diff --git a/.gitignore b/.gitignore index dc0a38edcb563589ce3845803174598ca68ec396..be97cf2f3ff9878774913ecf8dab0130179bbf16 100644 --- a/.gitignore +++ b/.gitignore @@ -116,4 +116,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models metal/MobileNetDemo/MobileNetDemo/Resources +#flatbuffers +lite/model_parser/flatbuffers/framework_generated.h + build* + +# hiai libs +ai_ddk_lib* diff --git a/.gitmodules b/.gitmodules index 107036c70292cf33e945f45a8bac935dea554ece..37af6a724560144190539ab677c8f17524f5e645 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "third-party/protobuf-host"] path = third-party/protobuf-host url = https://github.com/protocolbuffers/protobuf.git +[submodule "third-party/flatbuffers"] + path = third-party/flatbuffers + url = https://github.com/google/flatbuffers.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ac227f0154feb64178d9a99b6784bfd6db40d50..e598f1dcd501b2ca09273a0914ff4cdf66f8b0e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF) lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF) +lite_option(LITE_WITH_HUAWEI_ASCEND_NPU "Enable HUAWEI_ASCEND_NPU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) @@ -98,6 +99,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OF lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF) lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF) lite_option(LITE_WITH_LOG "Enable log printing or not." ON) +lite_option(LITE_WITH_EXCEPTION "Enable throwing the exception when error occurs in lite" OFF) lite_option(LITE_WITH_NVTX "Enable nvtx or not, please enable LITE_WITH_CUDA first." OFF) lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF) lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF) @@ -106,7 +108,8 @@ lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kerne lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF) # cv build options lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF) -lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." ON) +lite_option(LITE_WITH_STATIC_CUDA "Statically link cuda libraries." OFF) +lite_option(CUDA_WITH_FP16 "Compile with cuda half support" OFF) lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF) # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. @@ -168,6 +171,7 @@ if(LITE_WITH_RKNPU) include(device/rknpu) endif() +include(external/flatbuffers) # for mobile if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) @@ -222,6 +226,11 @@ endif() if(LITE_WITH_MLU) include(mlu) endif() + +if(LITE_WITH_HUAWEI_ASCEND_NPU) + include(device/huawei_ascend_npu) +endif() + include(coveralls) include(external/mklml) # download mklml package diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 1b0890e0dbf5e741176c293a059d809752c72a43..773de573aff92599ad6e5fb746a2956d9e50a8c2 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -174,6 +174,10 @@ if (LITE_WITH_MLU) add_definitions("-DLITE_WITH_MLU") endif() +if (LITE_WITH_HUAWEI_ASCEND_NPU) +add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU") +endif() + if (LITE_WITH_PROFILE) add_definitions("-DLITE_WITH_PROFILE") endif() @@ -190,6 +194,10 @@ if (LITE_WITH_LOG) add_definitions("-DLITE_WITH_LOG") endif() +if (LITE_WITH_EXCEPTION) + add_definitions("-DLITE_WITH_EXCEPTION") +endif() + if (LITE_ON_TINY_PUBLISH) add_definitions("-DLITE_ON_TINY_PUBLISH") endif() diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake index 4fc59ccd62671c5862a298832b1ec03d4e96d05a..68f91fe88173f1cd254bc44d5e7dbcd456bfcdb8 100644 --- a/cmake/cross_compiling/android.cmake +++ b/cmake/cross_compiling/android.cmake @@ -35,7 +35,11 @@ endif() if(NOT DEFINED ANDROID_API_LEVEL) set(ANDROID_API_LEVEL "23") if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") - set(ANDROID_API_LEVEL "22") + if(LITE_WITH_NPU AND NOT LITE_ON_TINY_PUBLISH) + set(ANDROID_API_LEVEL "24") # HIAI DDK depends on android-24 + else() + set(ANDROID_API_LEVEL "22") + endif() endif() endif() @@ -76,6 +80,21 @@ if (ARM_TARGET_LANG STREQUAL "clang") elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7") set(triple arm-v7a-linux-android) set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE) + if(ANDROID_STL_TYPE MATCHES "^c\\+\\+_") + # Use CMAKE_CXX_STANDARD_LIBRARIES_INIT to ensure libunwind and libc++ is linked in the right order + set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libunwind.a") + if (ANDROID_API_LEVEL LESS 21) + set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libandroid_support.a") + endif() + if(ANDROID_STL_TYPE STREQUAL "c++_shared") + set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_shared.so") + elseif(ANDROID_STL_TYPE STREQUAL "c++_static") + set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_static.a") + set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++abi.a") + else() + message(FATAL_ERROR "Invalid Android STL TYPE: ${ANDROID_STL_TYPE}.") + endif() + endif() else() message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7") endif() diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake index 069923c779fbd3eed4f5f81ef3e386ff70fac215..c9c3fc9f2681b6002567d555a26ee14edefaeae5 100644 --- a/cmake/cross_compiling/postproject.cmake +++ b/cmake/cross_compiling/postproject.cmake @@ -23,6 +23,21 @@ if(ANDROID) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC") + + # Don't re-export libgcc symbols + set(REMOVE_ATOMIC_GCC_SYMBOLS "-Wl,--exclude-libs,libatomic.a -Wl,--exclude-libs,libgcc.a") + set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}") + set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}") + + # Only the libunwind.a from clang(with libc++) provide C++ exception handling support for 32-bit ARM + # Refer to https://android.googlesource.com/platform/ndk/+/master/docs/BuildSystemMaintainers.md#Unwinding + if (ARM_TARGET_LANG STREQUAL "clang" AND ARM_TARGET_ARCH_ABI STREQUAL "armv7" AND ANDROID_STL_TYPE MATCHES "^c\\+\\+_") + set(REMOVE_UNWIND_SYMBOLS "-Wl,--exclude-libs,libunwind.a") + set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}") + set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}") + endif() endif() if(ARMLINUX) @@ -59,14 +74,13 @@ function(check_linker_flag) endfunction() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +if((LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) OR LITE_WITH_PYTHON OR LITE_WITH_EXCEPTION OR (NOT LITE_ON_TINY_PUBLISH)) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -fasynchronous-unwind-tables -funwind-tables") +else () + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-asynchronous-unwind-tables -fno-unwind-tables") +endif() if (LITE_ON_TINY_PUBLISH) - if((NOT LITE_WITH_PYTHON)) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions") - endif() - if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions") - endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections") check_linker_flag(-Wl,--gc-sections) endif() diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index e7df3f0fd6f0b0efcaf9cd859df5fb84a0cadfc4..eb8e26218ad1d8adc920b1834abd9ba10669a3e9 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -2,6 +2,10 @@ if(NOT LITE_WITH_CUDA) return() endif() +if(WITH_CUDA_FP16) + add_definitions("-DCUDA_WITH_FP16") +endif() + set(paddle_known_gpu_archs "30 35 50 52 60 61 70") set(paddle_known_gpu_archs7 "30 35 50 52") set(paddle_known_gpu_archs8 "30 35 50 52 53 60 61 62") @@ -167,6 +171,10 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x add_definitions("-DPADDLE_CUDA_BINVER=\"100\"") endif() +if (CUDA_WITH_FP16) + STRING(REGEX REPLACE "30|35|50|52" "" paddle_known_gpu_archs ${paddle_known_gpu_archs}) +endif() + include_directories(${CUDA_INCLUDE_DIRS}) if(NOT WITH_DSO) if(WIN32) diff --git a/cmake/device/huawei_ascend_npu.cmake b/cmake/device/huawei_ascend_npu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..0bd9591eee702f4db914a8b547c4c99b21d0473b --- /dev/null +++ b/cmake/device/huawei_ascend_npu.cmake @@ -0,0 +1,169 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_HUAWEI_ASCEND_NPU) + return() +endif() + +# 1. path to Huawei Ascend Install Path +if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT) + set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT}) + if(NOT HUAWEI_ASCEND_NPU_DDK_ROOT) + message(FATAL_ERROR "Must set HUAWEI_ASCEND_NPU_DDK_ROOT or env HUAWEI_ASCEND_NPU_DDK_ROOT when LITE_WITH_HUAWEI_ASCEND_NPU=ON") + endif() +endif() +message(STATUS "HUAWEI_ASCEND_NPU_DDK_ROOT: ${HUAWEI_ASCEND_NPU_DDK_ROOT}") + +# 2. Huawei Ascend include directory +set(ACL_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/include") +set(ATC_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/include") +set(OPP_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp") +include_directories(${ACL_INCLUDE_DIR}) +include_directories(${ATC_INCLUDE_DIR}) +include_directories(${OPP_INCLUDE_DIR}) + +# 3 find ACL Libs (ACL libs should before ATC libs) +find_library(ACL_ASCENDCL_FILE NAMES ascendcl + PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64 + NO_DEFAULT_PATH) + +if(NOT ACL_ASCENDCL_FILE) + message(FATAL_ERROR "Can not find ACL_ASCENDCL_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64") +else() + message(STATUS "Found ACL_ASCENDCL_FILE Library: ${ACL_ASCENDCL_FILE}") + add_library(acl_ascendcl SHARED IMPORTED GLOBAL) + set_property(TARGET acl_ascendcl PROPERTY IMPORTED_LOCATION ${ACL_ASCENDCL_FILE}) +endif() + +# 3.1 ascendcl dependency - libruntime.so +find_library(ACL_RUNTIME_FILE NAMES runtime + PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64 + NO_DEFAULT_PATH) + +if(NOT ACL_RUNTIME_FILE) + message(FATAL_ERROR "Can not find ACL_RUNTIME_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64") +else() + message(STATUS "Found ACL_RUNTIME_FILE Library: ${ACL_RUNTIME_FILE}") + add_library(acl_runtime SHARED IMPORTED GLOBAL) + set_property(TARGET acl_runtime PROPERTY IMPORTED_LOCATION ${ACL_RUNTIME_FILE}) +endif() + +# 4.1 find ATC libs - libregister.so +find_library(ATC_REGISTER_FILE NAMES register + PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64 + NO_DEFAULT_PATH) + +if(NOT ATC_REGISTER_FILE) + message(FATAL_ERROR "Can not find ATC_REGISTER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64") +else() + message(STATUS "Found ATC_REGISTER_FILE Library: ${ATC_REGISTER_FILE}") + add_library(atc_register SHARED IMPORTED GLOBAL) + set_property(TARGET atc_register PROPERTY IMPORTED_LOCATION ${ATC_REGISTER_FILE}) +endif() + +# 4.1.1 dependency of register - libprotobuf.so.19, +find_library(ATC_PROTOBUF_FILE NAMES libprotobuf.so.19 + PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64 + NO_DEFAULT_PATH) + + if(NOT ATC_REGISTER_FILE) + message(FATAL_ERROR "Can not find ATC_PROTOBUF_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64") +else() + message(STATUS "Found ATC_PROTOBUF_FILE Library: ${ATC_PROTOBUF_FILE}") + add_library(atc_protobuf SHARED IMPORTED GLOBAL) + set_property(TARGET atc_protobuf PROPERTY IMPORTED_LOCATION ${ATC_PROTOBUF_FILE}) +endif() + +# 4.1.2 dependency of register - libgraph.so +find_library(ATC_GRAPH_FILE NAMES graph + PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64 + NO_DEFAULT_PATH) + +if(NOT ATC_GRAPH_FILE) + message(FATAL_ERROR "Can not find ATC_GRAPH_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64") +else() + message(STATUS "Found ATC_GRAPH_FILE Library: ${ATC_GRAPH_FILE}") + add_library(atc_graph SHARED IMPORTED GLOBAL) + set_property(TARGET atc_graph PROPERTY IMPORTED_LOCATION ${ATC_GRAPH_FILE}) +endif() + +# 4.2 find ATC libs - libge_compiler.so +find_library(ATC_GE_COMPILER_FILE NAMES ge_compiler + PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64 + NO_DEFAULT_PATH) + +if(NOT ATC_GE_COMPILER_FILE) + message(FATAL_ERROR "Can not find ATC_GE_COMPILER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64") +else() + message(STATUS "Found ATC_GE_COMPILER_FILE Library: ${ATC_GE_COMPILER_FILE}") + add_library(atc_ge_compiler SHARED IMPORTED GLOBAL) + set_property(TARGET atc_ge_compiler PROPERTY IMPORTED_LOCATION ${ATC_GE_COMPILER_FILE}) +endif() + +# 4.2.1 dependencies of libge_compiler.so - libge_common.so +find_library(ATC_GE_COMMON_FILE NAMES ge_common + PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64 + NO_DEFAULT_PATH) + +if(NOT ATC_GE_COMMON_FILE) + message(FATAL_ERROR "Can not find ATC_GE_COMMON_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64") +else() + message(STATUS "Found ATC_GE_COMMON_FILE Library: ${ATC_GE_COMMON_FILE}") + add_library(atc_ge_common SHARED IMPORTED GLOBAL) + set_property(TARGET atc_ge_common PROPERTY IMPORTED_LOCATION ${ATC_GE_COMMON_FILE}) +endif() + +# 4.2.3 dependencies of libge_compiler.so - libresource.so +find_library(ATC_RESOURCE_FILE NAMES resource + PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64 + NO_DEFAULT_PATH) + +if(NOT ATC_RESOURCE_FILE) + message(FATAL_ERROR "Can not find ATC_RESOURCE_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64") +else() + message(STATUS "Found ATC_RESOURCE_FILE Library: ${ATC_RESOURCE_FILE}") + add_library(atc_resource SHARED IMPORTED GLOBAL) + set_property(TARGET atc_resource PROPERTY IMPORTED_LOCATION ${ATC_RESOURCE_FILE}) +endif() + +# 4.3 find OPP libs - libopsproto.so +find_library(OPP_OPS_PROTO_FILE NAMES opsproto + PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in + NO_DEFAULT_PATH) + +if(NOT OPP_OPS_PROTO_FILE) + message(FATAL_ERROR "Can not find OPP_OPS_PROTO_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in") +else() + message(STATUS "Found OPP_OPS_PROTO_FILE Library: ${OPP_OPS_PROTO_FILE}") + add_library(opp_ops_proto SHARED IMPORTED GLOBAL) + set_property(TARGET opp_ops_proto PROPERTY IMPORTED_LOCATION ${OPP_OPS_PROTO_FILE}) +endif() + +# 4.3.1 dependency of opp_ops_proto - liberror_manager.so +find_library(ATC_ERROR_MANAGER_FILE NAMES error_manager + PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64 + NO_DEFAULT_PATH) + +if(NOT ATC_ERROR_MANAGER_FILE) + message(FATAL_ERROR "Can not find ATC_ERROR_MANAGER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64") +else() + message(STATUS "Found ATC_ERROR_MANAGER_FILE Library: ${ATC_ERROR_MANAGER_FILE}") + add_library(atc_error_manager SHARED IMPORTED GLOBAL) + set_property(TARGET atc_error_manager PROPERTY IMPORTED_LOCATION ${ATC_ERROR_MANAGER_FILE}) +endif() + +# note: huawei_ascend_npu_runtime_libs should before huawei_ascend_npu_builder_libs +set(huawei_ascend_npu_runtime_libs acl_ascendcl acl_runtime CACHE INTERNAL "huawei_ascend_npu acllib runtime libs") +set(huawei_ascend_npu_builder_libs atc_register atc_protobuf atc_graph opp_ops_proto atc_error_manager + atc_ge_compiler atc_ge_common atc_resource CACHE INTERNAL "huawei_ascend_npu atc builder libs") \ No newline at end of file diff --git a/cmake/device/npu.cmake b/cmake/device/npu.cmake index 88598f4690a157b20ac1873d84ad13c2f8652725..0409b6a60fc651cbaade61998a09bc0489bc978c 100644 --- a/cmake/device/npu.cmake +++ b/cmake/device/npu.cmake @@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH} NO_DEFAULT_PATH) +# Added in HiAI DDK 320 or later version +find_library(NPU_DDK_HCL_FILE NAMES hcl + PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH} + NO_DEFAULT_PATH) + if(NOT NPU_DDK_HIAI_FILE) message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}") else() @@ -78,5 +83,13 @@ else() set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE}) endif() -set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs") +if(NOT NPU_DDK_HCL_FILE) +# message(FATAL_ERROR "Can not find NPU_DDK_HCL_FILE in ${NPU_DDK_ROOT}") +else() + message(STATUS "Found NPU_DDK HCL Library: ${NPU_DDK_HCL_FILE}") + add_library(npu_ddk_hcl SHARED IMPORTED GLOBAL) + set_property(TARGET npu_ddk_hcl PROPERTY IMPORTED_LOCATION ${NPU_DDK_HCL_FILE}) +endif() + +set(npu_runtime_libs npu_ddk_hiai npu_ddk_hcl CACHE INTERNAL "npu ddk runtime libs") set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs") diff --git a/cmake/device/xpu.cmake b/cmake/device/xpu.cmake index 823048552f3cb5f05375e97e94cd5b5ad63e7563..16fc7dcf4191a6b2a145d4d6e70e915fe5321a6b 100644 --- a/cmake/device/xpu.cmake +++ b/cmake/device/xpu.cmake @@ -39,7 +39,7 @@ else() endif() find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt - PATHS ${XPU_SDK_ROOT}/XTDK/shlib + PATHS ${XPU_SDK_ROOT}/XTDK/runtime/shlib ${XPU_SDK_ROOT}/XTDK/shlib # libxpurt.so may have been moved to XTDK/runtime/shlib NO_DEFAULT_PATH) if(NOT XPU_SDK_XPU_RT_FILE) diff --git a/cmake/external/flatbuffers.cmake b/cmake/external/flatbuffers.cmake new file mode 100644 index 0000000000000000000000000000000000000000..e6ab31ee855f5bbc0594f37c00a3ec46d8e4231d --- /dev/null +++ b/cmake/external/flatbuffers.cmake @@ -0,0 +1,114 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +# Introduce variables: +# * CMAKE_INSTALL_LIBDIR +INCLUDE(GNUInstallDirs) +SET(LIBDIR "lib") +if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$") + SET(LIBDIR "lib64") +endif() + +SET(FLATBUFFERS_PREFIX_DIR ${THIRD_PARTY_PATH}/flatbuffers) +SET(FLATBUFFERS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/flatbuffers) +SET(FLATBUFFERS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flatbuffers) +SET(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_SOURCES_DIR}/include" CACHE PATH "flatbuffers include directory." FORCE) +IF(WIN32) + set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE) +ELSE(WIN32) + set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.a" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE) +ENDIF(WIN32) + +INCLUDE_DIRECTORIES(${FLATBUFFERS_INCLUDE_DIR}) + +if(NOT HOST_CXX_COMPILER) + set(HOST_CXX_COMPILER ${CMAKE_CXX_COMPILER}) + set(HOST_C_COMPILER ${CMAKE_C_COMPILER}) +endif() + +SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${HOST_CXX_COMPILER}" + "-DCMAKE_C_COMPILER=${HOST_C_COMPILER}") + +ExternalProject_Add( + extern_flatbuffers + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/google/flatbuffers.git" + GIT_TAG "v1.12.0" + SOURCE_DIR ${FLATBUFFERS_SOURCES_DIR} + PREFIX ${FLATBUFFERS_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DBUILD_STATIC_LIBS=ON + -DCMAKE_INSTALL_PREFIX=${FLATBUFFERS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR} + -DFLATBUFFERS_BUILD_TESTS=OFF + ${CROSS_COMPILE_CMAKE_ARGS} + ${OPTIONAL_ARGS} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${FLATBUFFERS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) +IF(WIN32) + IF(NOT EXISTS "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib") + add_custom_command(TARGET extern_flatbuffers POST_BUILD + COMMAND cmake -E copy ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers_static.lib ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib + ) + ENDIF() +ENDIF(WIN32) +ADD_LIBRARY(flatbuffers STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET flatbuffers PROPERTY IMPORTED_LOCATION ${FLATBUFFERS_LIBRARIES}) +ADD_DEPENDENCIES(flatbuffers extern_flatbuffers) + +SET(FLATBUFFERS_FLATC_EXECUTABLE ${FLATBUFFERS_INSTALL_DIR}/bin/flatc) + +function(register_generated_output file_name) + get_property(tmp GLOBAL PROPERTY FBS_GENERATED_OUTPUTS) + list(APPEND tmp ${file_name}) + set_property(GLOBAL PROPERTY FBS_GENERATED_OUTPUTS ${tmp}) +endfunction(register_generated_output) + +function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT) + if(FLATBUFFERS_BUILD_LEGACY) + set(OPT ${OPT};--cpp-std c++0x) + else() + # --cpp-std is defined by flatc default settings. + endif() + message(STATUS "`${SRC_FBS}`: add generation of C++ code with '${OPT}'") + get_filename_component(SRC_FBS_DIR ${SRC_FBS} PATH) + message(STATUS "SRC_FBS_DIR: ${SRC_FBS_DIR}") + string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS}) + add_custom_command( + OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/${GEN_HEADER}" + COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}" + --cpp --gen-mutable --gen-object-api --reflect-names + ${OPT} + -o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}" + DEPENDS flatbuffers + COMMENT "Run generation: '${GEN_HEADER}'") + register_generated_output(${GEN_HEADER}) + add_custom_target(${TARGET} ALL DEPENDS ${GEN_HEADER}) +endfunction() + +set(FRAMEWORK_FBS_DIR "lite/model_parser/flatbuffers") +set(FRAMEWORK_SCHEMA_PATH "${FRAMEWORK_FBS_DIR}/framework.fbs") +compile_flatbuffers_schema_to_cpp_opt(framework_fbs_header ${FRAMEWORK_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty") +include_directories(${FLATBUFFERS_INCLUDE_DIR}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}) + diff --git a/cmake/lite.cmake b/cmake/lite.cmake index 8408a79fa4265b08771e435dcc5e82801a9d40f9..fe66d0f643e9bdf0cb778c4e4647294f553c023e 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -22,7 +22,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -118,6 +118,12 @@ function (lite_deps TARGET) endforeach(var) endif() + if (LITE_WITH_HUAWEI_ASCEND_NPU) + foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + set(${TARGET} ${deps} PARENT_SCOPE) endfunction() @@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -165,6 +171,7 @@ function(lite_cc_library TARGET) LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} MLU_DEPS ${args_MLU_DEPS} + HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS} ) if (args_SHARED OR ARGS_shared) @@ -193,7 +200,7 @@ function(lite_cc_binary TARGET) set(options " -g ") endif() set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -215,6 +222,7 @@ function(lite_cc_binary TARGET) HVY_DEPS ${args_HVY_DEPS} CV_DEPS ${CV_DEPS} MLU_DEPS ${args_MLU_DEPS} + HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS} ) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) if(NOT WIN32) @@ -246,7 +254,7 @@ function(lite_cc_test TARGET) endif() set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) @@ -276,6 +284,7 @@ function(lite_cc_test TARGET) HVY_DEPS ${args_HVY_DEPS} CV_DEPS ${args_CV_DEPS} MLU_DEPS ${args_MLU_DEPS} + HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS} ) _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) # strip binary target to reduce size @@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels") set(apu_kernels CACHE INTERNAL "apu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels") set(mlu_kernels CACHE INTERNAL "mlu kernels") +set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels") set(bm_kernels CACHE INTERNAL "bm kernels") set(rknpu_kernels CACHE INTERNAL "rknpu kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels") @@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) endif() # add a kernel for some specific device -# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU) +# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU) # level: one of (basic, extra) function(add_kernel TARGET device level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -438,6 +448,15 @@ function(add_kernel TARGET device level) endif() set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU") + if (NOT LITE_WITH_HUAWEI_ASCEND_NPU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() + set(huawei_ascend_npu_kernels "${huawei_ascend_npu_kernels};${TARGET}" CACHE INTERNAL "") + endif() if ("${device}" STREQUAL "OPENCL") if (NOT LITE_WITH_OPENCL) foreach(src ${args_SRCS}) @@ -481,6 +500,7 @@ function(add_kernel TARGET device level) RKNPU_DEPS ${args_RKNPU_DEPS} BM_DEPS ${args_BM_DEPS} MLU_DEPS ${args_MLU_DEPS} + HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -499,7 +519,7 @@ endif() function(add_operator TARGET level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -537,6 +557,7 @@ function(add_operator TARGET level) RKNPU_DEPS ${args_RKNPU_DEPS} BM_DEPS ${args_BM_DEPS} MLU_DEPS ${args_MLU_DEPS} + HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} diff --git a/lite/demo/cxx/train_demo/README.md b/docs/demo_guides/cpp_train_demo.md similarity index 82% rename from lite/demo/cxx/train_demo/README.md rename to docs/demo_guides/cpp_train_demo.md index 56f4513d45676a1deb51bfb93096db156ddd0449..c10f2091f9c14f6fc81563248c75e72abd713666 100644 --- a/lite/demo/cxx/train_demo/README.md +++ b/docs/demo_guides/cpp_train_demo.md @@ -1,8 +1,10 @@ +# C++ Train Demo -# Introduction - 我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。 +## Introduction + +我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。 - 你可以通过book库中的 +你可以通过book库中的 [文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html) 和 [源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line) @@ -10,18 +12,16 @@ 其使用线性回归(Linear Regression) 模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。 -注:这是一篇使用C++ API做模型训练的教程,其他API暂时不支持训练功能。 - -# Requirements +## Requirements - 一部安卓手机,用于运行训练程序 -- 装了Paddle (version: 1.7.0) 的python +- 装了Paddle (version >= 1.7.0) 的python -# Quick start +## Quick start -## Step1 build paddle-lite +### Step1 build paddle-lite -请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为: +请按照paddle-lite官方文档的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为: ```shell ## 配置环境 @@ -51,7 +51,7 @@ cd Paddle-Lite Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so ``` -## Step2 编译lr_trainer +### Step2 编译lr_trainer ```shell cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/ @@ -64,7 +64,7 @@ bin/ `-- demo_trainer ``` -## Step3 download model and run it! +### Step3 download model and run it! 在你的笔记本电脑上,用usb连接到手机,开启开发者模式,在任意目录下执行: @@ -102,7 +102,7 @@ sample 8: Loss: 248.445 sample 9: Loss: 325.135 ``` -# 更多细节 +## 更多细节 上面提到的模型是直接下载得到的,如果你想自己生成,可以执行以下命令: ```shell @@ -125,9 +125,9 @@ md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d 如果你想生成自己的模型用于训练,可以参考`train.py`中保存模型的方式。 -# 与Paddle训练结果做校对 +## 与Paddle训练结果做校对 -## 前10个Loss值 +### 前10个Loss值 为了验证paddle与lite的一致性,我们控制模型参数一致、数据一致、batch size = 1的情况下,训练10个batch, 记录了二者的loss值。 @@ -171,11 +171,11 @@ sample 8: Loss: 248.445 sample 9: Loss: 325.135 ``` -## Loss 曲线 +### Loss 曲线 控制训练时的batch size为20,每个epoch对训练数据做全局shuffle,训练100个epoch后,paddle和lite的loss曲线对比如下。 -![lr_loss](image/lr_loss.png) +![lr_loss](../images/lr_loss.png) 如果想复现上述效果,paddle+python的运行命令为: diff --git a/docs/demo_guides/opencl.md b/docs/demo_guides/opencl.md index 31a0e411566297d5556e6b7fffcec1343cd83781..52ea158cf7b9c827c17225b6690b1bd9d8d15d24 100644 --- a/docs/demo_guides/opencl.md +++ b/docs/demo_guides/opencl.md @@ -37,14 +37,25 @@ rm ./lite/api/paddle_use_kernels.h rm ./lite/api/paddle_use_ops.h # 设置编译参数并开始编译 +# android-armv7:cpu+gpu+cv+extra ./lite/tools/build_android.sh \ --arch=armv7 \ --toolchain=clang \ - --with_cv=OFF \ --with_log=OFF \ - --with_extra=OFF \ + --with_extra=ON \ + --with_cv=ON \ --with_opencl=ON +# android-armv8:cpu+gpu+cv+extra +./lite/tools/build_android.sh \ + --arch=armv8 \ + --toolchain=clang \ + --with_log=OFF \ + --with_extra=ON \ + --with_cv=ON \ + --with_opencl=ON + + # 注:编译帮助请执行: ./lite/tools/build_android.sh help ``` @@ -206,7 +217,7 @@ adb shell "export GLOG_v=4; \ ## 3. 如何在Code中使用 -即编译产物`demo/cxx/mobile_light`目录下的代码,在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc); +即编译产物`demo/cxx/mobile_light`目录下的代码,在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc),其中也包括判断当前设备是否支持OpenCL的方法; 注:这里给出的链接会跳转到线上最新develop分支的代码,很可能与您本地的代码存在差异,建议参考自己本地位于`lite/demo/cxx/`目录的代码,查看如何使用。 diff --git a/docs/demo_guides/python_demo.md b/docs/demo_guides/python_demo.md index d6a7b15bd9be638ef586e6b589e35eecbf1613c2..59f81783c0b2e791f9623e84cf57c269cbb7d6f2 100644 --- a/docs/demo_guides/python_demo.md +++ b/docs/demo_guides/python_demo.md @@ -86,19 +86,28 @@ config.set_model_from_file(/YOU_MODEL_PATH/mobilenet_v1_opt.nb) predictor = create_paddle_predictor(config) ``` -(3) 设置输入数据 +(3) 从图片读入数据 + +```python +image = Image.open('./example.jpg') +resized_image = image.resize((224, 224), Image.BILINEAR) +image_data = np.array(resized_image).flatten().tolist() +``` + +(4) 设置输入数据 + ```python input_tensor = predictor.get_input(0) input_tensor.resize([1, 3, 224, 224]) -input_tensor.set_float_data([1.] * 3 * 224 * 224) +input_tensor.set_float_data(image_data) ``` -(4) 执行预测 +(5) 执行预测 ```python predictor.run() ``` -(5) 得到输出数据 +(6) 得到输出数据 ```python output_tensor = predictor.get_output(0) print(output_tensor.shape()) diff --git a/lite/demo/cxx/train_demo/image/lr_loss.png b/docs/images/lr_loss.png similarity index 100% rename from lite/demo/cxx/train_demo/image/lr_loss.png rename to docs/images/lr_loss.png diff --git a/docs/index.rst b/docs/index.rst index c241f091ed2cae906879f98b769bc6b7ce830fe1..b2fba7daba51c68207af27e249559c18ab10235f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -59,7 +59,14 @@ Welcome to Paddle-Lite's documentation! demo_guides/baidu_xpu demo_guides/rockchip_npu demo_guides/mediatek_apu - + +.. toctree:: + :maxdepth: 1 + :caption: 训练示例(预览) + :name: sec-train_demo_guides + + demo_guides/cpp_train_demo + .. toctree:: :maxdepth: 1 :caption: API文档 diff --git a/docs/user_guides/Compile/iOS.md b/docs/user_guides/Compile/iOS.md index 355cc11875ce8f8db891fb843d2f1624180b71ff..60375ad1085dfac090442f9c0dad86cf71b64c9e 100644 --- a/docs/user_guides/Compile/iOS.md +++ b/docs/user_guides/Compile/iOS.md @@ -61,7 +61,7 @@ inference_lite_lib.ios64.armv8 iOS预测库和头文件 - 裁剪预测库方法(只编译模型中的kernel&OP,降低预测库体积): ```shell -./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir +./lite/tools/build_ios.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir ``` ```shell --with_strip: (OFF|ON); 是否根据输入模型裁剪预测库,默认为OFF diff --git a/docs/user_guides/model_optimize_tool.md b/docs/user_guides/model_optimize_tool.md index fed728cb0e06c9758a0497a9cbb93d7edf39bda7..4c80d638d224d294e247ad3f5300498dd536be62 100644 --- a/docs/user_guides/model_optimize_tool.md +++ b/docs/user_guides/model_optimize_tool.md @@ -21,11 +21,11 @@ pip install paddlelite - 方法二: 下载opt可执行文件 从[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases),选择当前预测库对应版本的`opt`转化工具 -本文提供`release/v2.6`和`release/v2.2.0`版本的优化工具下载 +本文提供`release/v2.6.1`和`release/v2.2.0`版本的优化工具下载 |版本 | Linux | MacOS| |---|---|---| -| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) | +| `release/v2.6.1` | [opt](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt_mac) | |`release/v2.2.0` | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) | - 方法三: 源码编译opt diff --git a/docs/user_guides/tutorial.md b/docs/user_guides/tutorial.md index 338449bfcb92e4029763c4357eb6d1fd5b820272..ee156038a6ea144921258734c92e9a2ea757d6ec 100644 --- a/docs/user_guides/tutorial.md +++ b/docs/user_guides/tutorial.md @@ -49,4 +49,4 @@ $ ./opt \ ## 五. 测试工具 -为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug) 和 [Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。 +为了使您更好的了解并使用Lite框架,我们向有进一步使用需求的用户开放了 [Debug工具](debug) 和 [Profile工具](debug)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异,进一步快速定位问题Op,方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗,其会自动统计Op执行的次数,最长、最短、平均执行时间等等信息,为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。 diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index ff4d00dbb1051320f817c8220a11a77edde7fb05..10601e34f9815bfee88d8dba58988169839cc86d 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}") message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}") +message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}") @@ -45,14 +46,17 @@ if (WITH_TESTING) lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz") + lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz") endif() if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz") - lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz") + lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz") lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz") + lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "bert.tar.gz") + lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "ernie.tar.gz") endif() endif() @@ -242,7 +246,6 @@ if (LITE_WITH_X86) add_dependencies(publish_inference_x86_cxx_lib test_model_bin) add_custom_target(publish_inference_x86_cxx_demos ${TARGET} - COMMAND rm -rf "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full" diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 85744f5cac4b5b6dc6cb149a0375a69c98d55dd7..6ff381268a5796a52136214b64db39c057b5d59b 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -2,7 +2,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG)) lite_cc_library(place SRCS paddle_place.cc DEPS logging) else() lite_cc_library(place SRCS paddle_place.cc DEPS glog) -endif(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) +endif() if (LITE_ON_TINY_PUBLISH) set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG") @@ -11,12 +11,13 @@ endif() set(light_lib_DEPS light_api paddle_api paddle_api_light) -if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) +if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR LITE_WITH_HUAWEI_ASCEND_NPU OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) #full api dynamic library lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc DEPS paddle_api paddle_api_light paddle_api_full) - add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto) - target_link_libraries(paddle_full_api_shared framework_proto) + target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files}) + add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry framework_fbs_header) + target_link_libraries(paddle_full_api_shared framework_proto op_registry) if(LITE_WITH_X86) add_dependencies(paddle_full_api_shared xxhash) target_link_libraries(paddle_full_api_shared xxhash) @@ -39,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH NPU_DEPS ${npu_kernels} APU_DEPS ${apu_kernels} RKNPU_DEPS ${rknpu_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} ) add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) if(WIN32) target_link_libraries(paddle_light_api_shared shlwapi.lib) endif() - target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels}) + target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${rknpu_kernels} ${apu_kernels}) if(APPLE) set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds") set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}") @@ -70,7 +72,7 @@ else() set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto") endif() set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}") - add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) + add_dependencies(paddle_light_api_shared op_list_h kernel_list_h framework_fbs_header) if (LITE_WITH_NPU) # Need to add HIAI runtime libs (libhiai.so) dependency target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs}) @@ -93,6 +95,7 @@ if (WITH_TESTING) RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} APU_DEPS ${apu_kernels}) endif() @@ -111,6 +114,10 @@ if(LITE_WITH_RKNPU) set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps}) endif() +if(LITE_WITH_HUAWEI_ASCEND_NPU) + set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps}) + set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps}) +endif() message(STATUS "get ops ${ops}") message(STATUS "get X86 kernels ${x86_kernels}") @@ -125,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get BM kernels ${bm_kernels}") message(STATUS "get MLU kernels ${mlu_kernels}") +message(STATUS "get HUAWEI_ASCEND_NPU kernels ${huawei_ascend_npu_kernels}") # for full api if (NOT LITE_ON_TINY_PUBLISH) @@ -143,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH) RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels}) + FPGA_DEPS ${fpga_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) endif() # for light api @@ -167,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} - MLU_DEPS ${mlu_kernels}) + MLU_DEPS ${mlu_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) include(ExternalProject) set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING @@ -190,6 +200,7 @@ if(WITH_TESTING) FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} EXCLUDE_COMPILE_DEPS "ON" ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -321,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH) APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - BM_DEPS ${bm_kernels}) + BM_DEPS ${bm_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) # The final inference library for just MobileConfig. bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) target_link_libraries(paddle_api_full ${cuda_deps}) @@ -361,6 +373,9 @@ endif() if (LITE_WITH_PYTHON) add_subdirectory(python) + # add library for opt_base + lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils) + add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h) endif() if (LITE_ON_TINY_PUBLISH) @@ -368,9 +383,6 @@ if (LITE_ON_TINY_PUBLISH) endif() -# add library for opt_base -lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils) -add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h) if (LITE_ON_MODEL_OPTIMIZE_TOOL) message(STATUS "Compiling opt") @@ -393,6 +405,7 @@ if(NOT WITH_COVERAGE) FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL) if (WITH_TESTING) add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz) @@ -414,7 +427,8 @@ if(NOT IOS) RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} - CUDA_DEPS ${cuda_kernels}) + CUDA_DEPS ${cuda_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} @@ -429,7 +443,8 @@ if(NOT IOS) RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} - CUDA_DEPS ${cuda_kernels}) + CUDA_DEPS ${cuda_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} @@ -444,7 +459,8 @@ if(NOT IOS) RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} - CUDA_DEPS ${cuda_kernels}) + CUDA_DEPS ${cuda_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} @@ -458,7 +474,8 @@ if(NOT IOS) CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} - CUDA_DEPS ${cuda_kernels}) + CUDA_DEPS ${cuda_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} @@ -469,8 +486,9 @@ if(NOT IOS) XPU_DEPS ${xpu_kernels} RKNPU_DEPS ${rknpu_kernels} MLU_DEPS ${mlu_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} CL_DEPS ${opencl_kernels} - BM_DEPS ${bm_kernels} + BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -486,7 +504,8 @@ if(NOT IOS) CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} - CUDA_DEPS ${cuda_kernels}) + CUDA_DEPS ${cuda_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) endif() #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt index d46e9f7cdec1cf422340ff11165ee166c7520bab..2929e24117c616a99ff4e078fd77fe8827186cb1 100644 --- a/lite/api/android/jni/native/CMakeLists.txt +++ b/lite/api/android/jni/native/CMakeLists.txt @@ -17,6 +17,7 @@ if (NOT LITE_ON_TINY_PUBLISH) # Unlike static library, module library has to link target to be able to work # as a single .so lib. target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels}) + add_dependencies(paddle_lite_jni framework_fbs_header) if (LITE_WITH_NPU) # Strips the symbols of our protobuf functions to fix the conflicts during # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so) @@ -31,7 +32,7 @@ else() endif() set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS}) target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc) - add_dependencies(paddle_lite_jni op_list_h kernel_list_h) + add_dependencies(paddle_lite_jni op_list_h kernel_list_h framework_fbs_header) if (LITE_WITH_NPU) # Need to add HIAI runtime libs (libhiai.so) dependency target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs}) diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index f0557226a8770201d0fe79c385ef7e2d0240e91c..52fc33830828ce1325a77b821f1cea4c329e933b 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -13,26 +13,31 @@ // limitations under the License. #include "lite/api/cxx_api.h" + #include #include #include #include #include #include + #include "lite/api/paddle_use_passes.h" #include "lite/utils/io.h" namespace paddle { namespace lite { +std::vector GetAllOps() { + return OpLiteFactory::Global().GetAllOps(); +} + void Predictor::SaveModel(const std::string &dir, lite_api::LiteModelType model_type, bool record_info) { if (!program_) { GenRuntimeProgram(); } - program_->SaveOpInfosToProgram(program_desc_.get()); - program_->UpdateVarsOfProgram(program_desc_.get()); + program_->SaveToProgram(program_desc_); switch (model_type) { case lite_api::LiteModelType::kProtobuf: SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true); @@ -52,17 +57,21 @@ void Predictor::SaveModel(const std::string &dir, void Predictor::SaveOpKernelInfo(const std::string &model_dir) { std::set ops_info; std::set kernels_info; - const auto &instructions_ = program_->instructions(); - for (auto &node : instructions_) { - // parse op type infomation - auto op = node.op()->op_info(); - ops_info.insert(op->Type()); - // parse kernel type information - std::string kernel_type_str = - node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) + - "," + PrecisionRepr(node.kernel()->precision()) + "," + - DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias(); - kernels_info.insert(kernel_type_str); + auto block_size = program_->block_size(); + for (size_t block_idx = 0; block_idx < block_size; ++block_idx) { + const auto &insts = program_->instructions(block_idx); + for (auto &inst : insts) { + // parse op type infomation + auto op = inst.op()->op_info(); + ops_info.insert(op->Type()); + // parse kernel type information + std::string kernel_type_str = + inst.kernel()->op_type() + "," + TargetRepr(inst.kernel()->target()) + + "," + PrecisionRepr(inst.kernel()->precision()) + "," + + DataLayoutRepr(inst.kernel()->layout()) + "," + + inst.kernel()->alias(); + kernels_info.insert(kernel_type_str); + } } // get souce_file name from op type and kernel type @@ -164,9 +173,9 @@ void Predictor::PrepareFeedFetch() { std::vector feeds; std::vector fetchs; - const auto &insts = program_->instructions(); - for (size_t i = 0; i < program_->num_instructions(); i++) { - const auto &op = insts[i].op()->op_info(); + const auto &insts = program_->instructions(kRootBlockIdx); + for (auto &inst : insts) { + const auto &op = inst.op()->op_info(); if (op->Type() == "feed") { feeds.push_back(op); } else if (op->Type() == "fetch") { @@ -249,7 +258,6 @@ void Predictor::Build(const lite_api::CxxConfig &config, } else { LOG(INFO) << "Load model from file."; } - Build(model_path, model_file, param_file, @@ -290,10 +298,10 @@ void Predictor::Build(const std::string &model_path, Build(program_desc_, valid_places, passes); } -void Predictor::Build(const std::shared_ptr &desc, +void Predictor::Build(const std::shared_ptr &program_desc, const std::vector &valid_places, const std::vector &passes) { - program_desc_ = desc; + program_desc_ = program_desc; // `inner_places` is used to optimize passes std::vector inner_places = valid_places; for (auto &valid_place : valid_places) { @@ -326,13 +334,11 @@ void Predictor::Build(const std::shared_ptr &desc, } } if (is_quantized_model) { -#ifdef LITE_WITH_ARM inner_places.insert(inner_places.begin(), Place{TARGET(kARM), PRECISION(kInt8)}); -#endif } - Program program(*desc.get(), scope_, inner_places); + Program program(program_desc_, scope_, inner_places); valid_places_ = inner_places; core::KernelPickFactor factor; diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h index 6d0b7830d37be7e441df9e0e71f87572edaf3911..ceb823d5811aed26792318e3c1bf718ad9c2d851 100644 --- a/lite/api/cxx_api.h +++ b/lite/api/cxx_api.h @@ -36,6 +36,8 @@ static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] = ".tailored_kernels_source_list"; static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list"; +std::vector GetAllOps(); + /* * Predictor for inference, input a model, it will optimize and execute it. */ @@ -47,18 +49,33 @@ class LITE_API Predictor { program_desc_ = std::make_shared(); } - // Create a predictor with the weight variable scope set. + /////////////////////////////////////////////////////////////////// + // Function: Predictor + // Usage: Constructor of Predictor. Create a predictor with the + // weight variable scope set given. + /////////////////////////////////////////////////////////////////// explicit Predictor(const std::shared_ptr& root_scope) : scope_(root_scope) {} - Predictor(const std::shared_ptr& desc, + /////////////////////////////////////////////////////////////////// + // Function: Predictor + // Usage: Constructor of Predictor. This constructor function can + // only be called in Predictor->Clone. This Function will create + // a predictor from existed ProgramDesc, Scope and RuntimeProgram. + /////////////////////////////////////////////////////////////////// + Predictor(const std::shared_ptr& program_desc, const std::shared_ptr& root, const std::vector& valid_places, const std::vector& var_names = {}) - : program_desc_(desc), scope_(root) { - Program program(*desc.get(), scope_, valid_places, var_names); - optimizer_ = Optimizer(std::move(program), valid_places); - exec_scope_ = optimizer_.exec_scope(); + : program_desc_(program_desc), scope_(root) { + // step1. Create a Program to construct the exec_scope and ops + Program program(program_desc_, scope_, valid_places, var_names); + exec_scope_ = program.exec_scope(); valid_places_ = valid_places; + + // step3. Create the RuntimeProgram. + program_.reset( + new RuntimeProgram(program_desc_, exec_scope_, kRootBlockIdx)); + program_generated_ = true; } // Build from a model, with places set for hardware config. @@ -77,32 +94,62 @@ class LITE_API Predictor { lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf, bool memory_from_memory = false); - void Build(const std::shared_ptr& desc, + void Build(const std::shared_ptr& program_desc, const std::vector& valid_places, const std::vector& passes = {}); - std::shared_ptr Clone() const { + ////////////////////////////////////////////////////////// + // Function: Clone + // Usage: Create a Predictor from an existed one, + // the cloned predictor will share persistable variables + // in scope_ with the original predictor. + ////////////////////////////////////////////////////////// + std::shared_ptr Clone() { + // step 1. Generate runtime_program, update op_info and var_info in + // program_desc_ + if (!program_generated_) { + GenRuntimeProgram(); + } + program_->SaveToProgram(program_desc_); + // step 2. Create a predictor friom current program_desc_ and + // runtime_program. auto predictor = std::make_shared(program_desc_, scope_, valid_places_); + // step3. Return the result return predictor; } - - std::shared_ptr Clone( - const std::vector& var_names) const { + ////////////////////////////////////////////////////////// + // Function: Clone(var_names) + // Usage: Create a Predictor from an existed one, + // the cloned predictor will share persistable variables + // but persistable variables of name var_names will not + // be shared. + ////////////////////////////////////////////////////////// + std::shared_ptr Clone(const std::vector& var_names) { CHECK(program_desc_) << "Both program and scope of current predicotr " "should be not be nullptr in Clone mode."; CHECK(scope_) << "Both program and scope of current predicotr should be " "not be nullptr in Clone mode."; + // step 1. Generate runtime_program, update op_info and var_info in + // program_desc_ + if (!program_generated_) { + GenRuntimeProgram(); + } + program_->SaveToProgram(program_desc_); + // step 2. Create a predictor friom current program_desc_ and + // runtime_program. auto predictor = std::make_shared( program_desc_, scope_, valid_places_, var_names); - - for (auto i : var_names) { - predictor->exec_scope_->LocalVar(i); - auto* tensor = predictor->scope_->Var(i)->GetMutable(); + // step3. Copy some persistable variables into private scope. + for (auto var_name : var_names) { + predictor->exec_scope_->LocalVar(var_name); + auto* tensor = + predictor->scope_->Var(var_name)->GetMutable(); auto* sub_tensor = - predictor->exec_scope_->Var(i)->GetMutable(); + predictor->exec_scope_->Var(var_name)->GetMutable(); sub_tensor->CopyDataFrom(*tensor); } + // step4. Return the result return predictor; } @@ -138,6 +185,7 @@ class LITE_API Predictor { // get a const tensor according to its name const lite::Tensor* GetTensor(const std::string& name) const; const RuntimeProgram& runtime_program() const; + Scope* scope() { return scope_.get(); } // This method is disabled in mobile, for unnecessary dependencies required. void SaveModel( @@ -160,7 +208,7 @@ class LITE_API Predictor { std::shared_ptr program_desc_; std::shared_ptr scope_; Scope* exec_scope_; - std::unique_ptr program_; + std::shared_ptr program_; bool program_generated_{false}; std::vector input_names_; std::vector output_names_; diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index 7b3b6bf043dae6008d8d6d9bc1acde97a2e3de38..726783349f0dcc049c4578df5c9e0ecbdb3dee4f 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -53,12 +53,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { #endif #ifdef LITE_WITH_MLU Env::Init(); - lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(), - config.mlu_core_number(), - config.mlu_use_first_conv(), - config.mlu_first_conv_mean(), - config.mlu_first_conv_std(), - config.mlu_input_layout()); + lite::TargetWrapperMlu::SetMLURunMode(config.mlu_core_version(), + config.mlu_core_number(), + config.mlu_input_layout(), + config.mlu_firstconv_param()); #endif // LITE_WITH_MLU auto use_layout_preprocess_pass = config.model_dir().find("OPENCL_PRE_PRECESS"); @@ -75,6 +73,18 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { } mode_ = config.power_mode(); threads_ = config.threads(); +#ifdef LITE_WITH_NPU + // Store the model-level configuration into scope for kernels, and use + // exe_scope to store the execution-level configuration + Context::SetSubgraphModelCacheDir( + raw_predictor_->scope(), config.subgraph_model_cache_dir()); +#endif +#ifdef LITE_WITH_HUAWEI_ASCEND_NPU + Context::SetHuaweiAscendDeviceID( + config.get_device_id()); + Context::SetSubgraphModelCacheDir( + config.subgraph_model_cache_dir()); +#endif #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) int num_threads = config.x86_math_library_num_threads(); diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc index 5f57ed40ddb762f2d80fce2327a01100bae741d9..fbcf171726d741ef0073f423bc4a600c9f9389d0 100644 --- a/lite/api/light_api.cc +++ b/lite/api/light_api.cc @@ -15,8 +15,6 @@ #include "lite/api/light_api.h" #include #include -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT namespace paddle { namespace lite { @@ -24,17 +22,18 @@ namespace lite { void LightPredictor::Build(const std::string& lite_model_file, bool model_from_memory) { if (model_from_memory) { - LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_); + LoadModelNaiveFromMemory( + lite_model_file, scope_.get(), program_desc_.get()); } else { - LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_); + LoadModelNaiveFromFile(lite_model_file, scope_.get(), program_desc_.get()); } // For weight quantization of post training, load the int8/16 weights // for optimized model, and dequant it to fp32. DequantizeWeight(); - - BuildRuntimeProgram(cpp_program_desc_); + BuildRuntimeProgram(program_desc_); PrepareFeedFetch(); + program_desc_.reset(); } void LightPredictor::Build(const std::string& model_dir, @@ -45,15 +44,15 @@ void LightPredictor::Build(const std::string& model_dir, switch (model_type) { #ifndef LITE_ON_TINY_PUBLISH case lite_api::LiteModelType::kProtobuf: - LoadModelPb(model_dir, "", "", scope_.get(), &cpp_program_desc_); + LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get()); break; #endif case lite_api::LiteModelType::kNaiveBuffer: { if (model_from_memory) { LoadModelNaiveFromMemory( - model_buffer, param_buffer, scope_.get(), &cpp_program_desc_); + model_buffer, param_buffer, scope_.get(), program_desc_.get()); } else { - LoadModelNaive(model_dir, scope_.get(), &cpp_program_desc_); + LoadModelNaive(model_dir, scope_.get(), program_desc_.get()); } break; } @@ -62,7 +61,7 @@ void LightPredictor::Build(const std::string& model_dir, } DequantizeWeight(); - BuildRuntimeProgram(cpp_program_desc_); + BuildRuntimeProgram(program_desc_); PrepareFeedFetch(); } @@ -111,15 +110,17 @@ std::vector LightPredictor::GetOutputNames() { } // append the names of inputs and outputs into input_names_ and output_names_ void LightPredictor::PrepareFeedFetch() { - auto current_block = cpp_program_desc_.GetBlock(0); - std::vector feeds; - std::vector fetchs; - for (size_t i = 0; i < current_block->OpsSize(); i++) { - auto op = current_block->GetOp(i); - if (op->Type() == "feed") { - feeds.push_back(op); - } else if (op->Type() == "fetch") { - fetchs.push_back(op); + std::vector feeds; + std::vector fetchs; + std::shared_ptr program_desc = program_desc_; + auto main_block = program_desc->GetBlock(kRootBlockIdx); + auto op_size = main_block->OpsSize(); + for (size_t op_idx = 0; op_idx < op_size; ++op_idx) { + auto op_desc = main_block->GetOp(op_idx); + if (op_desc->Type() == "feed") { + feeds.push_back(op_desc); + } else if (op_desc->Type() == "fetch") { + fetchs.push_back(op_desc); } } input_names_.resize(feeds.size()); @@ -134,54 +135,35 @@ void LightPredictor::PrepareFeedFetch() { } } -void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { - std::vector insts; - // 1. Create op first - Program program(prog, scope_, {}); - -// 2. Create Instructs -#ifdef LITE_WITH_OPENCL - using OpenCLContext = Context; - std::unique_ptr local_ctx(new KernelContext()); - local_ctx->As().InitOnce(); -#endif - - // Create the kernels of the target places, and filter out the specific - // kernel with the target alias. - for (auto& op : program.ops()) { - auto kernel_type = op->op_info()->GetAttr(kKernelTypeAttr); - std::string op_type, alias; - Place place; - KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place); - auto kernels = op->CreateKernels({place}); - // filter out a kernel - auto it = std::find_if( - kernels.begin(), kernels.end(), [&](std::unique_ptr& it) { - return it->alias() == alias; - }); - CHECK(it != kernels.end()); - -#ifdef LITE_WITH_OPENCL - if ((*it)->target() == TARGET(kOpenCL)) { - std::unique_ptr ctx(new KernelContext()); - (*local_ctx).As().CopySharedTo(&ctx->As()); - (*it)->SetContext(std::move(ctx)); - } else { - (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target())); +void LightPredictor::BuildRuntimeProgram( + const std::shared_ptr& program_desc) { + auto* exe_scope = &scope_->NewScope(); + // Prepare workspace + scope_->Var("feed")->GetMutable>(); + scope_->Var("fetch")->GetMutable>(); + CHECK(program_desc); + auto block_size = program_desc->BlocksSize(); + CHECK(block_size); + for (size_t block_idx = 0; block_idx < block_size; ++block_idx) { + auto block_desc = program_desc->GetBlock(block_idx); + auto var_size = block_desc->VarsSize(); + for (size_t var_idx = 0; var_idx < var_size; ++var_idx) { + auto var_desc = block_desc->GetVar(var_idx); + if (!var_desc->Persistable()) { + exe_scope->Var(var_desc->Name()); + } else { + if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") continue; + scope_->Var(var_desc->Name()); + } } -#else - (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target())); -#endif - - insts.emplace_back(op, std::move(*it)); } - program_.reset(new RuntimeProgram(std::move(insts))); - - CHECK(program.exec_scope()); - program_->set_exec_scope(program.exec_scope()); + // Only extracting the ops and generate the runtime program from the main + // block desc + program_.reset(new RuntimeProgram(program_desc, exe_scope, kRootBlockIdx)); } void LightPredictor::DequantizeWeight() { + std::shared_ptr program_desc = program_desc_; #define PROCESS_CONV2D_DATA() \ for (int64_t i = 0; i < ch; ++i) { \ for (int64_t j = 0; j < offset; ++j) { \ @@ -207,10 +189,9 @@ void LightPredictor::DequantizeWeight() { } return result; }; - Tensor tmp_tensor; - for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) { - auto* block = cpp_program_desc_.GetBlock(i); + for (size_t i = 0; i < program_desc->BlocksSize(); i++) { + auto* block = program_desc->GetBlock(i); for (size_t k = 0; k < block->OpsSize(); ++k) { auto* op_desc = block->GetOp(k); if (is_weight_quantized_op(op_desc)) { diff --git a/lite/api/light_api.h b/lite/api/light_api.h index e651d1323a5ce6e36546e9437d06a472eb8a5137..97a46b7d28ffc84feb87283eed9786b562a45229 100644 --- a/lite/api/light_api.h +++ b/lite/api/light_api.h @@ -46,6 +46,7 @@ class LITE_API LightPredictor { LightPredictor(const std::string& lite_model_file, bool model_from_memory = false) { scope_ = std::make_shared(); + program_desc_ = std::make_shared(); Build(lite_model_file, model_from_memory); } @@ -57,6 +58,7 @@ class LITE_API LightPredictor { lite_api::LiteModelType model_type = lite_api::LiteModelType::kNaiveBuffer) { scope_ = std::make_shared(); + program_desc_ = std::make_shared(); Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory); } @@ -78,6 +80,7 @@ class LITE_API LightPredictor { std::vector GetInputNames(); std::vector GetOutputNames(); void PrepareFeedFetch(); + Scope* scope() { return scope_.get(); } private: void Build(const std::string& lite_model_file, @@ -91,14 +94,15 @@ class LITE_API LightPredictor { lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf, bool model_from_memory = false); - void BuildRuntimeProgram(const cpp::ProgramDesc& prog); + void BuildRuntimeProgram( + const std::shared_ptr& program_desc); void DequantizeWeight(); private: std::shared_ptr scope_; std::unique_ptr program_; - cpp::ProgramDesc cpp_program_desc_; + std::shared_ptr program_desc_; std::vector input_names_; std::vector output_names_; }; diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc index 718ba020fb9c6daa4dc4d7263238692267335a48..c9c34377e2a82b72d26e3148a694fe0662e985ce 100644 --- a/lite/api/light_api_impl.cc +++ b/lite/api/light_api_impl.cc @@ -38,7 +38,15 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) { threads_ = config.threads(); #ifdef LITE_WITH_NPU + // Store the model-level configuration into scope for kernels, and use + // exe_scope to store the execution-level configuration Context::SetSubgraphModelCacheDir( + raw_predictor_->scope(), config.subgraph_model_cache_dir()); +#endif +#ifdef LITE_WITH_HUAWEI_ASCEND_NPU + Context::SetHuaweiAscendDeviceID( + config.get_device_id()); + Context::SetSubgraphModelCacheDir( config.subgraph_model_cache_dir()); #endif } diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc index 465f82056c6bb80b706cfb7d875773d75735911b..b523d5951b3302c5aa46763625af12e24da0015e 100644 --- a/lite/api/mobilenetv2_test.cc +++ b/lite/api/mobilenetv2_test.cc @@ -97,7 +97,7 @@ void TestModel(const std::vector& valid_places, if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) { ASSERT_EQ(out->dims().production(), 1000); - double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1; + double eps = first_target == TARGET(kOpenCL) ? 0.25 : 0.1; for (int i = 0; i < ref.size(); ++i) { for (int j = 0; j < ref[i].size(); ++j) { auto result = pdata[j * step + (out->dims()[1] * i)]; diff --git a/lite/api/opt.cc b/lite/api/opt.cc index c2fb594e8877020848ecc90c039c31d6f77f638b..e6a53e93e72261082fa220c5fe7b0c12bf60ca87 100644 --- a/lite/api/opt.cc +++ b/lite/api/opt.cc @@ -112,6 +112,8 @@ std::vector ParserValidPlaces() { valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)}); } else if (target_repr == "npu") { valid_places.emplace_back(TARGET(kNPU)); + } else if (target_repr == "huawei_ascend_npu") { + valid_places.emplace_back(TARGET(kHuaweiAscendNPU)); } else if (target_repr == "xpu") { valid_places.emplace_back(TARGET(kXPU)); } else if (target_repr == "mlu") { @@ -201,6 +203,7 @@ void PrintOpsInfo(std::set valid_ops = {}) { "kXPU", "kRKNPU", "kAPU", + "kHuaweiAscendNPU", "kAny", "kUnk"}; int maximum_optype_length = 0; @@ -265,16 +268,17 @@ void PrintHelpInfo() { " `--param_file=`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out=`\n" - " `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" + " " + "`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n" " `--record_tailoring_info=(true|false)`\n" " Arguments of model checking and ops information:\n" " `--print_all_ops=true` Display all the valid operators of " "Paddle-Lite\n" " `--print_supported_ops=true " - "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`" " Display valid operators of input targets\n" " `--print_model_ops=true --model_dir= " - "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`" " Display operators in the input model\n"; std::cout << "opt version:" << opt_version << std::endl << help_info << std::endl; diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc index 4ee18e24a632777c6a3e4a661c90aa9b59654028..ed41a821c0938b599dc8900baa021491df78f329 100644 --- a/lite/api/opt_base.cc +++ b/lite/api/opt_base.cc @@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) { valid_places_.emplace_back(TARGET(kX86)); } else if (target_repr == "npu") { valid_places_.emplace_back(TARGET(kNPU)); + } else if (target_repr == "huawei_ascend_npu") { + valid_places_.emplace_back(TARGET(kHuaweiAscendNPU)); } else if (target_repr == "xpu") { valid_places_.emplace_back(TARGET(kXPU)); } else if (target_repr == "rknpu") { @@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() { " `set_model_type(protobuf|naive_buffer)`: naive_buffer by " "default\n" " `set_lite_out(output_optimize_model_dir)`\n" - " `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" + " " + "`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n" " `record_model_info(false|true)`: refer to whether to record ops " "info for striping lib, false by default`\n" " `run() : start model transformation`\n" @@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() { " `--param_file=`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out=`\n" - " `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" + " `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n" " `--record_tailoring_info=(true|false)`\n" " Arguments of model checking and ops information:\n" " `--print_all_ops=true` Display all the valid operators of " "Paddle-Lite\n" " `--print_supported_ops=true " - "--valid_targets=(arm|opencl|x86|npu|xpu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`" " Display valid operators of input targets\n" " `--print_model_ops=true --model_dir= " - "--valid_targets=(arm|opencl|x86|npu|xpu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`" " Display operators in the input model\n"; std::cout << "paddlelite opt version:" << opt_version << std::endl << help_info << std::endl; @@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set& valid_ops) { "kXPU", "kRKNPU", "kAPU", + "kHuaweiAscendNPU", "kAny", "kUnk"}; // Get the lengh of the first column: maximum length of the op_type diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index bfeff4879820f132a331e9bff56a5f9c494fe775..08d2233536b90d2b39c7ba6e6733036652179d5f 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -13,6 +13,9 @@ // limitations under the License. #include "lite/api/paddle_api.h" + +#include + #include "lite/core/context.h" #include "lite/core/device_info.h" #include "lite/core/target_wrapper.h" @@ -21,10 +24,30 @@ #ifdef LITE_WITH_CUDA #include "lite/backends/cuda/target_wrapper.h" #endif +#ifdef LITE_WITH_XPU +#include "lite/backends/xpu/target_wrapper.h" +#endif + +#ifdef LITE_WITH_MLU +#include "lite/backends/mlu/target_wrapper.h" +#endif + +#ifdef LITE_WITH_OPENCL +#include "lite/backends/opencl/cl_runtime.h" +#endif namespace paddle { namespace lite_api { +bool IsOpenCLBackendValid() { + bool opencl_valid = false; +#ifdef LITE_WITH_OPENCL + opencl_valid = paddle::lite::CLRuntime::Global()->OpenCLAvaliableForDevice(); +#endif + LOG(INFO) << "opencl_valid:" << opencl_valid; + return opencl_valid; +} + Tensor::Tensor(void *raw) : raw_tensor_(raw) {} // TODO(Superjomn) refine this by using another `const void* const_raw`; @@ -97,6 +120,13 @@ void Tensor::CopyFromCpu(const T *src_data) { data, src_data, num * sizeof(T), lite::IoDirection::HtoD); #else LOG(FATAL) << "Please compile the lib with CUDA."; +#endif + } else if (type == TargetType::kMLU) { +#ifdef LITE_WITH_MLU + lite::TargetWrapperMlu::MemcpySync( + data, src_data, num * sizeof(T), lite::IoDirection::HtoD); +#else + LOG(FATAL) << "Please compile the lib with MLU."; #endif } else { LOG(FATAL) << "The CopyFromCpu interface just support kHost, kARM, kCUDA"; @@ -117,6 +147,13 @@ void Tensor::CopyToCpu(T *data) const { data, src_data, num * sizeof(T), lite::IoDirection::DtoH); #else LOG(FATAL) << "Please compile the lib with CUDA."; +#endif + } else if (type == TargetType::kMLU) { +#ifdef LITE_WITH_MLU + lite::TargetWrapperMlu::MemcpySync( + data, src_data, num * sizeof(T), lite::IoDirection::DtoH); +#else + LOG(FATAL) << "Please compile the lib with MLU."; #endif } else { LOG(FATAL) << "The CopyToCpu interface just support kHost, kARM, kCUDA"; @@ -138,6 +175,11 @@ template void Tensor::CopyFromCpu(const int64_t *); template void Tensor::CopyFromCpu(const float *); template void Tensor::CopyFromCpu(const int8_t *); +template void Tensor::CopyFromCpu(const int *); +template void Tensor::CopyFromCpu(const int64_t *); +template void Tensor::CopyFromCpu(const float *); +template void Tensor::CopyFromCpu(const int8_t *); + template void Tensor::CopyToCpu(float *) const; template void Tensor::CopyToCpu(int *) const; template void Tensor::CopyToCpu(int8_t *) const; @@ -228,13 +270,9 @@ void CxxConfig::set_mlu_core_number(int core_number) { void CxxConfig::set_mlu_input_layout(DataLayoutType layout) { mlu_input_layout_ = layout; } -void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) { - mlu_use_first_conv_ = use_first_conv; -} -void CxxConfig::set_mlu_first_conv_mean(const std::vector &mean) { +void CxxConfig::set_mlu_firstconv_param(const std::vector &mean, + const std::vector &std) { mlu_first_conv_mean_ = mean; -} -void CxxConfig::set_mlu_first_conv_std(const std::vector &std) { mlu_first_conv_std_ = std; } lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const { @@ -242,18 +280,15 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const { } int CxxConfig::mlu_core_number() const { return mlu_core_number_; } DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; } -bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; } -const std::vector &CxxConfig::mlu_first_conv_mean() const { - return mlu_first_conv_mean_; -} -const std::vector &CxxConfig::mlu_first_conv_std() const { - return mlu_first_conv_std_; +std::pair, std::vector> +CxxConfig::mlu_firstconv_param() const { + return std::make_pair(mlu_first_conv_mean_, mlu_first_conv_std_); } #endif void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) { #ifdef LITE_WITH_XPU - lite::Context::SetWorkspaceL3Size(l3_size); + lite::TargetWrapperXPU::workspace_l3_size_per_thread = l3_size; #else LOG(WARNING) << "The invoking of the function " "'set_xpu_workspace_l3_size_per_thread' is ignored, please " @@ -263,7 +298,7 @@ void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) { void CxxConfig::set_xpu_dev_per_thread(int dev_no) { #ifdef LITE_WITH_XPU - lite::Context::SetDev(dev_no); + lite::TargetWrapperXPU::SetDev(dev_no); #else LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is " "ignored, please rebuild it with LITE_WITH_XPU=ON."; @@ -272,7 +307,7 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) { void CxxConfig::set_xpu_multi_encoder_precision(const std::string &precision) { #ifdef LITE_WITH_XPU - lite::Context::_multi_encoder_precision = precision; + lite::TargetWrapperXPU::multi_encoder_precision = precision; #else LOG(WARNING) << "The invoking of the function " "'set_xpu_multi_encoder_precision' is " diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index d28ea8fdbf3f77a15f9ef561e03555090fddac97..6fe00bbd32d51e7d923901792e9d62166058c406 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -21,6 +21,7 @@ #define PADDLE_LITE_API_H_ #include #include +#include #include #include "paddle_place.h" // NOLINT @@ -32,6 +33,9 @@ using lod_t = std::vector>; enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK }; +// return true if current device supports OpenCL model +LITE_API bool IsOpenCLBackendValid(); + struct LITE_API Tensor { explicit Tensor(void* raw); explicit Tensor(const void* raw); @@ -122,6 +126,7 @@ class LITE_API ConfigBase { PowerMode mode_{LITE_POWER_NO_BIND}; // to save subgraph model for npu/xpu/... std::string subgraph_model_cache_dir_{""}; + int device_id_{0}; public: explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1); @@ -141,6 +146,9 @@ class LITE_API ConfigBase { const std::string& subgraph_model_cache_dir() const { return subgraph_model_cache_dir_; } + // set Device ID + void set_device_id(int device_id) { device_id_ = device_id; } + const int get_device_id() const { return device_id_; } }; /// CxxConfig is the config for the Full feature predictor. @@ -160,9 +168,8 @@ class LITE_API CxxConfig : public ConfigBase { lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270}; int mlu_core_number_{1}; DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)}; - bool mlu_use_first_conv_{false}; - std::vector mlu_first_conv_mean_; - std::vector mlu_first_conv_std_; + std::vector mlu_first_conv_mean_{}; + std::vector mlu_first_conv_std_{}; #endif public: @@ -210,24 +217,22 @@ class LITE_API CxxConfig : public ConfigBase { void set_mlu_core_version(lite_api::MLUCoreVersion core_version); // set MLU core number, which is used when compiling MLU kernels void set_mlu_core_number(int core_number); - // set MLU input layout. User can specify layout of input data to be NHWC, - // default is NCHW - void set_mlu_input_layout(DataLayoutType layout); // whether use MLU's first conv kernel. First conv is a special kernel // provided by MLU, its input is uint8, and also needs two 3-dimentional // vectors which save all inputs' mean and std values - void set_mlu_use_first_conv(bool use_first_conv); - // set the 3-dimentional mean vector used by MLU's first conv - void set_mlu_first_conv_mean(const std::vector& mean); - // set the 3-dimentional std vector used by MLU's first conv - void set_mlu_first_conv_std(const std::vector& std); + // set the 3-dimentional mean vector and 3-dimentional std vector used by + // MLU's first conv + void set_mlu_firstconv_param(const std::vector& mean, + const std::vector& std); + // set MLU input layout. User can specify layout of input data to be NHWC, + // default is NCHW + void set_mlu_input_layout(DataLayoutType layout); lite_api::MLUCoreVersion mlu_core_version() const; int mlu_core_number() const; DataLayoutType mlu_input_layout() const; - bool mlu_use_first_conv() const; - const std::vector& mlu_first_conv_mean() const; - const std::vector& mlu_first_conv_std() const; + // std::pair + std::pair, std::vector> mlu_firstconv_param() const; #endif // XPU only, set the size of the workspace memory from L3 cache for the diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc index 832867df079efa1baebf08da4c0d8e37958460f1..4edd61277059e20f7dfb1b8410a784fd04d85502 100644 --- a/lite/api/paddle_api_test.cc +++ b/lite/api/paddle_api_test.cc @@ -15,8 +15,11 @@ #include "lite/api/paddle_api.h" #include #include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" #include "lite/utils/cp_logging.h" #include "lite/utils/io.h" + DEFINE_string(model_dir, "", ""); namespace paddle { diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index 9bc63e78aae92556a312eb36c3415f9d57c2239a..29a119a6916e1e9fe9880c801291072351c18365 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -54,7 +54,8 @@ const std::string& ActivationTypeToStr(ActivationType act) { "Sigmoid", "Tanh", "Swish", - "Exp"}; + "Exp", + "ThresholdedRelu"}; auto x = static_cast(act); CHECK_LT(x, static_cast(ActivationType::NUM)); return act2string[x]; @@ -74,7 +75,8 @@ const std::string& TargetToStr(TargetType target) { "bm", "mlu", "rknpu", - "apu"}; + "apu", + "huawei_ascend_npu"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -117,7 +119,8 @@ const std::string& TargetRepr(TargetType target) { "kBM", "kMLU", "kRKNPU", - "kAPU"}; + "kAPU", + "kHuaweiAscendNPU"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -162,7 +165,8 @@ std::set ExpandValidTargets(TargetType target) { TARGET(kMLU), TARGET(kAPU), TARGET(kRKNPU), - TARGET(kFPGA)}); + TARGET(kFPGA), + TARGET(kHuaweiAscendNPU)}); if (target == TARGET(kAny)) { return valid_set; } diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index 7066656f18ec0693048223f5f1201e77a1b0a37d..5161d6b58af01f7af4dcbaec6a1cacb91e7c7056 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -57,7 +57,8 @@ enum class TargetType : int { kMLU = 11, kRKNPU = 12, kAPU = 13, - NUM = 14, // number of fields. + kHuaweiAscendNPU = 14, + NUM = 15, // number of fields. }; enum class PrecisionType : int { kUnk = 0, @@ -106,7 +107,8 @@ enum class ActivationType : int { kAbs = 9, kHardSwish = 10, kReciprocal = 11, - NUM = 12, + kThresholdedRelu = 12, + NUM = 13, }; static size_t PrecisionTypeLength(PrecisionType type) { diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index 6732b968734631cf74c1e8fc7b825f3e0b89b9fe..f132b2064e76a85865b6092240ec96d6af9ae49a 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -26,7 +26,9 @@ USE_MIR_PASS(argument_type_display_pass); USE_MIR_PASS(runtime_context_assign_pass); USE_MIR_PASS(graph_visualize_pass); +USE_MIR_PASS(remove_tf_redundant_ops_pass); USE_MIR_PASS(lite_conv_bn_fuse_pass); +USE_MIR_PASS(lite_conv_conv_fuse_pass); USE_MIR_PASS(lite_fc_fuse_pass); USE_MIR_PASS(lite_shuffle_channel_fuse_pass); USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass); @@ -46,14 +48,18 @@ USE_MIR_PASS(memory_optimize_pass); USE_MIR_PASS(multi_stream_analysis_pass); USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(npu_subgraph_pass); +USE_MIR_PASS(huawei_ascend_npu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass); USE_MIR_PASS(mlu_subgraph_pass); USE_MIR_PASS(mlu_postprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(apu_subgraph_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass); +USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass) USE_MIR_PASS(lite_scale_activation_fuse_pass); USE_MIR_PASS(__xpu__resnet_fuse_pass); +USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass); USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass); USE_MIR_PASS(__xpu__fc_fuse_pass); +USE_MIR_PASS(__xpu__mmdnn_fuse_pass); diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc index b7b24dfcea31d6e6e78538c6ac33923116b2e5a5..e32b61094a0b9ce9781cb6e9b8aef7ab753d7278 100644 --- a/lite/api/python/pybind/pybind.cc +++ b/lite/api/python/pybind/pybind.cc @@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) { .value("MLU", TargetType::kMLU) .value("RKNPU", TargetType::kRKNPU) .value("APU", TargetType::kAPU) + .value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU) .value("Any", TargetType::kAny); // PrecisionType diff --git a/lite/api/test_yolov3_lite_bm.cc b/lite/api/test_yolov3_lite_bm.cc index d70ecf3c03955286244aa13cfe65f19569a55930..ded851d93313c3e155dd7f8860eee7446e56e715 100644 --- a/lite/api/test_yolov3_lite_bm.cc +++ b/lite/api/test_yolov3_lite_bm.cc @@ -59,9 +59,9 @@ void TestModel(const std::vector& valid_places) { } auto* image_tensor = predictor.GetInput(1); image_tensor->Resize(DDim(std::vector({1, 2}))); - data = image_tensor->mutable_data(); - data[0] = FLAGS_im_height; - data[1] = FLAGS_im_width; + auto* data_1 = image_tensor->mutable_data(); + data_1[0] = FLAGS_im_height; + data_1[1] = FLAGS_im_width; for (int i = 0; i < FLAGS_warmup; ++i) { predictor.Run(); diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index 7f0d53f976ace17ee8d95e62e62d56f5cb974881..27a8a46cfa1413ea0d9ffa3641d8e4bd60785e11 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -10,3 +10,4 @@ add_subdirectory(mlu) add_subdirectory(bm) add_subdirectory(apu) add_subdirectory(rknpu) +add_subdirectory(huawei_ascend_npu) diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt index aecec295ae0269fb34a3c4fa38e396bdf98d4418..9cf8f6a507401656bb0df214bd463a09fd82a61d 100644 --- a/lite/backends/arm/math/CMakeLists.txt +++ b/lite/backends/arm/math/CMakeLists.txt @@ -83,6 +83,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR) conv5x5s2_depthwise_int8.cc conv5x5s2_depthwise_fp32.cc conv3x3_winograd_fp32_c4.cc + conv3x3_winograd_int8.cc conv_winograd_3x3.cc conv_impl.cc softmax.cc @@ -126,5 +127,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR) split_merge_lod_tenosr.cc reduce_prod.cc lstm.cc + clip.cc DEPS ${lite_kernel_deps} context tensor) endif() diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc index 1d01642100109d14a413ad5e274606c88bf0005a..01f25cbd36d327f7a3c252fdc675262d39748318 100644 --- a/lite/backends/arm/math/activation.cc +++ b/lite/backends/arm/math/activation.cc @@ -753,23 +753,15 @@ void act_abs(const float* din, float* dout, int size, int threads) { } } -#ifdef LITE_WITH_TRAIN template <> -void act_square_grad(const float* din, - const float* dout_grad, - float* din_grad, - int size, - int threads) { - const float* ptr_out_grad = dout_grad; - float* ptr_in_grad = din_grad; +void act_thresholded_relu( + const float* din, float* dout, int size, float threshold, int threads) { for (int i = 0; i < size; ++i) { - ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0]; - ptr_out_grad++; - ptr_in_grad++; + dout[0] = (din[0] > threshold ? din[0] : 0.f); din++; + dout++; } } -#endif } // namespace math } // namespace arm diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h index 50f60f300bbab9b9f0bcad222f31699b7bfadeab..b0147040cd11a888ec045948f0914a13aa932a2f 100644 --- a/lite/backends/arm/math/activation.h +++ b/lite/backends/arm/math/activation.h @@ -86,11 +86,9 @@ void act_reciprocal(const T* din, T* dout, int size, int threads); template void act_abs(const T* din, T* dout, int size, int threads); -#ifdef LITE_WITH_TRAIN template -void act_square_grad( - const T* din, const T* dout_grad, T* din_grad, int size, int threads); -#endif +void act_thresholded_relu( + const T* din, T* dout, int size, float threshold, int threads); } // namespace math } // namespace arm diff --git a/lite/backends/arm/math/beam_search.cc b/lite/backends/arm/math/beam_search.cc index 32b7d3bfeba6107493d62a0c9be14a3c15ce7692..74dfa143bda97219874b0e53efc7de34b0416c0e 100644 --- a/lite/backends/arm/math/beam_search.cc +++ b/lite/backends/arm/math/beam_search.cc @@ -234,7 +234,7 @@ void beam_search(const Tensor *pre_ids, selected_ids->Resize(dims); selected_scores->Resize(dims); if (parent_idx) { - parent_idx->Resize(dims); + parent_idx->Resize({static_cast(num_instances)}); } auto *selected_ids_data = selected_ids->mutable_data(); auto *selected_scores_data = selected_scores->mutable_data(); diff --git a/lite/backends/arm/math/clip.cc b/lite/backends/arm/math/clip.cc new file mode 100644 index 0000000000000000000000000000000000000000..8f8b48db53b9fe1b50a0832a64b3849faa417fb8 --- /dev/null +++ b/lite/backends/arm/math/clip.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/arm/math/clip.h" +#include +#include +#include +#include "lite/backends/arm/math/funcs.h" +#include "lite/backends/arm/math/saturate.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void clip_kernel_fp32( + const float* input, int64_t num, float min, float max, float* output) { + float tmp; + for (int64_t i = 0; i < num; i++) { + tmp = *input; + tmp = tmp > min ? tmp : min; + *output = tmp < max ? tmp : max; + input++; + output++; + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/clip.h b/lite/backends/arm/math/clip.h new file mode 100644 index 0000000000000000000000000000000000000000..cd74a8880abfb660c13c630ca708fa9c8f849d12 --- /dev/null +++ b/lite/backends/arm/math/clip.h @@ -0,0 +1,33 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/operators/op_params.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void clip_kernel_fp32( + const float* input, int64_t num, float min, float max, float* output); +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc index 35d9eeaee1b69bed423cd3b489217c71575b3079..2957085493f15016abf2bf50f0aabecbe95f5b36 100644 --- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc +++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc @@ -1245,7 +1245,7 @@ void weight_trans_c4_8x8( for (int i = 0; i < ch_out * ch_in * 64; ++i) { int new_c = i % 64; int new_oc = i / ch_in / 64 / 4; - int new_ic = i / 64 % (ch_in * 4) % ch_in; + int new_ic = i / 64 % ch_in; int new_inner = i / ch_in / 64 % 4; int dest_ind = new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner; @@ -1302,7 +1302,7 @@ void weight_trans_c4_4x4( for (int i = 0; i < ch_out * ch_in * 16; ++i) { int new_c = i % 16; int new_oc = i / ch_in / 16 / 4; - int new_ic = i / 16 % (ch_in * 4) % ch_in; + int new_ic = i / 16 % ch_in; int new_inner = i / ch_in / 16 % 4; int dest_ind = new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner; diff --git a/lite/backends/arm/math/conv3x3_winograd_int8.cc b/lite/backends/arm/math/conv3x3_winograd_int8.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b9870730e0cec07add470ad13292e1598736e5a --- /dev/null +++ b/lite/backends/arm/math/conv3x3_winograd_int8.cc @@ -0,0 +1,602 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/arm/math/conv_block_utils.h" +#include "lite/backends/arm/math/conv_impl.h" +#include "lite/backends/arm/math/packed_sgemm_c4.h" +#ifdef ARM_WITH_OMP +#include +#endif +#include +namespace paddle { +namespace lite { +namespace arm { +namespace math { +void input_trans_c8_4x4_int8(const int8_t* src, + int src_stride, + int src_h_stride, + int16_t* dest, + int dest_stride, + int dest_h_stride); +void output_trans_c8_post_2x4_int8(const int32_t* src, + int src_stride, + int src_h_stride, + int32_t* dest, + int dest_stride, + int dest_h_stride); +void weight_trans_c8_4x4_int8( + int16_t* dest, const int8_t* src, int ic, int oc, void* workspace); + +// F(2,3) +template +void conv_compute_2x2_3x3_int8(const int8_t* input, + Dtype* output, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + const int16_t* weight, + const float* bias, + const float* scale, + const operators::ConvParam& param, + ARMContext* ctx) { + auto act_param = param.activation_param; + const int pad_h0 = (*param.paddings)[0]; + const int pad_h1 = (*param.paddings)[1]; + const int pad_w0 = (*param.paddings)[2]; + const int pad_w1 = (*param.paddings)[3]; + int8_t* tmp_work_space = + ctx->workspace_data() + ctx->llc_size() / sizeof(int8_t); + + int in_n_stride = chin * hin * win; + int out_n_stride = chout * hout * wout; + int ic_stride = win * hin; + int oc_stride = wout * hout; + int ic_8 = (chin + 7) / 8; + int oc_8 = (chout + 7) / 8; + + int tile_w = (wout + 1) / 2; + int tile_h = (hout + 1) / 2; + int size_tile = tile_h * tile_w; + + int w_pad = win + pad_w0 + pad_w1; + int h_pad = hin + pad_h0 + pad_h1; + + const int zero_len = (w_pad + 3) / 4 * 4; + Dtype zero_ptr[zero_len]; // NOLINT + memset(zero_ptr, 0, zero_len * sizeof(Dtype)); + + int8_t* input_c8 = tmp_work_space; + int new_h_stride = w_pad * 8; + int new_c_stride = new_h_stride * h_pad; + + int ic_8_stride = w_pad * h_pad * 8; + int oc_8_stride = wout * hout * 8; + + int tile_block = 8; + int block_count = (size_tile + tile_block - 1) / tile_block; + + int threads = ctx->threads(); + int16_t* g_tmp_data = + (int16_t*)(tmp_work_space + ic_8 * ic_8_stride + // NOLINT + oc_8 * oc_8_stride * sizeof(int32_t)); + int tmp_input_thread_stride = tile_block * ic_8 * 128; + int tmp_output_thread_stride = tile_block * oc_8 * 128; + int tmp_data_thread_stride_size = tmp_input_thread_stride * sizeof(int16_t) + + tmp_output_thread_stride * sizeof(int32_t); + memset(g_tmp_data, 0, tmp_data_thread_stride_size); + int8_t* g_trans_remain_tmp_data = + (int8_t*)(g_tmp_data + // NOLINT + threads * (tmp_input_thread_stride + + tmp_output_thread_stride * sizeof(int32_t) / + sizeof(int16_t))); + int32_t* g_trans_tmp_data = + (int32_t*)(g_trans_remain_tmp_data + threads * 128); // NOLINT + auto act_type = act_param.active_type; + int flag_act = 0; // relu: 1, relu6: 2, leakey: 3 + float alpha[4] = {0.f, 0.f, 0.f, 0.f}; + if (act_param.has_active) { + if (act_type == lite_api::ActivationType::kRelu) { + flag_act = 1; + } else if (act_type == lite_api::ActivationType::kRelu6) { + flag_act = 2; + float local_alpha = act_param.Relu_clipped_coef; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } else if (act_type == lite_api::ActivationType::kLeakyRelu) { + flag_act = 3; + float local_alpha = act_param.Leaky_relu_alpha; + alpha[0] = local_alpha; + alpha[1] = local_alpha; + alpha[2] = local_alpha; + alpha[3] = local_alpha; + } + } + // begin compute + for (int ni = 0; ni < num; ++ni) { + // trans input to c8 + for (int i = 0; i < ic_8; ++i) { + prepack_input_nxwc8_int8_dw(input + ni * in_n_stride, + input_c8 + i * new_c_stride, + i * 8, + -pad_h0, + hin + pad_h1, + -pad_w0, + win + pad_w1, + chin, + win, + hin); + } + int32_t* output_c8 = (int32_t*)(input_c8 + ic_8 * ic_8_stride); // NOLINT + Dtype* output_ptr = output + ni * out_n_stride; + + const int16_t* weight_ptr = weight; +#pragma omp parallel for num_threads(threads) + for (int tbi = 0; tbi < block_count; ++tbi) { +#ifdef ARM_WITH_OMP + int16_t* tmp_data = + g_tmp_data + + omp_get_thread_num() * tmp_data_thread_stride_size / sizeof(int16_t); + int32_t* trans_tmp_data = g_trans_tmp_data + omp_get_thread_num() * 32; + int8_t* trans_remain_tmp_data = + g_trans_remain_tmp_data + omp_get_thread_num() * 128; +#else + int16_t* tmp_data = g_tmp_data; + int32_t* trans_tmp_data = g_trans_tmp_data; + int8_t* trans_remain_tmp_data = g_trans_remain_tmp_data; +#endif + int tile_index = tbi * tile_block; + int tile_remain = size_tile - tile_index; + int tile_count = tile_remain > tile_block ? tile_block : tile_remain; + + // input trans + int c_gi_stride = tile_count * oc_8 * 8; + int b_gi_stride = tile_count * ic_8 * 8; + //* + for (int ti = 0; ti < tile_count; ++ti) { + int index = tile_index + ti; + + int tw_index = index % tile_w; + int th_index = index / tile_w; + + int src_x = tw_index + tw_index; + int src_y = th_index + th_index; + int ex = src_x + 4 > w_pad ? w_pad - src_x : 4; + int ey = src_y + 4 > h_pad ? h_pad - src_y : 4; + + int16_t* dst_ptr = tmp_data + ti * 8; + const int8_t* src_ptr = input_c8 + (src_y * w_pad + src_x) * 8; + + if (ex == 4 && ey == 4) { + // trans input + for (int ci = 0; ci < ic_8; ++ci) { + const int8_t* src_ci = src_ptr + ci * ic_8_stride; + int16_t* dst_ci = dst_ptr + ci * tile_count * 8; + input_trans_c8_4x4_int8( + src_ci, 8, w_pad * 8, dst_ci, b_gi_stride, b_gi_stride * 4); + } + } else { + // trans remain input + int x_size = ex; + for (int ci = 0; ci < ic_8; ++ci) { + const int8_t* src_ci = src_ptr + ci * ic_8_stride; + // pad + memset(trans_remain_tmp_data, 0, 128 * sizeof(int8_t)); + if (x_size > 0) { + for (int yi = 0; yi < ey; ++yi) { + int8_t* dst_yi = trans_remain_tmp_data + yi * 32; + const int8_t* src_yi = src_ci + w_pad * yi * 8; + memcpy(dst_yi, src_yi, x_size * sizeof(int8_t) * 8); + } + } + + // trans + int16_t* dst_ci = dst_ptr + ci * tile_count * 8; + input_trans_c8_4x4_int8(trans_remain_tmp_data, + 8, + 32, + dst_ci, + b_gi_stride, + b_gi_stride * 4); + } // for ci_4 + } + } + //*/ + // input trans end + // *begin compute dot + // * + //* + int32_t* dst_temp_data = + (int32_t*)(tmp_data + tmp_input_thread_stride); // NOLINT + int16_t* b_ptr = tmp_data; + int w_gi_stride = ic_8 * oc_8 * 64; + for (int gi = 0; gi < 16; ++gi) { + int32_t* origin_C = dst_temp_data + gi * c_gi_stride; + int16_t* origin_B = b_ptr + gi * b_gi_stride; + const int16_t* origin_A = weight + gi * w_gi_stride; + sgemm_prepack_c8_int16_small( + oc_8 * 8, tile_count, ic_8 * 8, origin_A, origin_B, origin_C, ctx); + } + //*/ + //* + // output trans + for (int ti = 0; ti < tile_count; ++ti) { + int index = tile_index + ti; + + int tw_index = index % tile_w; + int th_index = index / tile_w; + + int dst_x = tw_index * 2; + int dst_y = th_index * 2; + + int ex = dst_x + 2 > wout ? wout - dst_x : 2; + int ey = dst_y + 2 > hout ? hout - dst_y : 2; + + int32_t* src_ptr = dst_temp_data + ti * 8; + int32_t* trans_remain_tmp_i32_data = + (int32_t*)(trans_remain_tmp_data); // NOLINT + int32_t* dst_ptr = output_c8 + (dst_y * wout + dst_x) * 8; + + if (ex == 2 && ey == 2) { + // trans output + for (int ci = 0; ci < oc_8; ++ci) { + int cur_ind = ci * 8; + + int32_t* src_ci = src_ptr + ci * tile_count * 8; + int32_t* dst_ci = dst_ptr + ci * oc_8_stride; + output_trans_c8_post_2x4_int8( + src_ci, c_gi_stride, c_gi_stride * 4, dst_ci, 8, wout * 8); + } + } else { + for (int ci = 0; ci < oc_8; ++ci) { + int cur_ind = ci * 8; + // trans output + int32_t* src_ci = src_ptr + ci * tile_count * 8; + output_trans_c8_post_2x4_int8(src_ci, + c_gi_stride, + c_gi_stride * 4, + trans_remain_tmp_i32_data, + 8, + 16); + // copy to dest + int32_t* dst_ci = dst_ptr + ci * oc_8_stride; + for (int i = 0; i < ey; ++i) { + memcpy(dst_ci + i * wout * 8, + trans_remain_tmp_i32_data + i * 16, + ex * sizeof(int32_t) * 8); + } + } + } + } + //*/ + } // for block_count + const float* bias_local_ptr = bias; + for (int ci = 0; ci < oc_8; ++ci) { + float bias_local[8] = {bias_local_ptr[0], + bias_local_ptr[1], + bias_local_ptr[2], + bias_local_ptr[3], + bias_local_ptr[4], + bias_local_ptr[5], + bias_local_ptr[6], + bias_local_ptr[7]}; + write_int32_nchwc8_to_nchw(output_c8 + ci * oc_8_stride, + output_ptr, + ci * 8, + ci * 8 + 8, + 0, + hout, + 0, + wout, + chout, + hout, + wout, + flag_act, + alpha, + bias_local, + param.bias, + zero_ptr, + scale + ci * 8); + bias_local_ptr += 8; + } + } // for num +} // conv compute +template void conv_compute_2x2_3x3_int8( + const int8_t* input, + int8_t* output, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + const int16_t* weight, + const float* bias, + const float* scale, + const operators::ConvParam& param, + ARMContext* ctx); +template void conv_compute_2x2_3x3_int8( + const int8_t* input, + float* output, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + const int16_t* weight, + const float* bias, + const float* scale, + const operators::ConvParam& param, + ARMContext* ctx); + +// BT=[1, 0, -1, 0, +// 0, 1, 1, 0, +// 0, -1, 1, 0, +// 0, 1, 0, -1] +void input_trans_c8_4x4_int8(const int8_t* src, + int src_stride, + int src_h_stride, + int16_t* dest, + int dest_stride, + int dest_h_stride) { + int8x8_t src00 = vld1_s8(src); + int8x8_t src01 = vld1_s8(src + src_stride); + int8x8_t src02 = vld1_s8(src + src_stride + src_stride); + int8x8_t src03 = vld1_s8(src + src_stride + src_stride + src_stride); + src += src_h_stride; + int8x8_t src10 = vld1_s8(src); + int8x8_t src11 = vld1_s8(src + src_stride); + int8x8_t src12 = vld1_s8(src + src_stride + src_stride); + int8x8_t src13 = vld1_s8(src + src_stride + src_stride + src_stride); + src += src_h_stride; + int8x8_t src20 = vld1_s8(src); + int8x8_t src21 = vld1_s8(src + src_stride); + int8x8_t src22 = vld1_s8(src + src_stride + src_stride); + int8x8_t src23 = vld1_s8(src + src_stride + src_stride + src_stride); + src += src_h_stride; + int8x8_t src30 = vld1_s8(src); + int8x8_t src31 = vld1_s8(src + src_stride); + int8x8_t src32 = vld1_s8(src + src_stride + src_stride); + int8x8_t src33 = vld1_s8(src + src_stride + src_stride + src_stride); + + int16x8_t dst00 = vsubl_s8(src00, src02); + int16x8_t dst10 = vaddl_s8(src01, src02); + int16x8_t dst20 = vsubl_s8(src02, src01); + int16x8_t dst30 = vsubl_s8(src01, src03); + + int16x8_t dst01 = vsubl_s8(src10, src12); + int16x8_t dst11 = vaddl_s8(src11, src12); + int16x8_t dst21 = vsubl_s8(src12, src11); + int16x8_t dst31 = vsubl_s8(src11, src13); + + int16x8_t dst02 = vsubl_s8(src20, src22); + int16x8_t dst12 = vaddl_s8(src21, src22); + int16x8_t dst22 = vsubl_s8(src22, src21); + int16x8_t dst32 = vsubl_s8(src21, src23); + + int16x8_t dst03 = vsubl_s8(src30, src32); + int16x8_t dst13 = vaddl_s8(src31, src32); + int16x8_t dst23 = vsubl_s8(src32, src31); + int16x8_t dst33 = vsubl_s8(src31, src33); + + int16x8_t dest00 = vsubq_s16(dst00, dst02); + int16x8_t dest10 = vaddq_s16(dst01, dst02); + int16x8_t dest20 = vsubq_s16(dst02, dst01); + int16x8_t dest30 = vsubq_s16(dst01, dst03); + + int16x8_t dest01 = vsubq_s16(dst10, dst12); + int16x8_t dest11 = vaddq_s16(dst11, dst12); + int16x8_t dest21 = vsubq_s16(dst12, dst11); + int16x8_t dest31 = vsubq_s16(dst11, dst13); + + int16x8_t dest02 = vsubq_s16(dst20, dst22); + int16x8_t dest12 = vaddq_s16(dst21, dst22); + int16x8_t dest22 = vsubq_s16(dst22, dst21); + int16x8_t dest32 = vsubq_s16(dst21, dst23); + + int16x8_t dest03 = vsubq_s16(dst30, dst32); + int16x8_t dest13 = vaddq_s16(dst31, dst32); + int16x8_t dest23 = vsubq_s16(dst32, dst31); + int16x8_t dest33 = vsubq_s16(dst31, dst33); + + vst1q_s16(dest, dest00); + vst1q_s16(dest + dest_stride, dest10); + vst1q_s16(dest + dest_stride + dest_stride, dest20); + vst1q_s16(dest + dest_stride + dest_stride + dest_stride, dest30); + dest += dest_h_stride; + vst1q_s16(dest, dest01); + vst1q_s16(dest + dest_stride, dest11); + vst1q_s16(dest + dest_stride + dest_stride, dest21); + vst1q_s16(dest + dest_stride + dest_stride + dest_stride, dest31); + dest += dest_h_stride; + vst1q_s16(dest, dest02); + vst1q_s16(dest + dest_stride, dest12); + vst1q_s16(dest + dest_stride + dest_stride, dest22); + vst1q_s16(dest + dest_stride + dest_stride + dest_stride, dest32); + dest += dest_h_stride; + vst1q_s16(dest, dest03); + vst1q_s16(dest + dest_stride, dest13); + vst1q_s16(dest + dest_stride + dest_stride, dest23); + vst1q_s16(dest + dest_stride + dest_stride + dest_stride, dest33); +} + +// AT=[1, 1, 1, 0, +// 0, 1, -1, -1] +void output_trans_c8_post_2x4_int8(const int32_t* src, + int src_stride, + int src_h_stride, + int32_t* dest, + int dest_stride, + int dest_h_stride) { + int32x4_t src400 = vld1q_s32(src); + int32x4_t src800 = vld1q_s32(src + 4); + src += src_stride; + int32x4_t src401 = vld1q_s32(src); + int32x4_t src801 = vld1q_s32(src + 4); + src += src_stride; + int32x4_t src402 = vld1q_s32(src); + int32x4_t src802 = vld1q_s32(src + 4); + src += src_stride; + int32x4_t src403 = vld1q_s32(src); + int32x4_t src803 = vld1q_s32(src + 4); + + src += src_h_stride - 3 * src_stride; + + int32x4_t src410 = vld1q_s32(src); + int32x4_t src810 = vld1q_s32(src + 4); + src += src_stride; + int32x4_t src411 = vld1q_s32(src); + int32x4_t src811 = vld1q_s32(src + 4); + src += src_stride; + int32x4_t src412 = vld1q_s32(src); + int32x4_t src812 = vld1q_s32(src + 4); + src += src_stride; + int32x4_t src413 = vld1q_s32(src); + int32x4_t src813 = vld1q_s32(src + 4); + + src += src_h_stride - 3 * src_stride; + + int32x4_t src420 = vld1q_s32(src); + int32x4_t src820 = vld1q_s32(src + 4); + src += src_stride; + int32x4_t src421 = vld1q_s32(src); + int32x4_t src821 = vld1q_s32(src + 4); + src += src_stride; + int32x4_t src422 = vld1q_s32(src); + int32x4_t src822 = vld1q_s32(src + 4); + src += src_stride; + int32x4_t src423 = vld1q_s32(src); + int32x4_t src823 = vld1q_s32(src + 4); + + src += src_h_stride - 3 * src_stride; + + int32x4_t src430 = vld1q_s32(src); + int32x4_t src830 = vld1q_s32(src + 4); + src += src_stride; + int32x4_t src431 = vld1q_s32(src); + int32x4_t src831 = vld1q_s32(src + 4); + src += src_stride; + int32x4_t src432 = vld1q_s32(src); + int32x4_t src832 = vld1q_s32(src + 4); + src += src_stride; + int32x4_t src433 = vld1q_s32(src); + int32x4_t src833 = vld1q_s32(src + 4); + + int32x4_t dst400 = vaddq_s32(vaddq_s32(src400, src401), src402); + int32x4_t dst410 = vsubq_s32(vsubq_s32(src401, src402), src403); + int32x4_t dst401 = vaddq_s32(vaddq_s32(src410, src411), src412); + int32x4_t dst411 = vsubq_s32(vsubq_s32(src411, src412), src413); + int32x4_t dst402 = vaddq_s32(vaddq_s32(src420, src421), src422); + int32x4_t dst412 = vsubq_s32(vsubq_s32(src421, src422), src423); + int32x4_t dst403 = vaddq_s32(vaddq_s32(src430, src431), src432); + int32x4_t dst413 = vsubq_s32(vsubq_s32(src431, src432), src433); + + int32x4_t dst800 = vaddq_s32(vaddq_s32(src800, src801), src802); + int32x4_t dst810 = vsubq_s32(vsubq_s32(src801, src802), src803); + int32x4_t dst801 = vaddq_s32(vaddq_s32(src810, src811), src812); + int32x4_t dst811 = vsubq_s32(vsubq_s32(src811, src812), src813); + int32x4_t dst802 = vaddq_s32(vaddq_s32(src820, src821), src822); + int32x4_t dst812 = vsubq_s32(vsubq_s32(src821, src822), src823); + int32x4_t dst803 = vaddq_s32(vaddq_s32(src830, src831), src832); + int32x4_t dst813 = vsubq_s32(vsubq_s32(src831, src832), src833); + + int32x4_t dest400 = vaddq_s32(vaddq_s32(dst400, dst401), dst402); + int32x4_t dest410 = vsubq_s32(vsubq_s32(dst401, dst402), dst403); + int32x4_t dest401 = vaddq_s32(vaddq_s32(dst410, dst411), dst412); + int32x4_t dest411 = vsubq_s32(vsubq_s32(dst411, dst412), dst413); + + int32x4_t dest800 = vaddq_s32(vaddq_s32(dst800, dst801), dst802); + int32x4_t dest810 = vsubq_s32(vsubq_s32(dst801, dst802), dst803); + int32x4_t dest801 = vaddq_s32(vaddq_s32(dst810, dst811), dst812); + int32x4_t dest811 = vsubq_s32(vsubq_s32(dst811, dst812), dst813); + + vst1q_s32(dest, dest400); + vst1q_s32(dest + 4, dest800); + dest += dest_stride; + vst1q_s32(dest, dest410); + vst1q_s32(dest + 4, dest810); + dest += dest_h_stride - dest_stride; + vst1q_s32(dest, dest401); + vst1q_s32(dest + 4, dest801); + dest += dest_stride; + vst1q_s32(dest, dest411); + vst1q_s32(dest + 4, dest811); +} + +void weight_trans_c8_4x4_int8( + int16_t* dest, const int8_t* din, int ch_in, int ch_out, void* workspace) { + const int16_t coeff[4][3] = {{2, 0, 0}, {1, 1, 1}, {1, -1, 1}, {0, 0, 2}}; + + int16_t* ptr_out = static_cast(workspace); + + for (int i = 0; i < ch_out; i++) { + for (int j = 0; j < ch_in; j++) { + const int8_t* kernel0 = + static_cast(din) + (i * ch_in + j) * 9; + int16_t* ptr_channel = ptr_out + (i * ch_in + j) * 16; + + //! transform kernel, transposed + const int8_t* k0 = kernel0; + const int8_t* k1 = kernel0 + 3; + const int8_t* k2 = kernel0 + 6; + + //! h + int16_t tmp[4][3]; + for (int i = 0; i < 4; i++) { + tmp[i][0] = + k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2]; + tmp[i][1] = + k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2]; + tmp[i][2] = + k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2]; + } + + //! v + for (int j = 0; j < 4; j++) { + int16_t* tmpp = &tmp[j][0]; + for (int i = 0; i < 4; i++) { + ptr_channel[j * 4 + i] = tmpp[0] * coeff[i][0] + + tmpp[1] * coeff[i][1] + + tmpp[2] * coeff[i][2]; + } + } + } + } + + int oc_pad = (ch_out + 7) / 8 * 8; + int ic_pad = (ch_in + 7) / 8 * 8; + int c_stride = ic_pad * oc_pad; + for (int i = 0; i < ch_out * ch_in * 16; ++i) { + int new_c = i % 16; + int new_oc = i / ch_in / 16 / 8; + int new_ic = i / 16 % ch_in; + int new_inner = i / ch_in / 16 % 8; + int dest_ind = + new_c * c_stride + new_oc * ic_pad * 8 + new_ic * 8 + new_inner; + dest[dest_ind] = ptr_out[i]; + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h index 78d4f3f74e3e8a0fb06b1fda83ad5deed281621b..c72223d2e845bc67b541e6f1790e45129deff62f 100644 --- a/lite/backends/arm/math/conv_block_utils.h +++ b/lite/backends/arm/math/conv_block_utils.h @@ -139,6 +139,151 @@ static bool conv_trans_weights_numc(const dtype* din, } return true; } +// for example: m = 4, n = 4 +// din = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9 , 10 ,11], [12, 13, 14, 15]] +// dout = [[0, 4, 8, 12], [1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15]] +/* + m = 8 n = 8: 0 1 2 3 4 5 6 7 0 8 16 24 32 40 48 56 + 16 17 18 19 20 21 22 23 2 10 18 26 34 42 50 58 + 24 25 26 27 28 29 30 31 3 11 19 27 35 43 51 59 + 32 33 34 35 36 37 38 39 4 12 20 28 36 44 52 60 ... + } + } +*/ +template +void local_transpose(const Dtype* din, Dtype* dout, int m, int n) { + // n % 4 == 0 && m % 4 == 0 + // n * m ==> n * m data trans + int offset_m = m << 2; + const Dtype* din_ptr = din; + Dtype* dout_ptr = dout; + for (int i = 0; i < n; i += 4) { + Dtype* out_ptr0 = dout_ptr; + Dtype* out_ptr1 = dout_ptr + m; + Dtype* out_ptr2 = out_ptr1 + m; + Dtype* out_ptr3 = out_ptr2 + m; + const Dtype* in_ptr0 = din_ptr; + const Dtype* in_ptr1 = din_ptr + m; + const Dtype* in_ptr2 = in_ptr1 + m; + const Dtype* in_ptr3 = in_ptr2 + m; + for (int j = 0; j < m; j += 4) { + float32x4_t vin0 = vld1q_f32(in_ptr0); + float32x4_t vin1 = vld1q_f32(in_ptr1); + float32x4_t vin2 = vld1q_f32(in_ptr2); + float32x4_t vin3 = vld1q_f32(in_ptr3); + // a00 b00 a02 b02 a01 b01 a03 b03 + float32x4x2_t tmp0 = vtrnq_f32(vin0, vin1); + // c00 d00 c02 d02 c01 d01 c03 d03 + float32x4x2_t tmp2 = vtrnq_f32(vin2, vin3); + in_ptr0 = in_ptr3 + m; + in_ptr1 = in_ptr3 + 2 * m; + float tmp_val1 = tmp0.val[0][2]; + float tmp_val2 = tmp0.val[0][3]; + tmp0.val[0][2] = tmp2.val[0][0]; + tmp0.val[0][3] = tmp2.val[0][1]; + float tmp_val3 = tmp0.val[1][2]; + float tmp_val4 = tmp0.val[1][3]; + tmp2.val[0][0] = tmp_val1; + tmp2.val[0][1] = tmp_val2; + tmp0.val[1][2] = tmp2.val[1][0]; + tmp0.val[1][3] = tmp2.val[1][1]; + tmp2.val[1][0] = tmp_val3; + tmp2.val[1][1] = tmp_val4; + in_ptr2 = in_ptr1 + m; + in_ptr3 = in_ptr1 + 2 * m; + vst1q_f32(out_ptr0, tmp0.val[0]); + vst1q_f32(out_ptr1, tmp0.val[1]); + out_ptr0 += 4; + out_ptr1 += 4; + vst1q_f32(out_ptr2, tmp2.val[0]); + vst1q_f32(out_ptr3, tmp2.val[1]); + out_ptr2 += 4; + out_ptr3 += 4; + } + dout_ptr += offset_m; + din_ptr += 4; + } +} +template +void transpose(const Dtype* din, Dtype* dout, int m, int n) { + // nxm == mxn + // 4x4 + int cnt_n = n >> 2; + int remain_n = n & 3; + int cnt_m = m >> 2; + int remain_m = m & 3; + int nn_num = n << 2; // n * 4 + int mm_num = m << 2; // m * 4 + for (int x = 0; x < cnt_n; x++) { + const Dtype* din_ptr0 = din + x * mm_num; + const Dtype* din_ptr1 = din_ptr0 + m; + const Dtype* din_ptr2 = din_ptr1 + m; + const Dtype* din_ptr3 = din_ptr2 + m; + Dtype* dout_ptr0 = dout + x * 4; + for (int y = 0; y < cnt_m; y++) { + float32x4_t din0 = vld1q_f32(din_ptr0); // a00 a01 a02 a03 + float32x4_t din1 = vld1q_f32(din_ptr1); + float32x4_t din2 = vld1q_f32(din_ptr2); + float32x4_t din3 = vld1q_f32(din_ptr3); + Dtype* dout_ptr1 = dout_ptr0 + n; + Dtype* dout_ptr2 = dout_ptr1 + n; + Dtype* dout_ptr3 = dout_ptr2 + n; + // a00 b00 a02 b02 a01 b01 a03 b03 + float32x4x2_t tmp0 = vtrnq_f32(din0, din1); + // c00 d00 c02 d02 c01 d01 c03 d03 + float32x4x2_t tmp2 = vtrnq_f32(din2, din3); + din_ptr0 += 4; + din_ptr1 += 4; + // a00 b00 c00 d00 a02 b02 c02 d02 + // a01 b01 c01 d01 a03 b03 c03 d03 + float tmp_val1 = tmp0.val[0][2]; + float tmp_val2 = tmp0.val[0][3]; + tmp0.val[0][2] = tmp2.val[0][0]; + tmp0.val[0][3] = tmp2.val[0][1]; + float tmp_val3 = tmp0.val[1][2]; + float tmp_val4 = tmp0.val[1][3]; + tmp2.val[0][0] = tmp_val1; + tmp2.val[0][1] = tmp_val2; + tmp0.val[1][2] = tmp2.val[1][0]; + tmp0.val[1][3] = tmp2.val[1][1]; + tmp2.val[1][0] = tmp_val3; + tmp2.val[1][1] = tmp_val4; + din_ptr2 += 4; + din_ptr3 += 4; + vst1q_f32(dout_ptr0, tmp0.val[0]); + vst1q_f32(dout_ptr1, tmp0.val[1]); + dout_ptr0 += nn_num; + vst1q_f32(dout_ptr2, tmp2.val[0]); + vst1q_f32(dout_ptr3, tmp2.val[1]); + } + for (int y = 0; y < remain_m; y++) { + *dout_ptr0++ = *din_ptr0++; + *dout_ptr0++ = *din_ptr1++; + *dout_ptr0++ = *din_ptr2++; + *dout_ptr0++ = *din_ptr3++; + } + } + const Dtype* din_ptr0 = din + cnt_n * mm_num; + dout = dout + cnt_n * 4; + for (int x = 0; x < remain_n; x++) { + Dtype* dout_ptr0 = dout + x * 4; + for (int y = 0; y < cnt_m; y++) { + float32x4_t din0 = vld1q_f32(din_ptr0); + Dtype* dout_ptr1 = dout_ptr0 + n; + Dtype* dout_ptr2 = dout_ptr1 + n; + Dtype* dout_ptr3 = dout_ptr2 + n; + din_ptr0 += 4; + *dout_ptr0 = din0[0]; + *dout_ptr1 = din0[1]; + dout_ptr0 += nn_num; + *dout_ptr2 = din0[2]; + *dout_ptr3 = din0[3]; + } + for (int y = 0; y < remain_m; y++) { + *dout_ptr0++ = *din_ptr0++; + } + } +} /*preprocessing inputs * input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws] * n = he - hs @@ -3762,6 +3907,7 @@ inline void write_int32_nchwc8_to_nchw(const int* din, int w_stride = we - ws; int valid_w = (we > width ? width : we) - ws; int cnt = valid_w / 4; + int remain = valid_w & 3; float32x4_t w_scale0 = vld1q_f32(scale); float32x4_t w_scale1 = vld1q_f32(scale + 4); @@ -3818,10 +3964,10 @@ inline void write_int32_nchwc8_to_nchw(const int* din, flag_act, alpha); } - if (we > width) { + if (remain > 0) { int offset = 32 * cnt; din_hei_ptr = ptr_din + offset; - for (int j = ws + cnt * 4; j < width; ++j) { + for (int j = 0; j < remain; ++j) { if (flag_bias) { *(doutc0_ptr++) = cvt_kernel( din_hei_ptr[0], scale[0], bias[0], flag_act, alpha[0]); diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h index 28a2fb7e2a42a27e9ecd3d42b25f9942b481004e..495a13eec17a0c35e90fbf3ef47c505028721857 100644 --- a/lite/backends/arm/math/conv_impl.h +++ b/lite/backends/arm/math/conv_impl.h @@ -359,6 +359,35 @@ void conv_compute_2x2_3x3_small(const float* input, const float* bias, const operators::ConvParam& param, ARMContext* ctx); +void input_trans_c8_4x4_int8(const int8_t* src, + int src_stride, + int src_h_stride, + int16_t* dest, + int dest_stride, + int dest_h_stride); +void output_trans_c8_post_2x4_int8(const int32_t* src, + int src_stride, + int src_h_stride, + int32_t* dest, + int dest_stride, + int dest_h_stride); +void weight_trans_c8_4x4_int8( + int16_t* dest, const int8_t* src, int ic, int oc, void* workspace); +template +void conv_compute_2x2_3x3_int8(const int8_t* input, + Dtype* output, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + const int16_t* weight, + const float* bias, + const float* scale, + const operators::ConvParam& param, + ARMContext* ctx); template void im2col(const Dtype* data_im, diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc index 4d08c1e957d43b5b748ffdb90fd14a07a61d0183..a73a63ddcb67f8790f73aff3fff8368f4005b7e1 100644 --- a/lite/backends/arm/math/elementwise.cc +++ b/lite/backends/arm/math/elementwise.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "lite/backends/arm/math/elementwise.h" +#include #include #include "lite/backends/arm/math/funcs.h" @@ -747,6 +747,16 @@ void elementwise_mul(const int* dinx, } } +template <> +void elementwise_mul(const int64_t* dinx, + const int64_t* diny, + int64_t* dout, + int num) { + for (int i = 0; i < num; i++) { + dout[i] = dinx[i] * diny[i]; + } +} + template <> void elementwise_mul_relu(const float* dinx, const float* diny, @@ -801,6 +811,17 @@ void elementwise_mul_relu(const float* dinx, } } +template <> +void elementwise_mul_relu(const int64_t* dinx, + const int64_t* diny, + int64_t* dout, + int num) { + for (int i = 0; i < num; i++) { + int64_t tmp = dinx[i] * diny[i]; + dout[i] = tmp > 0 ? tmp : 0; + } +} + template <> void elementwise_mul_broadcast(const float* dinx, const float* diny, @@ -935,6 +956,29 @@ void elementwise_mul_broadcast(const int* dinx, } } +template <> +void elementwise_mul_broadcast(const int64_t* dinx, + const int64_t* diny, + int64_t* dout, + int batch, + int channels, + int num) { +#pragma omp parallel for collapse(2) + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const int64_t* dinx_ptr = dinx + offset; + const int64_t diny_data = diny[j]; + int64_t* dout_ptr = dout + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *dinx_ptr * diny_data; + dout_ptr++; + dinx_ptr++; + } + } + } +} + template <> void elementwise_mul_relu_broadcast(const float* dinx, const float* diny, @@ -1014,6 +1058,30 @@ void elementwise_mul_relu_broadcast(const float* dinx, } } +template <> +void elementwise_mul_relu_broadcast(const int64_t* dinx, + const int64_t* diny, + int64_t* dout, + int batch, + int channels, + int num) { +#pragma omp parallel for collapse(2) + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const int64_t* dinx_ptr = dinx + offset; + const int64_t diny_data = diny[j]; + int64_t* dout_ptr = dout + offset; + for (int k = 0; k < num; ++k) { + int64_t tmp = *dinx_ptr * diny_data; + *dout_ptr = tmp > 0 ? tmp : 0; + dout_ptr++; + dinx_ptr++; + } + } + } +} + template <> void elementwise_max(const float* dinx, const float* diny, @@ -1254,6 +1322,19 @@ void elementwise_max_relu_broadcast(const float* dinx, } } +template <> +void elementwise_div(const int64_t* dinx, + const int64_t* diny, + int64_t* dout, + int num) { + for (int i = 0; i < num; i++) { + *dout = *dinx / *diny; + dout++; + dinx++; + diny++; + } +} + template <> void elementwise_div(const float* dinx, const float* diny, @@ -1306,6 +1387,28 @@ void elementwise_div(const float* dinx, } } +template <> +void elementwise_div_broadcast(const int64_t* dinx, + const int64_t* diny, + int64_t* dout, + int batch, + int channels, + int num) { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const int64_t* din_ptr = dinx + offset; + const int64_t diny_data = diny[j]; + int64_t* dout_ptr = dout + offset; + for (int p = 0; p < num; p++) { + *dout_ptr = *din_ptr / diny_data; + dout_ptr++; + din_ptr++; + } + } + } +} + template <> void elementwise_div_broadcast(const float* dinx, const float* diny, @@ -1541,6 +1644,87 @@ void elementwise_div_relu_broadcast(const float* dinx, } } +template +void elementwise_mod_broadcast( + const T* dinx, const T* diny, T* dout, int batch, int channels, int num) { +#pragma omp parallel for collapse(2) + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const T* din_ptr = dinx + offset; + const T diny_data = diny[j]; + T* dout_ptr = dout + offset; + + int cnt = num >> 2; + int remain = num % 4; + for (int k = 0; k < cnt; ++k) { + register T dinx0 = din_ptr[0]; + register T dinx1 = din_ptr[1]; + register T dinx2 = din_ptr[2]; + register T dinx3 = din_ptr[3]; + dout_ptr[0] = dinx0 % diny_data; + dout_ptr[1] = dinx1 % diny_data; + dout_ptr[2] = dinx2 % diny_data; + dout_ptr[3] = dinx3 % diny_data; + din_ptr += 4; + dout_ptr += 4; + } + if (remain > 0) { + for (int p = 0; p < remain; p++) { + *dout_ptr++ = *din_ptr++ % diny_data; + } + } + } + } +} + +template +void elementwise_mod(const T* dinx, const T* diny, T* dout, int num) { + int cnt = num >> 2; + int remain = num % 4; +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + const T* dinx_ptr = dinx + (i << 2); + const T* diny_ptr = diny + (i << 2); + T* dout_ptr = dout + (i << 2); + + register T dinx0 = dinx_ptr[0]; + register T dinx1 = dinx_ptr[1]; + register T dinx2 = dinx_ptr[2]; + register T dinx3 = dinx_ptr[3]; + + register T diny0 = diny_ptr[0]; + register T diny1 = diny_ptr[1]; + register T diny2 = diny_ptr[2]; + register T diny3 = diny_ptr[3]; + + dout_ptr[0] = dinx0 % diny0; + dout_ptr[1] = dinx1 % diny1; + dout_ptr[2] = dinx2 % diny2; + dout_ptr[3] = dinx3 % diny3; + } + if (remain > 0) { + const T* dinx_ptr = dinx + (cnt << 2); + const T* diny_ptr = diny + (cnt << 2); + T* dout_ptr = dout + (cnt << 2); + for (int i = 0; i < remain; i++) { + *dout_ptr++ = *dinx_ptr++ % *diny_ptr++; + } + } +} + +template void elementwise_mod(const int64_t* dinx, + const int64_t* diny, + int64_t* dout, + int num); + +template void elementwise_mod_broadcast(const int64_t* dinx, + const int64_t* diny, + int64_t* dout, + int batch, + int channels, + int num); + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/elementwise.h b/lite/backends/arm/math/elementwise.h index 06ecab08edcaf06614de94b99084be2ee80647aa..0b400fcce26c7d307777cc6e25d8d25e0d6234bc 100644 --- a/lite/backends/arm/math/elementwise.h +++ b/lite/backends/arm/math/elementwise.h @@ -253,6 +253,13 @@ template void elementwise_div_relu_broadcast( const T* dinx, const T* diny, T* dout, int batch, int channels, int num); +template +void elementwise_mod(const T* dinx, const T* diny, T* dout, int num); + +template +void elementwise_mod_broadcast( + const T* dinx, const T* diny, T* dout, int batch, int channels, int num); + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h index e975160c97b6e7396ab208805a4d685586ac00c8..75dcc971b80e53c3874ffcbb108afdc0e0faa705 100644 --- a/lite/backends/arm/math/funcs.h +++ b/lite/backends/arm/math/funcs.h @@ -25,6 +25,7 @@ #include "lite/backends/arm/math/axpy.h" #include "lite/backends/arm/math/beam_search.h" #include "lite/backends/arm/math/box_coder.h" +#include "lite/backends/arm/math/clip.h" #include "lite/backends/arm/math/col_im_transform.h" #include "lite/backends/arm/math/concat.h" #include "lite/backends/arm/math/conv_block_utils.h" diff --git a/lite/backends/arm/math/gemm_prepacked_int8.cc b/lite/backends/arm/math/gemm_prepacked_int8.cc index 343e93439d2db563e5ccd4d8c6aed681601871a0..f0c7c65c9067dabb46ad43b3a20a1b85d86d62d0 100644 --- a/lite/backends/arm/math/gemm_prepacked_int8.cc +++ b/lite/backends/arm/math/gemm_prepacked_int8.cc @@ -2242,19 +2242,45 @@ void gemm_prepack_oth_int8(const int8_t* A_packed, Dtype* tmp1 = nullptr; Dtype* tmp2 = nullptr; Dtype* tmp3 = nullptr; - float32_t scale_local[4]; + float32_t scale_local[4] = {0, 0, 0, 0}; float32_t bias_local[4] = {0, 0, 0, 0}; if (is_bias) { - bias_local[0] = bias[y]; - bias_local[1] = bias[y + 1]; - bias_local[2] = bias[y + 2]; - bias_local[3] = bias[y + 3]; + if (y + 4 <= M) { + bias_local[0] = bias[y]; + bias_local[1] = bias[y + 1]; + bias_local[2] = bias[y + 2]; + bias_local[3] = bias[y + 3]; + } else { + switch (M - y) { + case 3: + bias_local[2] = bias[y + 2]; + case 2: + bias_local[1] = bias[y + 1]; + case 1: + bias_local[0] = bias[y + 0]; + default: + break; + } + } } if (scale) { - scale_local[0] = scale[y]; - scale_local[1] = scale[y + 1]; - scale_local[2] = scale[y + 2]; - scale_local[3] = scale[y + 3]; + if (y + 4 <= M) { + scale_local[0] = scale[y]; + scale_local[1] = scale[y + 1]; + scale_local[2] = scale[y + 2]; + scale_local[3] = scale[y + 3]; + } else { + switch (M - y) { + case 3: + scale_local[2] = scale[y + 2]; + case 2: + scale_local[1] = scale[y + 1]; + case 1: + scale_local[0] = scale[y + 0]; + default: + break; + } + } } if (y + MBLOCK_INT8_OTH > M) { switch (y + MBLOCK_INT8_OTH - M) { diff --git a/lite/backends/arm/math/packed_sgemm_c4.cc b/lite/backends/arm/math/packed_sgemm_c4.cc index af4934e85756f03ec197520b2b5c130e27bdcad6..db1189a63c38bdb6ab33c6fa280a6f618b53ef7f 100644 --- a/lite/backends/arm/math/packed_sgemm_c4.cc +++ b/lite/backends/arm/math/packed_sgemm_c4.cc @@ -1679,6 +1679,912 @@ void sgemm_prepack_c4_small(int M, } } +void sgemm_prepack_c8_int16_small(int M, + int N, + int K, + const int16_t* A_packed, + const int16_t* B, + int32_t* C, + ARMContext* ctx) { + const int m_round = (M + 7) / 8 * 8; + const int k_round = (K + 7) / 8 * 8; + const int mloop = m_round >> 3; + const int lda = 8 * k_round; + const int ldb_byte = 8 * N * sizeof(int16_t); + const int kcnt = k_round >> 3; +#ifdef __aarch64__ + float32x4_t vzero = vdupq_n_f32(0.f); +#endif + for (int m = 0; m < mloop; ++m) { + const int16_t* b = B; + int n = N; +#ifdef __aarch64__ + for (; n > 7; n -= 8) { + int cnt = kcnt; + const int16_t* a_ptr = A_packed; + const int16_t* b_ptr = b; + // clang-format off + asm volatile( + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" //load a0, a1 + "ld1 {v4.8h, v5.8h}, [%[b]], #32 \n" //load b0, b1 + "ld1 {v6.8h, v7.8h}, [%[b]], #32 \n" //load b2, b3 + + "smull v20.4s, v0.4h, v4.h[0] \n" + "smull v21.4s, v0.4h, v5.h[0] \n" + "smull v22.4s, v0.4h, v6.h[0] \n" + "smull v23.4s, v0.4h, v7.h[0] \n" + "ld1 {v8.8h, v9.8h}, [%[b]], #32 \n" //load b0, b1 + "ld1 {v10.8h, v11.8h}, [%[b]], #32 \n" //load b2, b3 + + "smull2 v24.4s, v0.8h, v4.h[0] \n" + "smull2 v25.4s, v0.8h, v5.h[0] \n" + "smull2 v26.4s, v0.8h, v6.h[0] \n" + "smull2 v27.4s, v0.8h, v7.h[0] \n" + "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" //load a2, a3 + + "smlal v20.4s, v1.4h, v4.h[1] \n" + "smlal v21.4s, v1.4h, v5.h[1] \n" + "smlal v22.4s, v1.4h, v6.h[1] \n" + "smlal v23.4s, v1.4h, v7.h[1] \n" + + "smlal2 v24.4s, v1.8h, v4.h[1] \n" + "smlal2 v25.4s, v1.8h, v5.h[1] \n" + "smlal2 v26.4s, v1.8h, v6.h[1] \n" + "smlal2 v27.4s, v1.8h, v7.h[1] \n" + + "smull v12.4s, v0.4h, v8.h[0] \n" + "smull v13.4s, v0.4h, v9.h[0] \n" + "smull v14.4s, v0.4h, v10.h[0] \n" + "smull v15.4s, v0.4h, v11.h[0] \n" + + "smull2 v16.4s, v0.8h, v8.h[0] \n" + "smull2 v17.4s, v0.8h, v9.h[0] \n" + "smull2 v18.4s, v0.8h, v10.h[0] \n" + "smull2 v19.4s, v0.8h, v11.h[0] \n" + + "smlal v12.4s, v1.4h, v8.h[1] \n" + "smlal v13.4s, v1.4h, v9.h[1] \n" + "smlal v14.4s, v1.4h, v10.h[1] \n" + "smlal v15.4s, v1.4h, v11.h[1] \n" + + "smlal2 v16.4s, v1.8h, v8.h[1] \n" + "smlal2 v17.4s, v1.8h, v9.h[1] \n" + "smlal2 v18.4s, v1.8h, v10.h[1] \n" + "smlal2 v19.4s, v1.8h, v11.h[1] \n" + + "smlal v20.4s, v2.4h, v4.h[2] \n" + "smlal v21.4s, v2.4h, v5.h[2] \n" + "smlal v22.4s, v2.4h, v6.h[2] \n" + "smlal v23.4s, v2.4h, v7.h[2] \n" + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" //load a0, a1 + "smlal2 v24.4s, v2.8h, v4.h[2] \n" + "smlal2 v25.4s, v2.8h, v5.h[2] \n" + "smlal2 v26.4s, v2.8h, v6.h[2] \n" + "smlal2 v27.4s, v2.8h, v7.h[2] \n" + "smlal v12.4s, v2.4h, v8.h[2] \n" + "smlal v13.4s, v2.4h, v9.h[2] \n" + "smlal v14.4s, v2.4h, v10.h[2] \n" + "smlal v15.4s, v2.4h, v11.h[2] \n" + "smlal2 v16.4s, v2.8h, v8.h[2] \n" + "smlal2 v17.4s, v2.8h, v9.h[2] \n" + "smlal2 v18.4s, v2.8h, v10.h[2] \n" + "smlal2 v19.4s, v2.8h, v11.h[2] \n" + + "smlal v20.4s, v3.4h, v4.h[3] \n" + "smlal v21.4s, v3.4h, v5.h[3] \n" + "smlal v22.4s, v3.4h, v6.h[3] \n" + "smlal v23.4s, v3.4h, v7.h[3] \n" + "smlal2 v24.4s, v3.8h, v4.h[3] \n" + "smlal2 v25.4s, v3.8h, v5.h[3] \n" + "smlal2 v26.4s, v3.8h, v6.h[3] \n" + "smlal2 v27.4s, v3.8h, v7.h[3] \n" + "smlal v12.4s, v3.4h, v8.h[3] \n" + "smlal v13.4s, v3.4h, v9.h[3] \n" + "smlal v14.4s, v3.4h, v10.h[3] \n" + "smlal v15.4s, v3.4h, v11.h[3] \n" + "smlal2 v16.4s, v3.8h, v8.h[3] \n" + "smlal2 v17.4s, v3.8h, v9.h[3] \n" + "smlal2 v18.4s, v3.8h, v10.h[3] \n" + "smlal2 v19.4s, v3.8h, v11.h[3] \n" + + "smlal v20.4s, v0.4h, v4.h[4] \n" + "smlal v21.4s, v0.4h, v5.h[4] \n" + "smlal v22.4s, v0.4h, v6.h[4] \n" + "smlal v23.4s, v0.4h, v7.h[4] \n" + + "smlal2 v24.4s, v0.8h, v4.h[4] \n" + "smlal2 v25.4s, v0.8h, v5.h[4] \n" + "smlal2 v26.4s, v0.8h, v6.h[4] \n" + "smlal2 v27.4s, v0.8h, v7.h[4] \n" + "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" //load a2, a3 + + "smlal v20.4s, v1.4h, v4.h[5] \n" + "smlal v21.4s, v1.4h, v5.h[5] \n" + "smlal v22.4s, v1.4h, v6.h[5] \n" + "smlal v23.4s, v1.4h, v7.h[5] \n" + + "smlal2 v24.4s, v1.8h, v4.h[5] \n" + "smlal2 v25.4s, v1.8h, v5.h[5] \n" + "smlal2 v26.4s, v1.8h, v6.h[5] \n" + "smlal2 v27.4s, v1.8h, v7.h[5] \n" + + "smlal v12.4s, v0.4h, v8.h[4] \n" + "smlal v13.4s, v0.4h, v9.h[4] \n" + "smlal v14.4s, v0.4h, v10.h[4] \n" + "smlal v15.4s, v0.4h, v11.h[4] \n" + + "smlal2 v16.4s, v0.8h, v8.h[4] \n" + "smlal2 v17.4s, v0.8h, v9.h[4] \n" + "smlal2 v18.4s, v0.8h, v10.h[4] \n" + "smlal2 v19.4s, v0.8h, v11.h[4] \n" + + "smlal v12.4s, v1.4h, v8.h[5] \n" + "smlal v13.4s, v1.4h, v9.h[5] \n" + "smlal v14.4s, v1.4h, v10.h[5] \n" + "smlal v15.4s, v1.4h, v11.h[5] \n" + + "smlal2 v16.4s, v1.8h, v8.h[5] \n" + "smlal2 v17.4s, v1.8h, v9.h[5] \n" + "smlal2 v18.4s, v1.8h, v10.h[5] \n" + "smlal2 v19.4s, v1.8h, v11.h[5] \n" + + "smlal v20.4s, v2.4h, v4.h[6] \n" + "smlal v21.4s, v2.4h, v5.h[6] \n" + "smlal v22.4s, v2.4h, v6.h[6] \n" + "smlal v23.4s, v2.4h, v7.h[6] \n" + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" //load a0, a1 + "smlal2 v24.4s, v2.8h, v4.h[6] \n" + "smlal2 v25.4s, v2.8h, v5.h[6] \n" + "smlal2 v26.4s, v2.8h, v6.h[6] \n" + "smlal2 v27.4s, v2.8h, v7.h[6] \n" + "sub %[b], %[b], #128 \n" + "add %[b], %[b], %[ldb] \n" + "smlal v20.4s, v3.4h, v4.h[7] \n" + "smlal v21.4s, v3.4h, v5.h[7] \n" + "smlal v22.4s, v3.4h, v6.h[7] \n" + "smlal v23.4s, v3.4h, v7.h[7] \n" + "smlal2 v24.4s, v3.8h, v4.h[7] \n" + "smlal2 v25.4s, v3.8h, v5.h[7] \n" + "smlal2 v26.4s, v3.8h, v6.h[7] \n" + "smlal2 v27.4s, v3.8h, v7.h[7] \n" + "ld1 {v4.8h, v5.8h}, [%[b]], #32 \n" //load b0, b1 + "ld1 {v6.8h, v7.8h}, [%[b]], #32 \n" //load b2, b3 + + "smlal v12.4s, v2.4h, v8.h[6] \n" + "smlal v13.4s, v2.4h, v9.h[6] \n" + "smlal v14.4s, v2.4h, v10.h[6] \n" + "smlal v15.4s, v2.4h, v11.h[6] \n" + "smlal2 v16.4s, v2.8h, v8.h[6] \n" + "smlal2 v17.4s, v2.8h, v9.h[6] \n" + "smlal2 v18.4s, v2.8h, v10.h[6] \n" + "smlal2 v19.4s, v2.8h, v11.h[6] \n" + "subs %w[cnt], %w[cnt], #1 \n" + + "smlal v12.4s, v3.4h, v8.h[7] \n" + "smlal v13.4s, v3.4h, v9.h[7] \n" + "smlal v14.4s, v3.4h, v10.h[7] \n" + "smlal v15.4s, v3.4h, v11.h[7] \n" + "smlal2 v16.4s, v3.8h, v8.h[7] \n" + "smlal2 v17.4s, v3.8h, v9.h[7] \n" + "smlal2 v18.4s, v3.8h, v10.h[7] \n" + "smlal2 v19.4s, v3.8h, v11.h[7] \n" + + "beq 2f \n" + "1:\n" + "smlal v20.4s, v0.4h, v4.h[0] \n" + "smlal v21.4s, v0.4h, v5.h[0] \n" + "smlal v22.4s, v0.4h, v6.h[0] \n" + "smlal v23.4s, v0.4h, v7.h[0] \n" + "ld1 {v8.8h, v9.8h}, [%[b]], #32 \n" //load b0, b1 + "ld1 {v10.8h, v11.8h}, [%[b]], #32 \n" //load b2, b3 + + "smlal2 v24.4s, v0.8h, v4.h[0] \n" + "smlal2 v25.4s, v0.8h, v5.h[0] \n" + "smlal2 v26.4s, v0.8h, v6.h[0] \n" + "smlal2 v27.4s, v0.8h, v7.h[0] \n" + "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" //load a2, a3 + + "smlal v20.4s, v1.4h, v4.h[1] \n" + "smlal v21.4s, v1.4h, v5.h[1] \n" + "smlal v22.4s, v1.4h, v6.h[1] \n" + "smlal v23.4s, v1.4h, v7.h[1] \n" + + "smlal2 v24.4s, v1.8h, v4.h[1] \n" + "smlal2 v25.4s, v1.8h, v5.h[1] \n" + "smlal2 v26.4s, v1.8h, v6.h[1] \n" + "smlal2 v27.4s, v1.8h, v7.h[1] \n" + + "smlal v12.4s, v0.4h, v8.h[0] \n" + "smlal v13.4s, v0.4h, v9.h[0] \n" + "smlal v14.4s, v0.4h, v10.h[0] \n" + "smlal v15.4s, v0.4h, v11.h[0] \n" + + "smlal2 v16.4s, v0.8h, v8.h[0] \n" + "smlal2 v17.4s, v0.8h, v9.h[0] \n" + "smlal2 v18.4s, v0.8h, v10.h[0] \n" + "smlal2 v19.4s, v0.8h, v11.h[0] \n" + + "smlal v12.4s, v1.4h, v8.h[1] \n" + "smlal v13.4s, v1.4h, v9.h[1] \n" + "smlal v14.4s, v1.4h, v10.h[1] \n" + "smlal v15.4s, v1.4h, v11.h[1] \n" + + "smlal2 v16.4s, v1.8h, v8.h[1] \n" + "smlal2 v17.4s, v1.8h, v9.h[1] \n" + "smlal2 v18.4s, v1.8h, v10.h[1] \n" + "smlal2 v19.4s, v1.8h, v11.h[1] \n" + + "smlal v20.4s, v2.4h, v4.h[2] \n" + "smlal v21.4s, v2.4h, v5.h[2] \n" + "smlal v22.4s, v2.4h, v6.h[2] \n" + "smlal v23.4s, v2.4h, v7.h[2] \n" + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" //load a0, a1 + "smlal2 v24.4s, v2.8h, v4.h[2] \n" + "smlal2 v25.4s, v2.8h, v5.h[2] \n" + "smlal2 v26.4s, v2.8h, v6.h[2] \n" + "smlal2 v27.4s, v2.8h, v7.h[2] \n" + "smlal v12.4s, v2.4h, v8.h[2] \n" + "smlal v13.4s, v2.4h, v9.h[2] \n" + "smlal v14.4s, v2.4h, v10.h[2] \n" + "smlal v15.4s, v2.4h, v11.h[2] \n" + "smlal2 v16.4s, v2.8h, v8.h[2] \n" + "smlal2 v17.4s, v2.8h, v9.h[2] \n" + "smlal2 v18.4s, v2.8h, v10.h[2] \n" + "smlal2 v19.4s, v2.8h, v11.h[2] \n" + + "smlal v20.4s, v3.4h, v4.h[3] \n" + "smlal v21.4s, v3.4h, v5.h[3] \n" + "smlal v22.4s, v3.4h, v6.h[3] \n" + "smlal v23.4s, v3.4h, v7.h[3] \n" + "smlal2 v24.4s, v3.8h, v4.h[3] \n" + "smlal2 v25.4s, v3.8h, v5.h[3] \n" + "smlal2 v26.4s, v3.8h, v6.h[3] \n" + "smlal2 v27.4s, v3.8h, v7.h[3] \n" + "smlal v12.4s, v3.4h, v8.h[3] \n" + "smlal v13.4s, v3.4h, v9.h[3] \n" + "smlal v14.4s, v3.4h, v10.h[3] \n" + "smlal v15.4s, v3.4h, v11.h[3] \n" + "smlal2 v16.4s, v3.8h, v8.h[3] \n" + "smlal2 v17.4s, v3.8h, v9.h[3] \n" + "smlal2 v18.4s, v3.8h, v10.h[3] \n" + "smlal2 v19.4s, v3.8h, v11.h[3] \n" + + "smlal v20.4s, v0.4h, v4.h[4] \n" + "smlal v21.4s, v0.4h, v5.h[4] \n" + "smlal v22.4s, v0.4h, v6.h[4] \n" + "smlal v23.4s, v0.4h, v7.h[4] \n" + + "smlal2 v24.4s, v0.8h, v4.h[4] \n" + "smlal2 v25.4s, v0.8h, v5.h[4] \n" + "smlal2 v26.4s, v0.8h, v6.h[4] \n" + "smlal2 v27.4s, v0.8h, v7.h[4] \n" + "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" //load a2, a3 + + "smlal v20.4s, v1.4h, v4.h[5] \n" + "smlal v21.4s, v1.4h, v5.h[5] \n" + "smlal v22.4s, v1.4h, v6.h[5] \n" + "smlal v23.4s, v1.4h, v7.h[5] \n" + + "smlal2 v24.4s, v1.8h, v4.h[5] \n" + "smlal2 v25.4s, v1.8h, v5.h[5] \n" + "smlal2 v26.4s, v1.8h, v6.h[5] \n" + "smlal2 v27.4s, v1.8h, v7.h[5] \n" + + "smlal v12.4s, v0.4h, v8.h[4] \n" + "smlal v13.4s, v0.4h, v9.h[4] \n" + "smlal v14.4s, v0.4h, v10.h[4] \n" + "smlal v15.4s, v0.4h, v11.h[4] \n" + + "smlal2 v16.4s, v0.8h, v8.h[4] \n" + "smlal2 v17.4s, v0.8h, v9.h[4] \n" + "smlal2 v18.4s, v0.8h, v10.h[4] \n" + "smlal2 v19.4s, v0.8h, v11.h[4] \n" + + "smlal v12.4s, v1.4h, v8.h[5] \n" + "smlal v13.4s, v1.4h, v9.h[5] \n" + "smlal v14.4s, v1.4h, v10.h[5] \n" + "smlal v15.4s, v1.4h, v11.h[5] \n" + + "smlal2 v16.4s, v1.8h, v8.h[5] \n" + "smlal2 v17.4s, v1.8h, v9.h[5] \n" + "smlal2 v18.4s, v1.8h, v10.h[5] \n" + "smlal2 v19.4s, v1.8h, v11.h[5] \n" + + "smlal v20.4s, v2.4h, v4.h[6] \n" + "smlal v21.4s, v2.4h, v5.h[6] \n" + "smlal v22.4s, v2.4h, v6.h[6] \n" + "smlal v23.4s, v2.4h, v7.h[6] \n" + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" //load a0, a1 + "smlal2 v24.4s, v2.8h, v4.h[6] \n" + "smlal2 v25.4s, v2.8h, v5.h[6] \n" + "smlal2 v26.4s, v2.8h, v6.h[6] \n" + "smlal2 v27.4s, v2.8h, v7.h[6] \n" + "sub %[b], %[b], #128 \n" + "add %[b], %[b], %[ldb] \n" + "smlal v20.4s, v3.4h, v4.h[7] \n" + "smlal v21.4s, v3.4h, v5.h[7] \n" + "smlal v22.4s, v3.4h, v6.h[7] \n" + "smlal v23.4s, v3.4h, v7.h[7] \n" + "smlal2 v24.4s, v3.8h, v4.h[7] \n" + "smlal2 v25.4s, v3.8h, v5.h[7] \n" + "smlal2 v26.4s, v3.8h, v6.h[7] \n" + "smlal2 v27.4s, v3.8h, v7.h[7] \n" + "ld1 {v4.8h, v5.8h}, [%[b]], #32 \n" //load b0, b1 + "ld1 {v6.8h, v7.8h}, [%[b]], #32 \n" //load b2, b3 + + "smlal v12.4s, v2.4h, v8.h[6] \n" + "smlal v13.4s, v2.4h, v9.h[6] \n" + "smlal v14.4s, v2.4h, v10.h[6] \n" + "smlal v15.4s, v2.4h, v11.h[6] \n" + "smlal2 v16.4s, v2.8h, v8.h[6] \n" + "smlal2 v17.4s, v2.8h, v9.h[6] \n" + "smlal2 v18.4s, v2.8h, v10.h[6] \n" + "smlal2 v19.4s, v2.8h, v11.h[6] \n" + "subs %w[cnt], %w[cnt], #1 \n" + + "smlal v12.4s, v3.4h, v8.h[7] \n" + "smlal v13.4s, v3.4h, v9.h[7] \n" + "smlal v14.4s, v3.4h, v10.h[7] \n" + "smlal v15.4s, v3.4h, v11.h[7] \n" + "smlal2 v16.4s, v3.8h, v8.h[7] \n" + "smlal2 v17.4s, v3.8h, v9.h[7] \n" + "smlal2 v18.4s, v3.8h, v10.h[7] \n" + "smlal2 v19.4s, v3.8h, v11.h[7] \n" + + "bne 1b \n" + "2: \n" + "stp q20, q24, [%[c]], #32 \n" + "stp q21, q25, [%[c]], #32 \n" + "stp q22, q26, [%[c]], #32 \n" + "stp q23, q27, [%[c]], #32 \n" + "stp q12, q16, [%[c]], #32 \n" + "stp q13, q17, [%[c]], #32 \n" + "stp q14, q18, [%[c]], #32 \n" + "stp q15, q19, [%[c]], #32 \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [ldb] "r" (ldb_byte) + : "v0", "v1", "v2", "v3", "v4","v5", "v6", "v7", "v8", "v9", + "v10", "v11", "13", "14", "15", "16", "17", "18", "19","v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "cc", "memory" + ); + // clang format on + b += 64; + } + for (; n > 3; n -= 4) { + int cnt = kcnt; + const int16_t* a_ptr = A_packed; + const int16_t* b_ptr = b; + // clang-format off + asm volatile( + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" + "ld1 {v4.8h, v5.8h}, [%[b]], #32 \n" + + "smull v8.4s, v0.4h, v4.h[0] \n" + "smull v9.4s, v0.4h, v5.h[0] \n" + "ld1 {v6.8h, v7.8h}, [%[b]], #32 \n" + "smull2 v10.4s, v0.8h, v4.h[0] \n" + "smull2 v11.4s, v0.8h, v5.h[0] \n" + + "smlal v8.4s, v1.4h, v4.h[1] \n" + "smlal v9.4s, v1.4h, v5.h[1] \n" + "smlal2 v10.4s, v1.8h, v4.h[1] \n" + "smlal2 v11.4s, v1.8h, v5.h[1] \n" + "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" + + "smull v12.4s, v0.4h, v6.h[0] \n" + "smull v13.4s, v0.4h, v7.h[0] \n" + "smull2 v14.4s, v0.8h, v6.h[0] \n" + "smull2 v15.4s, v0.8h, v7.h[0] \n" + "smlal v12.4s, v1.4h, v6.h[1] \n" + "smlal v13.4s, v1.4h, v7.h[1] \n" + "smlal2 v14.4s, v1.8h, v6.h[1] \n" + "smlal2 v15.4s, v1.8h, v7.h[1] \n" + + "smlal v8.4s, v2.4h, v4.h[2] \n" + "smlal v9.4s, v2.4h, v5.h[2] \n" + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" + "smlal2 v10.4s, v2.8h, v4.h[2] \n" + "smlal2 v11.4s, v2.8h, v5.h[2] \n" + "smlal v8.4s, v3.4h, v4.h[3] \n" + "smlal v9.4s, v3.4h, v5.h[3] \n" + "smlal2 v10.4s, v3.8h, v4.h[3] \n" + "smlal2 v11.4s, v3.8h, v5.h[3] \n" + + "smlal v12.4s, v2.4h, v6.h[2] \n" + "smlal v13.4s, v2.4h, v7.h[2] \n" + "smlal2 v14.4s, v2.8h, v6.h[2] \n" + "smlal2 v15.4s, v2.8h, v7.h[2] \n" + "smlal v12.4s, v3.4h, v6.h[3] \n" + "smlal v13.4s, v3.4h, v7.h[3] \n" + "smlal2 v14.4s, v3.8h, v6.h[3] \n" + "smlal2 v15.4s, v3.8h, v7.h[3] \n" + + "smlal v8.4s, v0.4h, v4.h[4] \n" + "smlal v9.4s, v0.4h, v5.h[4] \n" + "smlal2 v10.4s, v0.8h, v4.h[4] \n" + "smlal2 v11.4s, v0.8h, v5.h[4] \n" + + "smlal v8.4s, v1.4h, v4.h[5] \n" + "smlal v9.4s, v1.4h, v5.h[5] \n" + "smlal2 v10.4s, v1.8h, v4.h[5] \n" + "smlal2 v11.4s, v1.8h, v5.h[5] \n" + "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" + + "smlal v12.4s, v0.4h, v6.h[4] \n" + "smlal v13.4s, v0.4h, v7.h[4] \n" + "smlal2 v14.4s, v0.8h, v6.h[4] \n" + "smlal2 v15.4s, v0.8h, v7.h[4] \n" + "smlal v12.4s, v1.4h, v6.h[5] \n" + "smlal v13.4s, v1.4h, v7.h[5] \n" + "smlal2 v14.4s, v1.8h, v6.h[5] \n" + "smlal2 v15.4s, v1.8h, v7.h[5] \n" + + "smlal v8.4s, v2.4h, v4.h[6] \n" + "smlal v9.4s, v2.4h, v5.h[6] \n" + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" + "smlal2 v10.4s, v2.8h, v4.h[6] \n" + "smlal2 v11.4s, v2.8h, v5.h[6] \n" + "smlal v8.4s, v3.4h, v4.h[7] \n" + "smlal v9.4s, v3.4h, v5.h[7] \n" + "smlal2 v10.4s, v3.8h, v4.h[7] \n" + "smlal2 v11.4s, v3.8h, v5.h[7] \n" + "sub %[b], %[b], #64 \n" + "add %[b], %[b], %[ldb] \n" + + "smlal v12.4s, v2.4h, v6.h[6] \n" + "smlal v13.4s, v2.4h, v7.h[6] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "ld1 {v4.8h, v5.8h}, [%[b]], #32 \n" + "smlal2 v14.4s, v2.8h, v6.h[6] \n" + "smlal2 v15.4s, v2.8h, v7.h[6] \n" + "smlal v12.4s, v3.4h, v6.h[7] \n" + "smlal v13.4s, v3.4h, v7.h[7] \n" + "smlal2 v14.4s, v3.8h, v6.h[7] \n" + "smlal2 v15.4s, v3.8h, v7.h[7] \n" + + "beq 2f \n" + "1: \n" + "smlal v8.4s, v0.4h, v4.h[0] \n" + "smlal v9.4s, v0.4h, v5.h[0] \n" + "ld1 {v6.8h, v7.8h}, [%[b]], #32 \n" + "smlal2 v10.4s, v0.8h, v4.h[0] \n" + "smlal2 v11.4s, v0.8h, v5.h[0] \n" + + "smlal v8.4s, v1.4h, v4.h[1] \n" + "smlal v9.4s, v1.4h, v5.h[1] \n" + "smlal2 v10.4s, v1.8h, v4.h[1] \n" + "smlal2 v11.4s, v1.8h, v5.h[1] \n" + "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" + + "smlal v12.4s, v0.4h, v6.h[0] \n" + "smlal v13.4s, v0.4h, v7.h[0] \n" + "smlal2 v14.4s, v0.8h, v6.h[0] \n" + "smlal2 v15.4s, v0.8h, v7.h[0] \n" + "smlal v12.4s, v1.4h, v6.h[1] \n" + "smlal v13.4s, v1.4h, v7.h[1] \n" + "smlal2 v14.4s, v1.8h, v6.h[1] \n" + "smlal2 v15.4s, v1.8h, v7.h[1] \n" + + "smlal v8.4s, v2.4h, v4.h[2] \n" + "smlal v9.4s, v2.4h, v5.h[2] \n" + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" + "smlal2 v10.4s, v2.8h, v4.h[2] \n" + "smlal2 v11.4s, v2.8h, v5.h[2] \n" + "smlal v8.4s, v3.4h, v4.h[3] \n" + "smlal v9.4s, v3.4h, v5.h[3] \n" + "smlal2 v10.4s, v3.8h, v4.h[3] \n" + "smlal2 v11.4s, v3.8h, v5.h[3] \n" + + "smlal v12.4s, v2.4h, v6.h[2] \n" + "smlal v13.4s, v2.4h, v7.h[2] \n" + "smlal2 v14.4s, v2.8h, v6.h[2] \n" + "smlal2 v15.4s, v2.8h, v7.h[2] \n" + "smlal v12.4s, v3.4h, v6.h[3] \n" + "smlal v13.4s, v3.4h, v7.h[3] \n" + "smlal2 v14.4s, v3.8h, v6.h[3] \n" + "smlal2 v15.4s, v3.8h, v7.h[3] \n" + + "smlal v8.4s, v0.4h, v4.h[4] \n" + "smlal v9.4s, v0.4h, v5.h[4] \n" + "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" + "smlal2 v10.4s, v0.8h, v4.h[4] \n" + "smlal2 v11.4s, v0.8h, v5.h[4] \n" + + "smlal v8.4s, v1.4h, v4.h[5] \n" + "smlal v9.4s, v1.4h, v5.h[5] \n" + "smlal2 v10.4s, v1.8h, v4.h[5] \n" + "smlal2 v11.4s, v1.8h, v5.h[5] \n" + + "smlal v12.4s, v0.4h, v6.h[4] \n" + "smlal v13.4s, v0.4h, v7.h[4] \n" + "smlal2 v14.4s, v0.8h, v6.h[4] \n" + "smlal2 v15.4s, v0.8h, v7.h[4] \n" + "smlal v12.4s, v1.4h, v6.h[5] \n" + "smlal v13.4s, v1.4h, v7.h[5] \n" + "smlal2 v14.4s, v1.8h, v6.h[5] \n" + "smlal2 v15.4s, v1.8h, v7.h[5] \n" + + "smlal v8.4s, v2.4h, v4.h[6] \n" + "smlal v9.4s, v2.4h, v5.h[6] \n" + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" + "smlal2 v10.4s, v2.8h, v4.h[6] \n" + "smlal2 v11.4s, v2.8h, v5.h[6] \n" + "smlal v8.4s, v3.4h, v4.h[7] \n" + "smlal v9.4s, v3.4h, v5.h[7] \n" + "smlal2 v10.4s, v3.8h, v4.h[7] \n" + "smlal2 v11.4s, v3.8h, v5.h[7] \n" + "sub %[b], %[b], #64 \n" + "add %[b], %[b], %[ldb] \n" + + "smlal v12.4s, v2.4h, v6.h[6] \n" + "smlal v13.4s, v2.4h, v7.h[6] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "ld1 {v4.8h, v5.8h}, [%[b]], #32 \n" + "smlal2 v14.4s, v2.8h, v6.h[6] \n" + "smlal2 v15.4s, v2.8h, v7.h[6] \n" + "smlal v12.4s, v3.4h, v6.h[7] \n" + "smlal v13.4s, v3.4h, v7.h[7] \n" + "smlal2 v14.4s, v3.8h, v6.h[7] \n" + "smlal2 v15.4s, v3.8h, v7.h[7] \n" + + "bne 1b \n" + "2: \n" + "stp q8, q10, [%[c]], #32 \n" + "stp q9, q11, [%[c]], #32 \n" + "stp q12, q14, [%[c]], #32 \n" + "stp q13, q15, [%[c]], #32 \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [ldb] "r" (ldb_byte) + : "v0", "v1", "v2", "v3", "v4","v5", "v6", "v7", "v8", "v9", + "v10", "v11","v12", "v13", "v14", "v15", "cc", "memory" + ); + // clang-format on + b += 32; + } + for (; n > 0; --n) { + int cnt = kcnt; + const int16_t* a_ptr = A_packed; + const int16_t* b_ptr = b; + // clang-format off + asm volatile( + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" + "ld1 {v4.8h}, [%[b]], #16 \n" + "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" + "smull v5.4s, v0.4h, v4.h[0] \n" + "smull2 v6.4s, v0.8h, v4.h[0] \n" + "ld1 {v10.8h, v11.8h}, [%[a]], #32 \n" + "smlal v5.4s, v1.4h, v4.h[1] \n" + "smlal2 v6.4s, v1.8h, v4.h[1] \n" + "ld1 {v12.8h, v13.8h}, [%[a]], #32 \n" + "smlal v5.4s, v2.4h, v4.h[2] \n" + "smlal2 v6.4s, v2.8h, v4.h[2] \n" + "smlal v5.4s, v3.4h, v4.h[3] \n" + "smlal2 v6.4s, v3.8h, v4.h[3] \n" + "sub %[b], %[b], #16 \n" + "add %[b], %[b], %[ldb] \n" + "smlal v5.4s, v10.4h, v4.h[4] \n" + "smlal2 v6.4s, v10.8h, v4.h[4] \n" + "smlal v5.4s, v11.4h, v4.h[5] \n" + "smlal2 v6.4s, v11.8h, v4.h[5] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" + "smlal v5.4s, v12.4h, v4.h[6] \n" + "smlal2 v6.4s, v12.8h, v4.h[6] \n" + "smlal v5.4s, v13.4h, v4.h[7] \n" + "smlal2 v6.4s, v13.8h, v4.h[7] \n" + + "beq 2f \n" + "1: \n" + "ld1 {v4.8h}, [%[b]], #16 \n" + "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" + "smlal v5.4s, v0.4h, v4.h[0] \n" + "smlal2 v6.4s, v0.8h, v4.h[0] \n" + "ld1 {v10.8h, v11.8h}, [%[a]], #32 \n" + "smlal v5.4s, v1.4h, v4.h[1] \n" + "smlal2 v6.4s, v1.8h, v4.h[1] \n" + "ld1 {v12.8h, v13.8h}, [%[a]], #32 \n" + "smlal v5.4s, v2.4h, v4.h[2] \n" + "smlal2 v6.4s, v2.8h, v4.h[2] \n" + "smlal v5.4s, v3.4h, v4.h[3] \n" + "smlal2 v6.4s, v3.8h, v4.h[3] \n" + "sub %[b], %[b], #16 \n" + "add %[b], %[b], %[ldb] \n" + "smlal v5.4s, v10.4h, v4.h[4] \n" + "smlal2 v6.4s, v10.8h, v4.h[4] \n" + "smlal v5.4s, v11.4h, v4.h[5] \n" + "smlal2 v6.4s, v11.8h, v4.h[5] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" + "smlal v5.4s, v12.4h, v4.h[6] \n" + "smlal2 v6.4s, v12.8h, v4.h[6] \n" + "smlal v5.4s, v13.4h, v4.h[7] \n" + "smlal2 v6.4s, v13.8h, v4.h[7] \n" + "bne 1b \n" + + "2: \n" + "st1 {v5.4s, v6.4s}, [%[c]], #32 \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [ldb] "r" (ldb_byte) + : "v0", "v1", "v2", "v3", "v4","v5", "v6", "cc", "memory" + ); + // clang-format on + b += 8; + } +#else + for (; n > 3; n -= 4) { + int cnt = kcnt; + const int16_t* a_ptr = A_packed; + const int16_t* b_ptr = b; + // clang-format off + asm volatile ( + "vld1.16 {d0-d3}, [%[b]]! \n" + "vld1.16 {d8-d11}, [%[a]]! \n" + "vld1.16 {d4-d7}, [%[b]]! \n" + "vmull.s16 q8, d8, d0[0] \n" + "vmull.s16 q9, d8, d2[0] \n" + "vld1.16 {d12-d15}, [%[a]]! \n" + "vmull.s16 q10, d9, d0[0] \n" + "vmull.s16 q11, d9, d2[0] \n" + "vmlal.s16 q8, d10, d0[1] \n" + "vmlal.s16 q9, d10, d2[1] \n" + "vmlal.s16 q10, d11, d0[1] \n" + "vmlal.s16 q11, d11, d2[1] \n" + "vmull.s16 q12, d8, d4[0] \n" + "vmull.s16 q13, d8, d6[0] \n" + "vmull.s16 q14, d9, d4[0] \n" + "vmull.s16 q15, d9, d6[0] \n" + "vmlal.s16 q12, d10, d4[1] \n" + "vmlal.s16 q13, d10, d6[1] \n" + "vmlal.s16 q14, d11, d4[1] \n" + "vmlal.s16 q15, d11, d6[1] \n" + + "vmlal.s16 q8, d12, d0[2] \n" + "vmlal.s16 q9, d12, d2[2] \n" + "vld1.16 {d8-d11}, [%[a]]! \n" + "vmlal.s16 q10, d13, d0[2] \n" + "vmlal.s16 q11, d13, d2[2] \n" + "vmlal.s16 q8, d14, d0[3] \n" + "vmlal.s16 q9, d14, d2[3] \n" + "vmlal.s16 q10, d15, d0[3] \n" + "vmlal.s16 q11, d15, d2[3] \n" + + "vmlal.s16 q12, d12, d4[2] \n" + "vmlal.s16 q13, d12, d6[2] \n" + "vmlal.s16 q14, d13, d4[2] \n" + "vmlal.s16 q15, d13, d6[2] \n" + "vmlal.s16 q12, d14, d4[3] \n" + "vmlal.s16 q13, d14, d6[3] \n" + "vmlal.s16 q14, d15, d4[3] \n" + "vmlal.s16 q15, d15, d6[3] \n" + + "sub %[b], %[b], #64 \n" + "add %[b], %[b], %[ldb] \n" + "vld1.16 {d12-d15}, [%[a]]! \n" + "vmlal.s16 q8, d8, d1[0] \n" + "vmlal.s16 q9, d8, d3[0] \n" + "vmlal.s16 q10, d9, d1[0] \n" + "vmlal.s16 q11, d9, d3[0] \n" + "vmlal.s16 q8, d10, d1[1] \n" + "vmlal.s16 q9, d10, d3[1] \n" + "vmlal.s16 q10, d11, d1[1] \n" + "vmlal.s16 q11, d11, d3[1] \n" + "vmlal.s16 q8, d12, d1[2] \n" + "vmlal.s16 q9, d12, d3[2] \n" + "vmlal.s16 q10, d13, d1[2] \n" + "vmlal.s16 q11, d13, d3[2] \n" + "vmlal.s16 q8, d14, d1[3] \n" + "vmlal.s16 q9, d14, d3[3] \n" + "vmlal.s16 q10, d15, d1[3] \n" + "vmlal.s16 q11, d15, d3[3] \n" + "vld1.16 {d0-d3}, [%[b]]! \n" + "vmlal.s16 q12, d8, d5[0] \n" + "vmlal.s16 q13, d8, d7[0] \n" + "vmlal.s16 q14, d9, d5[0] \n" + "vmlal.s16 q15, d9, d7[0] \n" + "vmlal.s16 q12, d10, d5[1] \n" + "vmlal.s16 q13, d10, d7[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "vmlal.s16 q14, d11, d5[1] \n" + "vmlal.s16 q15, d11, d7[1] \n" + "vld1.16 {d8-d11}, [%[a]]! \n" + "vmlal.s16 q12, d12, d5[2] \n" + "vmlal.s16 q13, d12, d7[2] \n" + "vmlal.s16 q14, d13, d5[2] \n" + "vmlal.s16 q15, d13, d7[2] \n" + "vmlal.s16 q12, d14, d5[3] \n" + "vmlal.s16 q13, d14, d7[3] \n" + "vmlal.s16 q14, d15, d5[3] \n" + "vmlal.s16 q15, d15, d7[3] \n" + + "beq 2f \n" + "1: \n" + "vld1.16 {d4-d7}, [%[b]]! \n" + "vmlal.s16 q8, d8, d0[0] \n" + "vmlal.s16 q9, d8, d2[0] \n" + "vld1.16 {d12-d15}, [%[a]]! \n" + "vmlal.s16 q10, d9, d0[0] \n" + "vmlal.s16 q11, d9, d2[0] \n" + "vmlal.s16 q8, d10, d0[1] \n" + "vmlal.s16 q9, d10, d2[1] \n" + "vmlal.s16 q10, d11, d0[1] \n" + "vmlal.s16 q11, d11, d2[1] \n" + "vmlal.s16 q12, d8, d4[0] \n" + "vmlal.s16 q13, d8, d6[0] \n" + "vmlal.s16 q14, d9, d4[0] \n" + "vmlal.s16 q15, d9, d6[0] \n" + "vmlal.s16 q12, d10, d4[1] \n" + "vmlal.s16 q13, d10, d6[1] \n" + "vmlal.s16 q14, d11, d4[1] \n" + "vmlal.s16 q15, d11, d6[1] \n" + + "vmlal.s16 q8, d12, d0[2] \n" + "vmlal.s16 q9, d12, d2[2] \n" + "vld1.16 {d8-d11}, [%[a]]! \n" + "vmlal.s16 q10, d13, d0[2] \n" + "vmlal.s16 q11, d13, d2[2] \n" + "vmlal.s16 q8, d14, d0[3] \n" + "vmlal.s16 q9, d14, d2[3] \n" + "vmlal.s16 q10, d15, d0[3] \n" + "vmlal.s16 q11, d15, d2[3] \n" + + "vmlal.s16 q12, d12, d4[2] \n" + "vmlal.s16 q13, d12, d6[2] \n" + "vmlal.s16 q14, d13, d4[2] \n" + "vmlal.s16 q15, d13, d6[2] \n" + "vmlal.s16 q12, d14, d4[3] \n" + "vmlal.s16 q13, d14, d6[3] \n" + "vmlal.s16 q14, d15, d4[3] \n" + "vmlal.s16 q15, d15, d6[3] \n" + + "sub %[b], %[b], #64 \n" + "add %[b], %[b], %[ldb] \n" + "vld1.16 {d12-d15}, [%[a]]! \n" + "vmlal.s16 q8, d8, d1[0] \n" + "vmlal.s16 q9, d8, d3[0] \n" + "vmlal.s16 q10, d9, d1[0] \n" + "vmlal.s16 q11, d9, d3[0] \n" + "vmlal.s16 q8, d10, d1[1] \n" + "vmlal.s16 q9, d10, d3[1] \n" + "vmlal.s16 q10, d11, d1[1] \n" + "vmlal.s16 q11, d11, d3[1] \n" + "vmlal.s16 q8, d12, d1[2] \n" + "vmlal.s16 q9, d12, d3[2] \n" + "vmlal.s16 q10, d13, d1[2] \n" + "vmlal.s16 q11, d13, d3[2] \n" + "vmlal.s16 q8, d14, d1[3] \n" + "vmlal.s16 q9, d14, d3[3] \n" + "vmlal.s16 q10, d15, d1[3] \n" + "vmlal.s16 q11, d15, d3[3] \n" + "vld1.16 {d0-d3}, [%[b]]! \n" + "vmlal.s16 q12, d8, d5[0] \n" + "vmlal.s16 q13, d8, d7[0] \n" + "vmlal.s16 q14, d9, d5[0] \n" + "vmlal.s16 q15, d9, d7[0] \n" + "vmlal.s16 q12, d10, d5[1] \n" + "vmlal.s16 q13, d10, d7[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "vmlal.s16 q14, d11, d5[1] \n" + "vmlal.s16 q15, d11, d7[1] \n" + "vld1.16 {d8-d11}, [%[a]]! \n" + "vmlal.s16 q12, d12, d5[2] \n" + "vmlal.s16 q13, d12, d7[2] \n" + "vmlal.s16 q14, d13, d5[2] \n" + "vmlal.s16 q15, d13, d7[2] \n" + "vmlal.s16 q12, d14, d5[3] \n" + "vmlal.s16 q13, d14, d7[3] \n" + "vmlal.s16 q14, d15, d5[3] \n" + "vmlal.s16 q15, d15, d7[3] \n" + + "bne 1b \n" + "2: \n" + "vst1.32 {d16-d17}, [%[c]]! \n" + "vst1.32 {d20-d21}, [%[c]]! \n" + "vst1.32 {d18-d19}, [%[c]]! \n" + "vst1.32 {d22-d23}, [%[c]]! \n" + "vst1.32 {d24-d25}, [%[c]]! \n" + "vst1.32 {d28-d29}, [%[c]]! \n" + "vst1.32 {d26-d27}, [%[c]]! \n" + "vst1.32 {d30-d31}, [%[c]]! \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [ldb] "r" (ldb_byte) + : "q0", "q1", "q2", "q3", "q4","q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory" + ); + // clang format on + b += 32; + } + for (; n > 0; --n) { + int cnt = kcnt; + const int16_t* a_ptr = A_packed; + const int16_t* b_ptr = b; + // clang format off + asm volatile ( + "vld1.16 {d0-d1}, [%[b]]! \n" + "vld1.16 {d4-d7}, [%[a]]! \n" + "vld1.16 {d8-d11}, [%[a]]! \n" + "vmull.s16 q8, d4, d0[0] \n" + "vmull.s16 q9, d5, d0[0] \n" + "sub %[b], %[b], #16 \n" + "vmlal.s16 q8, d6, d0[1] \n" + "vmlal.s16 q9, d7, d0[1] \n" + "add %[b], %[b], %[ldb] \n" + "subs %[cnt], %[cnt], #1 \n" + + "vld1.16 {d4-d7}, [%[a]]! \n" + "vmlal.s16 q8, d8, d0[2] \n" + "vmlal.s16 q9, d9, d0[2] \n" + "vmlal.s16 q8, d10, d0[3] \n" + "vmlal.s16 q9, d11, d0[3] \n" + "vld1.16 {d8-d11}, [%[a]]! \n" + + "vmlal.s16 q8, d4, d1[0] \n" + "vmlal.s16 q9, d5, d1[0] \n" + "vmlal.s16 q8, d6, d1[1] \n" + "vmlal.s16 q9, d7, d1[1] \n" + "vld1.16 {d4-d7}, [%[a]]! \n" + "vmlal.s16 q8, d8, d1[2] \n" + "vmlal.s16 q9, d9, d1[2] \n" + "vmlal.s16 q8, d10, d1[3] \n" + "vmlal.s16 q9, d11, d1[3] \n" + "beq 2f \n" + "1:\n" + "vld1.16 {d0-d1}, [%[b]]! \n" + "vld1.16 {d8-d11}, [%[a]]! \n" + "vmlal.s16 q8, d4, d0[0] \n" + "vmlal.s16 q9, d5, d0[0] \n" + "sub %[b], %[b], #16 \n" + "vmlal.s16 q8, d6, d0[1] \n" + "vmlal.s16 q9, d7, d0[1] \n" + "add %[b], %[b], %[ldb] \n" + "subs %[cnt], %[cnt], #1 \n" + + "vld1.16 {d4-d7}, [%[a]]! \n" + "vmlal.s16 q8, d8, d0[2] \n" + "vmlal.s16 q9, d9, d0[2] \n" + "vmlal.s16 q8, d10, d0[3] \n" + "vmlal.s16 q9, d11, d0[3] \n" + "vld1.16 {d8-d11}, [%[a]]! \n" + + "vmlal.s16 q8, d4, d1[0] \n" + "vmlal.s16 q9, d5, d1[0] \n" + "vmlal.s16 q8, d6, d1[1] \n" + "vmlal.s16 q9, d7, d1[1] \n" + "vld1.16 {d4-d7}, [%[a]]! \n" + "vmlal.s16 q8, d8, d1[2] \n" + "vmlal.s16 q9, d9, d1[2] \n" + "vmlal.s16 q8, d10, d1[3] \n" + "vmlal.s16 q9, d11, d1[3] \n" + "bne 1b \n" + "2: \n" + "vst1.32 {d16-d19}, [%[c]]! \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [ldb] "r" (ldb_byte) + : "q0", "q1", "q2", "q3", "q4","q5", "q6", "q7", "q8", + "q9", "cc", "memory" + ); + // clang-format on + b += 8; + } +#endif + A_packed += lda; + } +} + void sgemm_prepack_c4(int M, int N, int K, diff --git a/lite/backends/arm/math/packed_sgemm_c4.h b/lite/backends/arm/math/packed_sgemm_c4.h index 3229ff3e0774ce8bff02b12d79d7ec50ed873cea..51457d57405396f68bf1991bfa43cc6aa9fbe050 100644 --- a/lite/backends/arm/math/packed_sgemm_c4.h +++ b/lite/backends/arm/math/packed_sgemm_c4.h @@ -54,6 +54,13 @@ void sgemm_prepack_c4_small(int M, const float* B, float* C, ARMContext* ctx); +void sgemm_prepack_c8_int16_small(int M, + int N, + int K, + const int16_t* A_packed, + const int16_t* B, + int32_t* C, + ARMContext* ctx); } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc index 3e6cbff0660be8f2542d059a39115bed52122ff1..8303851ece9dd2f1d053f9f4b888e42f2fdc0aad 100644 --- a/lite/backends/arm/math/pooling.cc +++ b/lite/backends/arm/math/pooling.cc @@ -2044,7 +2044,7 @@ void pooling3x3s1p0_avg(const float* din, } else { if (pad_bottom > 1) { coef_h = 1.f / 3; - } else if (pad_bottom = 1) { + } else if (pad_bottom == 1) { coef_h = 0.5f; } else { coef_h = 1.f; diff --git a/lite/backends/arm/math/prior_box.cc b/lite/backends/arm/math/prior_box.cc index 6daab69ebf00da24d67132afba4b9abef0afbd39..4ef7356e67cee4c47ddf3eb16ed5286b4271b41a 100644 --- a/lite/backends/arm/math/prior_box.cc +++ b/lite/backends/arm/math/prior_box.cc @@ -21,7 +21,7 @@ namespace lite { namespace arm { namespace math { -const int MALLOC_ALIGN = 64; +const int MALLOC_ALIGN = 16; void* fast_malloc(size_t size) { size_t offset = sizeof(void*) + MALLOC_ALIGN - 1; diff --git a/lite/backends/arm/math/sequence_pool.cc b/lite/backends/arm/math/sequence_pool.cc index b8f9ab0a1a842a59971ad4c165d4c1be3426059a..ded76c1bdae354ca46a254309dcc6b3e216c92f4 100644 --- a/lite/backends/arm/math/sequence_pool.cc +++ b/lite/backends/arm/math/sequence_pool.cc @@ -46,11 +46,60 @@ void seq_pool_sum(const float* din, memcpy(dout_ptr, din_ptr, width * sizeof(float)); din_ptr += width; height = height - 1; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; ++w) { - dout_ptr[w] += din_ptr[w]; + int cnt_w = width >> 2; + int remain_w = width & 3; + int cnt_h = height >> 2; + int remain_h = height & 3; + int stride = width << 2; + for (int w = 0; w < cnt_w; w++) { + const float* din_ptr0 = din_ptr + w * 4; + float32x4_t dout_val = vld1q_f32(dout_ptr); + const float* din_ptr1 = din_ptr0 + width; + const float* din_ptr2 = din_ptr1 + width; + const float* din_ptr3 = din_ptr2 + width; + for (int h = 0; h < cnt_h; h++) { + float32x4_t din0 = vld1q_f32(din_ptr0); + float32x4_t din1 = vld1q_f32(din_ptr1); + float32x4_t din2 = vld1q_f32(din_ptr2); + float32x4_t din3 = vld1q_f32(din_ptr3); + dout_val = vaddq_f32(din0, dout_val); + float32x4_t tmp = vaddq_f32(din1, din2); + din_ptr0 += stride; + din_ptr1 += stride; + dout_val = vaddq_f32(din3, dout_val); + din_ptr2 += stride; + din_ptr3 += stride; + dout_val = vaddq_f32(tmp, dout_val); } - din_ptr += width; + for (int h = 0; h < remain_h; h++) { + float32x4_t din0 = vld1q_f32(din_ptr0); + dout_val = vaddq_f32(din0, dout_val); + din_ptr0 += width; + } + vst1q_f32(dout_ptr, dout_val); + dout_ptr += 4; + } + const float* din_ptr00 = din_ptr + cnt_w * 4; + for (int w = 0; w < remain_w; w++) { + const float* din_ptr0 = din_ptr00 + w; + const float* din_ptr1 = din_ptr0 + width; + const float* din_ptr2 = din_ptr1 + width; + const float* din_ptr3 = din_ptr2 + width; + for (int h = 0; h < cnt_h; h++) { + *dout_ptr += din_ptr0[0]; + float tmp = din_ptr1[0] + din_ptr2[0]; + din_ptr0 += stride; + din_ptr1 += stride; + *dout_ptr += din_ptr3[0]; + din_ptr2 += stride; + din_ptr3 += stride; + *dout_ptr += tmp; + } + for (int h = 0; h < remain_h; h++) { + *dout_ptr += din_ptr0[0]; + din_ptr0 += width; + } + dout_ptr++; } } } @@ -144,12 +193,62 @@ void seq_pool_max(const float* din, } else { memcpy(dout_ptr, din_ptr, width * sizeof(float)); din_ptr += width; - int remain_h = height - 1; - for (int h = 0; h < remain_h; h++) { - for (int w = 0; w < width; w++) { - dout_ptr[w] = std::max(dout_ptr[w], din_ptr[w]); + height = height - 1; + int cnt_w = width >> 2; + int remain_w = width & 3; + int cnt_h = height >> 2; + int remain_h = height & 3; + int stride = width << 2; + for (int w = 0; w < cnt_w; w++) { + const float* din_ptr0 = din_ptr + w * 4; + float32x4_t dout_val = vld1q_f32(dout_ptr); + const float* din_ptr1 = din_ptr0 + width; + const float* din_ptr2 = din_ptr1 + width; + const float* din_ptr3 = din_ptr2 + width; + for (int h = 0; h < cnt_h; h++) { + float32x4_t din0 = vld1q_f32(din_ptr0); + float32x4_t din1 = vld1q_f32(din_ptr1); + float32x4_t din2 = vld1q_f32(din_ptr2); + float32x4_t din3 = vld1q_f32(din_ptr3); + dout_val = vmaxq_f32(din0, dout_val); + float32x4_t tmp = vmaxq_f32(din1, din2); + din_ptr0 += stride; + din_ptr1 += stride; + dout_val = vmaxq_f32(din3, dout_val); + din_ptr2 += stride; + din_ptr3 += stride; + dout_val = vmaxq_f32(tmp, dout_val); } - din_ptr += width; + for (int h = 0; h < remain_h; h++) { + float32x4_t din0 = vld1q_f32(din_ptr0); + dout_val = vmaxq_f32(din0, dout_val); + din_ptr0 += width; + } + vst1q_f32(dout_ptr, dout_val); + dout_ptr += 4; + } + const float* din_ptr00 = din_ptr + cnt_w * 4; + for (int w = 0; w < remain_w; w++) { + const float* din_ptr0 = din_ptr00 + w; + const float* din_ptr1 = din_ptr0 + width; + const float* din_ptr2 = din_ptr1 + width; + const float* din_ptr3 = din_ptr2 + width; + for (int h = 0; h < cnt_h; h++) { + *dout_ptr += din_ptr0[0]; + *dout_ptr = std::max(*dout_ptr, din_ptr0[0]); + float tmp = std::max(din_ptr1[0], din_ptr2[0]); + din_ptr0 += stride; + din_ptr1 += stride; + *dout_ptr = std::max(*dout_ptr, din_ptr3[0]); + din_ptr2 += stride; + din_ptr3 += stride; + *dout_ptr = std::max(*dout_ptr, tmp); + } + for (int h = 0; h < remain_h; h++) { + *dout_ptr = std::max(*dout_ptr, din_ptr0[0]); + din_ptr0 += width; + } + dout_ptr++; } } } diff --git a/lite/backends/arm/math/softmax.cc b/lite/backends/arm/math/softmax.cc index 65d41b049123680f26674cc05d3c02172a260b31..b7f82e9f376e8b62195d884e8de19a142d76b316 100644 --- a/lite/backends/arm/math/softmax.cc +++ b/lite/backends/arm/math/softmax.cc @@ -531,7 +531,7 @@ void softmax_inner1_large_axis(const float* din, } float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax)); float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1)); - for (j = 4 * j; j < axis_size; ++j) { + for (j = 4 * nn; j < axis_size; ++j) { max_data = std::max(max_data, din_max_ptr[0]); din_max_ptr++; } @@ -557,7 +557,7 @@ void softmax_inner1_large_axis(const float* din, float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum)); float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1); - for (j = 4 * j; j < axis_size; ++j) { + for (j = 4 * nn; j < axis_size; ++j) { dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data); sum_data += dout_sum_ptr[0]; din_sum_ptr++; diff --git a/lite/backends/cuda/cuda_utils.h b/lite/backends/cuda/cuda_utils.h index 4c7cedaa97e22f74caebc5288fad8543f61bc88d..012004a65fa7d531ed85837e27b880c8c493ffca 100644 --- a/lite/backends/cuda/cuda_utils.h +++ b/lite/backends/cuda/cuda_utils.h @@ -41,6 +41,8 @@ << "CUDA: " << cudaGetErrorString(e); \ } +#define CUDA_POST_KERNEL_CHECK CUDA_CALL(cudaPeekAtLastError()) + #define CUBLAS_CALL(func) \ { \ auto e = (func); \ @@ -127,6 +129,10 @@ static const char* CudnnGetErrorInfo(cudnnStatus_t status) { return "CUDNN_STATUS_RUNTIME_IN_PROGRESS"; case CUDNN_STATUS_RUNTIME_FP_OVERFLOW: return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW"; +#endif +#if CUDNN_VERSION_MIN(8, 0, 0) + case CUDNN_STATUS_VERSION_MISMATCH: + return "CUDNN_STATUS_VERSION_MISMATCH"; #endif } return "Unknown cudnn status"; diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt index 9e33d38feedbe682f3c4d962b4ccb85b74af3a7b..c23d3d0ed0351b59d4a373efb2474e9a73763659 100644 --- a/lite/backends/cuda/math/CMakeLists.txt +++ b/lite/backends/cuda/math/CMakeLists.txt @@ -11,8 +11,13 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps}) nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps}) nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps}) nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps}) +nv_library(cuda_gru_forward SRCS gru_forward.cu DEPS cuda_activation ${cuda_static_deps}) +nv_library(cuda_sequence2batch SRCS sequence2batch.cu DEPS ${cuda_static_deps}) nv_library(cuda_gemm SRCS gemm.cc DEPS ${cuda_static_deps}) nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps}) +nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps}) +nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps}) +nv_library(cuda_bias SRCS bias.cu DEPS ${cuda_static_deps}) set ( math_cuda @@ -23,8 +28,13 @@ set ( cuda_transpose cuda_elementwise cudnn_pool + cuda_gru_forward + cuda_sequence2batch cuda_gemm cuda_batched_gemm + cuda_strided_gemm + cuda_sequence_padding + cuda_bias ) set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda") diff --git a/lite/backends/cuda/math/activation.cu b/lite/backends/cuda/math/activation.cu index a45e3eb378eefdbabce0b837891514dc659e0429..4d97042aeb0b728b491fbc2dd12ddcc94b4c1490 100644 --- a/lite/backends/cuda/math/activation.cu +++ b/lite/backends/cuda/math/activation.cu @@ -13,6 +13,7 @@ // limitations under the License. #include +#include "lite/backends/cuda/cuda_utils.h" #include "lite/backends/cuda/math/activation.h" #include "lite/backends/cuda/math/utils.h" @@ -21,6 +22,20 @@ namespace lite { namespace cuda { namespace math { +ActivationType GetActiveType(const std::string& act) { + if (act == "sigmoid") { + return kSigmoid; + } else if (act == "relu") { + return kReLU; + } else if (act == "tanh") { + return kTanh; + } else if (act == "identify") { + return kIdentity; + } else { + LOG(FATAL) << "not supported activation: " << act; + } +} + template __global__ void relu_kernel(const int num, const float alpha, @@ -470,6 +485,76 @@ template void relu(int, const half*, half*, float, cudaStream_t); template void bias_relu( int, const float*, const float* bias, float*, float, cudaStream_t); +// ------------- sigmoid ------------- + +template +__global__ void sigmoid_kernel(const int num, const T* in, T* out) { + CUDA_KERNEL_LOOP(i, num) { +#if __CUDA_ARCH__ >= 350 + out[i] = static_cast(1.0f) / + (static_cast(1.0f) + expf(-1 * __ldg(in + i))); +#else + out[i] = static_cast(1.0f) / (static_cast(1.0f) + expf(-in[i])); +#endif + } +} + +template <> +__global__ void sigmoid_kernel(const int num, const half* in, half* out) { + CUDA_KERNEL_LOOP(i, num) { + half tmp = __float2half(1.0f); +#if __CUDA_ARCH__ >= 530 + out[i] = __hdiv( + tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.0f), __ldg(in + i))))); +#else + out[i] = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i])))); +#endif + } +} + +template <> +__global__ void sigmoid_kernel(const int num, const half2* in, half2* out) { + CUDA_KERNEL_LOOP(i, num) { + half2 tmp = __floats2half2_rn(1.0f, 1.0f); +#if __CUDA_ARCH__ >= 530 + out[i] = __h2div(tmp, + __hadd2(tmp, + h2exp(__hmul2(__floats2half2_rn(-1.0f, -1.0f), + __ldg(in + i))))); +#else + out[i].x = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].x)))); + out[i].y = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].y)))); +#endif + } +} + +template +void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream) { + sigmoid_kernel<<>>( + num, din, dout); + CUDA_POST_KERNEL_CHECK; +} + +template <> +void sigmoid(const int num, const half* din, half* dout, cudaStream_t stream) { + if (num % 2 == 0) { + const half2* din2 = reinterpret_cast(din); + half2* dout2 = reinterpret_cast(dout); + sigmoid_kernel< + half2><<>>( + num / 2, din2, dout2); + } else { + sigmoid_kernel<<>>( + num, din, dout); + } + CUDA_POST_KERNEL_CHECK; +} + +template void sigmoid(const int num, + const float* din, + float* dout, + cudaStream_t stream); + } // namespace math } // namespace cuda } // namespace lite diff --git a/lite/backends/cuda/math/activation.h b/lite/backends/cuda/math/activation.h index 887a222ee83878aa19fd6a94a76572e48ab4d954..926ad8d99fc4bd6464ed517505fcf30f035c57f8 100644 --- a/lite/backends/cuda/math/activation.h +++ b/lite/backends/cuda/math/activation.h @@ -17,11 +17,22 @@ #include #include +#include "lite/utils/cp_logging.h" + namespace paddle { namespace lite { namespace cuda { namespace math { +enum ActivationType { + kSigmoid, + kReLU, + kTanh, + kIdentity, +}; + +ActivationType GetActiveType(const std::string& act); + // fp32 and half template void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream); @@ -72,6 +83,9 @@ void bias_int8_nhwc(int num, const void* scale, cudaStream_t stream); +template +void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream); + } // namespace math } // namespace cuda } // namespace lite diff --git a/lite/backends/cuda/math/bias.cu b/lite/backends/cuda/math/bias.cu new file mode 100644 index 0000000000000000000000000000000000000000..5e597e51c81cf75ddc2f850ac41924a0176ecb45 --- /dev/null +++ b/lite/backends/cuda/math/bias.cu @@ -0,0 +1,63 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "lite/backends/cuda/math/bias.h" + +#include + +#include "lite/backends/cuda/cuda_utils.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +__global__ void RowwiseAddKernel( + const T* a, const T* b, T* c, int width, int num) { + CUDA_KERNEL_LOOP(i, num) { + int h = i / width; + int w = i - h * width; + c[i] = a[i] + b[w]; + } +} + +template <> +__global__ void RowwiseAddKernel( + const half* a, const half* b, half* c, int width, int num) { + CUDA_KERNEL_LOOP(i, num) { + int h = i / width; + int w = i - h * width; + c[i] = __hadd(a[i], b[w]); + } +} + +template +void RowwiseAdd::operator()(const T* input, + const T* bias, + T* output, + const int width, + const int count, + const cudaStream_t& stream) { + RowwiseAddKernel<<>>( + input, bias, output, width, count); + CUDA_POST_KERNEL_CHECK; +} + +template struct RowwiseAdd; +template struct RowwiseAdd; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/bias.h b/lite/backends/cuda/math/bias.h new file mode 100644 index 0000000000000000000000000000000000000000..98f805a013ff80b267301be4d47a9694c5ce642f --- /dev/null +++ b/lite/backends/cuda/math/bias.h @@ -0,0 +1,39 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "lite/backends/cuda/cuda_utils.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +struct RowwiseAdd { + void operator()(const T* input, + const T* bias, + T* output, + const int width, + const int count, + const cudaStream_t& stream); +}; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc index 19ace2762af7d2088d5235e20387d8a4d941be30..5db41302c0cb0133e3badad0b5fa167d2c88f9df 100644 --- a/lite/backends/cuda/math/cudnn_conv.cc +++ b/lite/backends/cuda/math/cudnn_conv.cc @@ -161,15 +161,17 @@ bool CudnnConv2D::create(const operators::ConvParam& param, search_func); } else { - CUDNN_CHECK( - cudnnGetConvolutionForwardAlgorithm(this->handle_, - this->input_desc_, - this->filter_desc_, - this->conv_desc_, - this->output_desc_, - this->preference_, - this->workspace_limit_bytes_, - &this->fwd_algo_)); + int requestedAlgoCount = 1; + int returnedAlgoCount; + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(this->handle_, + this->input_desc_, + this->filter_desc_, + this->conv_desc_, + this->output_desc_, + requestedAlgoCount, + &returnedAlgoCount, + &this->algo_perf_)); + this->fwd_algo_ = this->algo_perf_.algo; } CUDNN_CHECK( cudnnGetConvolutionForwardWorkspaceSize(this->handle_, diff --git a/lite/backends/cuda/math/cudnn_conv.h b/lite/backends/cuda/math/cudnn_conv.h index f73f1db7b1785814b6e97f28c8624b76fa75f89c..a084edefa17a5882f7e6d67407e1f48a818e3407 100644 --- a/lite/backends/cuda/math/cudnn_conv.h +++ b/lite/backends/cuda/math/cudnn_conv.h @@ -81,6 +81,7 @@ class CudnnConv2DBase { cudaStream_t stream_; cudnnHandle_t handle_; cudnnConvolutionFwdAlgo_t fwd_algo_; + cudnnConvolutionFwdAlgoPerf_t algo_perf_; cudnnTensorDescriptor_t input_desc_; cudnnTensorDescriptor_t output_desc_; cudnnTensorDescriptor_t bias_desc_; @@ -98,8 +99,6 @@ class CudnnConv2DBase { const bool use_tensor_core_ = true; const size_t workspace_limit_bytes_ = 4 * 1024 * 1024; - const cudnnConvolutionFwdPreference_t preference_ = - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; // For int8 Tensor temp_tensor_; diff --git a/lite/backends/cuda/math/gru_forward.cu b/lite/backends/cuda/math/gru_forward.cu new file mode 100644 index 0000000000000000000000000000000000000000..cd04c3871db07a18acab99c960a90124941ade5d --- /dev/null +++ b/lite/backends/cuda/math/gru_forward.cu @@ -0,0 +1,278 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "lite/backends/cuda/math/gru_forward.h" +#include "lite/core/device_info.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void GruForwardResetOutput( + T* gate_value, + T* reset_output_value, + T* prev_output_value, + int frame_size, + int batch_size, + lite::cuda::math::ActivationType active_gate, + bool is_batch) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + reset_output_value += batch_idx * frame_size; + } + T prev_out = 0; + T reset_out_val; + T update_gate_value = gate_value[frame_idx + frame_size * 0]; + T reset_gate_value = gate_value[frame_idx + frame_size * 1]; + + if (prev_output_value) { + if (is_batch) { + prev_output_value += batch_idx * frame_size; + } + prev_out = prev_output_value[frame_idx]; + } + + if (active_gate == lite::cuda::math::ActivationType::kSigmoid) { + update_gate_value = Sigmoid(update_gate_value); + reset_gate_value = Sigmoid(reset_gate_value); + } else if (active_gate == lite::cuda::math::ActivationType::kReLU) { + update_gate_value = ReLU(update_gate_value); + reset_gate_value = ReLU(reset_gate_value); + } else if (active_gate == lite::cuda::math::ActivationType::kTanh) { + update_gate_value = Tanh(update_gate_value); + reset_gate_value = Tanh(reset_gate_value); + } + + reset_out_val = prev_out * reset_gate_value; + + gate_value[frame_idx + frame_size * 0] = update_gate_value; + gate_value[frame_idx + frame_size * 1] = reset_gate_value; + reset_output_value[frame_idx] = reset_out_val; +} + +template <> +__global__ void GruForwardResetOutput( + half* gate_value, + half* reset_output_value, + half* prev_output_value, + int frame_size, + int batch_size, + lite::cuda::math::ActivationType active_gate, + bool is_batch) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + reset_output_value += batch_idx * frame_size; + } + half prev_out = 0; + half reset_out_val; + half update_gate_value = gate_value[frame_idx + frame_size * 0]; + half reset_gate_value = gate_value[frame_idx + frame_size * 1]; + + if (prev_output_value) { + if (is_batch) { + prev_output_value += batch_idx * frame_size; + } + prev_out = prev_output_value[frame_idx]; + } + + if (active_gate == ActivationType::kSigmoid) { + update_gate_value = Sigmoid(update_gate_value); + reset_gate_value = Sigmoid(reset_gate_value); + } else if (active_gate == ActivationType::kReLU) { + update_gate_value = ReLU(update_gate_value); + reset_gate_value = ReLU(reset_gate_value); + } else if (active_gate == ActivationType::kTanh) { + update_gate_value = Tanh(update_gate_value); + reset_gate_value = Tanh(reset_gate_value); + } +#if __CUDA_ARCH__ >= 530 + reset_out_val = __hmul(prev_out, reset_gate_value); +#else + reset_out_val = + __float2half(__half2float(prev_out) * __half2float(reset_gate_value)); +#endif + + gate_value[frame_idx + frame_size * 0] = update_gate_value; + gate_value[frame_idx + frame_size * 1] = reset_gate_value; + reset_output_value[frame_idx] = reset_out_val; +} + +/* + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) + */ +template +__global__ void GruForwardFinalOutput( + T* gate_value, + T* prev_output_value, + T* output_value, + int frame_size, + int batch_size, + lite::cuda::math::ActivationType active_node, + bool origin_mode, + bool is_batch) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) { + return; + } + gate_value += batch_idx * 3 * frame_size; + output_value += batch_idx * frame_size; + } + + T output; + T prev_out = 0; + T update_gate_value = gate_value[frame_idx + frame_size * 0]; + T state_frame_value = gate_value[frame_idx + frame_size * 2]; + + if (prev_output_value) { + if (is_batch) prev_output_value += batch_idx * frame_size; + prev_out = prev_output_value[frame_idx]; + } + + if (active_node == lite::cuda::math::ActivationType::kSigmoid) { + state_frame_value = Sigmoid(state_frame_value); + } else if (active_node == lite::cuda::math::ActivationType::kReLU) { + state_frame_value = ReLU(state_frame_value); + } else if (active_node == lite::cuda::math::ActivationType::kTanh) { + state_frame_value = Tanh(state_frame_value); + } + + if (origin_mode) { + output = update_gate_value * prev_out + state_frame_value - + update_gate_value * state_frame_value; + } else { + output = prev_out - update_gate_value * prev_out + + update_gate_value * state_frame_value; + } + + gate_value[frame_idx + frame_size * 2] = state_frame_value; + output_value[frame_idx] = output; +} + +template <> +__global__ void GruForwardFinalOutput( + half* gate_value, + half* prev_output_value, + half* output_value, + int frame_size, + int batch_size, + lite::cuda::math::ActivationType active_node, + bool origin_mode, + bool is_batch) { + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) { + return; + } + gate_value += batch_idx * 3 * frame_size; + output_value += batch_idx * frame_size; + } + + half output; + half prev_out = 0; + half update_gate_value = gate_value[frame_idx + frame_size * 0]; + half state_frame_value = gate_value[frame_idx + frame_size * 2]; + + if (prev_output_value) { + if (is_batch) prev_output_value += batch_idx * frame_size; + prev_out = prev_output_value[frame_idx]; + } + + if (active_node == lite::cuda::math::ActivationType::kSigmoid) { + state_frame_value = Sigmoid(state_frame_value); + } else if (active_node == lite::cuda::math::ActivationType::kReLU) { + state_frame_value = ReLU(state_frame_value); + } else if (active_node == lite::cuda::math::ActivationType::kTanh) { + state_frame_value = Tanh(state_frame_value); + } + + if (origin_mode) { +#if __CUDA_ARCH__ >= 530 + output = + __hsub(__hadd(__hmul(update_gate_value, prev_out), state_frame_value), + __hmul(update_gate_value, state_frame_value)); +#else + output = __float2half( + __half2float(update_gate_value) * __half2float(prev_out) + + __half2float(state_frame_value) - + __half2float(update_gate_value) * __half2float(state_frame_value)); +#endif + } else { +#if __CUDA_ARCH__ >= 530 + output = prev_out - update_gate_value * prev_out + + update_gate_value * state_frame_value; + output = __hadd(__hsub(prev_out, __hmul(update_gate_value, prev_out)), + __hmul(update_gate_value, state_frame_value)); +#else + output = __float2half( + __half2float(prev_out) - + __half2float(update_gate_value) * __half2float(prev_out) + + __half2float(update_gate_value) * __half2float(state_frame_value)); +#endif + } + + gate_value[frame_idx + frame_size * 2] = state_frame_value; + output_value[frame_idx] = output; +} + +template __global__ void GruForwardFinalOutput( + float* gate_value, + float* prev_output_value, + float* output_value, + int frame_size, + int batch_size, + lite::cuda::math::ActivationType active_node, + bool origin_mode, + bool is_batch); + +template __global__ void GruForwardResetOutput( + float* gate_value, + float* reset_output_value, + float* prev_output_value, + int frame_size, + int batch_size, + lite::cuda::math::ActivationType active_gate, + bool is_batch); + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/gru_forward.h b/lite/backends/cuda/math/gru_forward.h new file mode 100644 index 0000000000000000000000000000000000000000..3a1648c437e860bec07fbec7bbbd69b659a58407 --- /dev/null +++ b/lite/backends/cuda/math/gru_forward.h @@ -0,0 +1,242 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include +#include + +#include "lite/api/paddle_place.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/backends/cuda/math/activation.h" +#include "lite/core/context.h" +#include "lite/core/target_wrapper.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 + +template +inline __device__ Dtype Sigmoid(const Dtype a) { + const Dtype min = SIGMOID_THRESHOLD_MIN; + const Dtype max = SIGMOID_THRESHOLD_MAX; + Dtype tmp = (a < min) ? min : ((a > max) ? max : a); + return static_cast(1.0) / (static_cast(1.0) + expf(-tmp)); +} + +template <> +inline __device__ half Sigmoid(const half a) { +#if __CUDA_ARCH__ >= 530 + const half tmp = __float2half(1.0f); + return __hdiv(tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.f), a)))); +#else + return __float2half(1.0f / (expf(__half2float(a) * -1) + 1.0f)); +#endif +} + +template +inline __device__ Dtype ReLU(const Dtype a) { + return a > static_cast(0.f) ? a : static_cast(0.f); +} + +template <> +inline __device__ half ReLU(const half a) { + const half tmp = __float2half(0.f); +#if __CUDA_ARCH__ >= 530 + return __hgt(a, tmp) ? a : tmp; +#else + return __float2half(__half2float(a) > 0.f ? __half2float(a) : 0.f); +#endif +} + +template +inline __device__ Dtype Tanh(const Dtype a) { + Dtype tmp = static_cast(-2.0) * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + return (static_cast(2.0) / (static_cast(1.0) + expf(tmp))) - + static_cast(1.0); +} + +template <> +inline __device__ half Tanh(const half a) { +#if __CUDA_ARCH__ >= 530 + half tmp = __float2half(1.0f); + half numerator = __hmul(__float2half(-2.0f), a); + return __hsub(__hdiv(__float2half(2.0f), __hadd(tmp, hexp(numerator))), tmp); +#else + float tmp = -2.0f * __half2float(a); + return __float2half(2.0f / (1.0f + expf(tmp)) - 1.0f); +#endif +} + +template +__global__ void GruForwardResetOutput( + T* gate_value, + T* reset_output_value, + T* prev_output_value, + int frame_size, + int batch_size, + lite::cuda::math::ActivationType active_gate, + bool is_batch); + +template +__global__ void GruForwardFinalOutput( + T* gate_value, + T* prev_output_value, + T* output_value, + int frame_size, + int batch_size, + lite::cuda::math::ActivationType active_node, + bool origin_mode, + bool is_batch); + +/* + * threads(tile_size, 1) + * grids(frame_blocks, 1) + */ +template +__global__ void FastCollectiveGruGate(T* gate_value, + T* prev_output_value, + T* gate_weight, + T* reset_output, + int frame_size, + ActivationType active_node) { + T xt_0 = 0.0f; + T a0 = 0.0f; + T c0 = 0.0f; + T b0[TiledSize]; + + int col = blockIdx.x * blockDim.x + threadIdx.x; + int tiled_mask = ((1 << TiledSize) - 1); + // tiled matrix multiply using register shift, faster than sm. + if (prev_output_value) { + for (int k = 0; k < (((frame_size - 1) / TiledSize) + 1); ++k) { + a0 = 0; + if ((threadIdx.x + k * TiledSize) < frame_size) { + a0 = prev_output_value[threadIdx.x + (k * TiledSize)]; + } + for (int i = 0; i < TiledSize; ++i) { + if (col < frame_size * 2 && (i + k * TiledSize) < frame_size) { + b0[i] = gate_weight[(i + k * TiledSize) * frame_size * 2 + col]; + } + } + + for (int i = 0; i < TiledSize; ++i) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + c0 = c0 + __shfl_sync(tiled_mask, a0, i, TiledSize) * b0[i]; +#else + c0 = c0 + __shfl(a0, i, TiledSize) * b0[i]; +#endif + } + } + } + + __syncthreads(); + + if (col < frame_size * 2) { + xt_0 = gate_value[col]; + c0 += xt_0; + if (active_node == ActivationType::kSigmoid) { + c0 = Sigmoid(c0); + } else if (active_node == ActivationType::kReLU) { + c0 = ReLU(c0); + } else if (active_node == ActivationType::kTanh) { + c0 = Tanh(c0); + } + gate_value[col] = c0; + if (frame_size <= col && col < frame_size * 2) { + T htp_0 = 0.0; + if (prev_output_value) { + htp_0 = prev_output_value[col - frame_size]; + } + reset_output[col - frame_size] = c0 * htp_0; + } else if (col < frame_size) { + gate_value[col] = c0; + } + } +} + +template +__global__ void FastCollectiveGruOut(T* gate_weight, + T* prev_out_value, + T* output_value, + T* gate_value, + T* reset_value, + int frame_size, + ActivationType active_node, + bool origin_mode) { + int col = blockIdx.x * blockDim.x + threadIdx.x; + T a0 = 0.0f; + T b0[TiledSize]; + T c0 = 0.0f; + + int tiled_mask = ((1 << TiledSize) - 1); + if (prev_out_value) { + for (int k = 0; k < ((frame_size - 1) / TiledSize + 1); ++k) { + a0 = 0; + if ((threadIdx.x + k * TiledSize) < frame_size) { + a0 = reset_value[threadIdx.x + k * TiledSize]; + } + for (int i = 0; i < TiledSize; ++i) { + if (col < frame_size && (i + k * TiledSize) < frame_size) { + b0[i] = gate_weight[(i + k * TiledSize) * frame_size + col]; + } + } + for (int i = 0; i < TiledSize; ++i) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + c0 = c0 + __shfl_sync(tiled_mask, a0, i, TiledSize) * b0[i]; +#else + c0 = c0 + __shfl(a0, i, TiledSize) * b0[i]; +#endif + } + } + } + + __syncthreads(); + + if (col < frame_size) { + T xt_0 = gate_value[col + 2 * frame_size]; + T gta_0 = gate_value[col]; + T htp_0 = 0; + if (prev_out_value) { + htp_0 = prev_out_value[col]; + } + c0 += xt_0; + if (active_node == ActivationType::kSigmoid) { + c0 = Sigmoid(c0); + } else if (active_node == ActivationType::kReLU) { + c0 = ReLU(c0); + } else if (active_node == ActivationType::kTanh) { + c0 = Tanh(c0); + } + gate_value[col + 2 * frame_size] = c0; + if (origin_mode) { + output_value[col] = htp_0 * gta_0 + (1 - gta_0) * c0; + } else { + output_value[col] = c0 * gta_0 + (1 - gta_0) * htp_0; + } + } +} + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/scale.cu b/lite/backends/cuda/math/scale.cu index 806a3697a2eb19354a81056f0a7ab6272ed991a1..f9d5209c3e4af11231f4b62531f9eb11ede56557 100644 --- a/lite/backends/cuda/math/scale.cu +++ b/lite/backends/cuda/math/scale.cu @@ -22,10 +22,6 @@ namespace lite { namespace cuda { namespace math { -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ - i += blockDim.x * gridDim.x) - template __global__ void scale_kernel(int count, const T* in_data, @@ -48,7 +44,6 @@ __global__ void scale_kernel(int count, template __global__ void scale_kernel( int count, const T* in_data, T* out_data, const T scale, const T bias) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; CUDA_KERNEL_LOOP(tid, count) { out_data[tid] = scale * in_data[tid] + bias; } } @@ -133,12 +128,11 @@ void fp32_scale_nhwc(int num, } template -void scale(int num, const T* in, T* out, T scale, cudaStream_t stream, T bias) { +void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream) { int thread = 256; int block = (num + thread - 1) / thread; scale_kernel<<>>(num, in, out, scale, bias); - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) std::cout << cudaGetErrorString(error); + CUDA_POST_KERNEL_CHECK; } template @@ -146,11 +140,10 @@ void scale(int num, const T* in, T* out, T scale, T bias) { int thread = 256; int block = (num + thread - 1) / thread; scale_kernel<<>>(num, in, out, scale, bias); - cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) std::cout << cudaGetErrorString(error); + CUDA_POST_KERNEL_CHECK; } -template void scale(int num, const float*, float*, float, cudaStream_t, float); +template void scale(int num, const float*, float*, float, float, cudaStream_t); template void scale(int num, const float*, float*, float, float); } // namespace math diff --git a/lite/backends/cuda/math/scale.h b/lite/backends/cuda/math/scale.h index 52ed1d38ae79ce11cac50a9abef0f57e6de1352c..b9961b12c3c251ffb7f80589fa8c9ccb12d96e30 100644 --- a/lite/backends/cuda/math/scale.h +++ b/lite/backends/cuda/math/scale.h @@ -32,8 +32,7 @@ void fp32_scale_nhwc(int num, cudaStream_t stream); template -void scale( - int num, const T* in, T* out, T scale, cudaStream_t stream, T bias = 0); +void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream); template void scale(int num, const T* in, T* out, T scale, T bias = 0); diff --git a/lite/backends/cuda/math/sequence2batch.cu b/lite/backends/cuda/math/sequence2batch.cu new file mode 100644 index 0000000000000000000000000000000000000000..9a93362b3bb163b889049d07186634987ed63940 --- /dev/null +++ b/lite/backends/cuda/math/sequence2batch.cu @@ -0,0 +1,91 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/backends/cuda/math/sequence2batch.h" +#include "lite/backends/cuda/math/utils.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +__global__ void CopyMatrixRowsKernel(const T* src, + T* dst, + const uint64_t* index, + int height, + int width, + bool is_src_index) { + int idx = threadIdx.x; + int idy = threadIdx.y; + int row_id = blockDim.y * blockIdx.x + idy; + if (row_id < height) { + int src_idx = is_src_index ? index[row_id] : row_id; + int dst_idx = is_src_index ? row_id : index[row_id]; + const T* src_data = src + src_idx * width; + T* dst_data = dst + dst_idx * width; + for (int i = idx; i < width; i += blockDim.x) { + dst_data[i] = src_data[i]; + } + } +} + +template +void CopyMatrixRowsFunctor::operator()( + const lite::Tensor& src, + lite::Tensor* dst, + const std::vector& index_lod, + bool is_src_index, + const cudaStream_t& stream) { + auto src_dims = src.dims(); + auto dst_dims = dst->dims(); + CHECK_EQ(src_dims.size(), 2) << "The src must be matrix with rank 2."; + CHECK_EQ(dst_dims.size(), 2) << "The dst must be matrix with rank 2."; + CHECK_EQ(src_dims[1], dst_dims[1]) + << "The width of src and dst must be same."; + int height = dst_dims[0]; + int width = dst_dims[1]; + const auto* src_data = src.data(); + auto* dst_data = dst->template mutable_data(TARGET(kCUDA)); + + index_tensor_.Resize({static_cast(index_lod.size())}); + auto* index_tensor_data = index_tensor_.mutable_data(TARGET(kCUDA)); + TargetWrapperCuda::MemcpyAsync(index_tensor_data, + index_lod.data(), + sizeof(uint64_t) * index_lod.size(), + IoDirection::HtoD, + stream); + dim3 threads(128, 8); + dim3 grids((height + threads.y - 1) / threads.y); + CopyMatrixRowsKernel<<>>( + src_data, dst_data, index_tensor_data, height, width, is_src_index); + CUDA_POST_KERNEL_CHECK; +} + +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; + +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; + +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/sequence2batch.h b/lite/backends/cuda/math/sequence2batch.h new file mode 100644 index 0000000000000000000000000000000000000000..e5a12ed0b4d54a9af47cfc046906ae96767e63cf --- /dev/null +++ b/lite/backends/cuda/math/sequence2batch.h @@ -0,0 +1,167 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include +#include +#include + +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/context.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +class CopyMatrixRowsFunctor { + public: + // If is_src_index is true, copy the indexed rows of input src to the output + // dst. If is_src_index is false, copy the input src to the indexed of output + // dst. The indexes rows are based on the input index. + void operator()(const lite::Tensor& src, + lite::Tensor* dst, + const std::vector& index_lod, + bool is_src_index, + const cudaStream_t& stream); + + private: + lite::Tensor index_tensor_; +}; + +template +class LoDTensor2BatchFunctor { + // Calculate the length of each sequence and + // sort sequence index by the length. + // example: sequences = {s0, s1, s2} + // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + // seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)} + struct SeqInfo { + SeqInfo(size_t start_val, size_t len_val, size_t seq_val) + : start(start_val), length(len_val), seq_idx(seq_val) {} + size_t start; + size_t length; + size_t seq_idx; + }; + + public: + void operator()(const lite::Tensor& lod_tensor, + lite::Tensor* batch_tensor, + bool is_reverse, + const cudaStream_t& stream) const { + auto lods = lod_tensor.lod(); + CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now."; + const auto& lod = lods[0]; + + std::vector seq_info; + for (int seq_id = 0; seq_id < static_cast(lod.size()) - 1; ++seq_id) { + size_t length = lod[seq_id + 1] - lod[seq_id]; + seq_info.emplace_back(lod[seq_id], length, seq_id); + } + + std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) { + return a.length > b.length; + }); + + // Calculate the start position of each batch. + // example: sequences = {s0, s1, s2} + // s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2 + // max_seqlen = 5, + // batchIndex = {b0, b1, b2, b3, b4} + // b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1 + // batch_start_positions[6] = {0, 3, 6, 9, 11, 12} + // batch_start_positions[0] = 0 + // batch_start_positions[1] = len(b0) + // batch_start_positions[2] = len(b0) + len(b1) + // ... + // seq2batch_idx[12] = {4, 0, 9, + // 5, 1, 10, + // 6, 2, 11, + // 7, 3, + // 8} + // seq_order = {1, 0, 2}, the sort order. + // where 1 is the second sequence, + // 0 is the first sequence, + // 2 is the third sequence. + + LoD batch_lods; + batch_lods.emplace_back(std::vector{0}); + batch_lods.emplace_back(std::vector{0}); + batch_lods.emplace_back(std::vector{0}); + + // batch_lods[0] is the start positions for batch LoDTensor + size_t max_seqlen = seq_info[0].length; + batch_lods[0].resize(max_seqlen + 1); + // batch_lods[1] is the raw index in the input LoDTensor + batch_lods[1].resize(static_cast(lod_tensor.dims()[0])); + // batch_lods[2] is the sort order for the input LoDTensor. + batch_lods[2].resize(seq_info.size()); + + auto* batch_starts = batch_lods[0].data(); + auto* seq2batch_idx = batch_lods[1].data(); + batch_starts[0] = 0; + for (size_t n = 0; n < max_seqlen; ++n) { + size_t batch_id = batch_starts[n]; + for (size_t i = 0; i < seq_info.size(); ++i) { + size_t seq_len = seq_info[i].length; + size_t start = seq_info[i].start; + if (n < seq_len) { + seq2batch_idx[batch_id] = + is_reverse ? start + seq_len - 1 - n : start + n; + ++batch_id; + } else { + break; + } + } + batch_starts[n + 1] = batch_id; + } + auto* seq_order = batch_lods[2].data(); + for (size_t i = 0; i < seq_info.size(); ++i) { + seq_order[i] = seq_info[i].seq_idx; + } + + batch_tensor->set_lod(batch_lods); + + lite::cuda::math::CopyMatrixRowsFunctor to_batch; + to_batch(lod_tensor, batch_tensor, batch_lods[1], true, stream); + CUDA_POST_KERNEL_CHECK; + } +}; + +template +class Batch2LoDTensorFunctor { + public: + void operator()(const lite::Tensor& batch_tensor, + lite::Tensor* lod_tensor, + const cudaStream_t& stream) { + auto in_lod = batch_tensor.lod(); + CHECK_GT(in_lod.size(), 2UL) << "The LoD of LoDTensor should include at " + "least 2-level sequence infomation."; + CHECK_EQ(in_lod[1].size(), static_cast(lod_tensor->dims()[0])) + << "The LoD information should be consistent with the dims."; + lite::cuda::math::CopyMatrixRowsFunctor to_seq; + to_seq(batch_tensor, lod_tensor, in_lod[1], false, stream); + CUDA_POST_KERNEL_CHECK; + } +}; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/sequence_padding.cu b/lite/backends/cuda/math/sequence_padding.cu new file mode 100644 index 0000000000000000000000000000000000000000..e4f194b9c2289c51983d62b3835727efea91028d --- /dev/null +++ b/lite/backends/cuda/math/sequence_padding.cu @@ -0,0 +1,164 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/backends/cuda/math/sequence_padding.h" +#include "lite/backends/cuda/math/utils.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +enum CopyType { kSeqToPad, kPadToSeq }; + +template +__global__ void SequencePadKernel(T* dst, + const T* src, + const T* pad_value, + bool is_constant_pad, + const size_t* seq_offsets, + const int seq_num, + const int pad_seq_len, + const int step_width) { + size_t seq_idx = blockIdx.y; + size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx]; + + size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y; + size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width; + size_t pad_data_offset = (seq_idx * pad_seq_len + step_idx) * step_width; + T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset); + const T* src_data = + src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset); + + if (step_idx < seq_len) { + for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) { + dst_data[i] = src_data[i]; + } + } else if (step_idx < pad_seq_len && Type == kSeqToPad) { + for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) { + dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i]; + } + } +} + +template +void SequencePadding(T* pad_data, + const T* seq_data, + const T* pad_value_data, + bool is_constant_pad, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream) { + const int kBlockSize = 512; + /* At least use 32 threads to copy sequence_width elements, + * and at least 8 elements for each thread. + */ + size_t block_dim_x = + std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + size_t block_dim_y = kBlockSize / block_dim_x; + dim3 threads(block_dim_x, block_dim_y); + + size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = seq_num; + dim3 grid(grid_dim_x, grid_dim_y); + + SequencePadKernel<<>>( + pad_data, + seq_data, + pad_value_data, + is_constant_pad, + seq_offsets_data, + seq_num, + pad_seq_len, + step_width); + CUDA_POST_KERNEL_CHECK; +} + +template +void SequenceUnpadding(T* seq_data, + const T* pad_data, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream) { + const int kBlockSize = 512; + /* At least use 32 threads to copy sequence_width elements, + * and at least 8 elements for each thread. + */ + size_t block_dim_x = + std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize); + size_t block_dim_y = kBlockSize / block_dim_x; + dim3 threads(block_dim_x, block_dim_y); + + size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y; + size_t grid_dim_y = seq_num; + dim3 grid(grid_dim_x, grid_dim_y); + + SequencePadKernel<<>>( + seq_data, + pad_data, + nullptr, + false, + seq_offsets_data, + seq_num, + pad_seq_len, + step_width); + CUDA_POST_KERNEL_CHECK; +} + +template void SequencePadding(float* pad_data, + const float* seq_data, + const float* pad_value_data, + bool is_constant_pad, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream); + +template void SequencePadding(half* pad_data, + const half* seq_data, + const half* pad_value_data, + bool is_constant_pad, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream); + +template void SequenceUnpadding(float* seq_data, + const float* pad_data, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream); + +template void SequenceUnpadding(half* seq_data, + const half* pad_data, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream); + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/sequence_padding.h b/lite/backends/cuda/math/sequence_padding.h new file mode 100644 index 0000000000000000000000000000000000000000..cfbac9b5bce2cad75174695ee85c28720a3eaf11 --- /dev/null +++ b/lite/backends/cuda/math/sequence_padding.h @@ -0,0 +1,51 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "lite/core/context.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +void SequenceUnpadding(T* seq_data, + const T* pad_data, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream); + +template +void SequencePadding(T* pad_data, + const T* seq_data, + const T* pad_value_data, + bool is_constant_pad, + const size_t* seq_offsets_data, + int seq_num, + int pad_seq_len, + int step_width, + cudaStream_t* stream); + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/strided_gemm.cc b/lite/backends/cuda/math/strided_gemm.cc new file mode 100644 index 0000000000000000000000000000000000000000..91013d977702682a42050407f49356bf7445bcbd --- /dev/null +++ b/lite/backends/cuda/math/strided_gemm.cc @@ -0,0 +1,136 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/math/strided_gemm.h" + +#include + +#include "lite/core/device_info.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +bool StridedGemm::init(const bool trans_a, + const bool trans_b, + Context* ctx) { + if (cu_handle_ == nullptr) { + this->exe_stream_ = ctx->exec_stream(); + CUBLAS_CALL(cublasCreate(&cu_handle_)); + CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_)); + } + cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N; + return true; +} + +template <> +bool StridedGemm::run(const float alpha, + const float beta, + const int m, + const int n, + const int k, + const float* a_data, + const float* b_data, + float* c_data, + const int batch_size, + const int64_t stride_a, + const int64_t stride_b) { + lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m; + ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k; + ldc_ = n; + m_ = m; + n_ = n; + k_ = k; + const int64_t stride_c = m_ * n_; + CUBLAS_CALL(cublasGemmStridedBatchedEx(cu_handle_, + cu_trans_b_, + cu_trans_a_, + n_, + m_, + k_, + &alpha, + b_data, + CUDA_R_32F, + ldb_, + stride_b, + a_data, + CUDA_R_32F, + lda_, + stride_a, + &beta, + c_data, + CUDA_R_32F, + ldc_, + stride_c, + batch_size, + CUDA_R_32F, + algo_)); + return true; +} + +template <> +bool StridedGemm::run(const half alpha, + const half beta, + const int m, + const int n, + const int k, + const half* a_data, + const half* b_data, + half* c_data, + const int batch_size, + const int64_t stride_a, + const int64_t stride_b) { + lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m; + ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k; + ldc_ = n; + m_ = m; + n_ = n; + k_ = k; + const int64_t stride_c = m_ * n_; + CUBLAS_CALL(cublasGemmStridedBatchedEx(cu_handle_, + cu_trans_b_, + cu_trans_a_, + n_, + m_, + k_, + &alpha, + b_data, + CUDA_R_16F, + ldb_, + stride_b, + a_data, + CUDA_R_16F, + lda_, + stride_a, + &beta, + c_data, + CUDA_R_16F, + ldc_, + stride_c, + batch_size, + CUDA_R_16F, + algo_)); + return true; +} + +template class StridedGemm; +template class StridedGemm; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/strided_gemm.h b/lite/backends/cuda/math/strided_gemm.h new file mode 100644 index 0000000000000000000000000000000000000000..4a0fe7143a2569eda36d203d9c905f2a4a0c772c --- /dev/null +++ b/lite/backends/cuda/math/strided_gemm.h @@ -0,0 +1,72 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include + +#include "lite/api/paddle_place.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/context.h" +#include "lite/core/target_wrapper.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +class StridedGemm { + public: + StridedGemm() : cu_handle_(nullptr) {} + ~StridedGemm() {} + + bool init(const bool trans_a, + const bool trans_b, + Context* ctx); + + bool run(const PtypeIn alpha, + const PtypeIn beta, + const int m, + const int n, + const int k, + const PtypeIn* a_data, + const PtypeIn* b_data, + PtypeOut* c_data, + const int batch_size, + const int64_t stride_a, + const int64_t stride_b); + + private: + cudaStream_t exe_stream_; + cublasHandle_t cu_handle_; + cublasOperation_t cu_trans_a_; + cublasOperation_t cu_trans_b_; + int m_{-1}; + int n_{-1}; + int k_{-1}; + int lda_{-1}; + int ldb_{-1}; + int ldc_{-1}; + cublasGemmAlgo_t algo_{CUBLAS_GEMM_DEFAULT_TENSOR_OP}; +}; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/transpose.cu b/lite/backends/cuda/math/transpose.cu index c50840fe269657965db8c58b171fce6819009775..d919bd757fbbcfcc5e5f8a3a4c18fbd1ed9ac53f 100644 --- a/lite/backends/cuda/math/transpose.cu +++ b/lite/backends/cuda/math/transpose.cu @@ -174,24 +174,9 @@ void Transpose::transpose(T* dst, TransposeCUDAImpl(src_dims, axes, src, dst, &Y_dims_, &strides_, stream); } -// template -// void Transpose::transpose(T* dst, -// const T* src, -// const std::vector& src_dims, -// const std::vector& axes, -// cudaStream_t* stream) { -// std::vector _src_dims(src_dims.size(), 0); -// std::transform( -// src_dims.begin(), -// src_dims.end(), -// _src_dims.begin(), -// [](int data) -> int64_t { return static_cast(data); }); -// TransposeCUDAImpl(_src_dims, axes, src, dst, &Y_dims_, &strides_, -// stream); -//} - template class Transpose; template class Transpose; +template class Transpose; } // namespace math } // namespace cuda diff --git a/lite/backends/cuda/target_wrapper.h b/lite/backends/cuda/target_wrapper.h index 3eeee84c1c46a65782e38b998bcd8142e08cbec1..caa9b3077fe96bf73e50b33688b90b71e0cd5c23 100644 --- a/lite/backends/cuda/target_wrapper.h +++ b/lite/backends/cuda/target_wrapper.h @@ -15,6 +15,7 @@ #pragma once #include #include +#include "lite/backends/cuda/cuda_utils.h" #include "lite/core/target_wrapper.h" namespace paddle { @@ -31,6 +32,16 @@ class TargetWrapper { static size_t num_devices(); static size_t maximum_stream() { return 0; } + static int GetComputeCapability() { + int dev_id = GetCurDevice(); + int major, minor; + CUDA_CALL(cudaDeviceGetAttribute( + &major, cudaDevAttrComputeCapabilityMajor, dev_id)); + CUDA_CALL(cudaDeviceGetAttribute( + &minor, cudaDevAttrComputeCapabilityMinor, dev_id)); + return major * 10 + minor; + } + static size_t GetCurDevice() { int dev_id; cudaGetDevice(&dev_id); diff --git a/lite/backends/host/target_wrapper.cc b/lite/backends/host/target_wrapper.cc index 5f020662a9d74aab6c28f79221d670e5de5ae048..00ce9dd6b349decc2f603692c2a6a0801bd4d7c0 100644 --- a/lite/backends/host/target_wrapper.cc +++ b/lite/backends/host/target_wrapper.cc @@ -19,7 +19,7 @@ namespace paddle { namespace lite { -const int MALLOC_ALIGN = 64; +const int MALLOC_ALIGN = 16; void* TargetWrapper::Malloc(size_t size) { size_t offset = sizeof(void*) + MALLOC_ALIGN - 1; @@ -30,7 +30,6 @@ void* TargetWrapper::Malloc(size_t size) { void* r = reinterpret_cast(reinterpret_cast(p + offset) & (~(MALLOC_ALIGN - 1))); static_cast(r)[-1] = p; - memset(r, 0, size); return r; } void TargetWrapper::Free(void* ptr) { diff --git a/lite/backends/huawei_ascend_npu/CMakeLists.txt b/lite/backends/huawei_ascend_npu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..65616b4d357d4d29ca9b356abead2e1f6eb725d1 --- /dev/null +++ b/lite/backends/huawei_ascend_npu/CMakeLists.txt @@ -0,0 +1,6 @@ +if(NOT LITE_WITH_HUAWEI_ASCEND_NPU) + return() +endif() + +lite_cc_library(model_client_huawei_ascend_npu SRCS model_client.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs}) +lite_cc_library(device_huawei_ascend_npu SRCS device.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs} model_client_huawei_ascend_npu) diff --git a/lite/backends/huawei_ascend_npu/device.cc b/lite/backends/huawei_ascend_npu/device.cc new file mode 100644 index 0000000000000000000000000000000000000000..c8dc3d1de46fe12c3cb41257f864bcb1ff82bd9a --- /dev/null +++ b/lite/backends/huawei_ascend_npu/device.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/huawei_ascend_npu/device.h" +#include +#include +#include "ge/ge_api_types.h" +#include "ge/ge_ir_build.h" +#include "graph/graph.h" +#include "lite/utils/io.h" + +namespace paddle { +namespace lite { +namespace huawei_ascend_npu { + +std::shared_ptr Device::LoadFromMem( + const std::vector& model_buffer, const int device_id) { + if (model_buffer.size() == 0) { + LOG(ERROR) << "[HUAWEI_ASCEND_NPU] model_buffer size is ZERO!"; + return nullptr; + } + + // Create a ACL model client to load the om model + std::shared_ptr model_client(new AclModelClient(device_id)); + // Load model from memory + if (model_client->LoadFromMem( + reinterpret_cast(model_buffer.data()), + model_buffer.size())) { + return model_client; + } + return nullptr; +} + +std::shared_ptr Device::LoadFromFile( + const std::string& model_path, const int device_id) { + if (!paddle::lite::IsFileExists(model_path)) { + VLOG(3) << "[HUAWEI_ASCEND_NPU] om model file not exists:" << model_path; + return nullptr; + } + + // Create a ACL model client to load the om model + std::shared_ptr model_client(new AclModelClient(device_id)); + // Load model from memory + if (model_client->LoadFromFile(model_path.c_str())) { + VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path; + return model_client; + } + return nullptr; +} + +std::mutex Device::device_mutex_; + +bool Device::Build(std::vector& input_nodes, // NOLINT + std::vector& output_nodes, // NOLINT + std::vector* model_buffer) { + std::lock_guard lock(device_mutex_); + // Convert the HiAI IR graph to the HiAI om model + ge::Graph ir_graph("graph"); + ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes); + + // Build IR model + ge::ModelBufferData om_buffer; + std::map options; + options.insert(std::make_pair(ge::ir_option::LOG_LEVEL, "error")); + + ATC_CALL(aclgrphBuildModel(ir_graph, options, om_buffer)); + + // Copy from om model buffer + model_buffer->resize(om_buffer.length); + memcpy(reinterpret_cast(model_buffer->data()), + reinterpret_cast(om_buffer.data.get()), + om_buffer.length); + + return true; +} + +void Device::InitOnce() { + if (runtime_inited_) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] runtime already inited!"; + return; + } + // ACL runtime init => can only be called once in one process + ACL_CALL(aclInit(NULL)); + + // ATC builder init => can only be called once in one process + std::map global_options; + global_options.insert( + std::make_pair(ge::ir_option::SOC_VERSION, "Ascend310")); + ATC_CALL(ge::aclgrphBuildInitialize(global_options)); + + runtime_inited_ = true; +} + +void Device::DestroyOnce() { + if (!runtime_inited_) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to destroy runtime!"; + return; + } + // ATC builder finalize => can only be called once in one process + ge::aclgrphBuildFinalize(); + // ACL runtime finalize => can only be called once in one process + ACL_CALL(aclFinalize()); + + runtime_inited_ = false; +} + +} // namespace huawei_ascend_npu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/huawei_ascend_npu/device.h b/lite/backends/huawei_ascend_npu/device.h new file mode 100644 index 0000000000000000000000000000000000000000..de7ca55670ad019b0f035f9e8ab42c29748654f1 --- /dev/null +++ b/lite/backends/huawei_ascend_npu/device.h @@ -0,0 +1,55 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include // NOLINT +#include +#include +#include "lite/backends/huawei_ascend_npu/model_client.h" + +namespace paddle { +namespace lite { +namespace huawei_ascend_npu { + +class Device { + public: + static Device& Global() { + static Device x; + return x; + } + Device() { InitOnce(); } + + ~Device() { DestroyOnce(); } + + std::shared_ptr LoadFromMem( + const std::vector& model_buffer, const int device_id); + std::shared_ptr LoadFromFile(const std::string& model_path, + const int device_id); + // Build the ACL IR graph to the ACL om model + bool Build(std::vector& input_nodes, // NOLINT + std::vector& output_nodes, // NOLINT + std::vector* model_buffer); // NOLINT + + private: + void InitOnce(); + void DestroyOnce(); + bool runtime_inited_{false}; + static std::mutex device_mutex_; +}; + +} // namespace huawei_ascend_npu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/huawei_ascend_npu/model_client.cc b/lite/backends/huawei_ascend_npu/model_client.cc new file mode 100644 index 0000000000000000000000000000000000000000..02a8014210b24f8ae143ee68341aec0281d5a570 --- /dev/null +++ b/lite/backends/huawei_ascend_npu/model_client.cc @@ -0,0 +1,398 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/huawei_ascend_npu/model_client.h" + +namespace paddle { +namespace lite { +namespace huawei_ascend_npu { + +bool AclModelClient::LoadFromMem(const void* data, uint32_t size) { + if (load_flag_) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!"; + return true; + } + + auto ret = aclmdlQuerySizeFromMem( + data, size, &model_memory_size_, &model_weight_size_); + if (ret != ACL_ERROR_NONE) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from memory failed!"; + return false; + } + ret = aclrtMalloc( + &model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST); + if (ret != ACL_ERROR_NONE) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory " + "failed, require size is " + << model_memory_size_; + return false; + } + ret = aclrtMalloc( + &model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST); + if (ret != ACL_ERROR_NONE) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth " + "failed, require size is " + << model_weight_size_; + return false; + } + ret = aclmdlLoadFromMemWithMem(data, + size, + &model_id_, + model_memory_ptr_, + model_memory_size_, + model_weight_ptr_, + model_weight_size_); + if (ret != ACL_ERROR_NONE) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from memory failed!"; + return false; + } + model_desc_ = aclmdlCreateDesc(); + if (model_desc_ == nullptr) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!"; + return false; + } + ret = aclmdlGetDesc(model_desc_, model_id_); + if (ret != ACL_ERROR_NONE) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!"; + return false; + } + VLOG(3) << "[HUAWEI_ASCEND_NPU] AclModelClient LoadFromMem success."; + load_flag_ = true; + return true; +} + +bool AclModelClient::LoadFromFile(const char* model_path) { + if (load_flag_) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!"; + return true; + } + auto ret = + aclmdlQuerySize(model_path, &model_memory_size_, &model_weight_size_); + if (ret != ACL_ERROR_NONE) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from file failed!"; + return false; + } + ret = aclrtMalloc( + &model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST); + if (ret != ACL_ERROR_NONE) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory " + "failed, require size is " + << model_memory_size_; + return false; + } + ret = aclrtMalloc( + &model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST); + if (ret != ACL_ERROR_NONE) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth " + "failed, require size is " + << model_weight_size_; + return false; + } + ret = aclmdlLoadFromFileWithMem(model_path, + &model_id_, + model_memory_ptr_, + model_memory_size_, + model_weight_ptr_, + model_weight_size_); + if (ret != ACL_ERROR_NONE) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from file failed!"; + return false; + } + model_desc_ = aclmdlCreateDesc(); + if (model_desc_ == nullptr) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!"; + return false; + } + ret = aclmdlGetDesc(model_desc_, model_id_); + if (ret != ACL_ERROR_NONE) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!"; + return false; + } + VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path; + load_flag_ = true; + return true; +} + +bool AclModelClient::GetModelIOTensorDim( + std::vector* input_tensor, + std::vector* output_tensor) { + if (!model_desc_) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] GetModelIOTensorDim failed!"; + return false; + } + size_t input_num = aclmdlGetNumInputs(model_desc_); + VLOG(3) << "[HUAWEI_ASCEND_NPU] input numher is " << input_num; + for (size_t i = 0; i < input_num; i++) { + VLOG(3) << "[HUAWEI_ASCEND_NPU] printing input [" << i << "] ...."; + aclmdlIODims input_dim; + aclmdlGetInputDims(model_desc_, i, &input_dim); + aclDataType data_type = aclmdlGetInputDataType(model_desc_, i); + VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of inputs[" << i << "] is " + << data_type; + aclFormat data_format = aclmdlGetInputFormat(model_desc_, i); + VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of inputs[" << i << "] is " + << data_format; + TensorDesc tensor_desc = TensorDesc(data_type, input_dim, data_format); + input_tensor->push_back(tensor_desc); + } + + size_t output_num = aclmdlGetNumOutputs(model_desc_); + VLOG(3) << "[HUAWEI_ASCEND_NPU] output numher is " << output_num; + for (size_t i = 0; i < output_num; i++) { + VLOG(3) << "[HUAWEI_ASCEND_NPU] printing output [" << i << "] ...."; + aclmdlIODims output_dim; + aclmdlGetOutputDims(model_desc_, i, &output_dim); + aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i); + VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of outputs[" << i << "] is " + << data_type; + aclFormat data_format = aclmdlGetOutputFormat(model_desc_, i); + VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of outputs[" << i << "] is " + << data_format; + TensorDesc tensor_desc = TensorDesc(data_type, output_dim, data_format); + output_tensor->push_back(tensor_desc); + } + return true; +} + +bool AclModelClient::GetTensorFromDataset( + std::vector>* output_tensor) { + size_t device_output_num = aclmdlGetDatasetNumBuffers(output_dataset_); + size_t tensor_output_num = reinterpret_cast(output_tensor->size()); + if (device_output_num != tensor_output_num) { + LOG(ERROR) + << "[HUAWEI_ASCEND_NPU] output number not equal, device number is " + << device_output_num << "tensor number is " << tensor_output_num; + return false; + } + for (size_t i = 0; i < device_output_num; i++) { + aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(output_dataset_, i); + void* device_data = aclGetDataBufferAddr(buffer_device); + uint32_t device_size = aclGetDataBufferSize(buffer_device); + + void* tensor_data = nullptr; + aclError ret = aclrtMallocHost(&tensor_data, device_size); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMallocHost failed, ret " << ret; + return false; + } + ret = aclrtMemcpy(tensor_data, + device_size, + device_data, + device_size, + ACL_MEMCPY_DEVICE_TO_HOST); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMemcpy failed, ret " << ret; + return false; + } + if (output_tensor->at(i)->SetData(reinterpret_cast(tensor_data), + device_size) != ge::GRAPH_SUCCESS) { + LOG(ERROR) << "[HUAWEI_ASCEND_NPU] SetData to output tensor failed"; + return false; + } + } + VLOG(3) + << "[HUAWEI_ASCEND_NPU] Get output tensor from output dataset succeed."; + return true; +} + +void AclModelClient::CreateInputDataset( + std::vector>* input_tensor) { + input_dataset_ = aclmdlCreateDataset(); + if (input_dataset_ == nullptr) { + LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create input dataset failed!"; + return; + } + + for (size_t i = 0; i < input_tensor->size(); i++) { + auto item = input_tensor->at(i); + size_t buffer_size = item->GetSize(); + void* buffer_device = nullptr; + aclError ret = + aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) + << "[HUAWEI_ASCEND_NPU] input malloc device buffer failed. size is " + << buffer_size; + return; + } + void* buffer_data = reinterpret_cast(item->GetData()); + ret = aclrtMemcpy(buffer_device, + buffer_size, + buffer_data, + buffer_size, + ACL_MEMCPY_HOST_TO_DEVICE); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input memcpy failed, buffer size is " + << buffer_size; + aclrtFree(buffer_device); + return; + } + aclDataBuffer* data_buffer = + aclCreateDataBuffer(buffer_device, buffer_size); + if (data_buffer == nullptr) { + LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!"; + aclrtFree(buffer_device); + return; + } + if (aclmdlAddDatasetBuffer(input_dataset_, data_buffer) != ACL_ERROR_NONE) { + LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input aclmdlAddDatasetBuffer failed!"; + aclrtFree(buffer_device); + aclDestroyDataBuffer(data_buffer); + return; + } + } + VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateInputDataset succeed."; +} +void AclModelClient::CreateOutputDataset( + std::vector>* output_tensor) { + output_dataset_ = aclmdlCreateDataset(); + if (output_dataset_ == nullptr) { + LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create output dataset failed!"; + return; + } + size_t output_size = aclmdlGetNumOutputs(model_desc_); + CHECK_EQ(output_size, output_tensor->size()); + for (size_t i = 0; i < output_size; i++) { + size_t buffer_size = aclmdlGetOutputSizeByIndex(model_desc_, i); + void* buffer_device = nullptr; + aclError ret = + aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) + << "[HUAWEI_ASCEND_NPU] output malloc device buffer failed. size is " + << buffer_size; + return; + } + aclDataBuffer* data_buffer = + aclCreateDataBuffer(buffer_device, buffer_size); + if (data_buffer == nullptr) { + LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!"; + aclrtFree(buffer_device); + return; + } + if (aclmdlAddDatasetBuffer(output_dataset_, data_buffer) != + ACL_ERROR_NONE) { + LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclmdlAddDatasetBuffer failed!"; + aclrtFree(buffer_device); + aclDestroyDataBuffer(data_buffer); + return; + } + } + VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateOutputDataset succeed."; +} + +bool AclModelClient::ModelExecute( + std::vector>* input_tensor, + std::vector>* output_tensor) { + // check model exists + if (model_desc_ == nullptr) { + LOG(ERROR) + << "[HUAWEI_ASCEND_NPU] no model description, model execution failed!"; + return false; + } + // create input/output dataset + CreateInputDataset(input_tensor); + CreateOutputDataset(output_tensor); + + // model execution + ACL_CALL(aclmdlExecute(model_id_, input_dataset_, output_dataset_)); + + // get output + if (!GetTensorFromDataset(output_tensor)) { + LOG(ERROR) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset failed, modelId:" + << model_id_; + return false; + } + VLOG(3) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset succeed, modelId:" + << model_id_; + + return true; +} + +void AclModelClient::DestroyDataset(aclmdlDataset** dataset) { + if (*dataset == nullptr) { + LOG(WARNING) + << "[HUAWEI_ASCEND_NPU] no dataset exists, no need to destroy!"; + return; + } + + size_t dataset_num = aclmdlGetDatasetNumBuffers(*dataset); + for (size_t i = 0; i < dataset_num; i++) { + aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(*dataset, i); + void* device_data = aclGetDataBufferAddr(buffer_device); + if (device_data == nullptr) { + LOG(WARNING) + << "[HUAWEI_ASCEND_NPU] failed to get data buffer of deivce data!"; + } else { + if (aclrtFree(device_data) != ACL_ERROR_NONE) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to free deivce data!"; + } + } + if (aclDestroyDataBuffer(buffer_device) != ACL_ERROR_NONE) { + LOG(WARNING) + << "[HUAWEI_ASCEND_NPU] failed to destroy deivce data buffer!"; + } + } + if (aclmdlDestroyDataset(*dataset) != ACL_ERROR_NONE) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to destroy dataset!"; + } + *dataset = nullptr; + VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroy dataset success."; +} + +bool AclModelClient::UnloadModel() { + if (!load_flag_) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to unload model, load flag is " + << load_flag_; + return true; + } + + DestroyDataset(&input_dataset_); + DestroyDataset(&output_dataset_); + + aclError ret = aclmdlUnload(model_id_); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "unload model failed, model id is " << model_id_; + return false; + } + if (model_desc_ != nullptr) { + (void)aclmdlDestroyDesc(model_desc_); + model_desc_ = nullptr; + } + + if (model_memory_ptr_ != nullptr) { + aclrtFree(model_memory_ptr_); + model_memory_ptr_ = nullptr; + model_memory_size_ = 0; + } + + if (model_weight_ptr_ != nullptr) { + aclrtFree(model_weight_ptr_); + model_weight_ptr_ = nullptr; + model_weight_size_ = 0; + } + load_flag_ = false; + VLOG(3) << "[HUAWEI_ASCEND_NPU] Unload model success, model id " << model_id_; + return true; +} + +uint32_t AclModelClient::num_devices() { + uint32_t count = 0; + ACL_CALL(aclrtGetDeviceCount(&count)); + return count; +} + +} // namespace huawei_ascend_npu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/huawei_ascend_npu/model_client.h b/lite/backends/huawei_ascend_npu/model_client.h new file mode 100644 index 0000000000000000000000000000000000000000..5cf19b26261a4ff0301b493c7edf2de6ce3f7ec1 --- /dev/null +++ b/lite/backends/huawei_ascend_npu/model_client.h @@ -0,0 +1,179 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/backends/huawei_ascend_npu/utils.h" + +namespace paddle { +namespace lite { +namespace huawei_ascend_npu { + +class TensorDesc { + public: + TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) { + if (format == ACL_FORMAT_NHWC) { + dim_order[1] = 3; + dim_order[2] = 1; + dim_order[3] = 2; + } + // create ge::Tensordesc + ge_tensor_desc_ = new ge::TensorDesc( + GetGeShape(dims), GetGeFormat(format), GetGeDataType(data_type)); + CHECK(ge_tensor_desc_ != nullptr); + } + ~TensorDesc() { ge_tensor_desc_ = nullptr; } + int64_t GetNumber() const { + return ge_tensor_desc_->GetShape().GetDim(dim_order[0]); + } + int64_t GetChannel() const { + return ge_tensor_desc_->GetShape().GetDim(dim_order[1]); + } + int64_t GetHeight() const { + return ge_tensor_desc_->GetShape().GetDim(dim_order[2]); + } + int64_t GetWidth() const { + return ge_tensor_desc_->GetShape().GetDim(dim_order[3]); + } + const ge::TensorDesc& GetGeTensorDesc() const { return *ge_tensor_desc_; } + + private: + ge::Shape GetGeShape(aclmdlIODims dims) { + ge::Shape ge_shape({0, 0, 0, 0}); + for (size_t i = 0; i < dims.dimCount; i++) { + if (ge_shape.SetDim(i, dims.dims[i]) != ge::GRAPH_SUCCESS) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Shape SetDim failed!"; + } else { + VLOG(3) << "[HUAWEI_ASCEND_NPU] Setting Ge Shape[" << i << "] = <" + << dims.dims[i] << ">"; + } + } + return ge_shape; + } + ge::Format GetGeFormat(aclFormat format) { + ge::Format ge_format = ge::FORMAT_NCHW; + switch (format) { + case ACL_FORMAT_NCHW: + ge_format = ge::FORMAT_NCHW; + break; + case ACL_FORMAT_NHWC: + ge_format = ge::FORMAT_NHWC; + break; + case ACL_FORMAT_ND: + ge_format = ge::FORMAT_ND; + break; + default: + LOG(FATAL) << "[HUAWEI_ASCEND_NPU] format not supported:" << format; + break; + } + return ge_format; + } + ge::DataType GetGeDataType(aclDataType data_type) { + ge::DataType ge_datatype = ge::DT_FLOAT; + switch (data_type) { + case ACL_FLOAT: + ge_datatype = ge::DT_FLOAT; + break; + case ACL_FLOAT16: + ge_datatype = ge::DT_FLOAT16; + break; + case ACL_INT8: + ge_datatype = ge::DT_INT8; + break; + case ACL_INT16: + ge_datatype = ge::DT_INT16; + break; + case ACL_INT32: + ge_datatype = ge::DT_INT32; + break; + case ACL_INT64: + ge_datatype = ge::DT_INT64; + break; + case ACL_BOOL: + ge_datatype = ge::DT_BOOL; + break; + default: + LOG(FATAL) << "[HUAWEI_ASCEND_NPU] data type not supported!"; + break; + } + return ge_datatype; + } + + private: + ge::TensorDesc* ge_tensor_desc_{nullptr}; + // n c h w order, default to ACL_FORMAT_NCHW + std::vector dim_order{0, 1, 2, 3}; +}; + +class AclModelClient { + public: + explicit AclModelClient(int device_id) { + VLOG(3) << "[HUAWEI_ASCEND_NPU] Creating Huawei Ascend Device: " + << device_id; + device_num_ = num_devices(); + if (device_id < 0 || device_id >= device_num_) { + LOG(FATAL) << "Failed with invalid device id " << device_id; + return; + } + device_id_ = device_id; + ACL_CALL(aclrtSetDevice(device_id_)); + } + + ~AclModelClient() { + VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroying Huawei Ascend Device: " + << device_id_; + ACL_CALL(aclrtResetDevice(device_id_)); + } + + bool LoadFromMem(const void* data, uint32_t size); + bool LoadFromFile(const char* model_path); + bool GetModelIOTensorDim(std::vector* input_tensor, + std::vector* output_tensor); + bool ModelExecute(std::vector>* input_tensor, + std::vector>* output_tensor); + bool UnloadModel(); + + private: + void CreateInputDataset( + std::vector>* input_tensor); + void CreateOutputDataset( + std::vector>* output_tensor); + bool GetTensorFromDataset( + std::vector>* output_tensor); + void DestroyDataset(aclmdlDataset** dataset); + + private: + uint32_t num_devices(); + + private: + int device_id_{0}; + int device_num_{0}; + aclrtContext context_{nullptr}; + bool load_flag_{false}; + uint32_t model_id_{0}; + size_t model_memory_size_; + size_t model_weight_size_; + void* model_memory_ptr_; + void* model_weight_ptr_; + aclmdlDesc* model_desc_{nullptr}; + aclmdlDataset* input_dataset_{nullptr}; + aclmdlDataset* output_dataset_{nullptr}; +}; + +} // namespace huawei_ascend_npu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/huawei_ascend_npu/utils.h b/lite/backends/huawei_ascend_npu/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..e2bff3f87e0831f7b98be60ef3980f10da610f10 --- /dev/null +++ b/lite/backends/huawei_ascend_npu/utils.h @@ -0,0 +1,128 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "acl/acl.h" +#include "ge/ge_api_types.h" +#include "ge/ge_ir_build.h" +#include "graph/ge_error_codes.h" +#include "graph/graph.h" +#include "graph/tensor.h" +#include "graph/types.h" +#include "lite/utils/cp_logging.h" + +/* + * This file contains some Huawei Ascend NPU specific uitls. + */ + +#define ACL_CALL(msg) \ + CHECK_EQ(reinterpret_cast(msg), ACL_ERROR_NONE) \ + << (msg) << " Huawei Ascend NPU ACL Error: " \ + << ::paddle::lite::huawei_ascend_npu::AclErrorInfo( \ + reinterpret_cast(msg)) + +#define ATC_CALL(msg) \ + CHECK_EQ(reinterpret_cast(msg), ge::GRAPH_SUCCESS) \ + << (msg) << " Huawei Ascend NPU ATC Error: " \ + << ::paddle::lite::huawei_ascend_npu::AtcErrorInfo( \ + reinterpret_cast(msg)) + +namespace paddle { +namespace lite { +namespace huawei_ascend_npu { + +static const char* AtcErrorInfo(uint32_t error) { + switch (error) { +#define LITE_ATC_ERROR_INFO(xx) \ + case xx: \ + return #xx; \ + break; + LITE_ATC_ERROR_INFO(ge::GRAPH_FAILED); // 0xFFFFFFFF + LITE_ATC_ERROR_INFO(ge::GRAPH_PARAM_INVALID); // 50331649 +#undef LITE_ATC_ERROR_INFO + default: + return "unknown error"; + break; + } +} + +static const char* AclErrorInfo(int error) { + switch (error) { +#define LITE_ACL_ERROR_INFO(xx) \ + case xx: \ + return #xx; \ + break; + LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PARAM); // 100000 + LITE_ACL_ERROR_INFO(ACL_ERROR_UNINITIALIZE); // 100001 + LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_INITIALIZE); // 100002 + LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE); // 100003 + LITE_ACL_ERROR_INFO(ACL_ERROR_WRITE_FILE); // 100004 + LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE_SIZE); // 100005 + LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_FILE); // 100006 + LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_MISSING_ATTR); // 100007 + LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_ATTR_INVALID); // 100008 + LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DUMP_CONFIG); // 100009 + LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PROFILING_CONFIG); // 100010 + LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_MODEL_ID); // 100011 + LITE_ACL_ERROR_INFO(ACL_ERROR_DESERIALIZE_MODEL); // 100012 + LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_MODEL); // 100013 + LITE_ACL_ERROR_INFO(ACL_ERROR_READ_MODEL_FAILURE); // 100014 + LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_SIZE_INVALID); // 100015 + LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_MISSING_ATTR); // 100016 + LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_INPUT_NOT_MATCH); // 100017 + LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_OUTPUT_NOT_MATCH); // 100018 + LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_NOT_DYNAMIC); // 100019 + LITE_ACL_ERROR_INFO(ACL_ERROR_OP_TYPE_NOT_MATCH); // 100020 + LITE_ACL_ERROR_INFO(ACL_ERROR_OP_INPUT_NOT_MATCH); // 100021 + LITE_ACL_ERROR_INFO(ACL_ERROR_OP_OUTPUT_NOT_MATCH); // 100022 + LITE_ACL_ERROR_INFO(ACL_ERROR_OP_ATTR_NOT_MATCH); // 100023 + LITE_ACL_ERROR_INFO(ACL_ERROR_OP_NOT_FOUND); // 100024 + LITE_ACL_ERROR_INFO(ACL_ERROR_OP_LOAD_FAILED); // 100025 + LITE_ACL_ERROR_INFO(ACL_ERROR_UNSUPPORTED_DATA_TYPE); // 100026 + LITE_ACL_ERROR_INFO(ACL_ERROR_FORMAT_NOT_MATCH); // 100027 + LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED); // 100028 + LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_NOT_FOUND); // 100029 + LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED); // 100030 + LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_ALREADY_REGISTERED); // 100031 + LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_QUEUE_ID); // 100032 + LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_SUBSCRIBE); // 100033 + LITE_ACL_ERROR_INFO(ACL_ERROR_STREAM_NOT_SUBSCRIBE); // 100034 + LITE_ACL_ERROR_INFO(ACL_ERROR_THREAD_NOT_SUBSCRIBE); // 100035 + LITE_ACL_ERROR_INFO(ACL_ERROR_WAIT_CALLBACK_TIMEOUT); // 100036 + LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_FINALIZE); // 100037 + LITE_ACL_ERROR_INFO(ACL_ERROR_NOT_STATIC_AIPP); // 100038 + LITE_ACL_ERROR_INFO(ACL_ERROR_BAD_ALLOC); // 200000 + LITE_ACL_ERROR_INFO(ACL_ERROR_API_NOT_SUPPORT); // 200001 + LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DEVICE); // 200002 + LITE_ACL_ERROR_INFO(ACL_ERROR_MEMORY_ADDRESS_UNALIGNED); // 200003 + LITE_ACL_ERROR_INFO(ACL_ERROR_RESOURCE_NOT_MATCH); // 200004 + LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_RESOURCE_HANDLE); // 200005 + LITE_ACL_ERROR_INFO(ACL_ERROR_FEATURE_UNSUPPORTED); // 200006 + LITE_ACL_ERROR_INFO(ACL_ERROR_STORAGE_OVER_LIMIT); // 300000 + LITE_ACL_ERROR_INFO(ACL_ERROR_INTERNAL_ERROR); // 500000 + LITE_ACL_ERROR_INFO(ACL_ERROR_FAILURE); // 500001 + LITE_ACL_ERROR_INFO(ACL_ERROR_GE_FAILURE); // 500002 + LITE_ACL_ERROR_INFO(ACL_ERROR_RT_FAILURE); // 500003 + LITE_ACL_ERROR_INFO(ACL_ERROR_DRV_FAILURE); // 500004 + LITE_ACL_ERROR_INFO(ACL_ERROR_PROFILING_FAILURE); // 500005 +#undef LITE_ACL_ERROR_INFO + default: + return "unknown error"; + break; + } +} + +} // namespace huawei_ascend_npu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/mlu/target_wrapper.cc b/lite/backends/mlu/target_wrapper.cc index 2385f69246a163830e0df855082d728da2743e02..b98854946db7eda4f133d773ae0f5ba9e45a77cc 100644 --- a/lite/backends/mlu/target_wrapper.cc +++ b/lite/backends/mlu/target_wrapper.cc @@ -15,6 +15,7 @@ #include "lite/backends/mlu/target_wrapper.h" #include +#include #include "lite/backends/mlu/mlu_utils.h" @@ -36,6 +37,13 @@ void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) { } // namespace mlu +thread_local cnmlCoreVersion_t TargetWrapperMlu::mlu_core_version_{CNML_MLU270}; +thread_local int TargetWrapperMlu::mlu_core_number_{1}; +thread_local bool TargetWrapperMlu::use_first_conv_{false}; +thread_local std::vector TargetWrapperMlu::mean_vec_; +thread_local std::vector TargetWrapperMlu::std_vec_; +thread_local DataLayoutType TargetWrapperMlu::input_layout_{DATALAYOUT(kNCHW)}; + size_t TargetWrapperMlu::num_devices() { uint32_t dev_count = 0; CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed"; @@ -77,15 +85,42 @@ void TargetWrapperMlu::MemcpySync(void* dst, LOG(FATAL) << "Unsupported IoDirection" << static_cast(dir); } } +void TargetWrapperMlu::SetMLURunMode( + lite_api::MLUCoreVersion core_version, + int core_number, + DataLayoutType input_layout, + std::pair, std::vector> firstconv_param) { + switch (core_version) { + case (lite_api::MLUCoreVersion::MLU_220): + mlu_core_version_ = CNML_MLU220; + break; + case (lite_api::MLUCoreVersion::MLU_270): + mlu_core_version_ = CNML_MLU270; + break; + default: + mlu_core_version_ = CNML_MLU270; + break; + } + mlu_core_number_ = core_number; + mean_vec_ = firstconv_param.first; + std_vec_ = firstconv_param.second; + use_first_conv_ = !(mean_vec_.empty() || std_vec_.empty()); + input_layout_ = input_layout; +} + +cnmlCoreVersion_t TargetWrapperMlu::MLUCoreVersion() { + return mlu_core_version_; +} + +int TargetWrapperMlu::MLUCoreNumber() { return mlu_core_number_; } + +bool TargetWrapperMlu::UseFirstConv() { return use_first_conv_; } + +const std::vector& TargetWrapperMlu::MeanVec() { return mean_vec_; } + +const std::vector& TargetWrapperMlu::StdVec() { return std_vec_; } -// void TargetWrapperMlu::MemcpyAsync(void* dst, -// const void* src, -// size_t size, -// IoDirection dir, -// const stream_t& stream) { -// LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync."; -// MemcpySync(dst, src, size, dir); -// } +DataLayoutType TargetWrapperMlu::InputLayout() { return input_layout_; } } // namespace lite } // namespace paddle diff --git a/lite/backends/mlu/target_wrapper.h b/lite/backends/mlu/target_wrapper.h index 2d9e10806f78e56f50b04d408dab219c923456fc..2566ae153e2f9539d1ad5739f208bc5f946a7542 100644 --- a/lite/backends/mlu/target_wrapper.h +++ b/lite/backends/mlu/target_wrapper.h @@ -13,6 +13,8 @@ // limitations under the License. #pragma once +#include +#include #include "lite/backends/mlu/mlu_utils.h" #include "lite/core/target_wrapper.h" @@ -43,11 +45,25 @@ class TargetWrapper { const void* src, size_t size, IoDirection dir); - // static void MemcpyAsync(void* dst, - // const void* src, - // size_t size, - // IoDirection dir, - // const queue_t& queue); + static void SetMLURunMode( + lite_api::MLUCoreVersion core_version, + int core_number, + DataLayoutType input_layout, + std::pair, std::vector> firstconv_param); + static cnmlCoreVersion_t MLUCoreVersion(); + static int MLUCoreNumber(); + static bool UseFirstConv(); + static const std::vector& MeanVec(); + static const std::vector& StdVec(); + static DataLayoutType InputLayout(); + + private: + static thread_local cnmlCoreVersion_t mlu_core_version_; + static thread_local int mlu_core_number_; + static thread_local bool use_first_conv_; + static thread_local std::vector mean_vec_; + static thread_local std::vector std_vec_; + static thread_local DataLayoutType input_layout_; }; } // namespace lite diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc index f9803aa8810ada33b9eecafe1502515501514e41..2b2d5321ba6dbac7ff002039c3c8a0423cbe0a6e 100644 --- a/lite/backends/npu/device.cc +++ b/lite/backends/npu/device.cc @@ -20,96 +20,122 @@ namespace paddle { namespace lite { namespace npu { -bool WriteToOMFile(const domi::ModelBufferData& om_model_buff, - std::string om_file_path) { - FILE* fp; - fp = fopen(om_file_path.c_str(), "wb"); - CHECK(fp != nullptr) << om_file_path << " open failed!"; - - uint32_t write_size = - (uint32_t)fwrite(om_model_buff.data, 1, om_model_buff.length, fp); - CHECK_EQ(write_size, om_model_buff.length) << "write om file failed !"; - - fclose(fp); - return true; -} - -bool ReadFromOMFile(domi::ModelBufferData* om_model_buff, - std::string om_file_path) { - FILE* fp; - fp = fopen(om_file_path.c_str(), "rb"); - CHECK(fp != nullptr) << om_file_path << " open failed!"; - - fseek(fp, 0, SEEK_END); - uint32_t model_length = (uint32_t)ftell(fp); - fseek(fp, 0, SEEK_SET); - om_model_buff->data = malloc(model_length); - om_model_buff->length = model_length; - uint32_t read_size = - (uint32_t)fread(om_model_buff->data, 1, model_length, fp); - CHECK_EQ(read_size, model_length) << "read om file failed !"; - - fclose(fp); - return true; -} - -std::shared_ptr Device::Build( - const std::string model_name, // NOLINT - std::vector& input_nodes, // NOLINT - std::vector& output_nodes, // NOLINT - const std::string model_cache_full_dir = "" // NOLINT - ) { - VLOG(3) << "[NPU] Build model"; - // Build the HiAI IR graph to the HiAI om model - ge::Graph ir_graph("graph"); - ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes); - ge::Model om_model("model", "model"); - om_model.SetGraph(ir_graph); - domi::HiaiIrBuild ir_build; - domi::ModelBufferData om_model_buf; - - if (!model_cache_full_dir.empty() && IsFileExists(model_cache_full_dir)) { - VLOG(3) << "Will read om model from " << model_cache_full_dir; - ReadFromOMFile(&om_model_buf, model_cache_full_dir); - } else { - if (!ir_build.CreateModelBuff(om_model, om_model_buf)) { - LOG(WARNING) << "[NPU] CreateModelBuff failed!"; - return nullptr; - } - if (!ir_build.BuildIRModel(om_model, om_model_buf)) { - LOG(WARNING) << "[NPU] BuildIRModel failed!"; - ir_build.ReleaseModelBuff(om_model_buf); - return nullptr; - } - if (!model_cache_full_dir.empty()) { - VLOG(3) << "Will write om model to " << model_cache_full_dir; - WriteToOMFile(om_model_buf, model_cache_full_dir); - } - } - +std::shared_ptr Device::Load( + const std::string& model_name, + std::vector* model_buffer, + bool* model_comp) { // Create a HiAI model manager client to load the HiAI om model - std::shared_ptr model_client( - new hiai::AiModelMngerClient()); + auto model_client = std::make_shared(); if (model_client->Init(nullptr) != hiai::AI_SUCCESS) { - LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!"; - ir_build.ReleaseModelBuff(om_model_buf); + LOG(WARNING) << "[NPU] Init hiai model client failed!"; return nullptr; } + // Check HiAI DDK version + const char* ddk_version = model_client->GetVersion(); + if (ddk_version) { + VLOG(3) << "[NPU] HiAI DDK version: " << ddk_version; + } else { + LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!"; + } + // Check model compatibility auto model_desc = std::make_shared( model_name, freq_level(), framework_type(), model_type(), device_type()); - model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length); - std::vector> model_descs; - model_descs.push_back(model_desc); + model_desc->SetModelBuffer( + reinterpret_cast(model_buffer->data()), + model_buffer->size()); + if (!*model_comp && + model_client->CheckModelCompatibility(*model_desc, *model_comp) != + hiai::AI_SUCCESS) { + *model_comp = false; + VLOG(3) << "[NPU] model is NOT compatiblitiable, setting model_comp to " + << *model_comp; + } else { + *model_comp = true; + VLOG(3) << "[NPU] model is compatiblitiable, setting model_comp to " + << *model_comp; + } + // Rebuild and write the data of the compatible model to the model buffer + if (!*model_comp) { + std::shared_ptr model_builder = + std::make_shared(model_client); + hiai::MemBuffer* org_model_buffer = model_builder->InputMemBufferCreate( + reinterpret_cast(model_buffer->data()), model_buffer->size()); + if (org_model_buffer) { + std::vector org_model_buffers; + org_model_buffers.push_back(org_model_buffer); + hiai::MemBuffer* new_model_buffer = model_builder->OutputMemBufferCreate( + framework_type(), org_model_buffers); + // VLOG(3) << "[NPU] new model buffer memeory size is " << + // new_model_buffer->GetMemBufferSize(); + if (new_model_buffer) { + uint32_t new_model_size = 0; + if (model_builder->BuildModel(org_model_buffers, + new_model_buffer, + new_model_size) == hiai::AI_SUCCESS) { + // need to change to new_model_size as GetMemBufferSize is not + // correct. + model_buffer->resize(new_model_size); + memcpy(reinterpret_cast(model_buffer->data()), + new_model_buffer->GetMemBufferData(), + new_model_size); + // Reset the model buffer + model_desc->SetModelBuffer( + reinterpret_cast(model_buffer->data()), + model_buffer->size()); + VLOG(3) << "[NPU] Rebuild the compatible model done."; + } else { + LOG(WARNING) << "[NPU] Rebuild the compatible model failed!"; + } + model_builder->MemBufferDestroy(new_model_buffer); + } else { + LOG(WARNING) << "[NPU] OutputMemBufferCreate failed!"; + } + model_builder->MemBufferDestroy(org_model_buffer); + } else { + LOG(WARNING) << "[NPU] InputMemBufferCreate failed!"; + } + } + // Load the compatible model + std::vector> model_descs{ + model_desc}; if (model_client->Load(model_descs) != hiai::AI_SUCCESS) { LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!"; - ir_build.ReleaseModelBuff(om_model_buf); return nullptr; } - ir_build.ReleaseModelBuff(om_model_buf); - VLOG(3) << "[NPU] Build done"; + VLOG(3) << "[NPU] Load model done."; return model_client; } +bool Device::Build(std::vector& input_nodes, // NOLINT + std::vector& output_nodes, // NOLINT + std::vector* model_buffer) { + // Convert the HiAI IR graph to the HiAI om model + ge::Graph ir_graph("graph"); + ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes); + ge::Model om_model("model", "model"); + om_model.SetGraph(ir_graph); + + // Build the HiAI om model, serialize and output it to the om buffer + domi::HiaiIrBuild ir_build; + domi::ModelBufferData om_buffer; + if (!ir_build.CreateModelBuff(om_model, om_buffer)) { + LOG(WARNING) << "[NPU] CreateModelBuff failed!"; + return false; + } + if (!ir_build.BuildIRModel(om_model, om_buffer)) { + LOG(WARNING) << "[NPU] BuildIRModel failed!"; + ir_build.ReleaseModelBuff(om_buffer); + return false; + } + model_buffer->resize(om_buffer.length); + memcpy(reinterpret_cast(model_buffer->data()), + reinterpret_cast(om_buffer.data), + om_buffer.length); + ir_build.ReleaseModelBuff(om_buffer); + VLOG(3) << "[NPU] Build model done."; + return true; +} + } // namespace npu } // namespace lite } // namespace paddle diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h index cf03e097194bf20ab428677b09b840991e8a902c..5862f0b393292d95b6500ae75171fab07a5279a6 100644 --- a/lite/backends/npu/device.h +++ b/lite/backends/npu/device.h @@ -38,14 +38,18 @@ class Device { int model_type() { return model_type_; } int device_type() { return device_type_; } + // Load the HiAI om model from buffer, rebuild the model if it's incompatible + // with the current device, then create a HiAI model manager client(from HiAI + // Server) to run inference + std::shared_ptr Load( + const std::string& model_name, + std::vector* model_buffer, + bool* model_comp); // Build the HiAI IR graph to om model, return HiAI model manager client to // load om model and run inference. - std::shared_ptr Build( - const std::string model_name, // NOLINT - std::vector& input_nodes, // NOLINT - std::vector& output_nodes, // NOLINT - const std::string model_cache_name // NOLINT - ); // NOLINT + bool Build(std::vector& input_nodes, // NOLINT + std::vector& output_nodes, // NOLINT + std::vector* model_buffer); private: int freq_level_{3}; diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc index 67d679fdd596b109b714bf7ba3cd45b2632b9420..002073517bc61af60da213db9af6e56da5f5b501 100644 --- a/lite/backends/opencl/cl_context.cc +++ b/lite/backends/opencl/cl_context.cc @@ -119,7 +119,7 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) { } } -cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size, +cl::NDRange CLContext::LocalWorkSizeTune(cl::NDRange global_work_size, size_t max_work_size, int divisor) { int preferred_lws = 0; @@ -157,7 +157,7 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size, static_cast(gws0)}; #endif } -cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size, +cl::NDRange CLContext::LocalWorkSizeTuneReverse(cl::NDRange global_work_size, size_t max_work_size, int divisor) { int preferred_lws = 0; diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h index 82d15bee5ec460a1fb06430571f007fcef23f66f..c204a8510402b8741c761938c3b2c37ac07fe961 100644 --- a/lite/backends/opencl/cl_context.h +++ b/lite/backends/opencl/cl_context.h @@ -62,10 +62,10 @@ class CLContext { cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size); - cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size, + cl::NDRange LocalWorkSizeTune(cl::NDRange global_work_size, size_t max_work_size, int divitor = 2); - cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size, + cl::NDRange LocalWorkSizeTuneReverse(cl::NDRange global_work_size, size_t max_work_size, int divitor = 2); bool IsArmMali(); diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl index a14748c69f3eafce515c90f2b8a226703fe5883d..080ce2b457421970409431dee6841ac4f7d57bb5 100644 --- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl @@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a, } else { for (int cidx = col; cidx < N; ++cidx) { for (int ridx = row; ridx < M; ++ridx) { - CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0; + CL_COMPUTE_DTYPE a0 = 0; + CL_COMPUTE_DTYPE b0 = 0; + CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0; for (int p = 0; p < K; ++p) { a0 = *(a + ridx * K + p); b0 = *(b + p * N + cidx), diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index 1c808da68ddc923e12234bc4b6ac99b35bfffb0b..9209f0e0f8d04fad5e788f3742c7922af8e13f49 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -6,9 +6,7 @@ __kernel void conv2d_1x1_opt( __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, @@ -284,9 +282,7 @@ __kernel void conv2d_1x1_simple( __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl index 771765ea6063a08784ae824a757b28450d808f6d..6a3aa6455daf8d20430a434ff6f47dac382f1f74 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl @@ -19,9 +19,7 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int offset, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl index 79f3922e89549fc15b7a849efb0e2b6595357102..739f852a7c6b60e4c38cb2523dfb745af65bc8df 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl @@ -19,9 +19,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -264,9 +262,7 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl index d856af6a1d4026b1595bc287901e53f64267dc81..f08d53fa4968d041337adfe3252529bca3b5c55e 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl @@ -5,9 +5,7 @@ __kernel void conv2d_5x5(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl index 4ed2e072022dc4b457a86d634bf4bc21ab62bc45..4cce039f27b750950a1475ac266e0f5117c6d259 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl @@ -20,9 +20,7 @@ __kernel void conv2d_5x5_opt(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -268,9 +266,7 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -513,4 +509,4 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); } -} \ No newline at end of file +} diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl index 4998dc99279fffad8750ef3b6495597e9fc4ad65..2a2f210601e760651ee850686391af3c040fbe7f 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl @@ -5,9 +5,7 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl index d82f4b4c96b586b6ecf948827402afd0766dcea4..4eadcd9f8032996abae04660b6878ab5beaff9a7 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl @@ -20,9 +20,7 @@ __kernel void conv2d_7x7_opt(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -268,9 +266,7 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, @@ -513,4 +509,4 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); } -} \ No newline at end of file +} diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl index 27313aea23ed16ecc7a6763dfbbbe63bca18941a..465b9f8f925a130b4d1b059ab15e93bc29128ec7 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl @@ -19,9 +19,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0, __private const int global_size_dim2, __read_only image2d_t input, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl index 5626fe6be7d451d4ffe22a2008affa7d82298bc3..6fbdc21f934f21dd26c3eb66885f7087e3d340c0 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -20,9 +20,7 @@ __kernel void depth_conv2d_3x3( __private const int global_size_dim2, __read_only image2d_t input, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int offset, @@ -249,9 +247,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, __private const int ou_nh, __read_only image2d_t input, __read_only image2d_t filter, -#if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, -#endif __write_only image2d_t output_image, __private const int stride, __private const int pad, diff --git a/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl b/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..b8533076b79aa2e94e30e38dd34d3f2292fdf88a --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl @@ -0,0 +1,160 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void transpose_4d(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = 1; + const int out_h = out_nh % out_H; + const int out_c0 = out_c * 4; + const int out_c1 = out_c * 4 + 1; + const int out_c2 = out_c * 4 + 2; + const int out_c3 = out_c * 4 + 3; + + const int in_n = out_n; + const int in_c = out_w * 0.25; + const int in_h0 = out_c0; + const int in_h1 = out_c1; + const int in_h2 = out_c2; + const int in_h3 = out_c3; + const int in_w = out_h; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_h0; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n * in_h1; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n * in_h2; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n * in_h3; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0); + + if (out_w % 4 == 0) { + output.x = input0.x; + } else if (out_w % 4 == 1) { + output.x = input0.y; + } else if (out_w % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1); + if(out_w % 4 == 0) { + output.y = input1.x; + } else if(out_w % 4 == 1) { + output.y = input1.y; + } else if(out_w % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2); + if (out_w % 4 == 0){ + output.z = input2.x; + } else if (out_w % 4 == 1) { + output.z = input2.y; + } else if (out_w % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3); + if (out_w % 4 == 0) { + output.w = input3.x; + } else if (out_w % 4 == 1) { + output.w = input3.y; + } else if (out_w % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = 1; + const int out_h = out_nh % out_H; + + const int in_n = 1; + const int in_c = out_c; + const int in_w = out_h; + const int in_h = out_w; + + int2 input_pos; + int2 output_pos; + input_pos.x = in_c * in_W + in_w; + input_pos.y = in_n * in_h; + + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_n * out_h; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + CL_DTYPE4 input; + CL_DTYPE4 output; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos); + + output = input; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input); +} \ No newline at end of file diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc index d8232cda4c790646fb5a4aae7d4e00d272d3a640..fe6b8fcd99d3f615aefd25145e97b7a08a537794 100644 --- a/lite/backends/opencl/cl_runtime.cc +++ b/lite/backends/opencl/cl_runtime.cc @@ -38,17 +38,20 @@ CLRuntime::~CLRuntime() { } bool CLRuntime::Init() { - if (initialized_) { + if (is_cl_runtime_initialized_) { return true; } bool is_platform_init = InitializePlatform(); bool is_device_init = InitializeDevice(); - is_init_success_ = is_platform_init && is_device_init; - initialized_ = true; - - context_ = CreateContext(); - command_queue_ = CreateCommandQueue(context()); - return initialized_; + LOG(INFO) << "is_platform_init:" << is_platform_init; + LOG(INFO) << "is_device_init:" << is_device_init; + if ((is_platform_init == true) && (is_device_init == true)) { + is_platform_device_init_success_ = true; + context_ = CreateContext(); + command_queue_ = CreateCommandQueue(context()); + is_cl_runtime_initialized_ = true; + } + return is_cl_runtime_initialized_; } cl::Platform& CLRuntime::platform() { @@ -64,7 +67,9 @@ cl::Context& CLRuntime::context() { } cl::Device& CLRuntime::device() { - CHECK(device_ != nullptr) << "device_ is not initialized!"; + if (device_ == nullptr) { + LOG(ERROR) << "device_ is not initialized!"; + } return *device_; } @@ -150,6 +155,14 @@ GpuType CLRuntime::ParseGpuTypeFromDeviceName(std::string device_name) { } bool CLRuntime::InitializeDevice() { + VLOG(3) << "device_info_.size():" << device_info_.size(); + for (auto i : device_info_) { + VLOG(3) << ">>> " << i.first << " " << i.second; + } + if (device_info_.size() > 0 && device_info_.size() <= 2) { + return false; + } + device_info_["PLACEHOLDER"] = 1; // ===================== BASIC ===================== // CL_DEVICE_TYPE_GPU // CL_DEVICE_NAME @@ -160,7 +173,7 @@ bool CLRuntime::InitializeDevice() { status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices); CL_CHECK_ERROR(status_); if (all_devices.empty()) { - LOG(FATAL) << "No OpenCL GPU device found!"; + LOG(ERROR) << "No available OpenCL GPU device found!"; return false; } device_ = std::make_shared(); @@ -313,9 +326,6 @@ bool CLRuntime::InitializeDevice() { } std::map& CLRuntime::GetDeviceInfo() { - if (0 != device_info_.size()) { - return device_info_; - } InitializeDevice(); return device_info_; } diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h index 3eeea7d63ae8f81e7eb395bc0da70caaf94c2a79..7e28130e15da0d45e62d984202f76aa1aff9762c 100644 --- a/lite/backends/opencl/cl_runtime.h +++ b/lite/backends/opencl/cl_runtime.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_utility.h" +#include "lite/backends/opencl/cl_wrapper.h" typedef enum { UNKNOWN = 0, @@ -68,6 +69,28 @@ class CLRuntime { public: static CLRuntime* Global(); + bool OpenCLAvaliableForDevice() { + bool opencl_lib_found = paddle::lite::CLWrapper::Global()->OpenclLibFound(); + LOG(INFO) << "opencl_lib_found:" << opencl_lib_found; + if (opencl_lib_found == false) return false; + + bool dlsym_success = paddle::lite::CLWrapper::Global()->DlsymSuccess(); + LOG(INFO) << "dlsym_success:" << dlsym_success; + if (opencl_lib_found == false) return false; + + InitializeDevice(); + bool support_fp16 = + static_cast(device_info_["CL_DEVICE_EXTENSIONS_FP16"]); + LOG(INFO) << "support_fp16:" << support_fp16; + if (support_fp16 == false) return false; + + is_device_avaliable_for_opencl_ = + dlsym_success && opencl_lib_found && support_fp16; + LOG(INFO) << "is_device_avaliable_for_opencl_:" + << is_device_avaliable_for_opencl_; + return is_device_avaliable_for_opencl_; + } + bool Init(); cl::Platform& platform(); @@ -85,7 +108,7 @@ class CLRuntime { bool BuildProgram(cl::Program* program, const std::string& options = ""); - bool IsInitSuccess() { return is_init_success_; } + bool IsInitSuccess() { return is_platform_device_init_success_; } std::string cl_path() { return cl_path_; } @@ -167,9 +190,11 @@ class CLRuntime { cl_int status_{CL_SUCCESS}; - bool initialized_{false}; + bool is_device_avaliable_for_opencl_{false}; + + bool is_cl_runtime_initialized_{false}; - bool is_init_success_{false}; + bool is_platform_device_init_success_{false}; }; } // namespace lite diff --git a/lite/backends/opencl/cl_wrapper.cc b/lite/backends/opencl/cl_wrapper.cc index 41011b593120d896cd1e6a2537ca59c4cf2a0835..5580a487eaaaf77676d2d6bd41542596504774a4 100644 --- a/lite/backends/opencl/cl_wrapper.cc +++ b/lite/backends/opencl/cl_wrapper.cc @@ -19,14 +19,16 @@ limitations under the License. */ namespace paddle { namespace lite { + CLWrapper *CLWrapper::Global() { static CLWrapper wrapper; return &wrapper; } CLWrapper::CLWrapper() { - CHECK(InitHandle()) << "Fail to initialize the OpenCL library!"; - InitFunctions(); + opencl_lib_found_ = InitHandle(); + CHECK(opencl_lib_found_) << "Fail to initialize the OpenCL library!"; + dlsym_success_ = InitFunctions(); } bool CLWrapper::InitHandle() { @@ -68,15 +70,17 @@ bool CLWrapper::InitHandle() { } } -void CLWrapper::InitFunctions() { +bool CLWrapper::InitFunctions() { CHECK(handle_ != nullptr) << "The library handle can't be null!"; + bool dlsym_success = true; #define PADDLE_DLSYM(cl_func) \ do { \ cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func); \ if (cl_func##_ == nullptr) { \ - LOG(FATAL) << "Cannot find the " << #cl_func \ + LOG(ERROR) << "Cannot find the " << #cl_func \ << " symbol in libOpenCL.so!"; \ + dlsym_success = false; \ break; \ } \ VLOG(4) << "Loaded the " << #cl_func << " symbol successfully."; \ @@ -106,7 +110,7 @@ void CLWrapper::InitFunctions() { PADDLE_DLSYM(clCreateCommandQueue); // note(ysh329): consider compatibility for cl_driver_version 1.10 // using clCreateCommandQueue instead. - // PADDLE_DLSYM(clCreateCommandQueueWithProperties); + // PADDLE_DLSYM(clCreateCommandQueueWithProperties); PADDLE_DLSYM(clReleaseCommandQueue); PADDLE_DLSYM(clCreateProgramWithBinary); PADDLE_DLSYM(clRetainContext); @@ -137,6 +141,7 @@ void CLWrapper::InitFunctions() { PADDLE_DLSYM(clEnqueueCopyImage); #undef PADDLE_DLSYM + return dlsym_success; } } // namespace lite @@ -445,9 +450,8 @@ CL_API_ENTRY cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties( // ->clCreateCommandQueueWithProperties()( // context, device, properties, errcode_ret); // - cl_command_queue_properties cl_cmd_properties; return paddle::lite::CLWrapper::Global()->clCreateCommandQueue()( - context, device, cl_cmd_properties, errcode_ret); + context, device, 0, errcode_ret); } CL_API_ENTRY cl_int CL_API_CALL clReleaseCommandQueue( diff --git a/lite/backends/opencl/cl_wrapper.h b/lite/backends/opencl/cl_wrapper.h index 35ef33e5a2f3973217e0e4c36caf1f8eb0fbdcb2..4df86b4028f92883718e7da0967f4a88ab20cc6d 100644 --- a/lite/backends/opencl/cl_wrapper.h +++ b/lite/backends/opencl/cl_wrapper.h @@ -508,13 +508,20 @@ class CLWrapper final { return clEnqueueCopyImage_; } + bool OpenclLibFound() { return opencl_lib_found_; } + + bool DlsymSuccess() { return dlsym_success_; } + private: CLWrapper(); CLWrapper(const CLWrapper &) = delete; CLWrapper &operator=(const CLWrapper &) = delete; bool InitHandle(); - void InitFunctions(); + bool InitFunctions(); + bool opencl_lib_found_{true}; + bool dlsym_success_{true}; void *handle_{nullptr}; + clGetPlatformIDsType clGetPlatformIDs_{nullptr}; clGetPlatformInfoType clGetPlatformInfo_{nullptr}; clBuildProgramType clBuildProgram_{nullptr}; diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc index 4978dfb84a4ee5770df011c54dccde59a62135b7..0d4301c5b6a56e50eba2d9a6ae13ce353a9b1e2e 100644 --- a/lite/backends/x86/dynamic_loader.cc +++ b/lite/backends/x86/dynamic_loader.cc @@ -20,8 +20,8 @@ limitations under the License. */ #include "lite/backends/x86/cupti_lib_path.h" #include "lite/backends/x86/port.h" #include "lite/backends/x86/warpctc_lib_path.h" +#include "lite/utils/cp_logging.h" #include "lite/utils/env.h" -#include "lite/utils/paddle_enforce.h" // DEFINE_string(cudnn_dir, // "", @@ -178,7 +178,7 @@ auto error_msg = #endif // !_WIN32 if (throw_on_error) { CHECK(dso_handle != nullptr); - // PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno); + // CHECK(nullptr != dso_handle, error_msg, dlPath, errorno); } else if (nullptr == dso_handle) { // LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno); } diff --git a/lite/backends/x86/jit/benchmark.cc b/lite/backends/x86/jit/benchmark.cc index c49984691e5beca5a42defd68243e1352372cf11..6318916dfa53d5cce0c33d0149a520ccb9288c28 100644 --- a/lite/backends/x86/jit/benchmark.cc +++ b/lite/backends/x86/jit/benchmark.cc @@ -319,8 +319,8 @@ void BenchKernelSgd() { const T lr = 0.1; auto UnDuplicatedRandomVec = []( int n, const int64_t lower, const int64_t upper) -> std::vector { - PADDLE_ENFORCE_LE(static_cast(upper - lower), n - 1); - PADDLE_ENFORCE_GT(n, 0); + CHECK_LE(static_cast(upper - lower), n - 1); + CHECK_GT(n, 0); std::vector all, out; for (int i = 0; i < n; ++i) { all.push_back(i); diff --git a/lite/backends/x86/jit/gen/embseqpool.cc b/lite/backends/x86/jit/gen/embseqpool.cc index 7e697014ed241a75693b783127633b255964f80b..e6628058d03959a2a58b403a6ad61af6c50b431c 100644 --- a/lite/backends/x86/jit/gen/embseqpool.cc +++ b/lite/backends/x86/jit/gen/embseqpool.cc @@ -129,11 +129,11 @@ class EmbSeqPoolCreator : public JitCodeCreator { } std::unique_ptr CreateJitCode( const emb_seq_pool_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.table_height, 0); - PADDLE_ENFORCE_GT(attr.table_width, 0); - PADDLE_ENFORCE_GT(attr.index_height, 0); - PADDLE_ENFORCE_GT(attr.index_width, 0); - PADDLE_ENFORCE_GT(attr.out_width, 0); + CHECK_GT(attr.table_height, 0); + CHECK_GT(attr.table_width, 0); + CHECK_GT(attr.index_height, 0); + CHECK_GT(attr.index_width, 0); + CHECK_GT(attr.out_width, 0); return make_unique(attr, CodeSize(attr)); } }; diff --git a/lite/backends/x86/jit/gen/embseqpool.h b/lite/backends/x86/jit/gen/embseqpool.h index 7bb248dd1d384af949fd3cd190df3d90d21921ef..d013887be5ecec1f67fa022b49b889f9cee9ade4 100644 --- a/lite/backends/x86/jit/gen/embseqpool.h +++ b/lite/backends/x86/jit/gen/embseqpool.h @@ -17,7 +17,7 @@ #include #include "lite/backends/x86/jit/gen/jitcode.h" #include "lite/utils/cp_logging.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc index f78df73f66532f891721c74cff9c78cc3bb61922..87fe758809e3e7e18d2f939a26f3729b937bf6f6 100644 --- a/lite/backends/x86/jit/gen/matmul.cc +++ b/lite/backends/x86/jit/gen/matmul.cc @@ -27,7 +27,7 @@ void MatMulJitCode::genCode() { preCode(); int block, rest; const auto groups = packed_groups(n_, k_, &block, &rest); - PADDLE_ENFORCE_GT(groups.front(), 0); + CHECK_GT(groups.front(), 0); const int block_len = sizeof(float) * block; const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1; @@ -116,9 +116,9 @@ class MatMulCreator : public JitCodeCreator { } std::unique_ptr CreateJitCode( const matmul_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.m, 0); - PADDLE_ENFORCE_GT(attr.n, 0); - PADDLE_ENFORCE_GT(attr.k, 0); + CHECK_GT(attr.m, 0); + CHECK_GT(attr.n, 0); + CHECK_GT(attr.k, 0); return make_unique(attr, CodeSize(attr)); } }; diff --git a/lite/backends/x86/jit/gen/matmul.h b/lite/backends/x86/jit/gen/matmul.h index 95edc14201ac94d302ff806d0a4b8f5f50b2835c..8bc1e41d0a17d548c47819b5e11daf7ed5065e86 100644 --- a/lite/backends/x86/jit/gen/matmul.h +++ b/lite/backends/x86/jit/gen/matmul.h @@ -19,7 +19,7 @@ #include #include "lite/backends/x86/jit/gen/jitcode.h" #include "lite/utils/cp_logging.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { @@ -32,7 +32,7 @@ class MatMulJitCode : public JitCode { size_t code_size = 256 * 1024, void* code_ptr = nullptr) : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) { - PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet"); + CHECK_EQ(m_, 1) << "Only support m==1 yet"; this->genCode(); } diff --git a/lite/backends/x86/jit/gen/seqpool.cc b/lite/backends/x86/jit/gen/seqpool.cc index 4c80737aac4bc9cd09f4ff222c8fad8c441887ec..c54093e4dfa00f89f51c70840c45518f3eddfd3d 100644 --- a/lite/backends/x86/jit/gen/seqpool.cc +++ b/lite/backends/x86/jit/gen/seqpool.cc @@ -69,8 +69,8 @@ class SeqPoolCreator : public JitCodeCreator { } std::unique_ptr CreateJitCode( const seq_pool_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.w, 0); - PADDLE_ENFORCE_GT(attr.h, 0); + CHECK_GT(attr.w, 0); + CHECK_GT(attr.h, 0); return make_unique(attr, CodeSize(attr)); } }; diff --git a/lite/backends/x86/jit/gen/seqpool.h b/lite/backends/x86/jit/gen/seqpool.h index a00428f3e0982889665cd23b21a5978c7c239399..a1bde4a9b66f22ef8815bdc61fe866065e7f4203 100644 --- a/lite/backends/x86/jit/gen/seqpool.h +++ b/lite/backends/x86/jit/gen/seqpool.h @@ -17,7 +17,7 @@ #include #include "lite/backends/x86/jit/gen/jitcode.h" #include "lite/utils/cp_logging.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { @@ -125,8 +125,8 @@ class SeqPoolJitCode : public JitCode { vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); reg_idx++; } - PADDLE_ENFORCE_EQ( - reg_idx, rest_used_num_regs, "All heights should use same regs"); + CHECK_EQ(reg_idx, rest_used_num_regs) + << "All heights should use same regs"; for (int i = 0; i < reg_idx; ++i) { vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs)); } diff --git a/lite/backends/x86/jit/gen/sgd.cc b/lite/backends/x86/jit/gen/sgd.cc index 44e083366132c675b339b2da4bbb3b7c1c6b7569..f91f1305ee30af708443e6a9a8bbb3fae2cc0b80 100644 --- a/lite/backends/x86/jit/gen/sgd.cc +++ b/lite/backends/x86/jit/gen/sgd.cc @@ -17,7 +17,7 @@ #include #include #include "lite/backends/x86/jit/registry.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -113,9 +113,9 @@ class SgdCreator : public JitCodeCreator { } std::unique_ptr CreateJitCode( const sgd_attr_t& attr) const override { - PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width); - PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height); - PADDLE_ENFORCE_GE(attr.selected_rows_size, 0); + CHECK_EQ(attr.param_width, attr.grad_width); + CHECK_LE(attr.selected_rows_size, attr.grad_height); + CHECK_GE(attr.selected_rows_size, 0); return make_unique(attr, CodeSize(attr)); } }; diff --git a/lite/backends/x86/jit/gen/vbroadcast.cc b/lite/backends/x86/jit/gen/vbroadcast.cc index fb1e71f7b0b1e6f68a331d264682e80fbab7c219..7c4860ba5084860b67b6ecb7e3eed8aafb16cb2c 100644 --- a/lite/backends/x86/jit/gen/vbroadcast.cc +++ b/lite/backends/x86/jit/gen/vbroadcast.cc @@ -16,7 +16,7 @@ #include #include #include "lite/backends/x86/jit/registry.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -76,7 +76,7 @@ class VBroadcastCreator : public JitCodeCreator { return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8; } std::unique_ptr CreateJitCode(const int64_t& w) const override { - PADDLE_ENFORCE_GT(w, 0); + CHECK_GT(w, 0); return make_unique(w, CodeSize(w)); } }; diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc index a3376be423828b25c6eda6fff30a56578c7bbbe5..a9a89fdb205ad54268986eeee628aec75ac01b74 100644 --- a/lite/backends/x86/jit/gen_base.cc +++ b/lite/backends/x86/jit/gen_base.cc @@ -21,8 +21,8 @@ // posix_memalign #include "lite/backends/x86/cpu_info.h" #include "lite/backends/x86/jit/macro.h" +#include "lite/utils/cp_logging.h" #include "lite/utils/env.h" -#include "lite/utils/paddle_enforce.h" #ifndef _WIN32 #define posix_memalign_free free @@ -62,12 +62,10 @@ void* GenBase::operator new(size_t size) { #ifdef _WIN32 ptr = _aligned_malloc(size, alignment); #else - PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), - 0, - "GenBase Alloc %ld error!", - size); + CHECK_EQ(posix_memalign(&ptr, alignment, size), 0) << "GenBase Alloc " << size + << " error!"; #endif - PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); + CHECK(ptr) << "Fail to allocate GenBase CPU memory: size = " << size; return ptr; } diff --git a/lite/backends/x86/jit/helper.cc b/lite/backends/x86/jit/helper.cc index 8322f7ebd2ce78f99979574983d81cebe5139606..f80a24d15c4666eacd31770c46f8a7ad4e7cfb37 100644 --- a/lite/backends/x86/jit/helper.cc +++ b/lite/backends/x86/jit/helper.cc @@ -14,9 +14,10 @@ #include "lite/backends/x86/jit/helper.h" #include // tolower +#include #include #include -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -104,12 +105,12 @@ void pack_weights(const float* src, float* dst, int n, int k) { int block, rest; const auto groups = packed_groups(n, k, &block, &rest); std::for_each(groups.begin(), groups.end(), [&](int i) { - PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0."); + CHECK_GT(i, 0) << "each element of groups should be larger than 0."; }); int sum = std::accumulate(groups.begin(), groups.end(), 0); std::memset(dst, 0, k * sum * block * sizeof(float)); - PADDLE_ENFORCE_GE( - sum * block, n, "The packed n should be equal to or larger than n"); + CHECK_GE(sum * block, n) + << "The packed n should be equal to or larger than n"; const int block_len = sizeof(float) * block; int n_offset = 0; diff --git a/lite/backends/x86/jit/helper.h b/lite/backends/x86/jit/helper.h index f741edbbed5b721fb9104a9c9a171a12532e4705..57a3611bb671c6d83ec3212702a57e3fc7d7f35f 100644 --- a/lite/backends/x86/jit/helper.h +++ b/lite/backends/x86/jit/helper.h @@ -23,7 +23,7 @@ #include "lite/backends/x86/jit/kernel_base.h" #include "lite/backends/x86/jit/kernel_key.h" #include "lite/backends/x86/jit/kernel_pool.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -78,8 +78,8 @@ inline const Kernel* GetReferKernel() { auto& ref_pool = ReferKernelPool::Instance().AllKernels(); KernelKey kkey(KernelTuple::kernel_type, lite::fluid::CPUPlace()); auto ref_iter = ref_pool.find(kkey); - PADDLE_ENFORCE(ref_iter != ref_pool.end(), - "Every Kernel should have reference function."); + CHECK(ref_iter != ref_pool.end()) + << "Every Kernel should have reference function."; auto& ref_impls = ref_iter->second; for (auto& impl : ref_impls) { auto i = dynamic_cast*>(impl.get()); @@ -94,7 +94,7 @@ template inline typename KernelTuple::func_type GetReferFunc() { auto ker = GetReferKernel(); auto p = dynamic_cast*>(ker); - PADDLE_ENFORCE(p, "The Refer kernel should exsit"); + CHECK(p) << "The Refer kernel should exsit"; return p->GetFunc(); } @@ -125,7 +125,7 @@ std::vector GetAllCandidateKernels( // The last implementation should be reference function on CPUPlace. auto ref = GetReferKernel(); - PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty."); + CHECK(ref != nullptr) << "Refer Kernel can not be empty."; res.emplace_back(ref); return res; } @@ -140,11 +140,11 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) { std::string name = k->ImplType(); if (name == "JitCode") { auto i = dynamic_cast(k); - PADDLE_ENFORCE(i, "jitcode kernel cast can not fail."); + CHECK(i) << "jitcode kernel cast can not fail."; res.emplace_back(std::make_pair(name, i->template getCode())); } else { auto i = dynamic_cast*>(k); - PADDLE_ENFORCE(i, "kernel cast can not fail."); + CHECK(i) << "kernel cast can not fail."; res.emplace_back(std::make_pair(name, i->GetFunc())); } } @@ -166,7 +166,7 @@ template typename KernelTuple::func_type GetDefaultBestFunc( const typename KernelTuple::attr_type& attr) { auto funcs = GetAllCandidateFuncs(attr); - PADDLE_ENFORCE_GE(funcs.size(), 1UL); + CHECK_GE(funcs.size(), 1UL); // Here could do some runtime benchmark of this attr and return the best one. // But yet just get the first one as the default best one, // which is searched in order and tuned by offline. diff --git a/lite/backends/x86/jit/kernel_key.cc b/lite/backends/x86/jit/kernel_key.cc index a6288fcf19d6867e1e1eb0bce32e559a4f303929..30397ffe1c4980e4af19a7a0eb44b47585b44f2c 100644 --- a/lite/backends/x86/jit/kernel_key.cc +++ b/lite/backends/x86/jit/kernel_key.cc @@ -14,7 +14,7 @@ #include "lite/backends/x86/jit/kernel_key.h" #include // XXH64: 13.8 GB/s -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/jit/more/mkl/mkl.h b/lite/backends/x86/jit/more/mkl/mkl.h index 6bc791e64575b8f481f91ea3c28ea4896fe1860d..473e1253194513c16d6d8c3b52eac110512e806e 100644 --- a/lite/backends/x86/jit/more/mkl/mkl.h +++ b/lite/backends/x86/jit/more/mkl/mkl.h @@ -18,7 +18,7 @@ #include #include #include "lite/backends/x86/jit/kernel_base.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -104,11 +104,11 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out, const emb_seq_pool_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); + CHECK_EQ(attr->table_width * attr->index_width, attr->out_width); auto check_idx_value_valid = [&](int64_t i) { - PADDLE_ENFORCE_LT( - idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i); - PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); + CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i] + << " i: " << i; + CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i; }; for (int64_t w = 0; w != attr->index_width; ++w) { @@ -175,22 +175,22 @@ void Sgd(const T* lr, const int64_t* rows, T* out, const sgd_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); - PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); + CHECK_EQ(attr->param_width, attr->grad_width); + CHECK_LE(attr->selected_rows_size, attr->grad_height); T scalar = -lr[0]; int width = attr->grad_width; if (out == param) { for (int64_t i = 0; i < attr->selected_rows_size; ++i) { auto h_idx = rows[i]; - PADDLE_ENFORCE_LT(h_idx, attr->param_height); - PADDLE_ENFORCE_GE(h_idx, 0); + CHECK_LT(h_idx, attr->param_height); + CHECK_GE(h_idx, 0); VAXPY(scalar, grad + i * width, out + h_idx * width, width); } } else { for (int64_t i = 0; i < attr->selected_rows_size; ++i) { auto h_idx = rows[i]; - PADDLE_ENFORCE_LT(h_idx, attr->param_height); - PADDLE_ENFORCE_GE(h_idx, 0); + CHECK_LT(h_idx, attr->param_height); + CHECK_GE(h_idx, 0); VScal(&scalar, grad + i * width, out + h_idx * width, width); VAdd(param + h_idx * width, out + h_idx * width, diff --git a/lite/backends/x86/jit/refer/refer.h b/lite/backends/x86/jit/refer/refer.h index d8c8d86911ab9a7794192aa68fb0c0571b1e4d26..b7243dfda350e8d0ea5909cf84ae3aa76d845055 100644 --- a/lite/backends/x86/jit/refer/refer.h +++ b/lite/backends/x86/jit/refer/refer.h @@ -22,7 +22,6 @@ #include "lite/backends/x86/jit/kernel_base.h" #include "lite/backends/x86/jit/macro.h" #include "lite/utils/cp_logging.h" -#include "lite/utils/paddle_enforce.h" namespace paddle { namespace lite { @@ -480,12 +479,12 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out, const emb_seq_pool_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); + CHECK_EQ(attr->table_width * attr->index_width, attr->out_width); auto check_idx_value_valid = [&](int64_t i) { - PADDLE_ENFORCE_LT( - idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i); - PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); + CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i] + << " i: " << i; + CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i; }; for (int64_t w = 0; w != attr->index_width; ++w) { @@ -527,12 +526,12 @@ void Sgd(const T* lr, const int64_t* rows, T* out, const lite::jit::sgd_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); - PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); + CHECK_EQ(attr->param_width, attr->grad_width); + CHECK_LE(attr->selected_rows_size, attr->grad_height); for (int64_t i = 0; i < attr->selected_rows_size; ++i) { auto h_idx = rows[i]; - PADDLE_ENFORCE_LT(h_idx, attr->param_height); - PADDLE_ENFORCE_GE(h_idx, 0); + CHECK_LT(h_idx, attr->param_height); + CHECK_GE(h_idx, 0); for (int64_t j = 0; j < attr->grad_width; ++j) { out[h_idx * attr->grad_width + j] = param[h_idx * attr->grad_width + j] - diff --git a/lite/backends/x86/jit/test.cc b/lite/backends/x86/jit/test.cc index aafcad579fdefd675323e0e2a6f40bd89c2a0166..03570a56d9c766271be630fe1d2e3048c6c42608 100644 --- a/lite/backends/x86/jit/test.cc +++ b/lite/backends/x86/jit/test.cc @@ -910,8 +910,8 @@ void TestKernelSgd() { const T lr = 0.1; auto UnDuplicatedRandomVec = []( int n, const int64_t lower, const int64_t upper) -> std::vector { - PADDLE_ENFORCE_LE(static_cast(upper - lower), n - 1); - PADDLE_ENFORCE_GT(n, 0); + CHECK_LE(static_cast(upper - lower), n - 1); + CHECK_GT(n, 0); std::vector all, out; for (int i = 0; i < n; ++i) { all.push_back(i); diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc index 5d7e98629cb89bd7a3fdee852507e0f381e54931..274e8836dd6e59d610ddeb7a63f898cdc1b19cc1 100644 --- a/lite/backends/x86/math/beam_search.cc +++ b/lite/backends/x86/math/beam_search.cc @@ -116,7 +116,7 @@ class BeamSearchFunctor { lod[0].assign(high_level.begin(), high_level.end()); lod[1].assign(low_level.begin(), low_level.end()); // if (!lite::fluid::CheckLoD(lod)) { - // //PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); + // //LOG(FATAL)<<"lod %s is not right", framework::LoDToString(lod)); //} selected_ids->set_lod(lod); selected_scores->set_lod(lod); diff --git a/lite/backends/x86/math/blas.cc b/lite/backends/x86/math/blas.cc index 3bc5f9f67ad96e7ec699400ff6369fe48c745b7e..4c6bf06951f81e90a73c91c2378f904db5678495 100644 --- a/lite/backends/x86/math/blas.cc +++ b/lite/backends/x86/math/blas.cc @@ -23,7 +23,7 @@ namespace math { MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim, int num_flatten_cols, bool trans) { - PADDLE_ENFORCE_GT(tensor_dim.size(), 1u); + CHECK_GT(tensor_dim.size(), 1u); MatDescriptor retv; if (num_flatten_cols > 1) { auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols); diff --git a/lite/backends/x86/math/blas_impl.h b/lite/backends/x86/math/blas_impl.h index 34b258892be05625ae88076eff175f56a53d3537..4a64e45ea945f2d46c06ba31d67bd2a0fbf7c635 100644 --- a/lite/backends/x86/math/blas_impl.h +++ b/lite/backends/x86/math/blas_impl.h @@ -287,22 +287,22 @@ struct CBlas { template <> struct CBlas { - static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } + static void GEMM(...) { LOG(FATAL) << "float16 GEMM not supported on CPU"; } static void SMM_GEMM(...) { - PADDLE_THROW("float16 SMM_GEMM not supported on CPU"); + LOG(FATAL) << "float16 SMM_GEMM not supported on CPU"; } - static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); } - static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); } + static void VMUL(...) { LOG(FATAL) << "float16 VMUL not supported on CPU"; } + static void VEXP(...) { LOG(FATAL) << "float16 VEXP not supported on CPU"; } static void VSQUARE(...) { - PADDLE_THROW("float16 VSQUARE not supported on CPU"); + LOG(FATAL) << "float16 VSQUARE not supported on CPU"; } - static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); } - static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); }; - static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); }; - static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); }; + static void VPOW(...) { LOG(FATAL) << "float16 VPOW not supported on CPU"; } + static void DOT(...) { LOG(FATAL) << "float16 DOT not supported on CPU"; }; + static void SCAL(...) { LOG(FATAL) << "float16 SCAL not supported on CPU"; }; + static void ASUM(...) { LOG(FATAL) << "float16 ASUM not supported on CPU"; }; #ifdef PADDLE_WITH_MKLML static void GEMM_BATCH(...) { - PADDLE_THROW("float16 GEMM_BATCH not supported on CPU"); + LOG(FATAL) << "float16 GEMM_BATCH not supported on CPU"; } #endif }; @@ -461,11 +461,11 @@ void Blas::MatMul(const lite::Tensor &mat_a, auto dim_a = mat_a.dims(); auto dim_b = mat_b.dims(); auto dim_out = mat_out->dims(); - PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, - "The input and output of matmul be matrix"); - // PADDLE_ENFORCE( - // mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target(), - // "The targets of matrices must be same"); + CHECK(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2) + << "The input and output of matmul be matrix"; + // CHECK( + // mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target()) + // << "The targets of matrices must be same"; int M = dim_out[0]; int N = dim_out[1]; @@ -746,7 +746,7 @@ void Blas::MatMul(const lite::Tensor &mat_a, T alpha, lite::Tensor *mat_out, T beta) const { - PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_); + CHECK_EQ(dim_a.width_, dim_b.height_); CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans; if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) { @@ -761,8 +761,8 @@ void Blas::MatMul(const lite::Tensor &mat_a, beta, mat_out->template mutable_data()); } else { - PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ || - dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0); + CHECK(dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 || + dim_b.batch_size_ == 0); this->template BatchedGEMM( transA, transB, diff --git a/lite/backends/x86/math/context_project.h b/lite/backends/x86/math/context_project.h index 0c56e0d759fd9b1e3abba5209f43d7a0c8fe194e..72a2f4ce12cbd72b26cd87e97d0178275a4b4abd 100644 --- a/lite/backends/x86/math/context_project.h +++ b/lite/backends/x86/math/context_project.h @@ -146,7 +146,7 @@ class ContextProjectFunctor { } } if (padding_trainable) { - PADDLE_ENFORCE(padding_data != nullptr); + CHECK(padding_data != nullptr); for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { if (lod_level_0[i] == lod_level_0[i + 1]) continue; diff --git a/lite/backends/x86/math/cpu_vec.h b/lite/backends/x86/math/cpu_vec.h index 9ff64d53f069d2e4c5b639d273af5b4aa5738b2b..0e721cc8c272eee4b1df1f4b254b5e1d0c1ebb0a 100644 --- a/lite/backends/x86/math/cpu_vec.h +++ b/lite/backends/x86/math/cpu_vec.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include #include "lite/backends/x86/cpu_info.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" #ifdef PADDLE_WITH_MKLML #include "lite/backends/x86/mklml.h" @@ -652,7 +652,7 @@ class VecActivations { } else if (type == "identity" || type == "") { return vec_identity; } - PADDLE_THROW("Not support type: %s", type); + LOG(FATAL) << "Not support type: " << type; } }; diff --git a/lite/backends/x86/math/cross_entropy.cc b/lite/backends/x86/math/cross_entropy.cc index 941a34643669f060cdd18f38f92c39e529da7b19..2419620111b7ace292d8a2d366fc1dce2353a15c 100644 --- a/lite/backends/x86/math/cross_entropy.cc +++ b/lite/backends/x86/math/cross_entropy.cc @@ -57,7 +57,7 @@ class CrossEntropyFunctor { for (int i = 0; i < batch_size; ++i) { for (int j = 0; j < num_remain; j++) { int lbl = label_data[i * num_remain + j]; - PADDLE_ENFORCE((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index); + CHECK((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index); int index = i * num_classes + lbl * num_remain + j; int loss_idx = i * num_remain + j; loss_data[loss_idx] = diff --git a/lite/backends/x86/math/cross_entropy.h b/lite/backends/x86/math/cross_entropy.h index 6b66f0b08548c1306681409345c051d1ab40a7c0..d2a66083ac1a72de9e5e469618fc387a5ea784dc 100644 --- a/lite/backends/x86/math/cross_entropy.h +++ b/lite/backends/x86/math/cross_entropy.h @@ -27,7 +27,7 @@ namespace math { template struct TolerableValue { HOSTDEVICE T operator()(const T& x) const { - PADDLE_ENFORCE(static_cast(std::is_floating_point::value)); + CHECK(static_cast(std::is_floating_point::value)); const T kApproInf = 1e20; if (x == INFINITY) return kApproInf; diff --git a/lite/backends/x86/math/detail/activation_functions.h b/lite/backends/x86/math/detail/activation_functions.h index 6a13a3d471e10970b36120a12b21a36256350803..dc3c3eac1989f256378e408b8e8e4401bea43e7c 100644 --- a/lite/backends/x86/math/detail/activation_functions.h +++ b/lite/backends/x86/math/detail/activation_functions.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include #include "lite/backends/x86/cpu_info.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -46,8 +46,6 @@ inline ActivationType GetActivationType(const std::string &type) { return ActivationType::kIdentity; } LOG(ERROR) << "Not support type " << type; - // PADDLE_ENFORCE(false, "Not support type %s", type); - // PADDLE_THROW("Not support type %s.", type); return ActivationType(); } diff --git a/lite/backends/x86/math/gru_compute.h b/lite/backends/x86/math/gru_compute.h index 86b7a91f4127de50aeb5c5fb02122bced0af4188..767e9b9da0e2977f566c793c2fdc71f83ab5b6d4 100644 --- a/lite/backends/x86/math/gru_compute.h +++ b/lite/backends/x86/math/gru_compute.h @@ -13,7 +13,7 @@ limitations under the License. */ #include "lite/backends/x86/math/detail/activation_functions.h" #include "lite/core/context.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/math/im2col.cc b/lite/backends/x86/math/im2col.cc index b916c912ffc2a4d62b63b98fdce150b353ba087e..abbd9b0e2811913f6aff79561e365d20bffbeae4 100644 --- a/lite/backends/x86/math/im2col.cc +++ b/lite/backends/x86/math/im2col.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "lite/backends/x86/math/im2col.h" #include #include "lite/backends/x86/math/im2col_cfo_cpu.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -38,8 +38,8 @@ class Im2ColFunctor& stride, const std::vector& padding, lite::Tensor* col) { - PADDLE_ENFORCE(im.dims().size() == 3); - PADDLE_ENFORCE(col->dims().size() == 5); + CHECK_EQ(im.dims().size(), 3); + CHECK_EQ(col->dims().size(), 5); if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 && dilation[1] == 1) { @@ -72,8 +72,8 @@ class Col2ImFunctor& stride, const std::vector& padding, lite::Tensor* im) { - PADDLE_ENFORCE(im->dims().size() == 3); - PADDLE_ENFORCE(col.dims().size() == 5); + CHECK_EQ(im->dims().size(), 3); + CHECK_EQ(col.dims().size(), 5); int im_channels = im->dims()[0]; int im_height = im->dims()[1]; int im_width = im->dims()[2]; @@ -82,20 +82,20 @@ class Col2ImFunctor& stride, const std::vector& padding, lite::Tensor* col) { - PADDLE_ENFORCE(im.dims().size() == 3); - PADDLE_ENFORCE(col->dims().size() == 5); + CHECK_EQ(im.dims().size(), 3); + CHECK_EQ(col->dims().size(), 5); int im_channels = im.dims()[0]; int im_height = im.dims()[1]; int im_width = im.dims()[2]; @@ -214,8 +214,8 @@ class Col2ImFunctor& stride, const std::vector& padding, lite::Tensor* im) { - PADDLE_ENFORCE(im->dims().size() == 3); - PADDLE_ENFORCE(col.dims().size() == 5); + CHECK_EQ(im->dims().size(), 3); + CHECK_EQ(col.dims().size(), 5); int im_channels = im->dims()[0]; int im_height = im->dims()[1]; int im_width = im->dims()[2]; @@ -224,16 +224,16 @@ class Col2ImFunctortemplate mutable_data(); const T* col_data = col.data(); diff --git a/lite/backends/x86/math/lstm_compute.h b/lite/backends/x86/math/lstm_compute.h index ddb7bea9995ebcca978be97f8295eb07b0e4e17e..b403770cca7248fba10e62708dddfb91f2789488 100644 --- a/lite/backends/x86/math/lstm_compute.h +++ b/lite/backends/x86/math/lstm_compute.h @@ -16,7 +16,7 @@ limitations under the License. */ #include "lite/backends/x86/math/detail/activation_functions.h" #include "lite/core/context.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc index cb1781db2199c1b7a12aaec80b1904f65b23b534..cc4aa5d9fa54c50eb944714c14a5f6b15634a181 100644 --- a/lite/backends/x86/math/math_function.cc +++ b/lite/backends/x86/math/math_function.cc @@ -121,8 +121,8 @@ struct RowwiseAdd { lite::Tensor* output) { const auto& in_dims = input.dims(); auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(vector.numel(), size); - PADDLE_ENFORCE_EQ(output->dims(), in_dims); + CHECK_EQ(vector.numel(), size); + CHECK_EQ(output->dims(), in_dims); const T* input_data = input.data(); const T* vector_data = vector.data(); diff --git a/lite/backends/x86/math/math_function.h b/lite/backends/x86/math/math_function.h index 8f629b5f171814f0df8e51e61123c7c0aabf7643..7081ec0053e0b4194730e6f4353e1274d6019bb4 100644 --- a/lite/backends/x86/math/math_function.h +++ b/lite/backends/x86/math/math_function.h @@ -20,8 +20,8 @@ limitations under the License. */ #include "lite/core/op_lite.h" #include "lite/core/tensor.h" #include "lite/fluid/float16.h" -#include "lite/utils/paddle_enforce.h" -//#include "lite/tensor_util.h" +#include "lite/utils/cp_logging.h" +// #include "lite/tensor_util.h" namespace paddle { namespace lite { diff --git a/lite/backends/x86/math/math_function_impl.h b/lite/backends/x86/math/math_function_impl.h index acfb76759f6fc9fa4122afd2388bc3adf8f5ea22..9bbfebcfb2feb0e3c9d68261240bed18888350c3 100644 --- a/lite/backends/x86/math/math_function_impl.h +++ b/lite/backends/x86/math/math_function_impl.h @@ -59,7 +59,7 @@ void ColwiseSum::operator()(const lite::Context& context, lite::TensorLite* out) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ(out->numel(), size); + CHECK_EQ(out->numel(), size); auto in = lite::fluid::EigenMatrix::From(input); auto vec = lite::fluid::EigenVector::Flatten(*out); @@ -81,7 +81,7 @@ class ColwiseSum { auto& in_dims = input.dims(); auto height = in_dims[0]; auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), size); + CHECK_EQ(out->numel(), size); T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); @@ -103,8 +103,8 @@ void RowwiseMean::operator()(const lite::Context& context, const lite::TensorLite& input, lite::TensorLite* out) { auto in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); - PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + CHECK_EQ(in_dims.size(), 2U); + CHECK_EQ(out->numel(), in_dims[0]); auto in = lite::fluid::EigenMatrix::From(input); auto vec = lite::fluid::EigenVector::Flatten(*out); @@ -124,10 +124,10 @@ class RowwiseMean { const lite::TensorLite& input, lite::TensorLite* out) { auto& in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + CHECK_EQ(in_dims.size(), 2U); auto height = in_dims[0]; auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), height); + CHECK_EQ(out->numel(), height); auto inv_size = 1.0 / size; T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); @@ -147,8 +147,8 @@ void RowwiseSum::operator()(const lite::Context& context, const lite::TensorLite& input, lite::TensorLite* out) { auto in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); - PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]); + CHECK_EQ(in_dims.size(), 2U); + CHECK_EQ(out->numel(), in_dims[0]); auto in = lite::fluid::EigenMatrix::From(input); auto vec = lite::fluid::EigenVector::Flatten(*out); @@ -168,10 +168,10 @@ class RowwiseSum { const lite::TensorLite& input, lite::TensorLite* out) { auto& in_dims = input.dims(); - PADDLE_ENFORCE_EQ(in_dims.size(), 2U); + CHECK_EQ(in_dims.size(), 2U); auto height = in_dims[0]; auto size = in_dims[1]; - PADDLE_ENFORCE_EQ(out->numel(), height); + CHECK_EQ(out->numel(), height); T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); diff --git a/lite/backends/x86/math/math_function_test.cc b/lite/backends/x86/math/math_function_test.cc index 19122a6169fbbe1729e38389b0006b11190bc206..b3511ca3521634a771965348e754e10bfd72e19f 100644 --- a/lite/backends/x86/math/math_function_test.cc +++ b/lite/backends/x86/math/math_function_test.cc @@ -273,7 +273,7 @@ TEST(math_funciton, set_constant) { auto* ctx = new paddle::platform::CPUDeviceContext(); paddle::operators::math::set_constant(*ctx, &t, 10); for (int64_t i = 0; i < t.numel(); ++i) { - PADDLE_ENFORCE_EQ(10, t.data()[i]); + CHECK_EQ(10, t.data()[i]); } delete ctx; } diff --git a/lite/backends/x86/math/sampler.h b/lite/backends/x86/math/sampler.h index efd9e48e5443186b6b735287cc150f99cb42be81..07cca52e1f436c2979a331dd27c2ddc554c0dad8 100644 --- a/lite/backends/x86/math/sampler.h +++ b/lite/backends/x86/math/sampler.h @@ -32,7 +32,7 @@ namespace math { class Sampler { public: explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) { - // PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0."); + // CHECK_GT(range, 0, "Range should be greater than 0."); if (seed == 0) { std::random_device r; seed_ = r(); diff --git a/lite/backends/x86/math/selected_rows_functor.cc b/lite/backends/x86/math/selected_rows_functor.cc index 03a18587f4a029bcaebe484ca1ab1951e7c3ecad..8e2a81905b871902aa8ec79c9dd718a62c9f6dec 100644 --- a/lite/backends/x86/math/selected_rows_functor.cc +++ b/lite/backends/x86/math/selected_rows_functor.cc @@ -31,7 +31,7 @@ struct SelectedRowsAdd { const fluid::SelectedRows& input2, fluid::SelectedRows* output) { auto in1_height = input1.height(); - PADDLE_ENFORCE_EQ(in1_height, input2.height()); + CHECK_EQ(in1_height, input2.height()); output->set_height(in1_height); auto& in1_rows = input1.rows(); @@ -49,8 +49,8 @@ struct SelectedRowsAdd { auto& in2_value = input2.value(); auto in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); - PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); + CHECK_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); + CHECK_EQ(in1_row_numel, out_value->numel() / out_rows.size()); auto* out_data = out_value->template mutable_data(); auto* in1_data = in1_value.data(); @@ -73,15 +73,15 @@ struct SelectedRowsAddTensor { auto in1_height = input1.height(); auto in2_dims = input2.dims(); auto out_dims = output->dims(); - PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); - PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); + CHECK_EQ(in1_height, in2_dims[0]); + CHECK_EQ(in1_height, out_dims[0]); auto& in1_value = input1.value(); auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); - PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); + CHECK_EQ(in1_row_numel, input2.numel() / in1_height); + CHECK_EQ(in1_row_numel, output->numel() / in1_height); SetConstant functor; functor(context, output, 0.0); @@ -113,7 +113,7 @@ struct SelectedRowsAddTo { const int64_t input2_offset, fluid::SelectedRows* input2) { auto in1_height = input1.height(); - PADDLE_ENFORCE_EQ(in1_height, input2->height()); + CHECK_EQ(in1_height, input2->height()); auto& in1_rows = input1.rows(); auto& in2_rows = *(input2->mutable_rows()); @@ -149,7 +149,7 @@ struct SelectedRowsSumTo { auto& in_rows = (*iter)->rows(); size += in_rows.end() - in_rows.begin(); auto in1_height = (*iter)->height(); - PADDLE_ENFORCE_EQ(in1_height, input2->height()); + CHECK_EQ(in1_height, input2->height()); } // concat rows std::vector in2_rows; @@ -185,13 +185,13 @@ struct SelectedRowsAddToTensor { auto in1_height = input1.height(); auto in2_dims = input2->dims(); - PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + CHECK_EQ(in1_height, in2_dims[0]); auto& in1_value = input1.value(); auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + CHECK_EQ(in1_row_numel, input2->numel() / in1_height); auto* in1_data = in1_value.data(); auto* input2_data = input2->template mutable_data(); @@ -291,12 +291,11 @@ struct MergeAdd { if (input->rows().size() == 0) { continue; } - PADDLE_ENFORCE_EQ(input_width, - input->value().dims()[1], - "all input should have same " - "dimension except for the first one"); - PADDLE_ENFORCE_EQ( - input_height, input->height(), "all input should have same height"); + CHECK_EQ(input_width, input->value().dims()[1]) + << "all input should have same " + "dimension except for the first one"; + CHECK_EQ(input_height, input->height()) + << "all input should have same height"; row_num += input->rows().size(); merged_row_set.insert(input->rows().begin(), input->rows().end()); } @@ -376,13 +375,13 @@ struct UpdateToTensor { lite::Tensor* input2) { auto in1_height = input1.height(); auto in2_dims = input2->dims(); - PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + CHECK_EQ(in1_height, in2_dims[0]); auto& in1_value = input1.value(); auto& in1_rows = input1.rows(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + CHECK_EQ(in1_row_numel, input2->numel() / in1_height); auto* in1_data = in1_value.data(); auto* input2_data = input2->template data(); diff --git a/lite/backends/x86/math/sequence2batch.cc b/lite/backends/x86/math/sequence2batch.cc index aa7aeac532e2fa1f90d452924b364be1896ee862..597521b6e7cac49ac91dbddac71af22bb5a8760c 100644 --- a/lite/backends/x86/math/sequence2batch.cc +++ b/lite/backends/x86/math/sequence2batch.cc @@ -30,12 +30,10 @@ class CopyMatrixRowsFunctor { const uint64_t* index = index_lod.data(); const auto& src_dims = src.dims(); const auto& dst_dims = dst->dims(); - PADDLE_ENFORCE_EQ( - src_dims.size(), 2UL, "The src must be matrix with rank 2."); - PADDLE_ENFORCE_EQ( - dst_dims.size(), 2UL, "The dst must be matrix with rank 2."); - PADDLE_ENFORCE_EQ( - src_dims[1], dst_dims[1], "The width of src and dst must be same."); + CHECK_EQ(src_dims.size(), 2UL) << "The src must be matrix with rank 2."; + CHECK_EQ(dst_dims.size(), 2UL) << "The dst must be matrix with rank 2."; + CHECK_EQ(src_dims[1], dst_dims[1]) + << "The width of src and dst must be same."; auto height = dst_dims[0]; auto width = dst_dims[1]; auto* src_data = src.data(); diff --git a/lite/backends/x86/math/sequence2batch.h b/lite/backends/x86/math/sequence2batch.h index 796894cb7d18ec4db7b670276bb3d3fc5b1427f8..953576eea4170cca57f10bb977ca9bfecb36ae6d 100644 --- a/lite/backends/x86/math/sequence2batch.h +++ b/lite/backends/x86/math/sequence2batch.h @@ -19,7 +19,7 @@ limitations under the License. */ #include "lite/core/context.h" #include "lite/core/tensor.h" #include "lite/fluid/eigen.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -66,21 +66,18 @@ class LoDTensor2BatchFunctor { bool is_reverse = false) const { if (!is_cal_batch_lod) { auto lods = batch->lod(); - PADDLE_ENFORCE_GT(lods.size(), - 2UL, - "The LoD of LoDTensor should inlcude at least 2-level " - "sequence information."); - PADDLE_ENFORCE_EQ( - lods[1].size(), - static_cast(lod_tensor.dims()[0]), - "The LoD information should be consistent with the dims."); + CHECK_GT(lods.size(), 2UL) + << "The LoD of LoDTensor should inlcude at least 2-level " + "sequence information."; + CHECK_EQ(lods[1].size(), static_cast(lod_tensor.dims()[0])) + << "The LoD information should be consistent with the dims."; CopyMatrixRowsFunctor to_batch; to_batch(context, lod_tensor, lods[1], batch, true); return; } auto lods = lod_tensor.lod(); - PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now."); + CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now."; const auto& lod = lods[0]; @@ -165,14 +162,11 @@ class Batch2LoDTensorFunctor { const lite::Tensor& batch, lite::Tensor* lod_tensor) const { auto in_lod = batch.lod(); - PADDLE_ENFORCE_GT(in_lod.size(), - 2UL, - "The LoD of LoDTensor should inlcude at least 2-level " - "sequence information."); - PADDLE_ENFORCE_EQ( - in_lod[1].size(), - static_cast(lod_tensor->dims()[0]), - "The LoD information should be consistent with the dims."); + CHECK_GT(in_lod.size(), 2UL) + << "The LoD of LoDTensor should inlcude at least 2-level " + "sequence information."; + CHECK_EQ(in_lod[1].size(), static_cast(lod_tensor->dims()[0])) + << "The LoD information should be consistent with the dims."; CopyMatrixRowsFunctor to_seq; to_seq(context, batch, in_lod[1], lod_tensor, false); } diff --git a/lite/backends/x86/math/sequence_padding.cc b/lite/backends/x86/math/sequence_padding.cc index eb977dc2d23f4cfaeec7dd5a6e2834ca23345f76..3b2f8bfc4f58a4bfcab968a9288eb8d1d817d78d 100644 --- a/lite/backends/x86/math/sequence_padding.cc +++ b/lite/backends/x86/math/sequence_padding.cc @@ -37,10 +37,9 @@ void CopyValidData(lite::Tensor* dst_tensor, layout == kBatchLengthWidth ? step_width : seq_num * step_width; for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) { int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx]; - PADDLE_ENFORCE_GE( - pad_seq_len, - valid_seq_len, - "The padded sequence length can not be less than its original length."); + CHECK_GE(pad_seq_len, valid_seq_len) << "The padded sequence length can " + "not be less than its original " + "length."; int seq_data_offset = seq_offsets[seq_idx] * step_width; int pad_data_offset = layout == kBatchLengthWidth ? seq_idx * pad_seq_len * step_width @@ -108,9 +107,9 @@ class PaddingLoDTensorFunctor { pad_seq_len, step_width, layout); - PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width, - "The numel of 'pad_value' can only be 1 or be equal to the " - "'step_width'."); + CHECK(pad_value.numel() == 1 || pad_value.numel() == step_width) + << "The numel of 'pad_value' can only be 1 or be equal to the " + "'step_width'."; // fill padding value T* pad_data = pad_tensor->template mutable_data(); diff --git a/lite/backends/x86/math/sequence_padding.h b/lite/backends/x86/math/sequence_padding.h index 43407014dea0ed0c78ab29da7fb8ebb0e0310566..5512c4aa11fb5dc05283d01b1d6d3da7fb83c064 100644 --- a/lite/backends/x86/math/sequence_padding.h +++ b/lite/backends/x86/math/sequence_padding.h @@ -19,7 +19,7 @@ limitations under the License. */ #include "lite/core/context.h" #include "lite/core/tensor.h" #include "lite/fluid/lod.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -46,15 +46,14 @@ inline static void CheckDims(const lite::DDim& seq_tensor_dims, int64_t padded_seq_len, int64_t step_width, const PadLayout& layout) { - PADDLE_ENFORCE_EQ(static_cast(seq_tensor_dims[0]), - seq_offset.back(), - "Value of 1st dimension of the sequence tensor should be " - "equal to sum of lengths of all sequences."); + CHECK_EQ(static_cast(seq_tensor_dims[0]), seq_offset.back()) + << "Value of 1st dimension of the sequence tensor should be " + "equal to sum of lengths of all sequences."; - PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() || - seq_tensor_dims.size() == pad_tensor_dims.size(), - "pad_tensor's rank should be 1 greater than seq_tensor's " - "rank, or be equal with it."); + CHECK(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() || + seq_tensor_dims.size() == pad_tensor_dims.size()) + << "pad_tensor's rank should be 1 greater than seq_tensor's " + "rank, or be equal with it."; } /* diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc index 2d00ebad61840da5b14fbf12d9255394b2b2df1a..c1ddb030349a7f7f46fd6b98d3f967eb6fdfe48e 100644 --- a/lite/backends/x86/math/sequence_pooling.cc +++ b/lite/backends/x86/math/sequence_pooling.cc @@ -46,12 +46,12 @@ class MaxSeqPoolFunctor { auto in_dims = input.dims(); auto out_dims = output->dims(); auto idx_dims = index->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), 1u); - PADDLE_ENFORCE_GT(out_dims.size(), 1u); + CHECK_GT(in_dims.size(), 1u); + CHECK_GT(out_dims.size(), 1u); for (size_t i = 1; i < in_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); + CHECK_EQ(in_dims[i], out_dims[i]); } - PADDLE_ENFORCE_EQ(idx_dims, out_dims); + CHECK_EQ(idx_dims, out_dims); auto starts = input.lod()[0]; const T* in_data = input.data(); @@ -95,10 +95,10 @@ class MaxSeqPoolFunctor { lite::Tensor* index) { auto in_dims = input.dims(); auto out_dims = output->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), 1u); - PADDLE_ENFORCE_GT(out_dims.size(), 1u); + CHECK_GT(in_dims.size(), 1u); + CHECK_GT(out_dims.size(), 1u); for (size_t i = 1; i < in_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); + CHECK_EQ(in_dims[i], out_dims[i]); } auto starts = input.lod()[0]; @@ -136,12 +136,12 @@ class MaxSeqPoolGradFunctor { auto og_dims = out_grad.dims(); auto ig_dims = in_grad->dims(); auto idx_dims = index.dims(); - PADDLE_ENFORCE_GT(og_dims.size(), 1); - PADDLE_ENFORCE_GT(ig_dims.size(), 1); + CHECK_GT(og_dims.size(), 1); + CHECK_GT(ig_dims.size(), 1); for (size_t i = 1; i < og_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); + CHECK_EQ(og_dims[i], ig_dims[i]); } - PADDLE_ENFORCE_EQ(idx_dims, og_dims); + CHECK_EQ(idx_dims, og_dims); const T* og_data = out_grad.data(); const int* max_index = index.data(); @@ -236,7 +236,7 @@ class SumSeqPoolGradFunctor { auto lod = in_grad->lod()[0]; int64_t out_w = out_grad.numel() / out_grad.dims()[0]; int64_t in_w = in_grad->numel() / in_grad->dims()[0]; - PADDLE_ENFORCE(in_w == out_w); + CHECK(in_w == out_w); const T* out_g_data = out_grad.data(); T* in_g_data = in_grad->template mutable_data(TARGET(kX86)); auto blas = math::GetBlas(context); @@ -330,7 +330,7 @@ class SequencePoolFunctor { out_e.device(eigen_device) = in_e.sum(Eigen::array({{0}})) / std::sqrt(static_cast(h)); } else { - PADDLE_THROW("unsupported pooling pooltype"); + LOG(FATAL) << "unsupported pooling pooltype"; } } } @@ -389,7 +389,7 @@ class SequencePoolGradFunctor { } else if (pooltype == "FIRST") { in_g_e.chip(0, 0).device(eigen_device) = out_g_e_v; } else { - PADDLE_THROW("unsupported pooling pooltype"); + LOG(FATAL) << "unsupported pooling pooltype"; } } } diff --git a/lite/backends/x86/math/sequence_pooling_test.cc b/lite/backends/x86/math/sequence_pooling_test.cc index b91f43a571994bef95650361a6dc62c0465837a7..8bba0f92055dbee5a81bf12ab2fa5cc6592bd60c 100644 --- a/lite/backends/x86/math/sequence_pooling_test.cc +++ b/lite/backends/x86/math/sequence_pooling_test.cc @@ -50,9 +50,9 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { in_grad.mutable_data(in_dims, context->GetPlace()); // check tensor contruction result - PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size()); + CHECK_EQ(in_grad.dims().size(), out_grad.dims().size()); for (int64_t i = 1; i < out_grad.dims().size(); ++i) { - PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]); + CHECK_EQ(in_grad.dims()[i], out_grad.dims()[i]); } // call functor diff --git a/lite/backends/x86/math/tree2col.cc b/lite/backends/x86/math/tree2col.cc index c54bb2099edd0a7e6be61cfdff6340734f09116a..bcab1e77c0bef356453bf1ea1f30aabfc9f1dff0 100644 --- a/lite/backends/x86/math/tree2col.cc +++ b/lite/backends/x86/math/tree2col.cc @@ -55,7 +55,7 @@ void Tree2ColUtil::construct_tree(const lite::Tensor &EdgeSet, std::vector> *tr, size_t *node_count) { auto edge_set_dims = EdgeSet.dims(); - PADDLE_ENFORCE_EQ(edge_set_dims[1], 2); + CHECK_EQ(edge_set_dims[1], 2); int64_t edge_count = EdgeSet.numel(); const int *edge_data = EdgeSet.data(); diff --git a/lite/backends/x86/math/unpooling.cc b/lite/backends/x86/math/unpooling.cc index 119d7294e9ec21e67f09776ad20d04f15b8b81ce..7ff132cbf121172b5bf35966637080d599eaf498 100644 --- a/lite/backends/x86/math/unpooling.cc +++ b/lite/backends/x86/math/unpooling.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "lite/backends/x86/math/unpooling.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -41,7 +41,7 @@ class Unpool2dMaxFunctor { for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { int index = indices_data[i]; - PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); + CHECK(index < output_feasize) << "err index in unpooling!"; output_data[index] = input_data[i]; } input_data += input_feasize; @@ -77,7 +77,7 @@ class Unpool2dMaxGradFunctor { for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { int index = indices_data[i]; - PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); + CHECK(index < output_feasize) << "err index in unpooling!"; input_grad_data[i] = output_grad_data[index]; } input_grad_data += input_feasize; diff --git a/lite/backends/x86/math/vol2col.cc b/lite/backends/x86/math/vol2col.cc index 91979bb7fdcfe66d84ded3f9797144ddafc8769e..8e8f44be55fc2df342092ad399f00bcc7941908d 100644 --- a/lite/backends/x86/math/vol2col.cc +++ b/lite/backends/x86/math/vol2col.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "lite/backends/x86/math/vol2col.h" #include -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -36,8 +36,8 @@ class Vol2ColFunctor { const std::vector& strides, const std::vector& paddings, lite::Tensor* col) const { - PADDLE_ENFORCE(vol.dims().size() == 4); - PADDLE_ENFORCE(col->dims().size() == 7); + CHECK_EQ(vol.dims().size(), 4); + CHECK_EQ(col->dims().size(), 7); int input_channels = vol.dims()[0]; int input_depth = vol.dims()[1]; @@ -52,27 +52,27 @@ class Vol2ColFunctor { int channels_col = input_channels * filter_depth * filter_height * filter_width; - PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - - ((dilations[0] * (filter_depth - 1) + 1))) / - strides[0] + - 1, - output_depth, - "input_depth and output_depth are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - - ((dilations[1] * (filter_height - 1) + 1))) / - strides[1] + - 1, - output_height, - "input_height and output_height are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - - ((dilations[2] * (filter_width - 1) + 1))) / - strides[2] + - 1, - output_width, - "input_width and output_width are " - "mismatching."); + CHECK_EQ((input_depth + 2 * paddings[0] - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1, + output_depth) + << "input_depth and output_depth are " + "mismatching."; + CHECK_EQ((input_height + 2 * paddings[1] - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1, + output_height) + << "input_height and output_height are " + "mismatching."; + CHECK_EQ((input_width + 2 * paddings[2] - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1, + output_width) + << "input_width and output_width are " + "mismatching."; const T* vol_data = vol.data(); T* col_data = col->template mutable_data(); @@ -122,8 +122,8 @@ class Col2VolFunctor { const std::vector& strides, const std::vector& paddings, lite::Tensor* vol) const { - PADDLE_ENFORCE(vol->dims().size() == 4); - PADDLE_ENFORCE(col.dims().size() == 7); + CHECK_EQ(vol->dims().size(), 4); + CHECK_EQ(col.dims().size(), 7); int input_channels = vol->dims()[0]; int input_depth = vol->dims()[1]; @@ -138,27 +138,27 @@ class Col2VolFunctor { int channels_col = input_channels * filter_depth * filter_height * filter_width; - PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] - - ((dilations[0] * (filter_depth - 1) + 1))) / - strides[0] + - 1, - output_depth, - "input_depth and output_depth are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] - - ((dilations[1] * (filter_height - 1) + 1))) / - strides[1] + - 1, - output_height, - "input_height and output_height are " - "mismatching."); - PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] - - ((dilations[2] * (filter_width - 1) + 1))) / - strides[2] + - 1, - output_width, - "input_width and output_width are " - "mismatching."); + CHECK_EQ((input_depth + 2 * paddings[0] - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1, + output_depth) + << "input_depth and output_depth are " + "mismatching."; + CHECK_EQ((input_height + 2 * paddings[1] - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1, + output_height) + << "input_height and output_height are " + "mismatching."; + CHECK_EQ((input_width + 2 * paddings[2] - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1, + output_width) + << "input_width and output_width are " + "mismatching."; T* vol_data = vol->template mutable_data(); const T* col_data = col.data(); diff --git a/lite/backends/xpu/debug.h b/lite/backends/xpu/debug.h new file mode 100644 index 0000000000000000000000000000000000000000..56bafc9c3d3a7772af8fc8afd10fc7efa3415ef7 --- /dev/null +++ b/lite/backends/xpu/debug.h @@ -0,0 +1,131 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/backends/xpu/target_wrapper.h" + +namespace paddle { +namespace lite { +namespace xpu { + +template +void DumpCPUMem(const T* ptr, + size_t len, + const std::string& comment = "", + size_t stride = 1, + size_t item_per_line = 30) { + size_t after_stride_len = (len + stride - 1) / stride; + std::unique_ptr after_stride(new T[after_stride_len]); + for (size_t i = 0; i < after_stride_len; ++i) { + after_stride[i] = ptr[i * stride]; + } + double sum = 0; + for (size_t i = 0; i < len; ++i) { + sum += ptr[i]; + } + + printf( + "------------------------------ [%s] len=%zd stride=%zd sum=%f BEGIN " + "------------------------------\n", + comment.c_str(), + len, + stride, + sum); + size_t nline = (after_stride_len + item_per_line - 1) / item_per_line; + for (size_t i = 0; i < nline; ++i) { + size_t line_begin = i * item_per_line; + size_t line_end = line_begin + item_per_line; + printf("line[%04zd] -- ", i); + for (size_t ii = line_begin; (ii < line_end) && (ii < after_stride_len); + ++ii) { + if (std::is_same::value) { + printf("%.6f, ", static_cast(after_stride[ii])); + } else if (std::is_same::value) { + printf("%d ", static_cast(after_stride[ii])); + } else { + // CHECK(false) << "unknown type"; + } + } + printf("\n"); + } + printf( + "------------------------------ [%s] len=%zd stride=%zd sum=%f END " + "------------------------------\n", + comment.c_str(), + len, + stride, + sum); +} + +template +void DumpXPUMem(const T* ptr, + size_t len, + const std::string& comment = "", + size_t stride = 1, + size_t item_per_line = 30) { + size_t after_stride_len = (len + stride - 1) / stride; + std::unique_ptr cpu_mem(new T[len]); + XPU_CALL(xpu_memcpy( + cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST)); + std::unique_ptr after_stride(new T[after_stride_len]); + for (size_t i = 0; i < after_stride_len; ++i) { + after_stride[i] = cpu_mem[i * stride]; + } + double sum = 0; + for (size_t i = 0; i < len; ++i) { + sum += cpu_mem[i]; + } + + printf( + "------------------------------ [%s] len=%zd stride=%zd sum=%f BEGIN " + "------------------------------\n", + comment.c_str(), + len, + stride, + sum); + size_t nline = (after_stride_len + item_per_line - 1) / item_per_line; + for (size_t i = 0; i < nline; ++i) { + size_t line_begin = i * item_per_line; + size_t line_end = line_begin + item_per_line; + printf("line[%04zd] -- ", i); + for (size_t ii = line_begin; (ii < line_end) && (ii < after_stride_len); + ++ii) { + if (std::is_same::value) { + printf("%.6f, ", static_cast(after_stride[ii])); + } else if (std::is_same::value) { + printf("%d ", static_cast(after_stride[ii])); + } else { + // CHECK(false) << "unknown type"; + } + } + printf("\n"); + } + printf( + "------------------------------ [%s] len=%zd stride=%zd sum=%f END " + "------------------------------\n", + comment.c_str(), + len, + stride, + sum); +} + +} // namespace xpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc index 5dcbc1e275cca8c32003cbef74dfb1f6d4caee93..a322418ccde20a34dc6c6ba9b47601a9a658f99c 100644 --- a/lite/backends/xpu/target_wrapper.cc +++ b/lite/backends/xpu/target_wrapper.cc @@ -13,18 +13,17 @@ // limitations under the License. #include "lite/backends/xpu/target_wrapper.h" -#include "lite/backends/xpu/xpu_header_sitter.h" namespace paddle { namespace lite { void* TargetWrapperXPU::Malloc(size_t size) { void* ptr{nullptr}; - xpu_malloc(&ptr, size); + XPU_CALL(xpu_malloc(&ptr, size)); return ptr; } -void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); } +void TargetWrapperXPU::Free(void* ptr) { XPU_CALL(xpu_free(ptr)); } void TargetWrapperXPU::MemcpySync(void* dst, const void* src, @@ -32,15 +31,31 @@ void TargetWrapperXPU::MemcpySync(void* dst, IoDirection dir) { switch (dir) { case IoDirection::HtoD: - xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE); + XPU_CALL(xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE)); break; case IoDirection::DtoH: - xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST); + XPU_CALL(xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST)); break; default: LOG(FATAL) << "Unsupported IoDirection " << static_cast(dir); } } +XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size, + bool use_l3) { + void* ptr{nullptr}; + if (use_l3) { + ptr = xdnn::alloc_workspace(GetRawContext(), size); + } else { + ptr = TargetWrapperXPU::Malloc(size); + } + CHECK(ptr != nullptr) << "size = " << size << ", use_l3 = " << use_l3; + return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3)); +} + +std::string TargetWrapperXPU::multi_encoder_precision; // NOLINT +int TargetWrapperXPU::workspace_l3_size_per_thread{0}; +thread_local xdnn::Context* TargetWrapperXPU::tls_raw_ctx_{nullptr}; + } // namespace lite } // namespace paddle diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h index c42d4139246085d8b9a367b45b60699209d0b668..070184a13088a169fe38f1b8105a0803d9915da1 100644 --- a/lite/backends/xpu/target_wrapper.h +++ b/lite/backends/xpu/target_wrapper.h @@ -14,13 +14,45 @@ #pragma once -#include "lite/core/target_wrapper.h" +#include // std::unique_ptr +#include "lite/backends/xpu/xpu_header_sitter.h" // xpu_free +#include "lite/core/target_wrapper.h" // TargetWrapper +#include "lite/utils/cp_logging.h" // CHECK_EQ + +#define XPU_CALL(func) \ + { \ + auto e = (func); \ + CHECK_EQ(e, 0) << "XPU: (" << #func << ") returns " << e; \ + } namespace paddle { namespace lite { +// MAX(lod.size()) = 64 +const int XPU_MAX_LOD_SIZE = 64; +// MAX(lod[i + 1] - lod[i]) = 512 +const int XPU_MAX_LOD_SEQ_LEN = 512; + using TargetWrapperXPU = TargetWrapper; +struct XPUScratchPad { + XPUScratchPad(void* addr, bool is_l3) : addr_(addr), is_l3_(is_l3) {} + + void* addr_{nullptr}; + bool is_l3_{false}; +}; + +struct XPUScratchPadDeleter { + void operator()(XPUScratchPad* sp) const { + if (!sp->is_l3_) { + XPU_CALL(xpu_free(sp->addr_)); + } + delete sp; + } +}; + +using XPUScratchPadGuard = std::unique_ptr; + template <> class TargetWrapper { public: @@ -34,6 +66,40 @@ class TargetWrapper { const void* src, size_t size, IoDirection dir); + + static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = false); + + static xdnn::Context* GetRawContext() { + if (tls_raw_ctx_ == nullptr) { + tls_raw_ctx_ = xdnn::create_context(); + CHECK(tls_raw_ctx_); + int r = xdnn::set_workspace_l3_size(tls_raw_ctx_, + workspace_l3_size_per_thread); + if (r != 0) { + LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r + << ", workspace_l3_size_per_thread = " + << workspace_l3_size_per_thread; + } + } + return tls_raw_ctx_; + } + + // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker + // thread + static void SetDev(int dev_no = 0) { + const char* dev_env = getenv("LITE_XPU_DEV"); + if (dev_env) { + dev_no = atoi(dev_env); + } + + XPU_CALL(xpu_set_device(dev_no)); + } + + static std::string multi_encoder_precision; // NOLINT + static int workspace_l3_size_per_thread; + + private: + static thread_local xdnn::Context* tls_raw_ctx_; }; } // namespace lite diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index 56a5c9b8f7ea0ed47d21629d7ccf083b4f9fa232..af2bfbe86aaa1b3f145838015a6d6a62090cb3b1 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -121,7 +121,7 @@ lite_cc_library(kernel SRCS kernel.cc PROFILE_DEPS lite_profiler ) lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel - cpp_op_desc tensor + cpp_op_desc tensor utils ) add_dependencies(kernel kernel_list_h) diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index 75971570fb078ce4e39413e5b3df629fe2a7ac3e..53988f063b89ae3e75f4c27cc1d937d12bb6dae5 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -6,5 +6,5 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc index 731215f542567ec3ff0cc87d6990624bfa6b2bc2..599e8f6c3791ac68474ca27e6c627bd2fc43765a 100644 --- a/lite/core/arena/framework.cc +++ b/lite/core/arena/framework.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/core/arena/framework.h" +#include #include "lite/core/context.h" #include "lite/operators/subgraph_op.h" @@ -22,37 +23,54 @@ namespace arena { void TestCase::CreateInstruction() { std::shared_ptr op = nullptr; - if (place_.target == TARGET(kNPU) || place_.target == TARGET(kXPU)) { + static const std::set subgraph_op_supported_targets( + {TARGET(kNPU), TARGET(kXPU), TARGET(kHuaweiAscendNPU)}); + bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) != + subgraph_op_supported_targets.end(); +#if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL) + enable_subgraph_op = false; // Use XPU kernel directly if XTCL is disabled. +#endif + if (enable_subgraph_op) { // Create a new block desc to wrap the original op desc + auto sub_program_desc = std::make_shared(); int sub_block_idx = 0; - auto sub_block_desc = new cpp::BlockDesc(); + auto sub_block_desc = sub_program_desc->AddBlock(); sub_block_desc->ClearOps(); sub_block_desc->ClearVars(); - auto sub_block_op_desc = sub_block_desc->AddOp(); - *sub_block_op_desc = *op_desc_; + auto sub_op_desc = sub_block_desc->AddOp(); + *sub_op_desc = *op_desc_; // Add the block desc into the subgraph op which used to replace the // original op op_desc_.reset(new cpp::OpDesc()); op_desc_->SetType("subgraph"); op_desc_->SetAttr("sub_block", sub_block_idx); - auto in_names = sub_block_op_desc->input_vars(); - auto out_names = sub_block_op_desc->output_vars(); + auto in_names = sub_op_desc->input_vars(); + auto out_names = sub_op_desc->output_vars(); op_desc_->SetInput("Inputs", in_names); op_desc_->SetOutput("Outputs", out_names); - op_desc_->SetAttr>("input_data_names", in_names); + // filter only data op (not const op by persisiable) + std::vector in_data_names; + for (auto name : in_names) { + if (!(inst_scope_->FindTensor(name)->persistable())) { + in_data_names.push_back(name); + } + } + op_desc_->SetAttr>("input_data_names", + in_data_names); op_desc_->SetAttr>("output_data_names", out_names); op = LiteOpRegistry::Global().Create(op_desc().Type()); - static_cast(op.get())->SetSubBlock(sub_block_desc); + static_cast(op.get())->SetProgramDesc( + sub_program_desc); } else { op = LiteOpRegistry::Global().Create(op_desc().Type()); } CHECK(op) << "no op for " << op_desc().Type(); - op->Attach(*op_desc_, inst_scope_); + op->Attach(*op_desc_, inst_scope_.get()); auto kernels = op->CreateKernels({place_}); // filter out the target kernel CHECK(!kernels.empty()) << "No kernel found for place " << place_.DebugString(); - auto it = std::remove_if( + auto it = std::find_if( kernels.begin(), kernels.end(), [&](std::unique_ptr& k) { return k->alias() == alias_; }); @@ -72,53 +90,35 @@ void TestCase::CreateInstruction() { void TestCase::PrepareInputsForInstruction() { for (auto& arg : op_desc().InputArgumentNames()) { for (auto& var : op_desc().Input(arg)) { - std::string kernel_key = instruction_->kernel()->key_with_alias(); - const auto* param_type = ParamTypeRegistry::Global().RetrieveInArgument( - place_, kernel_key, arg); - - const Type* inst_type = nullptr; - if (param_type->type->IsTensor()) { - inst_type = Type::GetTensorTy(TARGET(kHost)); - } else if (param_type->type->IsTensorList()) { - inst_type = Type::GetTensorListTy(TARGET(kHost)); - } else { - LOG(FATAL) << "unsupported param_type"; - } - - CHECK(scope_->FindVar(var)); - if (!TargetCompatibleTo(*inst_type, *param_type->type)) { - /// Create a tensor or tensor_array in the instruction's scope, - /// alloc memory and then copy data there. - if (param_type->type->IsTensor()) { - const auto* shared_tensor = scope_->FindTensor(var); - auto* target_tensor = inst_scope_->NewTensor(var); - CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet"; - target_tensor->Resize(shared_tensor->dims()); - TargetCopy(param_type->type->target(), - target_tensor->mutable_data(param_type->type->target(), - shared_tensor->memory_size()), - shared_tensor->raw_data(), - shared_tensor->memory_size()); - } else if (param_type->type->IsTensorList()) { - const auto* shared_tensor_array = - scope_->FindVar(var)->GetMutable>(); - auto* target_tensor_array = - inst_scope_->Var(var)->GetMutable>(); - CHECK(!shared_tensor_array->empty()) - << "shared_tensor_array is empty yet"; - target_tensor_array->resize(shared_tensor_array->size()); - for (size_t i = 0; i < shared_tensor_array->size(); i++) { - target_tensor_array->at(i).Resize( - shared_tensor_array->at(i).dims()); - TargetCopy(param_type->type->target(), - target_tensor_array->at(i).mutable_data( - param_type->type->target(), - shared_tensor_array->at(i).memory_size()), - shared_tensor_array->at(i).raw_data(), - shared_tensor_array->at(i).memory_size()); - } - } else { - LOG(FATAL) << "not support"; + const auto* type = instruction_->kernel()->GetInputDeclType(arg); + CHECK(base_scope_->FindVar(var)); + /// Create a tensor or tensor_array in the instruction's scope, + /// alloc memory and then copy data there. + if (type->IsTensor() && + !TargetCompatibleTo(*Type::GetTensorTy(TARGET(kHost)), *type)) { + const auto* base_tensor = base_scope_->FindTensor(var); + auto* inst_tensor = inst_scope_->FindMutableTensor(var); + CHECK(!base_tensor->dims().empty()) + << "The dims of input tensor is empty yet"; + TargetCopy(type->target(), + inst_tensor->mutable_data(type->target(), + base_tensor->memory_size()), + base_tensor->raw_data(), + base_tensor->memory_size()); + } else if (type->IsTensorList() && + !TargetCompatibleTo(*Type::GetTensorListTy(TARGET(kHost)), + *type)) { + const auto* base_tensor_list = base_scope_->FindTensorList(var); + auto* inst_tensor_list = inst_scope_->FindMutableTensorList(var); + CHECK_EQ(base_tensor_list->size(), inst_tensor_list->size()); + for (size_t i = 0; i < base_tensor_list->size(); i++) { + CHECK(!base_tensor_list->at(i).dims().empty()) + << "The dims of input tensor[" << i << "] is empty yet"; + TargetCopy(type->target(), + inst_tensor_list->at(i).mutable_data( + type->target(), base_tensor_list->at(i).memory_size()), + inst_tensor_list->at(i).raw_data(), + inst_tensor_list->at(i).memory_size()); } } } @@ -126,78 +126,88 @@ void TestCase::PrepareInputsForInstruction() { } template -bool TestCase::CheckTensorPrecision(const Tensor* a_tensor, - const Tensor* b_tensor, +bool TestCase::CheckTensorPrecision(const Tensor* inst_tensor, + const Tensor* base_tensor, float abs_error) { - CHECK(a_tensor); - CHECK(b_tensor); + CHECK(inst_tensor); + CHECK(base_tensor); - CHECK(ShapeEquals(a_tensor->dims(), b_tensor->dims())); + CHECK(ShapeEquals(inst_tensor->dims(), base_tensor->dims())); - CHECK(a_tensor->lod() == b_tensor->lod()) << "lod not match"; + CHECK(inst_tensor->lod() == base_tensor->lod()) << "lod not match"; // The baseline should output in host devices. - CHECK(b_tensor->target() == TARGET(kHost) || - b_tensor->target() == TARGET(kX86) || - b_tensor->target() == TARGET(kARM)); - - const T* a_data{}; - switch (a_tensor->target()) { + CHECK(base_tensor->target() == TARGET(kHost) || + base_tensor->target() == TARGET(kX86) || + base_tensor->target() == TARGET(kARM)); + const T* inst_data{}; + Tensor inst_host_tensor; + inst_host_tensor.Resize(inst_tensor->dims()); + switch (inst_tensor->target()) { case TARGET(kX86): case TARGET(kHost): case TARGET(kARM): - a_data = static_cast(a_tensor->raw_data()); + inst_data = static_cast(inst_tensor->raw_data()); break; +#ifdef LITE_WITH_XPU + case TARGET(kXPU): + CopySync(inst_host_tensor.mutable_data(), + inst_tensor->raw_data(), + sizeof(T) * inst_tensor->dims().production(), + IoDirection::DtoH); + inst_data = inst_host_tensor.data(); + break; +#endif default: // Before compare, need to copy data from `target` device to host. LOG(FATAL) << "Not supported"; } - CHECK(a_data); + CHECK(inst_data); - const T* b_data = static_cast(b_tensor->raw_data()); + const T* base_data = static_cast(base_tensor->raw_data()); bool success = true; - for (int i = 0; i < a_tensor->dims().production(); i++) { - EXPECT_NEAR(a_data[i], b_data[i], abs_error); - if (fabsf(a_data[i] - b_data[i]) > abs_error) { + for (int i = 0; i < inst_tensor->dims().production(); i++) { + EXPECT_NEAR(inst_data[i], base_data[i], abs_error); + if (fabsf(inst_data[i] - base_data[i]) > abs_error) { success = false; } } return success; } -bool TestCase::CheckPrecision(const Tensor* a_tensor, - const Tensor* b_tensor, +bool TestCase::CheckPrecision(const Tensor* inst_tensor, + const Tensor* base_tensor, float abs_error, PrecisionType precision_type) { PrecisionType precision_type_t = precision_type; if (precision_type == PRECISION(kAny)) { - precision_type_t = b_tensor->precision(); + precision_type_t = base_tensor->precision(); } - CHECK(precision_type_t == b_tensor->precision()) + CHECK(precision_type_t == base_tensor->precision()) << "arg precision type and base tensor precision type are not matched! " "arg precision type is: " << PrecisionToStr(precision_type) << ", base tensor precision type is: " - << PrecisionToStr(b_tensor->precision()); - CHECK(a_tensor->precision() == b_tensor->precision()) + << PrecisionToStr(base_tensor->precision()); + CHECK(inst_tensor->precision() == base_tensor->precision()) << "real tensor precision type and base tensor precision type are not " "matched! real tensor precision type is: " - << PrecisionToStr(a_tensor->precision()) + << PrecisionToStr(inst_tensor->precision()) << ", base tensor precision type is: " - << PrecisionToStr(b_tensor->precision()); + << PrecisionToStr(base_tensor->precision()); switch (precision_type_t) { case PRECISION(kFloat): - return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + return CheckTensorPrecision(inst_tensor, base_tensor, abs_error); case PRECISION(kInt8): - return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + return CheckTensorPrecision(inst_tensor, base_tensor, abs_error); case PRECISION(kInt32): - return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + return CheckTensorPrecision(inst_tensor, base_tensor, abs_error); case PRECISION(kInt64): - return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + return CheckTensorPrecision(inst_tensor, base_tensor, abs_error); case PRECISION(kBool): - return CheckTensorPrecision(a_tensor, b_tensor, abs_error); + return CheckTensorPrecision(inst_tensor, base_tensor, abs_error); default: LOG(FATAL) << "not support type: " << PrecisionToStr(precision_type); return false; @@ -209,24 +219,24 @@ bool TestCase::CheckPrecision(const std::string& var_name, PrecisionType precision_type) { bool success = true; if (inst_scope_->FindVar(var_name)->IsType()) { - auto a_tensor = inst_scope_->FindTensor(var_name); - auto b_tensor = base_scope_->FindTensor(var_name); - success = success && - CheckPrecision(a_tensor, b_tensor, abs_error, precision_type); + auto inst_tensor = inst_scope_->FindTensor(var_name); + auto base_tensor = base_scope_->FindTensor(var_name); + success = + success && + CheckPrecision(inst_tensor, base_tensor, abs_error, precision_type); } else if (inst_scope_->FindVar(var_name)->IsType>()) { - auto a_tensor_array = - inst_scope_->FindVar(var_name)->GetMutable>(); - auto b_tensor_array = - base_scope_->FindVar(var_name)->GetMutable>(); - CHECK_EQ(a_tensor_array->size(), b_tensor_array->size()); - for (size_t i = 0; i < a_tensor_array->size(); i++) { - Tensor* a_tensor = &(a_tensor_array->at(i)); - Tensor* b_tensor = &(b_tensor_array->at(i)); - if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) { + auto inst_tensor_list = inst_scope_->FindMutableTensorList(var_name); + auto base_tensor_list = base_scope_->FindMutableTensorList(var_name); + CHECK_EQ(inst_tensor_list->size(), base_tensor_list->size()); + for (size_t i = 0; i < inst_tensor_list->size(); i++) { + Tensor* inst_tensor = &(inst_tensor_list->at(i)); + Tensor* base_tensor = &(base_tensor_list->at(i)); + if (inst_tensor->dims().size() == 0 && base_tensor->dims().size() == 0) { continue; } - success = success && - CheckPrecision(a_tensor, b_tensor, abs_error, precision_type); + success = + success && + CheckPrecision(inst_tensor, base_tensor, abs_error, precision_type); } } else { LOG(FATAL) << "unsupported var type"; @@ -234,19 +244,6 @@ bool TestCase::CheckPrecision(const std::string& var_name, return success; } -TestCase::~TestCase() { - if (op_desc_->Type() == "subgraph") { - // Release the subblock desc of Subgraph op - auto subgraph_op = const_cast( - static_cast(instruction_->op())); - CHECK(subgraph_op); - auto sub_block_desc = subgraph_op->GetSubBlock(); - if (sub_block_desc) { - delete sub_block_desc; - } - } -} - } // namespace arena } // namespace lite } // namespace paddle diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h index cf864a32044e3dfd03ecd03327a0db69275ef586..4ccb05428d38c65f8cad36f1702c034cfe62705b 100644 --- a/lite/core/arena/framework.h +++ b/lite/core/arena/framework.h @@ -28,7 +28,7 @@ #include "lite/core/program.h" #include "lite/core/scope.h" #include "lite/core/types.h" -#include "lite/model_parser/cpp/op_desc.h" +#include "lite/model_parser/cpp_desc.h" namespace paddle { namespace lite { @@ -40,13 +40,15 @@ namespace arena { class TestCase { public: explicit TestCase(const Place& place, const std::string& alias) - : place_(place), scope_(new Scope), alias_(alias) { + : place_(place), + alias_(alias), + inst_scope_(new Scope), + base_scope_(new Scope) { ctx_ = ContextScheduler::Global().NewContext(place_.target); } - virtual ~TestCase(); + virtual ~TestCase() {} void Prepare() { - PrepareScopes(); PrepareData(); op_desc_.reset(new cpp::OpDesc); PrepareOpDesc(op_desc_.get()); @@ -91,16 +93,15 @@ class TestCase { // kernel registry. void CheckKernelConsistWithDefinition() {} - Scope& scope() { return *scope_; } - - Scope* baseline_scope() { return base_scope_; } - Scope* inst_scope() { return inst_scope_; } + Scope* baseline_scope() { return base_scope_.get(); } + Scope* inst_scope() { return inst_scope_.get(); } protected: // Prepare inputs in scope() for Tester. virtual void PrepareData() = 0; - /// Prepare a tensor in host. The tensors will be created in scope_. + /// Prepare a tensor in host. The tensors will be created both in base_scope_ + /// and inst_scope_. /// Need to specify the targets other than X86 or ARM. template void SetCommonTensor(const std::string& var_name, @@ -108,42 +109,47 @@ class TestCase { const T* data, const LoD& lod = {}, bool is_persistable = false) { - auto* tensor = scope_->NewTensor(var_name); - tensor->Resize(ddim); - auto* d = tensor->mutable_data(); - memcpy(d, data, ddim.production() * sizeof(T)); + // Create and fill a input tensor with the given data for baseline + auto* base_tensor = base_scope_->NewTensor(var_name); + base_tensor->Resize(ddim); + memcpy(base_tensor->mutable_data(), data, ddim.production() * sizeof(T)); // set lod - if (!lod.empty()) *tensor->mutable_lod() = lod; + if (!lod.empty()) *base_tensor->mutable_lod() = lod; // set persistable - tensor->set_persistable(is_persistable); + base_tensor->set_persistable(is_persistable); + + // Create a copy for instruction + auto* inst_tensor = inst_scope_->NewTensor(var_name); + inst_tensor->CopyDataFrom(*base_tensor); } /// Prepare a tensor_array in host. The tensors will be created in scope_. /// Need to specify the targets other than X86 or ARM. template void SetCommonTensorList(const std::string& var_name, - const std::vector& array_tensor_dims, + const std::vector& ddims, const std::vector>& datas, const std::vector& lods = {}) { - CHECK_EQ(array_tensor_dims.size(), datas.size()); + // Create a tensor array for baseline, and a copy for instruction + CHECK_EQ(ddims.size(), datas.size()); if (!lods.empty()) { - CHECK_EQ(array_tensor_dims.size(), lods.size()); + CHECK_EQ(ddims.size(), lods.size()); } - auto* tensor_array = - scope_->Var(var_name)->GetMutable>(); - for (int i = 0; i < array_tensor_dims.size(); i++) { - Tensor tmp; - tmp.Resize(array_tensor_dims[i]); - auto* tmp_data = tmp.mutable_data(); - memcpy(tmp_data, + auto* base_tensor_list = base_scope_->NewTensorList(var_name); + auto* inst_tensor_list = inst_scope_->NewTensorList(var_name); + for (int i = 0; i < ddims.size(); i++) { + Tensor item; + item.Resize(ddims[i]); + memcpy(item.mutable_data(), datas[i].data(), - array_tensor_dims[i].production() * sizeof(T)); + ddims[i].production() * sizeof(T)); if (!lods.empty()) { - tmp.set_lod(lods[i]); + item.set_lod(lods[i]); } - tensor_array->push_back(tmp); + base_tensor_list->push_back(item); + inst_tensor_list->push_back(item); } } @@ -157,11 +163,6 @@ class TestCase { std::unique_ptr ctx_; void CreateInstruction(); - void PrepareScopes() { - inst_scope_ = &scope_->NewScope(); - base_scope_ = &scope_->NewScope(); - } - // Check shape // TODO(Superjomn) Move this method to utils or DDim? bool ShapeEquals(const DDim& a, const DDim& b) { @@ -172,25 +173,23 @@ class TestCase { return true; } - /// Copy the input tensors to target devices needed by the instruction. + // Copy the host tensors to the device tensors if needed by the instruction. void PrepareInputsForInstruction(); // Create output tensors and variables. void PrepareOutputsForInstruction() { for (auto x : op_desc().output_vars()) { - inst_scope_->NewTensor(x); - base_scope_->NewTensor(x); + inst_scope_->Var(x); } } private: Place place_; - std::shared_ptr scope_; std::string alias_; // The workspace for the Instruction. - Scope* inst_scope_{}; + std::shared_ptr inst_scope_; // The workspace for the baseline implementation. - Scope* base_scope_{}; + std::shared_ptr base_scope_; std::unique_ptr op_desc_; std::unique_ptr instruction_; }; diff --git a/lite/core/context.cc b/lite/core/context.cc index eb8f90d7fa90d459846b24bc93b5d26cdfc3969a..abb44945ec66e1a89efc1ccb08ec1df370f2e099 100644 --- a/lite/core/context.cc +++ b/lite/core/context.cc @@ -17,14 +17,19 @@ namespace paddle { namespace lite { -#ifdef LITE_WITH_NPU -std::string Context::subgraph_model_cache_dir_{""}; // NOLINT +#ifdef LITE_WITH_HUAWEI_ASCEND_NPU +thread_local std::string + Context::subgraph_model_cache_dir_{ + ""}; // NOLINT +thread_local int + Context::huawei_ascend_device_id_{ + 0}; // NOLINT #endif -#ifdef LITE_WITH_XPU -std::string Context::_multi_encoder_precision; // NOLINT -thread_local xdnn::Context* Context::_tls_raw_ctx{nullptr}; -int Context::_workspace_l3_size_per_thread{0}; +#ifdef LITE_WITH_MLU +int Context::next_queue_id_{0}; +std::map Context::queue_id_map_; +std::mutex Context::map_mutex_; #endif } // namespace lite diff --git a/lite/core/context.h b/lite/core/context.h index f606eeffaf8ccf932e2d17f03478d4d893ee482d..69f6a4b9d6bc87422d06e66e8d329547ccf5f24a 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -25,6 +25,7 @@ #ifdef LITE_WITH_MLU #include #include +#include // NOLINT #include "lite/backends/mlu/mlu_utils.h" #endif #ifdef LITE_WITH_XPU @@ -38,6 +39,7 @@ #include #include #include "lite/core/device_info.h" +#include "lite/core/scope.h" #include "lite/core/target_wrapper.h" #include "lite/core/tensor.h" #include "lite/utils/all.h" @@ -60,6 +62,7 @@ using FPGAContext = Context; using BMContext = Context; using MLUContext = Context; using RKNPUContext = Context; +using HuaweiAscendNPUContext = Context; template <> class Context { @@ -83,6 +86,35 @@ class Context { NPUContext& operator=(const NPUContext& ctx) {} std::string name() const { return "NPUContext"; } + static void SetSubgraphModelCacheDir(Scope* scope, + std::string subgraph_model_cache_dir) { + auto var = scope->Var("SUBGRAPH_MODEL_CACHE_DIR"); + CHECK(var); + auto data = var->GetMutable(); + CHECK(data); + *data = subgraph_model_cache_dir; + } + static std::string SubgraphModelCacheDir(Scope* scope) { + auto var = scope->FindVar("SUBGRAPH_MODEL_CACHE_DIR"); + if (!var) return ""; + return var->Get(); + } +}; +#endif + +#ifdef LITE_WITH_HUAWEI_ASCEND_NPU +template <> +class Context { + public: + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() {} + void CopySharedTo(HuaweiAscendNPUContext* ctx) {} + + HuaweiAscendNPUContext& operator=(const HuaweiAscendNPUContext& ctx) { + return *this; + } + std::string name() const { return "HuaweiAscendNPUContext"; } + static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) { subgraph_model_cache_dir_ = subgraph_model_cache_dir; } @@ -90,8 +122,14 @@ class Context { return subgraph_model_cache_dir_; } + static void SetHuaweiAscendDeviceID(int huawei_ascend_device_id) { + huawei_ascend_device_id_ = huawei_ascend_device_id; + } + static int HuaweiAscendDeviceID() { return huawei_ascend_device_id_; } + private: - static std::string subgraph_model_cache_dir_; + static thread_local std::string subgraph_model_cache_dir_; + static thread_local int huawei_ascend_device_id_; }; #endif @@ -143,45 +181,12 @@ class Context { void CopySharedTo(XPUContext* ctx) {} + // TODO(miaotianxiang): remove this static xdnn::Context* GetRawContext() { - if (_tls_raw_ctx == nullptr) { - _tls_raw_ctx = xdnn::create_context(); - CHECK(_tls_raw_ctx); - int r = xdnn::set_workspace_l3_size(_tls_raw_ctx, - _workspace_l3_size_per_thread); - if (r != 0) { - LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r - << ", _workspace_l3_size_per_thread = " - << _workspace_l3_size_per_thread; - } - } - return _tls_raw_ctx; - } - - static void SetWorkspaceL3Size(int l3_size = 0xfffc00) { - _workspace_l3_size_per_thread = l3_size; - } - - // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker - // thread - static void SetDev(int dev_no = 0) { - const char* dev_env = getenv("LITE_XPU_DEV"); - if (dev_env) { - xpu_set_device(atoi(dev_env)); - return; - } - - xpu_set_device(dev_no); + return TargetWrapperXPU::GetRawContext(); } std::string name() const { return "XPUContext"; } - - public: - static std::string _multi_encoder_precision; // NOLINT - - private: - static thread_local xdnn::Context* _tls_raw_ctx; - static int _workspace_l3_size_per_thread; }; #endif @@ -249,11 +254,11 @@ class Context { void InitOnce() {} MLUContext& operator=(const MLUContext& ctx) { - this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_); + this->Init(ctx.device_id_, ctx.exec_queue_id_); return *this; } - void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) { + void Init(int dev_id, int exec_queue_id = 0) { CHECK_GT(devs.size(), 0UL) << "Env is not initialized or current target is not exit!"; if (dev_id >= static_cast(devs.size())) { @@ -264,21 +269,19 @@ class Context { device_id_ = dev_id; } SetMluDevice(device_id_); - if (io_queue_id >= devs[dev_id].max_queue()) { - LOG(WARNING) << "data queue index exceeds the maximum queue number, " - "set to default qeueu(0)!"; - io_queue_id = 0; - } - if (exec_queue_id >= devs[dev_id].max_queue()) { - LOG(WARNING) << "exec queue index exceeds the maximum queue number, " - "set to default qeueu(0)!"; - exec_queue_id = 0; + + // get queue id from map + std::unique_lock lk(map_mutex_); + if (queue_id_map_.find(exec_queue_id) == queue_id_map_.end()) { + queue_id_map_[exec_queue_id] = + next_queue_id_++ % devs[dev_id].max_queue(); } - io_queue_ = devs[dev_id].io_queues()[io_queue_id]; - exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id]; + exec_queue_id_ = queue_id_map_[exec_queue_id]; + VLOG(4) << "pick mlu queue id: " << exec_queue_id_; + lk.unlock(); - exec_queue_id_ = exec_queue_id; - io_queue_id_ = io_queue_id; + io_queue_ = devs[dev_id].io_queues()[exec_queue_id_]; + exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id_]; } void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; } @@ -290,10 +293,12 @@ class Context { void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; } cnmlCoreVersion_t MLUCoreVersion() { - return DeviceInfo::Global().MLUCoreVersion(); + return paddle::lite::TargetWrapperMlu::MLUCoreVersion(); } - int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); } + int MLUCoreNumber() { + return paddle::lite::TargetWrapperMlu::MLUCoreNumber(); + } u32_t affinity() { return affinity_; } @@ -304,10 +309,12 @@ class Context { std::string name() const { return "MLUContext"; } private: + static int next_queue_id_; + static std::map queue_id_map_; + static std::mutex map_mutex_; int device_id_; // overall information int exec_queue_id_; - int io_queue_id_; cnrtQueue_t io_queue_; cnrtQueue_t exec_queue_; @@ -415,6 +422,13 @@ class ContextScheduler { &ctx->As()); break; #endif +#ifdef LITE_WITH_HUAWEI_ASCEND_NPU + case TARGET(kHuaweiAscendNPU): + kernel_contexts_[TargetType::kHuaweiAscendNPU] + .As() + .CopySharedTo(&ctx->As()); + break; +#endif #ifdef LITE_WITH_APU case TARGET(kAPU): kernel_contexts_[TargetType::kAPU].As().CopySharedTo( @@ -455,7 +469,7 @@ class ContextScheduler { case TARGET(kMLU): { int dev_id = TargetWrapper::GetCurDevice(); auto& context = ctx->As(); - context.Init(dev_id); + context.Init(dev_id, exec_stream_id); kernel_contexts_[TargetType::kMLU].As().CopySharedTo( &context); LOG(INFO) << "New Context for MLU"; @@ -496,6 +510,9 @@ class ContextScheduler { #ifdef LITE_WITH_NPU InitContext(); #endif +#ifdef LITE_WITH_HUAWEI_ASCEND_NPU + InitContext(); +#endif #ifdef LITE_WITH_APU InitContext(); #endif diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index ac79ede37406188f495690179b4a4886bc009d80..6d404cee9718a94d2646728c8f2d79576ceb7860 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -66,15 +66,6 @@ thread_local std::vector DeviceInfo::active_ids_; thread_local TensorLite DeviceInfo::workspace_; thread_local int64_t DeviceInfo::count_ = 0; -#ifdef LITE_WITH_MLU -thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270}; -thread_local int DeviceInfo::mlu_core_number_{1}; -thread_local bool DeviceInfo::use_first_conv_{false}; -thread_local std::vector DeviceInfo::mean_vec_; -thread_local std::vector DeviceInfo::std_vec_; -thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)}; -#endif - #ifdef TARGET_IOS const int DEFAULT_L1_CACHE_SIZE = 64 * 1024; const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024; @@ -1089,45 +1080,6 @@ int DeviceInfo::Setup() { return 0; } -#ifdef LITE_WITH_MLU -void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version, - int core_number, - bool use_first_conv, - const std::vector& mean_vec, - const std::vector& std_vec, - DataLayoutType input_layout) { - switch (core_version) { - case (lite_api::MLUCoreVersion::MLU_220): - mlu_core_version_ = CNML_MLU220; - break; - case (lite_api::MLUCoreVersion::MLU_270): - mlu_core_version_ = CNML_MLU270; - break; - default: - mlu_core_version_ = CNML_MLU270; - break; - } - mlu_core_number_ = core_number; - use_first_conv_ = use_first_conv; - mean_vec_ = mean_vec; - std_vec_ = std_vec; - input_layout_ = input_layout; -} - -cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; } - -int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; } - -bool DeviceInfo::UseFirstConv() { return use_first_conv_; } - -const std::vector& DeviceInfo::MeanVec() const { return mean_vec_; } - -const std::vector& DeviceInfo::StdVec() const { return std_vec_; } - -DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; } - -#endif // LITE_WITH_MLU - void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) { #ifdef ARM_WITH_OMP thread_num = std::min(thread_num, core_num_); diff --git a/lite/core/device_info.h b/lite/core/device_info.h index f5b75039ea14f67cee9d009261b2dd1fc6b46825..f3f10c2d5740d6e8cc7e219b8f0d9d9ff17a8496 100644 --- a/lite/core/device_info.h +++ b/lite/core/device_info.h @@ -55,20 +55,6 @@ class DeviceInfo { int Setup(); void SetRunMode(lite_api::PowerMode mode, int thread_num); -#ifdef LITE_WITH_MLU - void SetMLURunMode(lite_api::MLUCoreVersion core_version, - int core_number, - bool use_first_conv, - const std::vector& mean_vec, - const std::vector& std_vec, - DataLayoutType input_layout); - cnmlCoreVersion_t MLUCoreVersion(); - int MLUCoreNumber(); - bool UseFirstConv(); - const std::vector& MeanVec() const; - const std::vector& StdVec() const; - DataLayoutType InputLayout() const; -#endif void SetCache(int l1size, int l2size, int l3size); void SetArch(ARMArch arch) { arch_ = arch; } @@ -120,15 +106,6 @@ class DeviceInfo { static thread_local TensorLite workspace_; static thread_local int64_t count_; -#ifdef LITE_WITH_MLU - static thread_local cnmlCoreVersion_t mlu_core_version_; - static thread_local int mlu_core_number_; - static thread_local bool use_first_conv_; - static thread_local std::vector mean_vec_; - static thread_local std::vector std_vec_; - static thread_local DataLayoutType input_layout_; -#endif - void SetDotInfo(int argc, ...); void SetFP16Info(int argc, ...); void SetFP32Info(int argc, ...); diff --git a/lite/core/memory.cc b/lite/core/memory.cc index 1f2f7fed7d61b67a76f54a092b6d48951bc9fcbd..83e41d2c0960d87a0201b55b943529a9df4f6ab2 100644 --- a/lite/core/memory.cc +++ b/lite/core/memory.cc @@ -140,6 +140,11 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) { dst, src, size, IoDirection::HtoD); break; #endif +#ifdef LITE_WITH_XPU + case TargetType::kXPU: + TargetWrapperXPU::MemcpySync(dst, src, size, IoDirection::HtoD); + break; +#endif #ifdef LITE_WITH_OPENCL case TargetType::kOpenCL: TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD); diff --git a/lite/core/memory.h b/lite/core/memory.h index a1013910019251271ddfccfbc700297c45226fe6..c80c8fb6b6e1356ebfa52920a8ee39f61ed20692 100644 --- a/lite/core/memory.h +++ b/lite/core/memory.h @@ -97,6 +97,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) { case TARGET(kBM): TargetWrapper::MemcpySync(dst, src, size, dir); break; +#endif +#ifdef LITE_WITH_XPU + case TARGET(kXPU): + TargetWrapperXPU::MemcpySync(dst, src, size, dir); + break; #endif default: LOG(FATAL) diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt index b8234b18922f454c41e295209da13de024184adc..cd129b332fa79dc45d74dc8a0befc1e67a68c316 100644 --- a/lite/core/mir/CMakeLists.txt +++ b/lite/core/mir/CMakeLists.txt @@ -18,17 +18,22 @@ lite_cc_library(mir_passes fusion/conv_activation_fuse_pass.cc fusion/var_conv_2d_activation_fuse_pass.cc fusion/conv_bn_fuse_pass.cc + fusion/conv_conv_fuse_pass.cc fusion/elementwise_add_activation_fuse_pass.cc fusion/quant_dequant_fuse_pass.cc fusion/sequence_pool_concat_fuse_pass.cc fusion/scale_activation_fuse_pass.cc fusion/__xpu__resnet_fuse_pass.cc + fusion/__xpu__resnet_cbam_fuse_pass.cc fusion/__xpu__multi_encoder_fuse_pass.cc fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc fusion/__xpu__fc_fuse_pass.cc + fusion/__xpu__mmdnn_fuse_pass.cc elimination/identity_scale_eliminate_pass.cc elimination/identity_dropout_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc + elimination/remove_tf_redundant_ops_pass.cc + elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc static_kernel_pick_pass.cc variable_place_inference_pass.cc type_target_cast_pass.cc diff --git a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..7866cb956c4e51d3b69687751325ca3ff4eda9d6 --- /dev/null +++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc @@ -0,0 +1,244 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "lite/core/mir/pass_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +// Remove all of the unused nodes from the contorl flow op and update the inputs +// and outputs of the op info The unused nodes are defined as the nodes which +// are only linked to the control flow op nodes but nerver linked to the other +// op nodes. +// +// For example: +// graph[0]: main block +// in_x +// in_f | in_z(unused node) +// \ | / +// \ | / +// in_w ------- while ------- in_y(unused_node) +// / | +// / | +// (unused node)out_y | +// out_x +// +// graph[1]: sub block +// in_x +// | +// | +// conv2d----in_f +// | +// | +// fc ------in_w +// | +// | +// softmax +// | +// | +// out_x +// +// After the pass is applied: +// in_x +// in_f | +// \ | +// \ | +// in_w ------- while +// | +// | +// | +// out_x + +// Remove the var node from var2rm if it is recursively referred to any op in +// the subblock +void CollectUnusedInputOutputNodes( + int block_idx, + std::vector>* graphs, + const std::unordered_set& control_flow_op_types, + std::unordered_map* in_vars2rm, + std::unordered_map* out_vars2rm) { + auto block_size = graphs->size(); + for (auto& op_node : (*graphs)[block_idx]->StmtTopologicalOrder()) { + if (!op_node->IsStmt()) continue; + auto op_info = op_node->AsStmt().op_info(); + auto op_type = op_info->Type(); + if (control_flow_op_types.count(op_type)) { + int sub_block_idx = op_info->GetAttr("sub_block"); + CHECK(block_idx >= 0 && block_idx < block_size); + CollectUnusedInputOutputNodes(sub_block_idx, + graphs, + control_flow_op_types, + in_vars2rm, + out_vars2rm); + } else { + for (auto& var_node : op_node->inlinks) { + auto& var_name = var_node->AsArg().name; + if (in_vars2rm->count(var_name)) { + in_vars2rm->erase(var_name); + } + } + for (auto& var_node : op_node->outlinks) { + auto& var_name = var_node->AsArg().name; + // Tensor array may be only used as the output vars in the sublock + if (in_vars2rm->count(var_name)) { + in_vars2rm->erase(var_name); + } + if (out_vars2rm->count(var_name)) { + out_vars2rm->erase(var_name); + } + } + } + } +} + +// Remove the unused var nodes from the graph and update the op_info of the +// control flow op +void RemoveNodesFromGraphAndUpdateOpInfo( + SSAGraph* graph, + Node* op_node, + const std::unordered_map& in_vars2rm, + const std::unordered_map& out_vars2rm) { + auto op_info = op_node->AsStmt().mutable_op_info(); + auto op_type = op_info->Type(); + // Unlink the in_vars2rm and out_vars2rm from the control flow op node, and + // remove them if nerver used. + for (auto& var_node : in_vars2rm) { + VLOG(3) << "in var node '" << var_node.first << "' is unlinked to " + << op_type; + RemoveDirectedLink(var_node.second, op_node); + } + for (auto& var_node : out_vars2rm) { + VLOG(3) << "out var node '" << var_node.first << "' is unlinked from " + << op_type; + RemoveDirectedLink(op_node, var_node.second); + // Unlink from all of the out op nodes. + std::unordered_set out_op_nodes; + for (auto* out_op_node : var_node.second->outlinks) { + if (!out_op_nodes.count(out_op_node)) { + out_op_nodes.insert(out_op_node); + } + } + for (auto* out_op_node : out_op_nodes) { + RemoveDirectedLink(var_node.second, out_op_node); + } + } + // Remove the unused nodes from the graph if their inlinks and outlinks are + // empty + std::unordered_set removed_var_nodes; + for (auto& var_node : in_vars2rm) { + if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() && + !removed_var_nodes.count(var_node.second)) { + removed_var_nodes.insert(var_node.second); + graph->RemoveNode(var_node.second); + VLOG(3) << "in var node " << var_node.first << " is removed"; + } + } + for (auto& var_node : out_vars2rm) { + if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() && + !removed_var_nodes.count(var_node.second)) { + removed_var_nodes.insert(var_node.second); + graph->RemoveNode(var_node.second); + VLOG(3) << "out var node " << var_node.first << " is removed"; + } + } + // Update the op info of the control flow op + for (auto& input : *op_info->mutable_inputs()) { + for (auto var = input.second.begin(); var != input.second.end();) { + if (in_vars2rm.count(*var)) { + var = input.second.erase(var); + } else { + ++var; + } + } + } + for (auto& output : *op_info->mutable_outputs()) { + for (auto var = output.second.begin(); var != output.second.end();) { + if (out_vars2rm.count(*var)) { + var = output.second.erase(var); + } else { + ++var; + } + } + } +} + +void ControlFlowOpUnusedInputsAndOutputsEliminatePass::SetAllGraphs( + std::vector>* graphs) { + CHECK(graphs && !graphs->empty()); + graphs_ = graphs; +} + +void ControlFlowOpUnusedInputsAndOutputsEliminatePass::Apply( + const std::unique_ptr& graph) { + // Remove the unused input and output nodes from the control flow op nodes + // Which are only linked to the control flow op nodes but nerver linked to the + // other op nodes + const std::unordered_set control_flow_op_types = { + "while", "conditional_block"}; + auto block_size = graphs_->size(); + for (auto& op_node : graph->StmtTopologicalOrder()) { + if (!op_node->IsStmt()) continue; + auto op_info = op_node->AsStmt().mutable_op_info(); + auto op_type = op_info->Type(); + if (!control_flow_op_types.count(op_type)) continue; + int sub_block_idx = op_info->GetAttr("sub_block"); + CHECK(sub_block_idx >= 0 && sub_block_idx < block_size); + // Initialize the unused nodes with all of the input and output nodes + std::unordered_map in_vars2rm, out_vars2rm; + for (auto* var_node : op_node->inlinks) { + auto& var_name = var_node->AsArg().name; + if (!in_vars2rm.count(var_name)) { + in_vars2rm.insert(std::pair(var_name, var_node)); + } + } + for (auto* var_node : op_node->outlinks) { + auto& var_name = var_node->AsArg().name; + if (!out_vars2rm.count(var_name)) { + out_vars2rm.insert(std::pair(var_name, var_node)); + } + } + // Remove the nodes which used in subblock recursively, and the remaining + // nodes are the unused one. + CollectUnusedInputOutputNodes(sub_block_idx, + graphs_, + control_flow_op_types, + &in_vars2rm, + &out_vars2rm); + if (in_vars2rm.size() > 0 || out_vars2rm.size() > 0) { + // Remove the unused nodes from graph, and update the op info of the + // control flow op + RemoveNodesFromGraphAndUpdateOpInfo( + graph.get(), op_node, in_vars2rm, out_vars2rm); + } + } +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS( + control_flow_op_unused_inputs_and_outputs_eliminate_pass, + paddle::lite::mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass) + .BindTargets({TARGET(kNPU)}); diff --git a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..2863661de1e93d15bfe835e39033d4ecaee6d8cc --- /dev/null +++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/core/mir/pass.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace mir { + +class ControlFlowOpUnusedInputsAndOutputsEliminatePass : public mir::StmtPass { + public: + void Apply(const std::unique_ptr &graph) override; + void SetAllGraphs(std::vector> *graphs); + + private: + std::vector> *graphs_; +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..673854b118a8adaca73cb905eda4892b6903665c --- /dev/null +++ b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc @@ -0,0 +1,245 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/elimination/remove_tf_redundant_ops_pass.h" +#include +#include "lite/core/mir/graph_visualize_pass.h" +#include "lite/core/mir/pass.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/pattern_matcher.h" +#include "lite/model_parser/cpp_desc.h" + +namespace paddle { +namespace lite { +namespace mir { + +void RemoveTFRedundantOpsPass::Apply(const std::unique_ptr& graph) { + RemoveSqueeze2Reshape2Pattern(graph); + RemoveReshape2Pattern(graph); +} + +void RemoveTFRedundantOpsPass::RemoveReshape2Pattern( + const std::unique_ptr& graph) { + bool found = false; + Node* softmax_node{nullptr}; + Node* reshape2_node{nullptr}; + std::string reshape2_out_arg_name; + Node* fetch_node{nullptr}; + std::string fetch_in_arg_name; + DDim softmax_out_dims; + DDim reshape2_out_dims; + + for (auto& op_node : graph->StmtTopologicalOrder()) { + if (op_node->AsStmt().picked_kernel().op_type() == "softmax") { + softmax_node = op_node; + } else if (op_node->AsStmt().picked_kernel().op_type() == "reshape2") { + reshape2_node = op_node; + } else if (op_node->AsStmt().picked_kernel().op_type() == "fetch") { + fetch_node = op_node; + fetch_in_arg_name = fetch_node->inlinks.front()->AsArg().name; + } + } + + if (softmax_node == nullptr || reshape2_node == nullptr) { + return; + } + + // Get out tensor dims of softmax, reshape2 + auto* scope = softmax_node->AsStmt().op()->scope(); + auto softmax_out_arg_name = softmax_node->outlinks.front()->AsArg().name; + auto softmax_out_tensor = + scope->FindVar(softmax_out_arg_name)->Get(); + softmax_out_dims = softmax_out_tensor.dims(); + + for (auto out_node : reshape2_node->outlinks) { + if (out_node->IsArg() && out_node->outlinks.size() != 0) { + reshape2_out_arg_name = reshape2_node->outlinks.front()->AsArg().name; + auto reshape2_out_tensor = + scope->FindVar(reshape2_out_arg_name)->Get(); + reshape2_out_dims = reshape2_out_tensor.dims(); + } + } + + VLOG(3) << "reshape2_out_dims:" << reshape2_out_dims; + VLOG(3) << "softmax_out_dims:" << softmax_out_dims; + VLOG(3) << "found:" << found; + + if (softmax_out_dims == reshape2_out_dims && + softmax_node->outlinks.front() == reshape2_node->inlinks.front() && + reshape2_out_arg_name == fetch_in_arg_name) { + found = true; + } + + if (found) { + // link out_arg to op + IR_NODE_LINK_TO(softmax_node->outlinks.front(), fetch_node); + + // collect nodes to safe remove + std::set nodes_to_remove; + auto remove_inst_node_and_out_args_node = [&](Node* n) { + nodes_to_remove.insert(n); + for (auto& out : n->outlinks) { + nodes_to_remove.insert(out); + } + }; + + remove_inst_node_and_out_args_node(reshape2_node); + GraphSafeRemoveNodes(graph.get(), nodes_to_remove); + auto fetch_op_desc = fetch_node->AsStmt().mutable_op_info(); + fetch_op_desc->SetInput("X", + {softmax_node->outlinks.front()->AsArg().name}); + } + VLOG(5) << "\n" << Visualize(graph.get()); +} + +void RemoveTFRedundantOpsPass::RemoveSqueeze2Reshape2Pattern( + const std::unique_ptr& graph) { + VLOG(5) << Visualize(graph.get()); + bool found = false; + + // find out_arg->squeeze2 + // find out_arg_dims of out_arg + Node* out_arg_node{nullptr}; + DDim out_arg_dims; + Node* squeeze2_node{nullptr}; + + // find squeeze2->reshape2 + // find output dims of squeeze2 and reshape2 nodes + DDim squeeze2_out_dims; + Node* reshape2_node{nullptr}; + Node* reshape2_out_node{nullptr}; + DDim reshape2_out_dims; + + // find next inst node of reshape2 + Node* next_inst_node_of_reshape2_out{nullptr}; + + for (auto& node : graph->StmtTopologicalOrder()) { + if (node->AsStmt().picked_kernel().op_type() != "squeeze2") continue; + auto* scope = node->AsStmt().op()->scope(); + + // find inlinks of squeeze2: out_arg_node + squeeze2_node = node; + auto squeeze2_inlinks = squeeze2_node->inlinks; + VLOG(5) << "squeeze2_inlinks.size():" << squeeze2_inlinks.size(); + for (auto& in_link : squeeze2_inlinks) { + if (in_link->IsArg() && squeeze2_inlinks.size() == 1) { + out_arg_node = in_link; + auto* var = scope->FindVar(out_arg_node->AsArg().name); + out_arg_dims = var->Get().dims(); + VLOG(5) << "arg name:" << out_arg_node->AsArg().name + << " dims:" << out_arg_dims; + } else { + // found mutli-input links + continue; + } + } + + // find squeeze2->reshape2 pattern + // and output dims of squeeze2, reshape2 nodes + auto squeeze2_outlinks = squeeze2_node->outlinks; + for (auto& squeeze2_out_link : squeeze2_outlinks) { + if (squeeze2_out_link->IsArg() && + squeeze2_out_link->outlinks.size() != 0) { + auto* squeeze2_out_var = + scope->FindVar(squeeze2_out_link->AsArg().name); + squeeze2_out_dims = squeeze2_out_var->Get().dims(); + + VLOG(5) << "squeeze2_out_arg.name:" << squeeze2_out_link->AsArg().name + << " squeeze2_out_dims:" << squeeze2_out_dims + << " squeeze2_out_link->outlinks.size():" + << squeeze2_out_link->outlinks.size(); + + for (auto& out2_link : squeeze2_out_link->outlinks) { + if (out2_link->IsStmt() && + out2_link->AsStmt().picked_kernel().op_type() == "reshape2") { + reshape2_node = out2_link; + for (auto& reshape2_out_link : reshape2_node->outlinks) { + if (reshape2_out_link->IsArg() && + reshape2_out_link->outlinks.size() != 0) { + reshape2_out_node = reshape2_out_link; + auto* reshape2_out_var = + scope->FindVar(reshape2_out_link->AsArg().name); + reshape2_out_dims = + reshape2_out_var->Get().dims(); + + VLOG(5) << "reshape2_out_node:" << reshape2_out_node + << " reshape2_out_name:" + << reshape2_out_link->AsArg().name + << " reshape2_out_dims:" << reshape2_out_dims; + } + } + } + } + } + } + + // find next inst node of reshape2 + VLOG(5) << "reshape2_out_node->outlinks.size():" + << reshape2_out_node->outlinks.size() + << " reshape2_out_node->IsStmt():" << reshape2_out_node->IsStmt(); + VLOG(5) << "reshape2_out_node->AsArg().name:" + << reshape2_out_node->AsArg().name; + if (reshape2_out_node != nullptr && + reshape2_out_node->outlinks.size() == 1 && + reshape2_out_node->outlinks.front()->IsStmt()) { + next_inst_node_of_reshape2_out = reshape2_out_node->outlinks.front(); + found = true; + break; + VLOG(5) + << "next_inst_node_of_reshape2_out->picked_kernel().op_type():" + << next_inst_node_of_reshape2_out->AsStmt().picked_kernel().op_type(); + } + + VLOG(5) << "=============================="; + VLOG(5) << "out_arg_dims:" << out_arg_dims; + VLOG(5) << "squeeze2_out_dims:" << squeeze2_out_dims; + VLOG(5) << "reshape2_out_dims:" << reshape2_out_dims; + VLOG(5) << "=============================="; + } + + // replace pattern + if (found && out_arg_dims[1] == squeeze2_out_dims[1] && + out_arg_dims[1] == reshape2_out_dims[1] && out_arg_dims[1] == 1001 && + out_arg_dims[2] == out_arg_dims[3] && out_arg_dims[2] == 1 && + next_inst_node_of_reshape2_out->AsStmt().picked_kernel().op_type() == + "softmax") { + // link out_arg to op + IR_NODE_LINK_TO(out_arg_node, next_inst_node_of_reshape2_out); + + // collect nodes to safe remove + std::set nodes_to_remove; + auto remove_inst_node_and_out_args_node = [&](Node* n) { + nodes_to_remove.insert(n); + for (auto& out : n->outlinks) { + nodes_to_remove.insert(out); + } + }; + remove_inst_node_and_out_args_node(squeeze2_node); + remove_inst_node_and_out_args_node(reshape2_node); + GraphSafeRemoveNodes(graph.get(), nodes_to_remove); + auto next_inst_op_desc = + next_inst_node_of_reshape2_out->AsStmt().mutable_op_info(); + next_inst_op_desc->SetInput("X", {out_arg_node->AsArg().name}); + VLOG(5) << Visualize(graph.get()); + } + VLOG(5) << "replace pattern fininshed"; +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(remove_tf_redundant_ops_pass, + paddle::lite::mir::RemoveTFRedundantOpsPass) + .BindTargets({TARGET(kOpenCL), TARGET(kARM)}); diff --git a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.h b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..652a8fb4a7f67e173527725e3bbecfadcde96798 --- /dev/null +++ b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/core/mir/pass.h" +#include "lite/core/tensor.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace mir { + +/* + * mir::RemoveTFRedundantOpsPass remove reshape2->squeeze2 pattern + * and last reshape2 op for tensorflow mobilenetv1/v2. + */ +class RemoveTFRedundantOpsPass : public mir::StmtPass { + public: + void Apply(const std::unique_ptr& graph) override; + void RemoveReshape2Pattern(const std::unique_ptr& graph); + void RemoveSqueeze2Reshape2Pattern(const std::unique_ptr& graph); +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt index a7a4cee798c1e8ef5b9b8f8d9e8e5810554fc571..95723bbd21dc02ed8bb5b46c48f9836d3f9aff1f 100644 --- a/lite/core/mir/fusion/CMakeLists.txt +++ b/lite/core/mir/fusion/CMakeLists.txt @@ -16,6 +16,9 @@ lite_cc_library(fuse_var_conv_activation lite_cc_library(fuse_conv_bn SRCS conv_bn_fuser.cc DEPS pattern_matcher_high_api) +lite_cc_library(fuse_conv_conv + SRCS conv_conv_fuser.cc + DEPS pattern_matcher_high_api) lite_cc_library(fuse_elementwise_add_activation SRCS elementwise_add_activation_fuser.cc DEPS pattern_matcher_high_api) @@ -42,6 +45,7 @@ set(mir_fusers fuse_conv_activation fuse_var_conv_activation fuse_conv_bn + fuse_conv_conv fuse_quant_dequant fuse_elementwise_add_activation fuse_transpose_softmax_transpose diff --git a/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..db950fd4b4d671ed618c8bc53010e5be6f5fd78b --- /dev/null +++ b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc @@ -0,0 +1,1644 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class XPUMmdnnFloat2Fix { + public: + void operator()(SSAGraph* graph) { + for (auto* node : graph->StmtTopologicalOrder()) { + CHECK(node->IsStmt()); + auto* op_info = node->stmt()->op_info(); + std::string op_type = op_info->Type(); + + static const std::vector target_ops{"var_conv_2d", + "search_fc"}; + if (std::find(target_ops.begin(), target_ops.end(), op_type) != + target_ops.end()) { + std::string weight_name = op_info->Input("W").front(); + auto* scope = node->stmt()->op()->scope(); + auto* weight_t = scope->FindMutableTensor(weight_name); + auto weight_dims = weight_t->dims(); + auto weight_len = weight_t->numel(); + float* weight_on_host = weight_t->mutable_data(); + float max_f = + paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len); + std::unique_ptr weight_int16(new int16_t[weight_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + weight_on_host, weight_int16.get(), max_f, weight_len); + memcpy( + weight_on_host, weight_int16.get(), weight_len * sizeof(int16_t)); + + auto update_op_info = *op_info; + update_op_info.SetAttr("__xpu__float_to_fix", true); + update_op_info.SetAttr("__xpu__w_max", max_f); + node->stmt()->ResetOp(update_op_info, graph->valid_places()); + VLOG(3) << "Float2Fix, op_type=" << op_type + << ", weight_name=" << weight_name; + } else if (op_type == "match_matrix_tensor") { + std::string weight_name = op_info->Input("W").front(); + auto* scope = node->stmt()->op()->scope(); + auto* weight_t = scope->FindMutableTensor(weight_name); + auto weight_dims = weight_t->dims(); + auto weight_len = weight_t->numel(); + float* weight_on_host = weight_t->mutable_data(); + float max_f = + paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len); + std::unique_ptr weight_int16(new int16_t[weight_len]); + std::unique_ptr weight_trans_int16(new int16_t[weight_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + weight_on_host, weight_int16.get(), max_f, weight_len); + paddle::lite::xpu::math::Transpose(weight_int16.get(), + weight_trans_int16.get(), + weight_dims[0], + weight_dims[1] * weight_dims[2]); + memcpy(weight_on_host, + weight_trans_int16.get(), + weight_len * sizeof(int16_t)); + + auto update_op_info = *op_info; + update_op_info.SetAttr("__xpu__float_to_fix", true); + update_op_info.SetAttr("__xpu__w_max", max_f); + node->stmt()->ResetOp(update_op_info, graph->valid_places()); + VLOG(3) << "Float2Fix && Transposed, op_type=" << op_type + << ", weight_name=" << weight_name; + } else if (op_type == "search_grnn") { + auto* scope = node->stmt()->op()->scope(); + + std::string wi_name = op_info->Input("Wi").front(); + auto* wi_t = scope->FindMutableTensor(wi_name); + auto wi_dims = wi_t->dims(); + auto wi_len = wi_t->numel(); + auto wi_stride_len = wi_len / 3; + float* wi_on_host = wi_t->mutable_data(); + std::unique_ptr wi_int16(new int16_t[wi_len]); + std::vector wi_max(3); + for (int i = 0; i < 3; ++i) { + float max_f = paddle::lite::xpu::math::FindMaxAbs( + wi_on_host + i * wi_stride_len, wi_stride_len); + paddle::lite::xpu::math::ConvertFP32ToInt16( + wi_on_host + i * wi_stride_len, + wi_int16.get() + i * wi_stride_len, + max_f, + wi_stride_len); + wi_max[i] = max_f; + } + memcpy(wi_on_host, wi_int16.get(), wi_len * sizeof(int16_t)); + + std::string wh_name = op_info->Input("Wh").front(); + auto* wh_t = scope->FindMutableTensor(wh_name); + auto wh_dims = wh_t->dims(); + auto wh_len = wh_t->numel(); + auto wh_stride_len = wh_len / 3; + float* wh_on_host = wh_t->mutable_data(); + std::unique_ptr wh_int16(new int16_t[wh_len]); + std::vector wh_max(3); + for (int i = 0; i < 3; ++i) { + float max_f = paddle::lite::xpu::math::FindMaxAbs( + wh_on_host + i * wh_stride_len, wh_stride_len); + paddle::lite::xpu::math::ConvertFP32ToInt16( + wh_on_host + i * wh_stride_len, + wh_int16.get() + i * wh_stride_len, + max_f, + wh_stride_len); + wh_max[i] = max_f; + } + memcpy(wh_on_host, wh_int16.get(), wh_len * sizeof(int16_t)); + + auto update_op_info = *op_info; + update_op_info.SetAttr("__xpu__float_to_fix", true); + update_op_info.SetAttr>("__xpu__wi_max", wi_max); + update_op_info.SetAttr>("__xpu__wh_max", wh_max); + node->stmt()->ResetOp(update_op_info, graph->valid_places()); + VLOG(3) << "Float2Fix, op_type=" << op_type << ", wi_name=" << wi_name + << ", wh_name=" << wh_name; + } + } + } +}; + +class XPUMmdnnSearchAttentionFuser : public FuseBase { + public: + void BuildPattern() override { + auto* input = VarNode("input")->AsInput(); + + auto* search_group_padding = + OpNode("search_group_padding", "search_group_padding"); + auto* out_emb_padding = + VarNode("out_emb_padding") + ->assert_is_op_output("search_group_padding", "Out_emb_padding") + ->AsIntermediate(); + auto* out_new = VarNode("out_new") + ->assert_is_op_output("search_group_padding", "Out_new") + ->AsIntermediate(); + auto* out_padding = + VarNode("out_padding") + ->assert_is_op_output("search_group_padding", "Out_padding") + ->AsIntermediate(); + + auto* search_seq_fc_w = VarNode("search_seq_fc_w") + ->assert_is_op_input("search_seq_fc", "W") + ->AsInput(); + auto* search_seq_fc_b = VarNode("search_seq_fc_b") + ->assert_is_op_input("search_seq_fc", "b") + ->AsInput(); + auto* search_seq_fc = + OpNode("search_seq_fc", "search_seq_fc")->AsIntermediate(); + auto* search_seq_fc_out = VarNode("search_seq_fc_out") + ->assert_is_op_output("search_seq_fc", "Out") + ->AsIntermediate(); + + auto* search_aligned_mat_mul = + OpNode("search_aligned_mat_mul", "search_aligned_mat_mul") + ->AsIntermediate(); + auto* search_aligned_mat_mul_out = + VarNode("search_aligned_mat_mul_out") + ->assert_is_op_output("search_aligned_mat_mul", "Out") + ->AsIntermediate(); + auto* search_aligned_mat_mul_a = + VarNode("search_aligned_mat_mul_a") + ->assert_is_op_output("search_aligned_mat_mul", "_a_addr") + ->AsIntermediate(); + auto* search_aligned_mat_mul_b = + VarNode("search_aligned_mat_mul_b") + ->assert_is_op_output("search_aligned_mat_mul", "_b_addr") + ->AsIntermediate(); + auto* search_aligned_mat_mul_c = + VarNode("search_aligned_mat_mul_c") + ->assert_is_op_output("search_aligned_mat_mul", "_c_addr") + ->AsIntermediate(); + + auto* search_attention_padding_mask = + OpNode("search_attention_padding_mask", "search_attention_padding_mask") + ->AsIntermediate(); + auto* search_attention_padding_mask_out = + VarNode("search_attention_padding_mask_out") + ->assert_is_op_output("search_attention_padding_mask", "Out") + ->AsIntermediate(); + auto* search_attention_padding_mask_pad_begin = + VarNode("search_attention_padding_mask_pad_begin") + ->assert_is_op_output("search_attention_padding_mask", "pad_begin") + ->AsIntermediate(); + + auto* search_seq_softmax = + OpNode("search_seq_softmax", "search_seq_softmax")->AsIntermediate(); + auto* search_seq_softmax_out = + VarNode("search_seq_softmax_out") + ->assert_is_op_output("search_seq_softmax", "Out") + ->AsIntermediate(); + auto* search_seq_softmax_out_log = + VarNode("search_seq_softmax_out_log") + ->assert_is_op_output("search_seq_softmax", "Out_log") + ->AsIntermediate(); + + auto* search_aligned_mat_mul_2 = + OpNode("search_aligned_mat_mul_2", "search_aligned_mat_mul") + ->AsIntermediate(); + auto* search_aligned_mat_mul_2_out = + VarNode("search_aligned_mat_mul_2_out") + ->assert_is_op_output("search_aligned_mat_mul", "Out") + ->AsIntermediate(); + auto* search_aligned_mat_mul_2_a = + VarNode("search_aligned_mat_mul_2_a") + ->assert_is_op_output("search_aligned_mat_mul", "_a_addr") + ->AsIntermediate(); + auto* search_aligned_mat_mul_2_b = + VarNode("search_aligned_mat_mul_2_b") + ->assert_is_op_output("search_aligned_mat_mul", "_b_addr") + ->AsIntermediate(); + auto* search_aligned_mat_mul_2_c = + VarNode("search_aligned_mat_mul_2_c") + ->assert_is_op_output("search_aligned_mat_mul", "_c_addr") + ->AsIntermediate(); + + auto* search_seq_depadding = + OpNode("search_seq_depadding")->AsIntermediate(); + auto* search_seq_depadding_out = + VarNode("search_seq_depadding_out")->AsOutput(); + + *input >> *search_group_padding >> *out_emb_padding; + *search_group_padding >> *out_new; + *search_group_padding >> *out_padding; + + *search_seq_fc_w >> *search_seq_fc; + *search_seq_fc_b >> *search_seq_fc; + *out_emb_padding >> *search_seq_fc; + *search_seq_fc >> *search_seq_fc_out; + + *search_seq_fc_out >> *search_aligned_mat_mul; + *out_emb_padding >> *search_aligned_mat_mul; + *search_aligned_mat_mul >> *search_aligned_mat_mul_out; + *search_aligned_mat_mul >> *search_aligned_mat_mul_a; + *search_aligned_mat_mul >> *search_aligned_mat_mul_b; + *search_aligned_mat_mul >> *search_aligned_mat_mul_c; + + *search_aligned_mat_mul_out >> *search_attention_padding_mask; + *out_padding >> *search_attention_padding_mask; + *search_attention_padding_mask >> *search_attention_padding_mask_out; + *search_attention_padding_mask >> *search_attention_padding_mask_pad_begin; + + *search_attention_padding_mask_out >> *search_seq_softmax; + *search_seq_softmax >> *search_seq_softmax_out; + *search_seq_softmax >> *search_seq_softmax_out_log; + + *search_seq_softmax_out >> *search_aligned_mat_mul_2; + *out_emb_padding >> *search_aligned_mat_mul_2; + *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_out; + *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_a; + *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_b; + *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_c; + + *search_aligned_mat_mul_2_out >> *search_seq_depadding; + *out_new >> *search_seq_depadding; + *search_seq_depadding >> *search_seq_depadding_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__mmdnn_search_attention"); + op_desc.SetInput("X", {matched.at("input")->arg()->name}); + op_desc.SetInput("W", {matched.at("search_seq_fc_w")->arg()->name}); + op_desc.SetInput("b", {matched.at("search_seq_fc_b")->arg()->name}); + op_desc.SetOutput("Out", + {matched.at("search_seq_depadding_out")->arg()->name}); + + auto* padding_op_info = + matched.at("search_group_padding")->stmt()->op_info(); + op_desc.SetAttr("pad_id", padding_op_info->GetAttr("pad_id")); + auto* matmul_0_op_info = + matched.at("search_aligned_mat_mul")->stmt()->op_info(); + op_desc.SetAttr("alpha0", matmul_0_op_info->GetAttr("alpha")); + auto* matmul_1_op_info = + matched.at("search_aligned_mat_mul_2")->stmt()->op_info(); + op_desc.SetAttr("alpha1", matmul_1_op_info->GetAttr("alpha")); + auto* mask_op_info = + matched.at("search_attention_padding_mask")->stmt()->op_info(); + op_desc.SetAttr("mask", mask_op_info->GetAttr("mask")); + + auto* new_stmt = matched.at("search_group_padding")->stmt(); + auto* scope = new_stmt->op()->scope(); + auto w_name = matched.at("search_seq_fc_w")->arg()->name; + auto* w_t = scope->FindMutableTensor(w_name); + auto w_dims = w_t->dims(); + int w_len = w_t->numel(); + float* w_on_host = w_t->mutable_data(); + + float max_f = paddle::lite::xpu::math::FindMaxAbs(w_on_host, w_len); + std::unique_ptr w_int16(new int16_t[w_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + w_on_host, w_int16.get(), max_f, w_len); + memcpy(w_on_host, w_int16.get(), w_len * sizeof(int16_t)); + op_desc.SetAttr("W_max", max_f); + + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, scope); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + DirectedLink(matched.at("search_seq_fc_w"), + matched.at("search_group_padding")); + DirectedLink(matched.at("search_seq_fc_b"), + matched.at("search_group_padding")); + IR_OP_VAR_LINK(matched.at("search_group_padding"), + matched.at("search_seq_depadding_out")); + } +}; + +// 4 inputs +// ======== +// +// input_x +// input_y +// topk_row +// topk_col +// +// input_x ------- match_matrix_tensor ------- input_y +// | +// relu +// ________/ \________ +// | | +// var_conv_2d | +// | | +// relu | +// |_______ _______| +// \ / +// sequence_concat +// | +// topk_row ---- sequence_topk_avg_pooling ----- topk_col +// +class XPUMmdnnMatchConvTopkFuser : public FuseBase { + public: + void BuildPattern() override { + auto* input_x = VarNode("input_x") + ->assert_is_op_input("match_matrix_tensor", "X") + ->AsInput(); + auto* input_y = VarNode("input_y") + ->assert_is_op_input("match_matrix_tensor", "Y") + ->AsInput(); + auto* input_w = VarNode("input_w") + ->assert_is_op_input("match_matrix_tensor", "W") + ->AsInput(); + + auto* match_matrix_tensor = + OpNode("match_matrix_tensor", "match_matrix_tensor"); + auto* match_out = VarNode("match_out") + ->assert_is_op_output("match_matrix_tensor", "Out") + ->AsIntermediate(); + auto* match_tmp = VarNode("match_tmp") + ->assert_is_op_output("match_matrix_tensor", "Tmp") + ->AsIntermediate(); + auto* relu0 = OpNode("relu0", "relu")->AsIntermediate(); + auto* relu0_out = VarNode("relu0_out") + ->assert_is_op_output("relu", "Out") + ->AsIntermediate(); + auto* conv_w = + VarNode("conv_w")->assert_is_op_input("var_conv_2d", "W")->AsInput(); + auto* conv = OpNode("conv", "var_conv_2d")->AsIntermediate(); + auto* conv_out = VarNode("conv_out") + ->assert_is_op_output("var_conv_2d", "Out") + ->AsIntermediate(); + auto* conv_col = VarNode("conv_col") + ->assert_is_op_output("var_conv_2d", "Col") + ->AsIntermediate(); + auto* relu1 = OpNode("relu1", "relu")->AsIntermediate(); + auto* relu1_out = VarNode("relu1_out") + ->assert_is_op_output("relu", "Out") + ->AsIntermediate(); + auto* seq_concat = + OpNode("seq_concat", "sequence_concat")->AsIntermediate(); + auto* seq_concat_out = + VarNode("seq_concat_out") + ->assert_is_op_output("sequence_concat", "Out") + ->assert_is_op_input("sequence_topk_avg_pooling", "X") + ->AsIntermediate(); + auto* topk_col = + VarNode("topk_col") + ->assert_is_op_input("sequence_topk_avg_pooling", "COLUMN") + ->AsInput(); + auto* topk_row = + VarNode("topk_row") + ->assert_is_op_input("sequence_topk_avg_pooling", "ROW") + ->AsInput(); + auto* topk = OpNode("topk", "sequence_topk_avg_pooling")->AsIntermediate(); + auto* topk_out = + VarNode("topk_out") + ->assert_is_op_output("sequence_topk_avg_pooling", "Out") + ->AsOutput(); + auto* topk_pos = + VarNode("topk_pos") + ->assert_is_op_output("sequence_topk_avg_pooling", "pos") + ->AsIntermediate(); + + *input_x >> *match_matrix_tensor; + *input_y >> *match_matrix_tensor; + *input_w >> *match_matrix_tensor; + *match_matrix_tensor >> *match_out >> *relu0 >> *relu0_out; + *match_matrix_tensor >> *match_tmp; + + *relu0_out >> *conv >> *conv_out >> *relu1 >> *relu1_out; + *conv_w >> *conv; + *conv >> *conv_col; + + *relu0_out >> *seq_concat; + *relu1_out >> *seq_concat; + *seq_concat >> *seq_concat_out >> *topk >> *topk_out; + *topk_col >> *topk; + *topk_row >> *topk; + *topk >> *topk_pos; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__mmdnn_match_conv_topk"); + op_desc.SetInput("input_x", {matched.at("input_x")->arg()->name}); + op_desc.SetInput("input_y", {matched.at("input_y")->arg()->name}); + op_desc.SetInput("input_w", {matched.at("input_w")->arg()->name}); + op_desc.SetInput("conv_w", {matched.at("conv_w")->arg()->name}); + op_desc.SetOutput("topk_out", {matched.at("topk_out")->arg()->name}); + + auto* match_op_info = matched.at("match_matrix_tensor")->stmt()->op_info(); + op_desc.SetAttr("input_w_max", + match_op_info->GetAttr("__xpu__w_max")); + op_desc.SetAttr("dim_t", match_op_info->GetAttr("dim_t")); + auto* conv_op_info = matched.at("conv")->stmt()->op_info(); + op_desc.SetAttr("conv_w_max", + conv_op_info->GetAttr("__xpu__w_max")); + op_desc.SetAttr("output_channel", + conv_op_info->GetAttr("OutputChannel")); + auto* topk_op_info = matched.at("topk")->stmt()->op_info(); + op_desc.SetAttr>( + "topks", topk_op_info->GetAttr>("topks")); + op_desc.SetAttr("channel_num", + topk_op_info->GetAttr("channel_num")); + + auto* new_stmt = matched.at("match_matrix_tensor")->stmt(); + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, new_stmt->op()->scope()); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + // XXX(miaotianxiang): redundant links around |topk| are automatically + // removed as |topk| is marked intermediate. + // RemoveDirectedLink(matched.at("topk_col"), matched.at("topk")); + // RemoveDirectedLink(matched.at("topk_row"), matched.at("topk")); + std::vector arg_names{"conv_w"}; + for (auto name : arg_names) { + DirectedLink(matched.at(name), matched.at("match_matrix_tensor")); + } + std::vector out_names{"topk_out"}; + for (auto name : out_names) { + IR_OP_VAR_LINK(matched.at("match_matrix_tensor"), matched.at(name)); + } + } +}; + +// 2 inputs +// ======== +// +// input_x +// input_y +// +// input_x ------- match_matrix_tensor ------- input_y +// | | | +// | relu | +// | ________/ \________ | +// | | | | +// | var_conv_2d | | +// | | | | +// | relu | | +// | |_______ _______| | +// | \ / | +// | sequence_concat | +// | | | +// |--------- sequence_topk_avg_pooling -------| +// +class XPUMmdnnMatchConvTopkFuser2 : public FuseBase { + public: + void BuildPattern() override { + auto* input_x = VarNode("input_x") + ->assert_is_op_input("match_matrix_tensor", "X") + ->assert_is_op_input("sequence_topk_avg_pooling", "ROW") + ->AsInput(); + auto* input_y = + VarNode("input_y") + ->assert_is_op_input("match_matrix_tensor", "Y") + ->assert_is_op_input("sequence_topk_avg_pooling", "COLUMN") + ->AsInput(); + auto* input_w = VarNode("input_w") + ->assert_is_op_input("match_matrix_tensor", "W") + ->AsInput(); + + auto* match_matrix_tensor = + OpNode("match_matrix_tensor", "match_matrix_tensor"); + auto* match_out = VarNode("match_out") + ->assert_is_op_output("match_matrix_tensor", "Out") + ->AsIntermediate(); + auto* match_tmp = VarNode("match_tmp") + ->assert_is_op_output("match_matrix_tensor", "Tmp") + ->AsIntermediate(); + auto* relu0 = OpNode("relu0", "relu")->AsIntermediate(); + auto* relu0_out = VarNode("relu0_out") + ->assert_is_op_output("relu", "Out") + ->AsIntermediate(); + auto* conv_w = + VarNode("conv_w")->assert_is_op_input("var_conv_2d", "W")->AsInput(); + auto* conv = OpNode("conv", "var_conv_2d")->AsIntermediate(); + auto* conv_out = VarNode("conv_out") + ->assert_is_op_output("var_conv_2d", "Out") + ->AsIntermediate(); + auto* conv_col = VarNode("conv_col") + ->assert_is_op_output("var_conv_2d", "Col") + ->AsIntermediate(); + auto* relu1 = OpNode("relu1", "relu")->AsIntermediate(); + auto* relu1_out = VarNode("relu1_out") + ->assert_is_op_output("relu", "Out") + ->AsIntermediate(); + auto* seq_concat = + OpNode("seq_concat", "sequence_concat")->AsIntermediate(); + auto* seq_concat_out = + VarNode("seq_concat_out") + ->assert_is_op_output("sequence_concat", "Out") + ->assert_is_op_input("sequence_topk_avg_pooling", "X") + ->AsIntermediate(); + auto* topk = OpNode("topk", "sequence_topk_avg_pooling")->AsIntermediate(); + auto* topk_out = + VarNode("topk_out") + ->assert_is_op_output("sequence_topk_avg_pooling", "Out") + ->AsOutput(); + auto* topk_pos = + VarNode("topk_pos") + ->assert_is_op_output("sequence_topk_avg_pooling", "pos") + ->AsIntermediate(); + + *input_x >> *match_matrix_tensor; + *input_y >> *match_matrix_tensor; + *input_w >> *match_matrix_tensor; + *match_matrix_tensor >> *match_out >> *relu0 >> *relu0_out; + *match_matrix_tensor >> *match_tmp; + + *relu0_out >> *conv >> *conv_out >> *relu1 >> *relu1_out; + *conv_w >> *conv; + *conv >> *conv_col; + + *relu0_out >> *seq_concat; + *relu1_out >> *seq_concat; + *seq_concat >> *seq_concat_out >> *topk >> *topk_out; + *input_x >> *topk; + *input_y >> *topk; + *topk >> *topk_pos; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__mmdnn_match_conv_topk"); + op_desc.SetInput("input_x", {matched.at("input_x")->arg()->name}); + op_desc.SetInput("input_y", {matched.at("input_y")->arg()->name}); + op_desc.SetInput("input_w", {matched.at("input_w")->arg()->name}); + op_desc.SetInput("conv_w", {matched.at("conv_w")->arg()->name}); + op_desc.SetOutput("topk_out", {matched.at("topk_out")->arg()->name}); + + auto* match_op_info = matched.at("match_matrix_tensor")->stmt()->op_info(); + op_desc.SetAttr("input_w_max", + match_op_info->GetAttr("__xpu__w_max")); + op_desc.SetAttr("dim_t", match_op_info->GetAttr("dim_t")); + auto* conv_op_info = matched.at("conv")->stmt()->op_info(); + op_desc.SetAttr("conv_w_max", + conv_op_info->GetAttr("__xpu__w_max")); + op_desc.SetAttr("output_channel", + conv_op_info->GetAttr("OutputChannel")); + auto* topk_op_info = matched.at("topk")->stmt()->op_info(); + op_desc.SetAttr>( + "topks", topk_op_info->GetAttr>("topks")); + op_desc.SetAttr("channel_num", + topk_op_info->GetAttr("channel_num")); + + auto* new_stmt = matched.at("match_matrix_tensor")->stmt(); + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, new_stmt->op()->scope()); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + // XXX(miaotianxiang): redundant links around |topk| are automatically + // removed as |topk| is marked intermediate. + // RemoveDirectedLink(matched.at("topk_col"), matched.at("topk")); + // RemoveDirectedLink(matched.at("topk_row"), matched.at("topk")); + std::vector arg_names{"conv_w"}; + for (auto name : arg_names) { + DirectedLink(matched.at(name), matched.at("match_matrix_tensor")); + } + std::vector out_names{"topk_out"}; + for (auto name : out_names) { + IR_OP_VAR_LINK(matched.at("match_matrix_tensor"), matched.at(name)); + } + } +}; + +class XPUMmdnnBidSeqRevEmbEltwiseFuser : public FuseBase { + public: + void BuildPattern() override { + auto* input0 = VarNode("input0")->AsInput(); + auto* input1 = VarNode("input1")->AsInput(); + auto* emb_tbl = VarNode("emb_tbl")->AsInput(); + + // fwd emb + auto* emb0 = OpNode("emb0", "lookup_table"); + auto* emb0_out = + VarNode("emb0_out")->assert_is_op_output("lookup_table", "Out"); + auto* emb1 = OpNode("emb1", "lookup_table"); + auto* emb1_out = + VarNode("emb1_out")->assert_is_op_output("lookup_table", "Out"); + + auto* eltwise01 = OpNode("eltwise01", "search_seq_arithmetic"); + auto* eltwise01_out = + VarNode("eltwise01_out") + ->assert_is_op_output("search_seq_arithmetic", "Out") + ->AsOutput(); + + // rev emb + auto* seq_rev2 = OpNode("seq_rev2", "sequence_reverse")->AsIntermediate(); + auto* seq_rev2_out = VarNode("seq_rev2_out") + ->assert_is_op_output("sequence_reverse", "Y") + ->AsIntermediate(); + auto* seq_rev3 = OpNode("seq_rev3", "sequence_reverse")->AsIntermediate(); + auto* seq_rev3_out = VarNode("seq_rev3_out") + ->assert_is_op_output("sequence_reverse", "Y") + ->AsIntermediate(); + auto* emb2 = OpNode("emb2", "lookup_table")->AsIntermediate(); + auto* emb2_out = VarNode("emb2_out") + ->assert_is_op_output("lookup_table", "Out") + ->AsIntermediate(); + auto* emb3 = OpNode("emb3", "lookup_table")->AsIntermediate(); + auto* emb3_out = VarNode("emb3_out") + ->assert_is_op_output("lookup_table", "Out") + ->AsIntermediate(); + + auto* eltwise23 = + OpNode("eltwise23", "search_seq_arithmetic")->AsIntermediate(); + auto* eltwise23_out = + VarNode("eltwise23_out") + ->assert_is_op_output("search_seq_arithmetic", "Out") + ->AsOutput(); + + *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out; + *emb_tbl >> *emb0; + *input1 >> *emb1 >> *emb1_out >> *eltwise01; + *emb_tbl >> *emb1; + + *input0 >> *seq_rev2 >> *seq_rev2_out >> *emb2 >> *emb2_out >> *eltwise23 >> + *eltwise23_out; + *emb_tbl >> *emb2; + *input1 >> *seq_rev3 >> *seq_rev3_out >> *emb3 >> *emb3_out >> *eltwise23; + *emb_tbl >> *emb3; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("sequence_reverse"); + op_desc.SetInput("X", {matched.at("eltwise01_out")->arg()->name}); + op_desc.SetOutput("Y", {matched.at("eltwise23_out")->arg()->name}); + + auto emb0_op = matched.at("emb0")->stmt()->op(); + auto new_seq_rev_op = LiteOpRegistry::Global().Create("sequence_reverse"); + new_seq_rev_op->Attach(op_desc, emb0_op->scope()); + auto* new_seq_rev_node = + graph->GraphCreateInstructNode(new_seq_rev_op, emb0_op->valid_places()); + + DirectedLink(matched.at("eltwise01_out"), new_seq_rev_node); + DirectedLink(new_seq_rev_node, matched.at("eltwise23_out")); + } +}; + +class XPUMmdnnBidEmbAttFuser : public FuseBase { + public: + void BuildPattern() override { + auto* input0 = VarNode("input0")->AsInput(); + auto* input1 = VarNode("input1")->AsInput(); + auto* emb_tbl = VarNode("emb_tbl")->AsInput(); + + auto* emb0 = OpNode("emb0", "lookup_table"); + auto* emb0_out = VarNode("emb0_out") + ->assert_is_op_output("lookup_table", "Out") + ->AsIntermediate(); + auto* emb1 = OpNode("emb1", "lookup_table")->AsIntermediate(); + auto* emb1_out = VarNode("emb1_out") + ->assert_is_op_output("lookup_table", "Out") + ->AsIntermediate(); + auto* eltwise01 = + OpNode("eltwise01", "search_seq_arithmetic")->AsIntermediate(); + auto* eltwise01_out = + VarNode("eltwise01_out") + ->assert_is_op_output("search_seq_arithmetic", "Out") + ->AsOutput(); + + auto* att_2in1_w = + VarNode("att_2in1_w") + ->assert_is_op_input("__xpu__mmdnn_search_attention", "W") + ->AsInput(); + auto* att_2in1_b = + VarNode("att_2in1_b") + ->assert_is_op_input("__xpu__mmdnn_search_attention", "b") + ->AsInput(); + auto* att_2in1 = + OpNode("att_2in1", "__xpu__mmdnn_search_attention")->AsIntermediate(); + auto* att_2in1_out = + VarNode("att_2in1_out") + ->assert_is_op_output("__xpu__mmdnn_search_attention", "Out") + ->AsIntermediate(); + auto* seq_pool_2in1 = + OpNode("seq_pool_2in1", "sequence_pool")->AsIntermediate(); + auto* seq_pool_2in1_out = VarNode("seq_pool_2in1_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsOutput(); + auto* seq_pool_2in1_max_idx = + VarNode("seq_pool_2in1_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out; + *emb_tbl >> *emb0; + *input1 >> *emb1 >> *emb1_out >> *eltwise01; + *emb_tbl >> *emb1; + + *eltwise01_out >> *att_2in1 >> *att_2in1_out >> *seq_pool_2in1 >> + *seq_pool_2in1_out; + *seq_pool_2in1 >> *seq_pool_2in1_max_idx; + *att_2in1_w >> *att_2in1; + *att_2in1_b >> *att_2in1; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__mmdnn_bid_emb_att"); + op_desc.SetInput("id0", {matched.at("input0")->arg()->name}); + op_desc.SetInput("id1", {matched.at("input1")->arg()->name}); + op_desc.SetInput("emb_tbl", {matched.at("emb_tbl")->arg()->name}); + op_desc.SetInput("att_fc_w", {matched.at("att_2in1_w")->arg()->name}); + op_desc.SetInput("att_fc_b", {matched.at("att_2in1_b")->arg()->name}); + op_desc.SetOutput("att_pool_out", + {matched.at("seq_pool_2in1_out")->arg()->name}); + op_desc.SetOutput("emb_fw_out", {matched.at("eltwise01_out")->arg()->name}); + + auto* att_fc_op_info = matched.at("att_2in1")->stmt()->op_info(); + op_desc.SetAttr("att_fc_w_max", + att_fc_op_info->GetAttr("W_max")); + + auto* new_stmt = matched.at("emb0")->stmt(); + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, new_stmt->op()->scope()); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + std::vector arg_names{ + "input1", "att_2in1_w", "att_2in1_b", + }; + for (auto name : arg_names) { + DirectedLink(matched.at(name), matched.at("emb0")); + } + std::vector out_names{ + "seq_pool_2in1_out", "eltwise01_out", + }; + for (auto name : out_names) { + IR_OP_VAR_LINK(matched.at("emb0"), matched.at(name)); + } + } +}; + +// 5 outputs +// ========= +// +// eltwise01_out +// seq_pool_right_out +// seq_pool_left_out +// seq_pool_2in1_out +// concat_3in1_out +// +class XPUMmdnnBidEmbGrnnAttFuser : public FuseBase { + public: + void BuildPattern() override { + auto* input0 = VarNode("input0")->AsInput(); + auto* input1 = VarNode("input1")->AsInput(); + auto* emb_tbl = VarNode("emb_tbl")->AsInput(); + + auto* emb0 = OpNode("emb0", "lookup_table"); + auto* emb0_out = VarNode("emb0_out") + ->assert_is_op_output("lookup_table", "Out") + ->AsIntermediate(); + auto* emb1 = OpNode("emb1", "lookup_table")->AsIntermediate(); + auto* emb1_out = VarNode("emb1_out") + ->assert_is_op_output("lookup_table", "Out") + ->AsIntermediate(); + auto* eltwise01 = + OpNode("eltwise01", "search_seq_arithmetic")->AsIntermediate(); + auto* eltwise01_out = + VarNode("eltwise01_out") + ->assert_is_op_output("search_seq_arithmetic", "Out") + ->AsOutput(); + + auto* seq_rev_right0 = + OpNode("seq_rev_right0", "sequence_reverse")->AsIntermediate(); + auto* seq_rev_right0_out = + VarNode("seq_rev_right0_out") + ->assert_is_op_output("sequence_reverse", "Y") + ->AsIntermediate(); + auto* grnn_right_wh = VarNode("grnn_right_wh") + ->assert_is_op_input("search_grnn", "Wh") + ->AsInput(); + auto* grnn_right_wi = VarNode("grnn_right_wi") + ->assert_is_op_input("search_grnn", "Wi") + ->AsInput(); + auto* grnn_right = OpNode("grnn_right", "search_grnn")->AsIntermediate(); + auto* grnn_right_out = VarNode("grnn_right_out") + ->assert_is_op_output("search_grnn", "Out") + ->AsIntermediate(); + auto* grnn_right_idx_sorted_by_width = + VarNode("grnn_right_idx_sorted_by_width") + ->assert_is_op_output("search_grnn", "idx_sorted_by_width") + ->AsIntermediate(); + auto* grnn_right_layout_input = + VarNode("grnn_right_layout_input") + ->assert_is_op_output("search_grnn", "layout_input") + ->AsIntermediate(); + auto* grnn_right_tmp_buffer = + VarNode("grnn_right_tmp_buffer") + ->assert_is_op_output("search_grnn", "tmp_buffer") + ->AsIntermediate(); + auto* seq_rev_right1 = + OpNode("seq_rev_right1", "sequence_reverse")->AsIntermediate(); + auto* seq_rev_right1_out = + VarNode("seq_rev_right1_out") + ->assert_is_op_output("sequence_reverse", "Y") + ->AsIntermediate(); + auto* seq_pool_right = + OpNode("seq_pool_right", "sequence_pool")->AsIntermediate(); + auto* seq_pool_right_out = VarNode("seq_pool_right_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsOutput(); + auto* seq_pool_right_max_idx = + VarNode("seq_pool_right_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* grnn_left_wh = VarNode("grnn_left_wh") + ->assert_is_op_input("search_grnn", "Wh") + ->AsInput(); + auto* grnn_left_wi = VarNode("grnn_left_wi") + ->assert_is_op_input("search_grnn", "Wi") + ->AsInput(); + auto* grnn_left = OpNode("grnn_left", "search_grnn")->AsIntermediate(); + auto* grnn_left_out = VarNode("grnn_left_out") + ->assert_is_op_output("search_grnn", "Out") + ->AsIntermediate(); + auto* grnn_left_idx_sorted_by_width = + VarNode("grnn_left_idx_sorted_by_width") + ->assert_is_op_output("search_grnn", "idx_sorted_by_width") + ->AsIntermediate(); + auto* grnn_left_layout_input = + VarNode("grnn_left_layout_input") + ->assert_is_op_output("search_grnn", "layout_input") + ->AsIntermediate(); + auto* grnn_left_tmp_buffer = + VarNode("grnn_left_tmp_buffer") + ->assert_is_op_output("search_grnn", "tmp_buffer") + ->AsIntermediate(); + auto* seq_pool_left = + OpNode("seq_pool_left", "sequence_pool")->AsIntermediate(); + auto* seq_pool_left_out = VarNode("seq_pool_left_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsOutput(); + auto* seq_pool_left_max_idx = + VarNode("seq_pool_left_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* concat_2in1 = OpNode("concat_2in1", "concat")->AsIntermediate(); + auto* concat_2in1_out = VarNode("concat_2in1_out") + ->assert_is_op_output("concat", "Out") + ->AsIntermediate(); + auto* att_2in1_w = + VarNode("att_2in1_w") + ->assert_is_op_input("__xpu__mmdnn_search_attention", "W") + ->AsInput(); + auto* att_2in1_b = + VarNode("att_2in1_b") + ->assert_is_op_input("__xpu__mmdnn_search_attention", "b") + ->AsInput(); + auto* att_2in1 = + OpNode("att_2in1", "__xpu__mmdnn_search_attention")->AsIntermediate(); + auto* att_2in1_out = + VarNode("att_2in1_out") + ->assert_is_op_output("__xpu__mmdnn_search_attention", "Out") + ->AsIntermediate(); + auto* seq_pool_2in1 = + OpNode("seq_pool_2in1", "sequence_pool")->AsIntermediate(); + auto* seq_pool_2in1_out = VarNode("seq_pool_2in1_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsOutput(); + auto* seq_pool_2in1_max_idx = + VarNode("seq_pool_2in1_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* concat_3in1 = OpNode("concat_3in1", "concat")->AsIntermediate(); + auto* concat_3in1_out = VarNode("concat_3in1_out") + ->assert_is_op_output("concat", "Out") + ->AsOutput(); + + *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out; + *emb_tbl >> *emb0; + *input1 >> *emb1 >> *emb1_out >> *eltwise01; + *emb_tbl >> *emb1; + + *eltwise01_out >> *seq_rev_right0 >> *seq_rev_right0_out >> *grnn_right >> + *grnn_right_out >> *seq_rev_right1 >> *seq_rev_right1_out; + *grnn_right_out >> *seq_pool_right >> *seq_pool_right_out; + *seq_pool_right >> *seq_pool_right_max_idx; + *grnn_right_wh >> *grnn_right; + *grnn_right_wi >> *grnn_right; + *grnn_right >> *grnn_right_idx_sorted_by_width; + *grnn_right >> *grnn_right_layout_input; + *grnn_right >> *grnn_right_tmp_buffer; + + *eltwise01_out >> *grnn_left >> *grnn_left_out >> *seq_pool_left >> + *seq_pool_left_out; + *seq_pool_left >> *seq_pool_left_max_idx; + *grnn_left_wh >> *grnn_left; + *grnn_left_wi >> *grnn_left; + *grnn_left >> *grnn_left_idx_sorted_by_width; + *grnn_left >> *grnn_left_layout_input; + *grnn_left >> *grnn_left_tmp_buffer; + + *seq_rev_right1_out >> *concat_2in1; + *grnn_left_out >> *concat_2in1; + *concat_2in1 >> *concat_2in1_out >> *att_2in1 >> *att_2in1_out >> + *seq_pool_2in1 >> *seq_pool_2in1_out; + *seq_pool_2in1 >> *seq_pool_2in1_max_idx; + *att_2in1_w >> *att_2in1; + *att_2in1_b >> *att_2in1; + + *eltwise01_out >> *concat_3in1; + *seq_rev_right1_out >> *concat_3in1; + *grnn_left_out >> *concat_3in1; + *concat_3in1 >> *concat_3in1_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__mmdnn_bid_emb_grnn_att"); + op_desc.SetInput("id0", {matched.at("input0")->arg()->name}); + op_desc.SetInput("id1", {matched.at("input1")->arg()->name}); + op_desc.SetInput("emb_tbl", {matched.at("emb_tbl")->arg()->name}); + op_desc.SetInput("grnn_fw_wh", {matched.at("grnn_left_wh")->arg()->name}); + op_desc.SetInput("grnn_fw_wi", {matched.at("grnn_left_wi")->arg()->name}); + op_desc.SetInput("grnn_rv_wh", {matched.at("grnn_right_wh")->arg()->name}); + op_desc.SetInput("grnn_rv_wi", {matched.at("grnn_right_wi")->arg()->name}); + op_desc.SetInput("att_fc_w", {matched.at("att_2in1_w")->arg()->name}); + op_desc.SetInput("att_fc_b", {matched.at("att_2in1_b")->arg()->name}); + op_desc.SetOutput("grnn_fw_pool_out", + {matched.at("seq_pool_left_out")->arg()->name}); + op_desc.SetOutput("grnn_rv_pool_out", + {matched.at("seq_pool_right_out")->arg()->name}); + op_desc.SetOutput("att_pool_out", + {matched.at("seq_pool_2in1_out")->arg()->name}); + op_desc.SetOutput("concat_3in1_out", + {matched.at("concat_3in1_out")->arg()->name}); + op_desc.SetOutput("emb_fw_out", {matched.at("eltwise01_out")->arg()->name}); + + auto* grnn_fw_op_info = matched.at("grnn_left")->stmt()->op_info(); + op_desc.SetAttr>( + "grnn_fw_wh_maxs", + grnn_fw_op_info->GetAttr>("__xpu__wh_max")); + op_desc.SetAttr>( + "grnn_fw_wi_maxs", + grnn_fw_op_info->GetAttr>("__xpu__wi_max")); + auto* grnn_rv_op_info = matched.at("grnn_right")->stmt()->op_info(); + op_desc.SetAttr>( + "grnn_rv_wh_maxs", + grnn_rv_op_info->GetAttr>("__xpu__wh_max")); + op_desc.SetAttr>( + "grnn_rv_wi_maxs", + grnn_rv_op_info->GetAttr>("__xpu__wi_max")); + auto* att_fc_op_info = matched.at("att_2in1")->stmt()->op_info(); + op_desc.SetAttr("att_fc_w_max", + att_fc_op_info->GetAttr("W_max")); + + auto* new_stmt = matched.at("emb0")->stmt(); + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, new_stmt->op()->scope()); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + std::vector arg_names{ + "input1", + "grnn_left_wh", + "grnn_left_wi", + "grnn_right_wh", + "grnn_right_wi", + "att_2in1_w", + "att_2in1_b", + }; + for (auto name : arg_names) { + DirectedLink(matched.at(name), matched.at("emb0")); + } + std::vector out_names{ + "seq_pool_left_out", + "seq_pool_right_out", + "seq_pool_2in1_out", + "concat_3in1_out", + "eltwise01_out", + }; + for (auto name : out_names) { + IR_OP_VAR_LINK(matched.at("emb0"), matched.at(name)); + } + } +}; + +// 6 outputs +// ========= +// +// emb0_out +// eltwise01_out +// seq_pool_right_out +// seq_pool_left_out +// seq_pool_2in1_out +// concat_3in1_out +// +class XPUMmdnnBidEmbGrnnAttFuser2 : public FuseBase { + public: + void BuildPattern() override { + auto* input0 = VarNode("input0")->AsInput(); + auto* input1 = VarNode("input1")->AsInput(); + auto* emb_tbl = VarNode("emb_tbl")->AsInput(); + + auto* emb0 = OpNode("emb0", "lookup_table"); + auto* emb0_out = VarNode("emb0_out") + ->assert_is_op_output("lookup_table", "Out") + ->assert_is_op_input("search_seq_arithmetic", "X") + ->AsOutput(); + auto* emb1 = OpNode("emb1", "lookup_table")->AsIntermediate(); + auto* emb1_out = VarNode("emb1_out") + ->assert_is_op_output("lookup_table", "Out") + ->assert_is_op_input("search_seq_arithmetic", "Y") + ->AsIntermediate(); + auto* eltwise01 = + OpNode("eltwise01", "search_seq_arithmetic")->AsIntermediate(); + auto* eltwise01_out = + VarNode("eltwise01_out") + ->assert_is_op_output("search_seq_arithmetic", "Out") + ->AsOutput(); + + auto* seq_rev_right0 = + OpNode("seq_rev_right0", "sequence_reverse")->AsIntermediate(); + auto* seq_rev_right0_out = + VarNode("seq_rev_right0_out") + ->assert_is_op_output("sequence_reverse", "Y") + ->AsIntermediate(); + auto* grnn_right_wh = VarNode("grnn_right_wh") + ->assert_is_op_input("search_grnn", "Wh") + ->AsInput(); + auto* grnn_right_wi = VarNode("grnn_right_wi") + ->assert_is_op_input("search_grnn", "Wi") + ->AsInput(); + auto* grnn_right = OpNode("grnn_right", "search_grnn")->AsIntermediate(); + auto* grnn_right_out = VarNode("grnn_right_out") + ->assert_is_op_output("search_grnn", "Out") + ->AsIntermediate(); + auto* grnn_right_idx_sorted_by_width = + VarNode("grnn_right_idx_sorted_by_width") + ->assert_is_op_output("search_grnn", "idx_sorted_by_width") + ->AsIntermediate(); + auto* grnn_right_layout_input = + VarNode("grnn_right_layout_input") + ->assert_is_op_output("search_grnn", "layout_input") + ->AsIntermediate(); + auto* grnn_right_tmp_buffer = + VarNode("grnn_right_tmp_buffer") + ->assert_is_op_output("search_grnn", "tmp_buffer") + ->AsIntermediate(); + auto* seq_rev_right1 = + OpNode("seq_rev_right1", "sequence_reverse")->AsIntermediate(); + auto* seq_rev_right1_out = + VarNode("seq_rev_right1_out") + ->assert_is_op_output("sequence_reverse", "Y") + ->AsIntermediate(); + auto* seq_pool_right = + OpNode("seq_pool_right", "sequence_pool")->AsIntermediate(); + auto* seq_pool_right_out = VarNode("seq_pool_right_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsOutput(); + auto* seq_pool_right_max_idx = + VarNode("seq_pool_right_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* grnn_left_wh = VarNode("grnn_left_wh") + ->assert_is_op_input("search_grnn", "Wh") + ->AsInput(); + auto* grnn_left_wi = VarNode("grnn_left_wi") + ->assert_is_op_input("search_grnn", "Wi") + ->AsInput(); + auto* grnn_left = OpNode("grnn_left", "search_grnn")->AsIntermediate(); + auto* grnn_left_out = VarNode("grnn_left_out") + ->assert_is_op_output("search_grnn", "Out") + ->AsIntermediate(); + auto* grnn_left_idx_sorted_by_width = + VarNode("grnn_left_idx_sorted_by_width") + ->assert_is_op_output("search_grnn", "idx_sorted_by_width") + ->AsIntermediate(); + auto* grnn_left_layout_input = + VarNode("grnn_left_layout_input") + ->assert_is_op_output("search_grnn", "layout_input") + ->AsIntermediate(); + auto* grnn_left_tmp_buffer = + VarNode("grnn_left_tmp_buffer") + ->assert_is_op_output("search_grnn", "tmp_buffer") + ->AsIntermediate(); + auto* seq_pool_left = + OpNode("seq_pool_left", "sequence_pool")->AsIntermediate(); + auto* seq_pool_left_out = VarNode("seq_pool_left_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsOutput(); + auto* seq_pool_left_max_idx = + VarNode("seq_pool_left_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* concat_2in1 = OpNode("concat_2in1", "concat")->AsIntermediate(); + auto* concat_2in1_out = VarNode("concat_2in1_out") + ->assert_is_op_output("concat", "Out") + ->AsIntermediate(); + auto* att_2in1_w = + VarNode("att_2in1_w") + ->assert_is_op_input("__xpu__mmdnn_search_attention", "W") + ->AsInput(); + auto* att_2in1_b = + VarNode("att_2in1_b") + ->assert_is_op_input("__xpu__mmdnn_search_attention", "b") + ->AsInput(); + auto* att_2in1 = + OpNode("att_2in1", "__xpu__mmdnn_search_attention")->AsIntermediate(); + auto* att_2in1_out = + VarNode("att_2in1_out") + ->assert_is_op_output("__xpu__mmdnn_search_attention", "Out") + ->AsIntermediate(); + auto* seq_pool_2in1 = + OpNode("seq_pool_2in1", "sequence_pool")->AsIntermediate(); + auto* seq_pool_2in1_out = VarNode("seq_pool_2in1_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsOutput(); + auto* seq_pool_2in1_max_idx = + VarNode("seq_pool_2in1_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* concat_3in1 = OpNode("concat_3in1", "concat")->AsIntermediate(); + auto* concat_3in1_out = VarNode("concat_3in1_out") + ->assert_is_op_output("concat", "Out") + ->AsOutput(); + + *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out; + *emb_tbl >> *emb0; + *input1 >> *emb1 >> *emb1_out >> *eltwise01; + *emb_tbl >> *emb1; + + *eltwise01_out >> *seq_rev_right0 >> *seq_rev_right0_out >> *grnn_right >> + *grnn_right_out >> *seq_rev_right1 >> *seq_rev_right1_out; + *grnn_right_out >> *seq_pool_right >> *seq_pool_right_out; + *seq_pool_right >> *seq_pool_right_max_idx; + *grnn_right_wh >> *grnn_right; + *grnn_right_wi >> *grnn_right; + *grnn_right >> *grnn_right_idx_sorted_by_width; + *grnn_right >> *grnn_right_layout_input; + *grnn_right >> *grnn_right_tmp_buffer; + + *eltwise01_out >> *grnn_left >> *grnn_left_out >> *seq_pool_left >> + *seq_pool_left_out; + *seq_pool_left >> *seq_pool_left_max_idx; + *grnn_left_wh >> *grnn_left; + *grnn_left_wi >> *grnn_left; + *grnn_left >> *grnn_left_idx_sorted_by_width; + *grnn_left >> *grnn_left_layout_input; + *grnn_left >> *grnn_left_tmp_buffer; + + *seq_rev_right1_out >> *concat_2in1; + *grnn_left_out >> *concat_2in1; + *concat_2in1 >> *concat_2in1_out >> *att_2in1 >> *att_2in1_out >> + *seq_pool_2in1 >> *seq_pool_2in1_out; + *seq_pool_2in1 >> *seq_pool_2in1_max_idx; + *att_2in1_w >> *att_2in1; + *att_2in1_b >> *att_2in1; + + *eltwise01_out >> *concat_3in1; + *seq_rev_right1_out >> *concat_3in1; + *grnn_left_out >> *concat_3in1; + *concat_3in1 >> *concat_3in1_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__mmdnn_bid_emb_grnn_att2"); + op_desc.SetInput("id0", {matched.at("input0")->arg()->name}); + op_desc.SetInput("id1", {matched.at("input1")->arg()->name}); + op_desc.SetInput("emb_tbl", {matched.at("emb_tbl")->arg()->name}); + op_desc.SetInput("grnn_fw_wh", {matched.at("grnn_left_wh")->arg()->name}); + op_desc.SetInput("grnn_fw_wi", {matched.at("grnn_left_wi")->arg()->name}); + op_desc.SetInput("grnn_rv_wh", {matched.at("grnn_right_wh")->arg()->name}); + op_desc.SetInput("grnn_rv_wi", {matched.at("grnn_right_wi")->arg()->name}); + op_desc.SetInput("att_fc_w", {matched.at("att_2in1_w")->arg()->name}); + op_desc.SetInput("att_fc_b", {matched.at("att_2in1_b")->arg()->name}); + op_desc.SetOutput("emb0_out", {matched.at("emb0_out")->arg()->name}); + op_desc.SetOutput("grnn_fw_pool_out", + {matched.at("seq_pool_left_out")->arg()->name}); + op_desc.SetOutput("grnn_rv_pool_out", + {matched.at("seq_pool_right_out")->arg()->name}); + op_desc.SetOutput("att_pool_out", + {matched.at("seq_pool_2in1_out")->arg()->name}); + op_desc.SetOutput("concat_3in1_out", + {matched.at("concat_3in1_out")->arg()->name}); + op_desc.SetOutput("emb_fw_out", {matched.at("eltwise01_out")->arg()->name}); + + auto* grnn_fw_op_info = matched.at("grnn_left")->stmt()->op_info(); + op_desc.SetAttr>( + "grnn_fw_wh_maxs", + grnn_fw_op_info->GetAttr>("__xpu__wh_max")); + op_desc.SetAttr>( + "grnn_fw_wi_maxs", + grnn_fw_op_info->GetAttr>("__xpu__wi_max")); + auto* grnn_rv_op_info = matched.at("grnn_right")->stmt()->op_info(); + op_desc.SetAttr>( + "grnn_rv_wh_maxs", + grnn_rv_op_info->GetAttr>("__xpu__wh_max")); + op_desc.SetAttr>( + "grnn_rv_wi_maxs", + grnn_rv_op_info->GetAttr>("__xpu__wi_max")); + auto* att_fc_op_info = matched.at("att_2in1")->stmt()->op_info(); + op_desc.SetAttr("att_fc_w_max", + att_fc_op_info->GetAttr("W_max")); + + auto* new_stmt = matched.at("emb0")->stmt(); + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, new_stmt->op()->scope()); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + std::vector arg_names{ + "input1", + "grnn_left_wh", + "grnn_left_wi", + "grnn_right_wh", + "grnn_right_wi", + "att_2in1_w", + "att_2in1_b", + }; + for (auto name : arg_names) { + DirectedLink(matched.at(name), matched.at("emb0")); + } + std::vector out_names{ + "seq_pool_left_out", + "seq_pool_right_out", + "seq_pool_2in1_out", + "concat_3in1_out", + "eltwise01_out", + }; + for (auto name : out_names) { + IR_OP_VAR_LINK(matched.at("emb0"), matched.at(name)); + } + } +}; + +class XPUMmdnnMergeAllFuser : public FuseBase { + public: + explicit XPUMmdnnMergeAllFuser(int n_concat_topk) + : n_concat_topk_(n_concat_topk) {} + + void BuildPattern() override { + auto* concat_7in1_input0 = VarNode("concat_7in1_input0") + ->assert_is_op_nth_input("concat", "X", 0) + ->AsInput(); + auto* concat_7in1_input1 = VarNode("concat_7in1_input1") + ->assert_is_op_nth_input("concat", "X", 1) + ->AsInput(); + auto* concat_7in1_input2 = VarNode("concat_7in1_input2") + ->assert_is_op_nth_input("concat", "X", 2) + ->AsInput(); + auto* concat_7in1_input3 = VarNode("concat_7in1_input3") + ->assert_is_op_nth_input("concat", "X", 3) + ->AsInput(); + auto* concat_7in1_input4 = VarNode("concat_7in1_input4") + ->assert_is_op_nth_input("concat", "X", 4) + ->AsInput(); + auto* concat_7in1_input5 = VarNode("concat_7in1_input5") + ->assert_is_op_nth_input("concat", "X", 5) + ->AsInput(); + auto* concat_7in1_input6 = VarNode("concat_7in1_input6") + ->assert_is_op_nth_input("concat", "X", 6) + ->AsInput(); + auto* concat_7in1 = OpNode("concat_7in1", "concat"); + auto* concat_7in1_out = VarNode("concat_7in1_out") + ->assert_is_op_output("concat", "Out") + ->AsIntermediate(); + auto* search_fc0_w = VarNode("search_fc0_w") + ->assert_is_op_input("search_fc", "W") + ->AsInput(); + auto* search_fc0_b = VarNode("search_fc0_b") + ->assert_is_op_input("search_fc", "b") + ->AsInput(); + auto* search_fc0 = OpNode("search_fc0", "search_fc")->AsIntermediate(); + auto* search_fc0_out = VarNode("search_fc0_out") + ->assert_is_op_output("search_fc", "Out") + ->AsIntermediate(); + auto* relu0 = OpNode("relu0", "relu")->AsIntermediate(); + auto* relu0_out = VarNode("relu0_out") + ->assert_is_op_output("relu", "Out") + ->AsIntermediate(); + + auto* concat_topk_input0 = VarNode("concat_topk_input0") + ->assert_is_op_nth_input("concat", "X", 0) + ->AsInput(); + auto* concat_topk_input1 = VarNode("concat_topk_input1") + ->assert_is_op_nth_input("concat", "X", 1) + ->AsInput(); + auto* concat_topk = OpNode("concat_topk", "concat")->AsIntermediate(); + auto* concat_topk_out = VarNode("concat_topk_out") + ->assert_is_op_output("concat", "Out") + ->AsIntermediate(); + for (int i = 2; i < n_concat_topk_; ++i) { + auto concat_topk_input_name = + paddle::lite::string_format("concat_topk_input%d", i); + auto* concat_topk_inputx = VarNode(concat_topk_input_name) + ->assert_is_op_nth_input("concat", "X", i) + ->AsInput(); + *concat_topk_inputx >> *concat_topk; + } + + auto* seq_rev = OpNode("seq_rev", "sequence_reverse")->AsIntermediate(); + auto* seq_rev_out = VarNode("seq_rev_out") + ->assert_is_op_output("sequence_reverse", "Y") + ->AsIntermediate(); + + auto* grnn_rv_wh = VarNode("grnn_rv_wh") + ->assert_is_op_input("search_grnn", "Wh") + ->AsInput(); + auto* grnn_rv_wi = VarNode("grnn_rv_wi") + ->assert_is_op_input("search_grnn", "Wi") + ->AsInput(); + auto* grnn_rv = OpNode("grnn_rv", "search_grnn")->AsIntermediate(); + auto* grnn_rv_out = VarNode("grnn_rv_out") + ->assert_is_op_output("search_grnn", "Out") + ->AsIntermediate(); + auto* grnn_rv_idx_sorted_by_width = + VarNode("grnn_rv_idx_sorted_by_width") + ->assert_is_op_output("search_grnn", "idx_sorted_by_width") + ->AsIntermediate(); + auto* grnn_rv_layout_input = + VarNode("grnn_rv_layout_input") + ->assert_is_op_output("search_grnn", "layout_input") + ->AsIntermediate(); + auto* grnn_rv_tmp_buffer = + VarNode("grnn_rv_tmp_buffer") + ->assert_is_op_output("search_grnn", "tmp_buffer") + ->AsIntermediate(); + auto* seq_pool_rv = + OpNode("seq_pool_rv", "sequence_pool")->AsIntermediate(); + auto* seq_pool_rv_out = VarNode("seq_pool_rv_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsIntermediate(); + auto* seq_pool_rv_max_idx = + VarNode("seq_pool_rv_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* grnn_fw_wh = VarNode("grnn_fw_wh") + ->assert_is_op_input("search_grnn", "Wh") + ->AsInput(); + auto* grnn_fw_wi = VarNode("grnn_fw_wi") + ->assert_is_op_input("search_grnn", "Wi") + ->AsInput(); + auto* grnn_fw = OpNode("grnn_fw", "search_grnn")->AsIntermediate(); + auto* grnn_fw_out = VarNode("grnn_fw_out") + ->assert_is_op_output("search_grnn", "Out") + ->AsIntermediate(); + auto* grnn_fw_idx_sorted_by_width = + VarNode("grnn_fw_idx_sorted_by_width") + ->assert_is_op_output("search_grnn", "idx_sorted_by_width") + ->AsIntermediate(); + auto* grnn_fw_layout_input = + VarNode("grnn_fw_layout_input") + ->assert_is_op_output("search_grnn", "layout_input") + ->AsIntermediate(); + auto* grnn_fw_tmp_buffer = + VarNode("grnn_fw_tmp_buffer") + ->assert_is_op_output("search_grnn", "tmp_buffer") + ->AsIntermediate(); + auto* seq_pool_fw = + OpNode("seq_pool_fw", "sequence_pool")->AsIntermediate(); + auto* seq_pool_fw_out = VarNode("seq_pool_fw_out") + ->assert_is_op_output("sequence_pool", "Out") + ->AsIntermediate(); + auto* seq_pool_fw_max_idx = + VarNode("seq_pool_fw_max_idx") + ->assert_is_op_output("sequence_pool", "MaxIndex") + ->AsIntermediate(); + + auto* rv_fw_concat = OpNode("rv_fw_concat", "concat")->AsIntermediate(); + auto* rv_fw_concat_out = VarNode("rv_fw_concat_out") + ->assert_is_op_output("concat", "Out") + ->AsIntermediate(); + + auto* last_concat = OpNode("last_concat", "concat")->AsIntermediate(); + auto* last_concat_out = VarNode("last_concat_out") + ->assert_is_op_output("concat", "Out") + ->AsIntermediate(); + auto* search_fc1_w = VarNode("search_fc1_w") + ->assert_is_op_input("search_fc", "W") + ->AsInput(); + auto* search_fc1_b = VarNode("search_fc1_b") + ->assert_is_op_input("search_fc", "b") + ->AsInput(); + auto* search_fc1 = OpNode("search_fc1", "search_fc")->AsIntermediate(); + auto* search_fc1_out = VarNode("search_fc1_out") + ->assert_is_op_output("search_fc", "Out") + ->AsIntermediate(); + auto* relu1 = OpNode("relu1", "relu")->AsIntermediate(); + auto* relu1_out = VarNode("relu1_out") + ->assert_is_op_output("relu", "Out") + ->AsIntermediate(); + auto* search_fc2_w = VarNode("search_fc2_w") + ->assert_is_op_input("search_fc", "W") + ->AsInput(); + auto* search_fc2_b = VarNode("search_fc2_b") + ->assert_is_op_input("search_fc", "b") + ->AsInput(); + auto* search_fc2 = OpNode("search_fc2", "search_fc")->AsIntermediate(); + auto* search_fc2_out = VarNode("search_fc2_out") + ->assert_is_op_output("search_fc", "Out") + ->AsOutput(); + + *concat_7in1_input0 >> *concat_7in1; + *concat_7in1_input1 >> *concat_7in1; + *concat_7in1_input2 >> *concat_7in1; + *concat_7in1_input3 >> *concat_7in1; + *concat_7in1_input4 >> *concat_7in1; + *concat_7in1_input5 >> *concat_7in1; + *concat_7in1_input6 >> *concat_7in1; + *concat_7in1 >> *concat_7in1_out >> *search_fc0 >> *search_fc0_out >> + *relu0 >> *relu0_out; + *search_fc0_w >> *search_fc0; + *search_fc0_b >> *search_fc0; + + *concat_topk_input0 >> *concat_topk; + *concat_topk_input1 >> *concat_topk; + *concat_topk >> *concat_topk_out >> *seq_rev >> *seq_rev_out; + + *seq_rev_out >> *grnn_rv >> *grnn_rv_out >> *seq_pool_rv >> + *seq_pool_rv_out; + *seq_pool_rv >> *seq_pool_rv_max_idx; + *grnn_rv_wh >> *grnn_rv; + *grnn_rv_wi >> *grnn_rv; + *grnn_rv >> *grnn_rv_idx_sorted_by_width; + *grnn_rv >> *grnn_rv_layout_input; + *grnn_rv >> *grnn_rv_tmp_buffer; + + *concat_topk_out >> *grnn_fw >> *grnn_fw_out >> *seq_pool_fw >> + *seq_pool_fw_out; + *seq_pool_fw >> *seq_pool_fw_max_idx; + *grnn_fw_wh >> *grnn_fw; + *grnn_fw_wi >> *grnn_fw; + *grnn_fw >> *grnn_fw_idx_sorted_by_width; + *grnn_fw >> *grnn_fw_layout_input; + *grnn_fw >> *grnn_fw_tmp_buffer; + + *seq_pool_rv_out >> *rv_fw_concat; + *seq_pool_fw_out >> *rv_fw_concat; + *rv_fw_concat >> *rv_fw_concat_out; + + *rv_fw_concat_out >> *last_concat; + *relu0_out >> *last_concat; + *last_concat >> *last_concat_out >> *search_fc1 >> *search_fc1_out >> + *relu1 >> *relu1_out >> *search_fc2 >> *search_fc2_out; + *search_fc1_w >> *search_fc1; + *search_fc1_b >> *search_fc1; + *search_fc2_w >> *search_fc2; + *search_fc2_b >> *search_fc2; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__mmdnn_merge_all"); + auto* concat_7in1_op_info = matched.at("concat_7in1")->stmt()->op_info(); + op_desc.SetInput("concat_7in1_x", concat_7in1_op_info->Input("X")); + auto* concat_topk_op_info = matched.at("concat_topk")->stmt()->op_info(); + op_desc.SetInput("concat_topk_x", concat_topk_op_info->Input("X")); + op_desc.SetInput("grnn_fw_wh", {matched.at("grnn_fw_wh")->arg()->name}); + op_desc.SetInput("grnn_fw_wi", {matched.at("grnn_fw_wi")->arg()->name}); + op_desc.SetInput("grnn_rv_wh", {matched.at("grnn_rv_wh")->arg()->name}); + op_desc.SetInput("grnn_rv_wi", {matched.at("grnn_rv_wi")->arg()->name}); + op_desc.SetInput("fc0_w", {matched.at("search_fc0_w")->arg()->name}); + op_desc.SetInput("fc0_b", {matched.at("search_fc0_b")->arg()->name}); + op_desc.SetInput("fc1_w", {matched.at("search_fc1_w")->arg()->name}); + op_desc.SetInput("fc1_b", {matched.at("search_fc1_b")->arg()->name}); + op_desc.SetInput("fc2_w", {matched.at("search_fc2_w")->arg()->name}); + op_desc.SetInput("fc2_b", {matched.at("search_fc2_b")->arg()->name}); + + op_desc.SetOutput("out", {matched.at("search_fc2_out")->arg()->name}); + + auto* grnn_fw_op_info = matched.at("grnn_fw")->stmt()->op_info(); + op_desc.SetAttr>( + "grnn_fw_wh_maxs", + grnn_fw_op_info->GetAttr>("__xpu__wh_max")); + op_desc.SetAttr>( + "grnn_fw_wi_maxs", + grnn_fw_op_info->GetAttr>("__xpu__wi_max")); + auto* grnn_rv_op_info = matched.at("grnn_rv")->stmt()->op_info(); + op_desc.SetAttr>( + "grnn_rv_wh_maxs", + grnn_rv_op_info->GetAttr>("__xpu__wh_max")); + op_desc.SetAttr>( + "grnn_rv_wi_maxs", + grnn_rv_op_info->GetAttr>("__xpu__wi_max")); + auto* fc0_op_info = matched.at("search_fc0")->stmt()->op_info(); + op_desc.SetAttr("fc0_w_max", + fc0_op_info->GetAttr("__xpu__w_max")); + auto* fc1_op_info = matched.at("search_fc1")->stmt()->op_info(); + op_desc.SetAttr("fc1_w_max", + fc1_op_info->GetAttr("__xpu__w_max")); + auto* fc2_op_info = matched.at("search_fc2")->stmt()->op_info(); + op_desc.SetAttr("fc2_w_max", + fc2_op_info->GetAttr("__xpu__w_max")); + + auto* new_stmt = matched.at("concat_7in1")->stmt(); + auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); + new_op->Attach(op_desc, new_stmt->op()->scope()); + new_op->SetValidPlaces(new_stmt->op()->valid_places()); + auto kernels = new_op->CreateKernels(new_op->valid_places()); + new_stmt->SetOp(new_op); + new_stmt->SetKernels(std::move(kernels)); + + std::vector arg_names{ + "concat_topk_input0", + "concat_topk_input1", + "grnn_fw_wh", + "grnn_fw_wi", + "grnn_rv_wh", + "grnn_rv_wi", + "search_fc0_w", + "search_fc0_b", + "search_fc1_w", + "search_fc1_b", + "search_fc2_w", + "search_fc2_b", + }; + for (int i = 2; i < n_concat_topk_; ++i) { + auto concat_topk_input_name = + paddle::lite::string_format("concat_topk_input%d", i); + arg_names.push_back(concat_topk_input_name); + } + for (auto name : arg_names) { + DirectedLink(matched.at(name), matched.at("concat_7in1")); + } + std::vector out_names{ + "search_fc2_out", + }; + for (auto name : out_names) { + IR_OP_VAR_LINK(matched.at("concat_7in1"), matched.at(name)); + } + } + + private: + int n_concat_topk_; +}; + +} // namespace fusion + +class XPUMmdnnFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + + fusion::XPUMmdnnFloat2Fix float_2_fix; + float_2_fix(graph.get()); + fusion::XPUMmdnnSearchAttentionFuser search_att_fuser; + search_att_fuser(graph.get()); + fusion::XPUMmdnnMatchConvTopkFuser match_conv_topk_fuser; + match_conv_topk_fuser(graph.get()); + fusion::XPUMmdnnMatchConvTopkFuser2 match_conv_topk_fuser2; + match_conv_topk_fuser2(graph.get()); + + fusion::XPUMmdnnBidSeqRevEmbEltwiseFuser bi_seq_rev_emb_eltwise_fuser; + bi_seq_rev_emb_eltwise_fuser(graph.get()); + fusion::XPUMmdnnBidEmbGrnnAttFuser bid_emb_grnn_att_fuser; + bid_emb_grnn_att_fuser(graph.get()); + fusion::XPUMmdnnBidEmbGrnnAttFuser2 bid_emb_grnn_att_fuser2; + bid_emb_grnn_att_fuser2(graph.get()); + fusion::XPUMmdnnBidEmbAttFuser bid_emb_att_fuser; + bid_emb_att_fuser(graph.get()); + for (int n_concat_topk : {3, 2}) { + fusion::XPUMmdnnMergeAllFuser merge_all_fuser(n_concat_topk); + merge_all_fuser(graph.get()); + } + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__mmdnn_fuse_pass, paddle::lite::mir::XPUMmdnnFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("__xpu__mmdnn_search_attention") + .BindKernel("__xpu__mmdnn_bid_emb_grnn_att") + .BindKernel("__xpu__mmdnn_bid_emb_grnn_att2") + .BindKernel("__xpu__mmdnn_bid_emb_att") + .BindKernel("__xpu__mmdnn_match_conv_topk") + .BindKernel("__xpu__mmdnn_merge_all"); diff --git a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc index 525042e44b2997013943f392f592d812bd68fa0b..21bc266204d95c0f7faa8c3796e4b6255a3fe741 100644 --- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc +++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc @@ -383,10 +383,10 @@ class XPUSingleEncoderFuser : public FuseBase { op_desc.SetAttr("act_type", act_type_); auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); - // XXX: memleak? - auto sub_block_desc = new cpp::BlockDesc(); + auto sub_program_desc = std::make_shared(); + sub_program_desc->AddBlock(); static_cast(fake_subgraph_op.get()) - ->SetSubBlock(sub_block_desc); + ->SetProgramDesc(sub_program_desc); auto* single_encoder_stmt = matched.at("q_mul")->stmt(); fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope()); fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places()); @@ -639,20 +639,21 @@ class XPUMultiEncoderFusePass : public ProgramPass { std::set fc_int31_ids; #ifdef LITE_WITH_XPU // TODO(miaotianxiang): core/mir/*_pass.cc are compiled anyway and need to - // access Context::_multi_encoder_precision, but this static member - // variable in class specialization defined in lite/core/context.cc - // is only compiled iff LITE_WITH_XPU==ON. To suppress linkage error, we use + // access TargetWrapperXPU::multi_encoder_precision, but this static member + // variable in class specialization defined in + // lite/backends/xpu/target_wrapper.cc is only compiled iff + // LITE_WITH_XPU==ON. To suppress linkage error, we use // #ifdef here. Any better idea? if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" || - lite::Context::_multi_encoder_precision == "int31") { + lite::TargetWrapperXPU::multi_encoder_precision == "int31") { fc_int31_ids = {0, 1, 2, 3, 4, 5}; VLOG(3) << "Use int31 in XPUMultiEncoderOp, " - << "lite::Context<>::_multi_encoder_precision=" - << lite::Context::_multi_encoder_precision; + << "lite::TargetWrapperXPU::multi_encoder_precision=" + << lite::TargetWrapperXPU::multi_encoder_precision; } else { VLOG(3) << "Use int16 in XPUMultiEncoderOp, " - << "lite::Context<>::_multi_encoder_precision=" - << lite::Context::_multi_encoder_precision; + << "lite::TargetWrapperXPU::multi_encoder_precision=" + << lite::TargetWrapperXPU::multi_encoder_precision; } #endif diff --git a/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..f017cc8c72f93a772f8bcbdc9aa96d5b0ad215d8 --- /dev/null +++ b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc @@ -0,0 +1,1389 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include "lite/operators/subgraph_op.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class XPUResNetCbamBlock0Fuser : public FuseBase { + public: + XPUResNetCbamBlock0Fuser() {} + + void BuildPattern() override { + auto* input = + VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput(); + + auto* left_conv1_weight = VarNode("left_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv1 = OpNode("left_conv1", "conv2d"); + auto* left_conv1_out = VarNode("left_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn1_scale = VarNode("left_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn1_bias = VarNode("left_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn1_mean = VarNode("left_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn1_var = VarNode("left_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate(); + auto* left_bn1_out = VarNode("left_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* left_bn1_mean_out = VarNode("left_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn1_var_out = + VarNode("left_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn1_saved_mean = + VarNode("left_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn1_saved_var = + VarNode("left_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate(); + auto* left_relu1_out = VarNode("left_relu1_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* left_conv2_weight = VarNode("left_conv2_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate(); + auto* left_conv2_out = VarNode("left_conv2_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn2_scale = VarNode("left_bn2_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn2_bias = VarNode("left_bn2_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn2_mean = VarNode("left_bn2_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn2_var = VarNode("left_bn2_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate(); + auto* left_bn2_out = VarNode("left_bn2_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* left_bn2_mean_out = VarNode("left_bn2_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn2_var_out = + VarNode("left_bn2_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn2_saved_mean = + VarNode("left_bn2_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn2_saved_var = + VarNode("left_bn2_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate(); + auto* left_relu2_out = VarNode("left_relu2_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* left_conv3_weight = VarNode("left_conv3_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate(); + auto* left_conv3_out = VarNode("left_conv3_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn3_scale = VarNode("left_bn3_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn3_bias = VarNode("left_bn3_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn3_mean = VarNode("left_bn3_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn3_var = VarNode("left_bn3_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate(); + auto* left_bn3_out = VarNode("left_bn3_out") + ->assert_is_op_output("batch_norm", "Y") + ->AsIntermediate(); + auto* left_bn3_mean_out = VarNode("left_bn3_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn3_var_out = + VarNode("left_bn3_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn3_saved_mean = + VarNode("left_bn3_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn3_saved_var = + VarNode("left_bn3_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + // cbam specific + auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate(); + auto* reduce_mean_out = VarNode("reduce_mean_out") + ->assert_is_op_output("reduce_mean", "Out") + ->assert_is_op_input("concat") + ->AsIntermediate(); + auto* reduce_max = OpNode("reduce_max", "reduce_max")->AsIntermediate(); + auto* reduce_max_out = VarNode("reduce_max_out") + ->assert_is_op_output("reduce_max", "Out") + ->assert_is_op_input("concat") + ->AsIntermediate(); + auto* concat = OpNode("concat", "concat")->AsIntermediate(); + auto* concat_out = VarNode("concat_out") + ->assert_is_op_output("concat", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + auto* left_conv4_weight = VarNode("left_conv4_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv4 = OpNode("left_conv4", "conv2d")->AsIntermediate(); + auto* left_conv4_out = VarNode("left_conv4_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("sigmoid", "X") + ->AsIntermediate(); + auto* sigmoid = OpNode("sigmoid", "sigmoid")->AsIntermediate(); + auto* sigmoid_out = VarNode("sigmoid_out") + ->assert_is_op_output("sigmoid", "Out") + ->assert_is_op_input("elementwise_mul") + ->AsIntermediate(); + auto* reshape = OpNode("reshape", "reshape2")->AsIntermediate(); + auto* reshape_out = VarNode("reshape_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("elementwise_mul") + ->AsIntermediate(); + auto* reshape_xshape = VarNode("reshape_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* eltwise_mul = + OpNode("eltwise_mul", "elementwise_mul")->AsIntermediate(); + auto* eltwise_mul_out = VarNode("eltwise_mul_out") + ->assert_is_op_output("elementwise_mul", "Out") + ->assert_is_op_input("elementwise_add") + ->AsIntermediate(); + + auto* right_conv1_weight = VarNode("right_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate(); + auto* right_conv1_out = VarNode("right_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn1_scale = VarNode("right_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn1_bias = VarNode("right_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn1_mean = VarNode("right_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn1_var = VarNode("right_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate(); + auto* right_bn1_out = VarNode("right_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("elementwise_add") + ->AsIntermediate(); + auto* right_bn1_mean_out = + VarNode("right_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn1_var_out = + VarNode("right_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn1_saved_mean = + VarNode("right_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn1_saved_var = + VarNode("right_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + auto* add = OpNode("add", "elementwise_add")->AsIntermediate(); + auto* add_out = VarNode("add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* relu = OpNode("relu", "relu")->AsIntermediate(); + auto* relu_out = + VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput(); + + *input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >> + *left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >> + *left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >> + *left_conv3 >> *left_conv3_out >> *left_bn3 >> + *left_bn3_out /* >> *add*/; + + *left_bn3_out >> *reduce_mean >> *reduce_mean_out >> *concat; + *left_bn3_out >> *reduce_max >> *reduce_max_out >> *concat; + *concat >> *concat_out >> *left_conv4 >> *left_conv4_out >> *sigmoid >> + *sigmoid_out >> *eltwise_mul; + *left_conv4_weight >> *left_conv4; + *left_bn3_out >> *reshape >> *reshape_out >> *eltwise_mul; + *reshape >> *reshape_xshape; + *eltwise_mul >> *eltwise_mul_out >> *add; + + *left_conv1_weight >> *left_conv1; + *left_bn1_scale >> *left_bn1; + *left_bn1_bias >> *left_bn1; + *left_bn1_mean >> *left_bn1; + *left_bn1_var >> *left_bn1; + *left_bn1 >> *left_bn1_mean_out; + *left_bn1 >> *left_bn1_var_out; + *left_bn1 >> *left_bn1_saved_mean; + *left_bn1 >> *left_bn1_saved_var; + + *left_conv2_weight >> *left_conv2; + *left_bn2_scale >> *left_bn2; + *left_bn2_bias >> *left_bn2; + *left_bn2_mean >> *left_bn2; + *left_bn2_var >> *left_bn2; + *left_bn2 >> *left_bn2_mean_out; + *left_bn2 >> *left_bn2_var_out; + *left_bn2 >> *left_bn2_saved_mean; + *left_bn2 >> *left_bn2_saved_var; + + *left_conv3_weight >> *left_conv3; + *left_bn3_scale >> *left_bn3; + *left_bn3_bias >> *left_bn3; + *left_bn3_mean >> *left_bn3; + *left_bn3_var >> *left_bn3; + *left_bn3 >> *left_bn3_mean_out; + *left_bn3 >> *left_bn3_var_out; + *left_bn3 >> *left_bn3_saved_mean; + *left_bn3 >> *left_bn3_saved_var; + + *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >> + *right_bn1_out >> *add; + + *right_conv1_weight >> *right_conv1; + *right_bn1_scale >> *right_bn1; + *right_bn1_bias >> *right_bn1; + *right_bn1_mean >> *right_bn1; + *right_bn1_var >> *right_bn1; + *right_bn1 >> *right_bn1_mean_out; + *right_bn1 >> *right_bn1_var_out; + *right_bn1 >> *right_bn1_saved_mean; + *right_bn1 >> *right_bn1_saved_var; + + *add >> *add_out >> *relu >> *relu_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("resnet_cbam_block0"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Filter", + { + matched.at("left_conv1_weight")->arg()->name, + matched.at("left_conv2_weight")->arg()->name, + matched.at("left_conv3_weight")->arg()->name, + matched.at("left_conv4_weight")->arg()->name, + matched.at("right_conv1_weight")->arg()->name, + }); + op_desc.SetInput("Scale", + { + matched.at("left_bn1_scale")->arg()->name, + matched.at("left_bn2_scale")->arg()->name, + matched.at("left_bn3_scale")->arg()->name, + "placeholder_sa_conv", + matched.at("right_bn1_scale")->arg()->name, + }); + op_desc.SetInput("Bias", + { + matched.at("left_bn1_bias")->arg()->name, + matched.at("left_bn2_bias")->arg()->name, + matched.at("left_bn3_bias")->arg()->name, + "placeholder_sa_conv", + matched.at("right_bn1_bias")->arg()->name, + }); + op_desc.SetInput("Mean", + { + matched.at("left_bn1_mean")->arg()->name, + matched.at("left_bn2_mean")->arg()->name, + matched.at("left_bn3_mean")->arg()->name, + "placeholder_sa_conv", + matched.at("right_bn1_mean")->arg()->name, + }); + op_desc.SetInput("Var", + { + matched.at("left_bn1_variance")->arg()->name, + matched.at("left_bn2_variance")->arg()->name, + matched.at("left_bn3_variance")->arg()->name, + "placeholder_sa_conv", + matched.at("right_bn1_variance")->arg()->name, + }); + op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + auto block0_stmt = matched.at("left_conv1")->stmt(); + // block0_stmt->ResetOp(op_desc, graph->valid_places()); + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + auto sub_program_desc = std::make_shared(); + sub_program_desc->AddBlock(); + static_cast(fake_subgraph_op.get()) + ->SetProgramDesc(sub_program_desc); + fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places()); + block0_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "left_conv2_weight", + "left_conv3_weight", + "left_conv4_weight", + "right_conv1_weight", + "left_bn1_bias", + "left_bn2_bias", + "left_bn3_bias", + "right_bn1_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1")); + } + IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out")); + } +}; + +class XPUResNetCbamBlock1Fuser : public FuseBase { + public: + XPUResNetCbamBlock1Fuser() {} + + void BuildPattern() override { + auto* input = VarNode("input") + ->assert_is_op_input("conv2d", "Input") + ->assert_is_op_input("elementwise_add") + ->AsInput(); + + auto* right_conv1_weight = VarNode("right_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv1 = OpNode("right_conv1", "conv2d"); + auto* right_conv1_out = VarNode("right_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn1_scale = VarNode("right_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn1_bias = VarNode("right_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn1_mean = VarNode("right_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn1_var = VarNode("right_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate(); + auto* right_bn1_out = VarNode("right_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* right_bn1_mean_out = + VarNode("right_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn1_var_out = + VarNode("right_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn1_saved_mean = + VarNode("right_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn1_saved_var = + VarNode("right_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* right_relu1 = OpNode("right_relu1", "relu")->AsIntermediate(); + auto* right_relu1_out = VarNode("right_relu1_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* right_conv2_weight = VarNode("right_conv2_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv2 = OpNode("right_conv2", "conv2d")->AsIntermediate(); + auto* right_conv2_out = VarNode("right_conv2_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn2_scale = VarNode("right_bn2_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn2_bias = VarNode("right_bn2_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn2_mean = VarNode("right_bn2_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn2_var = VarNode("right_bn2_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn2 = OpNode("right_bn2", "batch_norm")->AsIntermediate(); + auto* right_bn2_out = VarNode("right_bn2_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* right_bn2_mean_out = + VarNode("right_bn2_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn2_var_out = + VarNode("right_bn2_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn2_saved_mean = + VarNode("right_bn2_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn2_saved_var = + VarNode("right_bn2_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* right_relu2 = OpNode("right_relu2", "relu")->AsIntermediate(); + auto* right_relu2_out = VarNode("right_relu2_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* right_conv3_weight = VarNode("right_conv3_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv3 = OpNode("right_conv3", "conv2d")->AsIntermediate(); + auto* right_conv3_out = VarNode("right_conv3_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn3_scale = VarNode("right_bn3_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn3_bias = VarNode("right_bn3_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn3_mean = VarNode("right_bn3_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn3_var = VarNode("right_bn3_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn3 = OpNode("right_bn3", "batch_norm")->AsIntermediate(); + auto* right_bn3_out = VarNode("right_bn3_out") + ->assert_is_op_output("batch_norm", "Y") + ->AsIntermediate(); + auto* right_bn3_mean_out = + VarNode("right_bn3_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn3_var_out = + VarNode("right_bn3_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn3_saved_mean = + VarNode("right_bn3_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn3_saved_var = + VarNode("right_bn3_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + // cbam specific + auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate(); + auto* reduce_mean_out = VarNode("reduce_mean_out") + ->assert_is_op_output("reduce_mean", "Out") + ->assert_is_op_input("concat") + ->AsIntermediate(); + auto* reduce_max = OpNode("reduce_max", "reduce_max")->AsIntermediate(); + auto* reduce_max_out = VarNode("reduce_max_out") + ->assert_is_op_output("reduce_max", "Out") + ->assert_is_op_input("concat") + ->AsIntermediate(); + auto* concat = OpNode("concat", "concat")->AsIntermediate(); + auto* concat_out = VarNode("concat_out") + ->assert_is_op_output("concat", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + auto* right_conv4_weight = VarNode("right_conv4_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv4 = OpNode("right_conv4", "conv2d")->AsIntermediate(); + auto* right_conv4_out = VarNode("right_conv4_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("sigmoid", "X") + ->AsIntermediate(); + auto* sigmoid = OpNode("sigmoid", "sigmoid")->AsIntermediate(); + auto* sigmoid_out = VarNode("sigmoid_out") + ->assert_is_op_output("sigmoid", "Out") + ->assert_is_op_input("elementwise_mul") + ->AsIntermediate(); + auto* reshape = OpNode("reshape", "reshape2")->AsIntermediate(); + auto* reshape_out = VarNode("reshape_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("elementwise_mul") + ->AsIntermediate(); + auto* reshape_xshape = VarNode("reshape_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* eltwise_mul = + OpNode("eltwise_mul", "elementwise_mul")->AsIntermediate(); + auto* eltwise_mul_out = VarNode("eltwise_mul_out") + ->assert_is_op_output("elementwise_mul", "Out") + ->assert_is_op_input("elementwise_add") + ->AsIntermediate(); + + auto* add = OpNode("add", "elementwise_add")->AsIntermediate(); + auto* add_out = VarNode("add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* relu = OpNode("relu", "relu")->AsIntermediate(); + auto* relu_out = + VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput(); + + *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >> + *right_bn1_out >> *right_relu1 >> *right_relu1_out >> *right_conv2 >> + *right_conv2_out >> *right_bn2 >> *right_bn2_out >> *right_relu2 >> + *right_relu2_out >> *right_conv3 >> *right_conv3_out >> *right_bn3 >> + *right_bn3_out /* >> *add*/; + + *right_bn3_out >> *reduce_mean >> *reduce_mean_out >> *concat; + *right_bn3_out >> *reduce_max >> *reduce_max_out >> *concat; + *concat >> *concat_out >> *right_conv4 >> *right_conv4_out >> *sigmoid >> + *sigmoid_out >> *eltwise_mul; + *right_conv4_weight >> *right_conv4; + *right_bn3_out >> *reshape >> *reshape_out >> *eltwise_mul; + *reshape >> *reshape_xshape; + *eltwise_mul >> *eltwise_mul_out >> *add; + + *right_conv1_weight >> *right_conv1; + *right_bn1_scale >> *right_bn1; + *right_bn1_bias >> *right_bn1; + *right_bn1_mean >> *right_bn1; + *right_bn1_var >> *right_bn1; + *right_bn1 >> *right_bn1_mean_out; + *right_bn1 >> *right_bn1_var_out; + *right_bn1 >> *right_bn1_saved_mean; + *right_bn1 >> *right_bn1_saved_var; + + *right_conv2_weight >> *right_conv2; + *right_bn2_scale >> *right_bn2; + *right_bn2_bias >> *right_bn2; + *right_bn2_mean >> *right_bn2; + *right_bn2_var >> *right_bn2; + *right_bn2 >> *right_bn2_mean_out; + *right_bn2 >> *right_bn2_var_out; + *right_bn2 >> *right_bn2_saved_mean; + *right_bn2 >> *right_bn2_saved_var; + + *right_conv3_weight >> *right_conv3; + *right_bn3_scale >> *right_bn3; + *right_bn3_bias >> *right_bn3; + *right_bn3_mean >> *right_bn3; + *right_bn3_var >> *right_bn3; + *right_bn3 >> *right_bn3_mean_out; + *right_bn3 >> *right_bn3_var_out; + *right_bn3 >> *right_bn3_saved_mean; + *right_bn3 >> *right_bn3_saved_var; + + *input >> *add; + + *add >> *add_out >> *relu >> *relu_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("resnet_cbam_block1"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Filter", + { + matched.at("right_conv1_weight")->arg()->name, + matched.at("right_conv2_weight")->arg()->name, + matched.at("right_conv3_weight")->arg()->name, + matched.at("right_conv4_weight")->arg()->name, + }); + op_desc.SetInput("Scale", + { + matched.at("right_bn1_scale")->arg()->name, + matched.at("right_bn2_scale")->arg()->name, + matched.at("right_bn3_scale")->arg()->name, + "placeholder_sa_conv", + }); + op_desc.SetInput("Bias", + { + matched.at("right_bn1_bias")->arg()->name, + matched.at("right_bn2_bias")->arg()->name, + matched.at("right_bn3_bias")->arg()->name, + "placeholder_sa_conv", + }); + op_desc.SetInput("Mean", + { + matched.at("right_bn1_mean")->arg()->name, + matched.at("right_bn2_mean")->arg()->name, + matched.at("right_bn3_mean")->arg()->name, + "placeholder_sa_conv", + }); + op_desc.SetInput("Var", + { + matched.at("right_bn1_variance")->arg()->name, + matched.at("right_bn2_variance")->arg()->name, + matched.at("right_bn3_variance")->arg()->name, + "placeholder_sa_conv", + }); + op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + auto block1_stmt = matched.at("right_conv1")->stmt(); + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + auto sub_program_desc = std::make_shared(); + sub_program_desc->AddBlock(); + static_cast(fake_subgraph_op.get()) + ->SetProgramDesc(sub_program_desc); + fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places()); + block1_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "right_conv2_weight", + "right_conv3_weight", + "right_conv4_weight", + "right_bn1_bias", + "right_bn2_bias", + "right_bn3_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("right_conv1")); + } + IR_OP_VAR_LINK(matched.at("right_conv1"), matched.at("relu_out")); + } +}; + +class XPUResNetCbamBlock2Fuser : public FuseBase { + public: + XPUResNetCbamBlock2Fuser() {} + + void BuildPattern() override { + auto* input = VarNode("input")->assert_is_op_input("clip", "X")->AsInput(); + + auto* clip = OpNode("clip", "clip"); + auto* clip_out = VarNode("clip_out") + ->assert_is_op_output("clip", "Out") + ->assert_is_op_input("elementwise_pow") + ->AsIntermediate(); + auto* eltwise_y = VarNode("eltwise_y") + ->assert_is_op_input("elementwise_pow") + ->assert_is_op_input("elementwise_div") + ->AsIntermediate(); + auto* eltwise_pow = + OpNode("eltwise_pow", "elementwise_pow")->AsIntermediate(); + auto* eltwise_pow_out = VarNode("eltwise_pow_out") + ->assert_is_op_output("elementwise_pow", "Out") + ->assert_is_op_input("pad2d", "X") + ->AsIntermediate(); + auto* pad2d = OpNode("pad2d", "pad2d")->AsIntermediate(); + auto* pad2d_out = VarNode("pad2d_out") + ->assert_is_op_output("pad2d", "Out") + ->assert_is_op_input("pool2d", "X") + ->AsIntermediate(); + auto* pool2d = OpNode("pool2d", "pool2d")->AsIntermediate(); + auto* pool2d_out = VarNode("pool2d_out") + ->assert_is_op_output("pool2d", "Out") + ->assert_is_op_input("elementwise_pow") + ->AsIntermediate(); + + auto* fill_const = OpNode("fill_const", "fill_constant")->AsIntermediate(); + auto* fill_const_out = VarNode("fill_const_out") + ->assert_is_op_output("fill_constant", "Out") + ->assert_is_op_input("elementwise_div") + ->AsIntermediate(); + auto* eltwise_div = + OpNode("eltwise_div", "elementwise_div")->AsIntermediate(); + auto* eltwise_div_out = VarNode("eltwise_div_out") + ->assert_is_op_output("elementwise_div", "Out") + ->assert_is_op_input("elementwise_pow") + ->AsIntermediate(); + + auto* eltwise_pow2 = + OpNode("eltwise_pow2", "elementwise_pow")->AsIntermediate(); + auto* eltwise_pow2_out = VarNode("eltwise_pow2_out") + ->assert_is_op_output("elementwise_pow", "Out") + ->AsIntermediate(); + + auto* shape = OpNode("shape", "shape")->AsIntermediate(); + auto* shape_out = VarNode("shape_out") + ->assert_is_op_output("shape", "Out") + ->assert_is_op_input("gather") + ->AsIntermediate(); + auto* fill_const2 = + OpNode("fill_const2", "fill_constant")->AsIntermediate(); + auto* fill_const2_out = VarNode("fill_const2_out") + ->assert_is_op_output("fill_constant", "Out") + ->assert_is_op_input("gather") + ->AsIntermediate(); + auto* gather = OpNode("gather", "gather")->AsIntermediate(); + auto* gather_out = VarNode("gather_out") + ->assert_is_op_output("gather", "Out") + ->assert_is_op_input("assign", "X") + ->AsIntermediate(); + auto* assign = OpNode("assign", "assign")->AsIntermediate(); + auto* assign_out = VarNode("assign_out") + ->assert_is_op_output("assign", "Out") + ->assert_is_op_input("concat") + ->AsIntermediate(); + + auto* fill_const3 = + OpNode("fill_const3", "fill_constant")->AsIntermediate(); + auto* fill_const3_out = VarNode("fill_const3_out") + ->assert_is_op_output("fill_constant", "Out") + ->assert_is_op_input("assign") + ->AsIntermediate(); + auto* assign2 = OpNode("assign2", "assign")->AsIntermediate(); + auto* assign2_out = VarNode("assign2_out") + ->assert_is_op_output("assign", "Out") + ->assert_is_op_input("concat") + ->AsIntermediate(); + + auto* concat = OpNode("concat", "concat")->AsIntermediate(); + auto* concat_out = VarNode("concat_out") + ->assert_is_op_output("concat", "Out") + ->assert_is_op_input("cast", "X") + ->AsIntermediate(); + auto* cast = OpNode("cast", "cast")->AsIntermediate(); + auto* cast_out = VarNode("cast_out") + ->assert_is_op_output("cast", "Out") + ->assert_is_op_input("reshape2", "Shape") + ->AsIntermediate(); + + auto* reshape2 = OpNode("reshape2", "reshape2")->AsIntermediate(); + auto* reshape2_out = VarNode("reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("matmul", "X") + ->AsIntermediate(); + auto* reshape2_xshape = VarNode("reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* matmul_y = + VarNode("matmul_y")->assert_is_op_input("matmul", "Y")->AsInput(); + auto* matmul = OpNode("matmul", "matmul")->AsIntermediate(); + auto* matmul_out = VarNode("matmul_out") + ->assert_is_op_output("matmul", "Out") + ->assert_is_op_input("elementwise_add") + ->AsIntermediate(); + auto* eltwise_add_y = VarNode("eltwise_add_y") + ->assert_is_op_input("elementwise_add") + ->AsInput(); + auto* eltwise_add = + OpNode("eltwise_add", "elementwise_add")->AsIntermediate(); + auto* eltwise_add_out = VarNode("eltwise_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->AsIntermediate(); + + auto* norm = OpNode("norm", "norm")->AsIntermediate(); + auto* norm_out = VarNode("norm_out") + ->assert_is_op_output("norm", "Out") + ->assert_is_op_input("elementwise_add") + ->AsIntermediate(); + auto* norm_norm = VarNode("norm_norm") + ->assert_is_op_output("norm", "Norm") + ->AsIntermediate(); + auto* fill_const4 = + OpNode("fill_const4", "fill_constant")->AsIntermediate(); + auto* fill_const4_out = VarNode("fill_const4_out") + ->assert_is_op_output("fill_constant", "Out") + ->assert_is_op_input("elementwise_add") + ->AsIntermediate(); + auto* eltwise_add2 = + OpNode("eltwise_add2", "elementwise_add")->AsIntermediate(); + auto* eltwise_add2_out = VarNode("eltwise_add2_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("elementwise_mul") + ->AsIntermediate(); + auto* fill_const5 = + OpNode("fill_const5", "fill_constant")->AsIntermediate(); + auto* fill_const5_out = VarNode("fill_const5_out") + ->assert_is_op_output("fill_constant", "Out") + ->assert_is_op_input("elementwise_mul") + ->AsIntermediate(); + auto* eltwise_mul = + OpNode("eltwise_mul", "elementwise_mul")->AsIntermediate(); + auto* eltwise_mul_out = VarNode("eltwise_mul_out") + ->assert_is_op_output("elementwise_mul", "Out") + ->assert_is_op_input("elementwise_div") + ->AsIntermediate(); + + auto* eltwise_div2 = + OpNode("eltwise_div2", "elementwise_div")->AsIntermediate(); + auto* eltwise_div2_out = VarNode("eltwise_div2_out") + ->assert_is_op_output("elementwise_div", "Out") + ->AsOutput(); + + *input >> *clip >> *clip_out >> *eltwise_pow >> *eltwise_pow_out >> + *pad2d >> *pad2d_out >> *pool2d >> *pool2d_out >> *eltwise_pow2; + *eltwise_y >> *eltwise_pow; + + *fill_const >> *fill_const_out >> *eltwise_div >> *eltwise_div_out >> + *eltwise_pow2; + *eltwise_y >> *eltwise_div; + + *eltwise_pow2 >> *eltwise_pow2_out >> *shape >> *shape_out >> *gather >> + *gather_out >> *assign >> *assign_out >> *concat >> *concat_out >> + *cast >> *cast_out >> *reshape2; + *fill_const2 >> *fill_const2_out >> *gather; + *fill_const3 >> *fill_const3_out >> *assign2 >> *assign2_out >> *concat; + *eltwise_pow2_out >> *reshape2; + + *reshape2 >> *reshape2_out >> *matmul >> *matmul_out >> *eltwise_add >> + *eltwise_add_out; + *reshape2 >> *reshape2_xshape; + *matmul_y >> *matmul; + *eltwise_add_y >> *eltwise_add; + + *eltwise_add_out >> *norm >> *norm_out >> *eltwise_add2 >> + *eltwise_add2_out >> *eltwise_mul >> *eltwise_mul_out >> + *eltwise_div2 >> *eltwise_div2_out; + *norm >> *norm_norm; + *fill_const4 >> *fill_const4_out >> *eltwise_add2; + *fill_const5 >> *fill_const5_out >> *eltwise_mul; + *eltwise_add_out >> *eltwise_div2; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("resnet_cbam_block2"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Filter", {matched.at("matmul_y")->arg()->name}); + op_desc.SetInput("Scale", {"placeholder_last_fc"}); + op_desc.SetInput("Bias", {matched.at("eltwise_add_y")->arg()->name}); + op_desc.SetInput("Mean", {"placeholder_last_fc"}); + op_desc.SetInput("Var", {"placeholder_last_fc"}); + op_desc.SetOutput("Outputs", {matched.at("eltwise_div2_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + // extra traits to distill + auto block2_stmt = matched.at("clip")->stmt(); + auto* scope = block2_stmt->op()->scope(); + auto pow_tensor_name = matched.at("eltwise_y")->arg()->name; + auto* pow_tensor = scope->FindTensor(pow_tensor_name); + float pool_p = pow_tensor->data()[0]; + op_desc.SetAttr("pool_p", pool_p); + auto* matmul_op_info = matched.at("matmul")->stmt()->op_info(); + CHECK(matmul_op_info->GetAttr("transpose_Y") == true) + << "Y of last fc must have been transposed"; + + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + auto sub_program_desc = std::make_shared(); + sub_program_desc->AddBlock(); + static_cast(fake_subgraph_op.get()) + ->SetProgramDesc(sub_program_desc); + fake_subgraph_op->Attach(op_desc, scope); + fake_subgraph_op->SetValidPlaces(block2_stmt->op()->valid_places()); + block2_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "matmul_y", "eltwise_add_y", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("clip")); + } + IR_OP_VAR_LINK(matched.at("clip"), matched.at("eltwise_div2_out")); + } +}; + +class XPUResNetCbamFuser : public xpu::XPUFuseBase { + public: + XPUResNetCbamFuser() {} + + void BuildPattern() override { + auto* input = + VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput(); + + auto* top_conv_weight = VarNode("top_conv_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* top_conv = OpNode("top_conv", "conv2d"); + auto* top_conv_out = VarNode("top_conv_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* top_bn_scale = VarNode("top_bn_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* top_bn_bias = VarNode("top_bn_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* top_bn_mean = VarNode("top_bn_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* top_bn_var = VarNode("top_bn_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate(); + auto* top_bn_out = VarNode("top_bn_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* top_bn_mean_out = VarNode("top_bn_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* top_bn_var_out = + VarNode("top_bn_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* top_bn_saved_mean = + VarNode("top_bn_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* top_bn_saved_var = + VarNode("top_bn_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate(); + auto* top_relu_out = VarNode("top_relu_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("pool2d", "X") + ->AsIntermediate(); + auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate(); + auto* top_pool_out = + VarNode("top_pool_out") + ->assert_is_op_output("pool2d", "Out") + ->assert_is_op_input("resnet_cbam_block0", "Inputs") + ->AsIntermediate(); + + // args are left out + auto* resnet_block0_1 = + OpNode("resnet_block0_1", "resnet_cbam_block0")->AsIntermediate(); + auto* resnet_block0_1_out = + VarNode("resnet_block0_1_out") + ->assert_is_op_output("resnet_cbam_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_1_1 = + OpNode("resnet_block1_1_1", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_1_1_out = + VarNode("resnet_block1_1_1_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_1_2 = + OpNode("resnet_block1_1_2", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_1_2_out = + VarNode("resnet_block1_1_2_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_2 = + OpNode("resnet_block0_2", "resnet_cbam_block0")->AsIntermediate(); + auto* resnet_block0_2_out = + VarNode("resnet_block0_2_out") + ->assert_is_op_output("resnet_cbam_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_1 = + OpNode("resnet_block1_2_1", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_2_1_out = + VarNode("resnet_block1_2_1_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_2 = + OpNode("resnet_block1_2_2", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_2_2_out = + VarNode("resnet_block1_2_2_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_3 = + OpNode("resnet_block1_2_3", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_2_3_out = + VarNode("resnet_block1_2_3_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_3 = + OpNode("resnet_block0_3", "resnet_cbam_block0")->AsIntermediate(); + auto* resnet_block0_3_out = + VarNode("resnet_block0_3_out") + ->assert_is_op_output("resnet_cbam_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_1 = + OpNode("resnet_block1_3_1", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_3_1_out = + VarNode("resnet_block1_3_1_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_2 = + OpNode("resnet_block1_3_2", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_3_2_out = + VarNode("resnet_block1_3_2_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_3 = + OpNode("resnet_block1_3_3", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_3_3_out = + VarNode("resnet_block1_3_3_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_4 = + OpNode("resnet_block1_3_4", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_3_4_out = + VarNode("resnet_block1_3_4_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_5 = + OpNode("resnet_block1_3_5", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_3_5_out = + VarNode("resnet_block1_3_5_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_4 = + OpNode("resnet_block0_4", "resnet_cbam_block0")->AsIntermediate(); + auto* resnet_block0_4_out = + VarNode("resnet_block0_4_out") + ->assert_is_op_output("resnet_cbam_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_4_1 = + OpNode("resnet_block1_4_1", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_4_1_out = + VarNode("resnet_block1_4_1_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_4_2 = + OpNode("resnet_block1_4_2", "resnet_cbam_block1")->AsIntermediate(); + auto* resnet_block1_4_2_out = + VarNode("resnet_block1_4_2_out") + ->assert_is_op_output("resnet_cbam_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block2 = + OpNode("resnet_block2", "resnet_cbam_block2")->AsIntermediate(); + auto* resnet_block2_out = + VarNode("resnet_block2_out") + ->assert_is_op_output("resnet_cbam_block2", "Outputs") + ->AsOutput(); + + *input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >> + *top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >> + *resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >> + *resnet_block1_1_1_out >> *resnet_block1_1_2 >> + *resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >> + *resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >> + *resnet_block1_2_2_out >> *resnet_block1_2_3 >> + *resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >> + *resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >> + *resnet_block1_3_2_out >> *resnet_block1_3_3 >> + *resnet_block1_3_3_out >> *resnet_block1_3_4 >> + *resnet_block1_3_4_out >> *resnet_block1_3_5 >> + *resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >> + *resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >> + *resnet_block1_4_2_out >> *resnet_block2 >> *resnet_block2_out; + + *top_conv_weight >> *top_conv; + *top_bn_scale >> *top_bn; + *top_bn_bias >> *top_bn; + *top_bn_mean >> *top_bn; + *top_bn_var >> *top_bn; + *top_bn >> *top_bn_mean_out; + *top_bn >> *top_bn_var_out; + *top_bn >> *top_bn_saved_mean; + *top_bn >> *top_bn_saved_var; + } + + void handle_placeholder_sa_conv(SSAGraph* graph, + const key2nodes_t& matched, + paddle::lite::Scope* scope, + const std::string& filter_name, + std::vector* max_filter_name) { + auto* filter_t = scope->FindMutableTensor(filter_name); + int filter_len = filter_t->numel(); + float* filter_on_host = filter_t->mutable_data(); + + float max_f = + paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len); + std::unique_ptr filter_int16(new int16_t[filter_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + filter_on_host, filter_int16.get(), max_f, filter_len); + memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t)); + + // create new arg in graph and scope + std::string max_name = filter_name + "_max"; + max_filter_name->push_back(max_name); + auto* max_filter_node = graph->NewArgumentNode(max_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + DirectedLink(max_filter_node, matched.at("top_conv")); + auto* max_filter_t = scope->NewTensor(max_name); + max_filter_t->Resize({4}); + float* max_ptr = max_filter_t->mutable_data(); + max_ptr[0] = max_f; + max_ptr[1] = max_f; + max_ptr[2] = max_f; + max_ptr[3] = max_f; + } + + void handle_placeholder_last_fc(SSAGraph* graph, + const key2nodes_t& matched, + paddle::lite::Scope* scope, + const std::string& filter_name, + std::vector* max_filter_name) { + auto* filter_t = scope->FindMutableTensor(filter_name); + auto filter_dims = filter_t->dims(); + int filter_len = filter_t->numel(); + float* filter_on_host = filter_t->mutable_data(); + + // XXX(miaotianxiang): Y has already been transposed in model... + float max_f = + paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len); + std::unique_ptr filter_int16(new int16_t[filter_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + filter_on_host, filter_int16.get(), max_f, filter_len); + memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t)); + + // create new arg in graph and scope + std::string max_name = filter_name + "_max"; + max_filter_name->push_back(max_name); + auto* max_filter_node = graph->NewArgumentNode(max_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + DirectedLink(max_filter_node, matched.at("top_conv")); + auto* max_filter_t = scope->NewTensor(max_name); + max_filter_t->Resize({4}); + float* max_ptr = max_filter_t->mutable_data(); + max_ptr[0] = max_f; + max_ptr[1] = max_f; + max_ptr[2] = max_f; + max_ptr[3] = max_f; + } + + void InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched, + const std::vector& extra_input_vars) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__resnet_cbam"); + op_desc.SetInput("Input", {matched.at("input")->arg()->name}); + std::vector filter_name = { + matched.at("top_conv_weight")->arg()->name}; + std::vector scale_name = { + matched.at("top_bn_scale")->arg()->name}; + std::vector bias_name = { + matched.at("top_bn_bias")->arg()->name}; + std::vector mean_name = { + matched.at("top_bn_mean")->arg()->name}; + std::vector var_name = { + matched.at("top_bn_variance")->arg()->name}; + std::vector max_filter_name; + std::vector resnet_block_vec = { + "resnet_block0_1", + "resnet_block1_1_1", + "resnet_block1_1_2", + "resnet_block0_2", + "resnet_block1_2_1", + "resnet_block1_2_2", + "resnet_block1_2_3", + "resnet_block0_3", + "resnet_block1_3_1", + "resnet_block1_3_2", + "resnet_block1_3_3", + "resnet_block1_3_4", + "resnet_block1_3_5", + "resnet_block0_4", + "resnet_block1_4_1", + "resnet_block1_4_2", + "resnet_block2", + }; + for (auto& block : resnet_block_vec) { + auto* block_op_info = matched.at(block)->stmt()->op_info(); + auto block_filter_name = block_op_info->Input("Filter"); + std::copy(block_filter_name.begin(), + block_filter_name.end(), + std::back_inserter(filter_name)); + auto block_scale_name = block_op_info->Input("Scale"); + std::copy(block_scale_name.begin(), + block_scale_name.end(), + std::back_inserter(scale_name)); + auto block_bias_name = block_op_info->Input("Bias"); + std::copy(block_bias_name.begin(), + block_bias_name.end(), + std::back_inserter(bias_name)); + auto block_mean_name = block_op_info->Input("Mean"); + std::copy(block_mean_name.begin(), + block_mean_name.end(), + std::back_inserter(mean_name)); + auto block_var_name = block_op_info->Input("Var"); + std::copy(block_var_name.begin(), + block_var_name.end(), + std::back_inserter(var_name)); + } + + auto* resnet_cbam_stmt = matched.at("top_conv")->stmt(); + auto* scope = resnet_cbam_stmt->op()->scope(); + for (size_t i = 0; i < filter_name.size(); ++i) { + if (scale_name[i] == "placeholder_sa_conv") { + handle_placeholder_sa_conv( + graph, matched, scope, filter_name[i], &max_filter_name); + continue; + } else if (scale_name[i] == "placeholder_last_fc") { + handle_placeholder_last_fc( + graph, matched, scope, filter_name[i], &max_filter_name); + continue; + } + + auto* filter_t = scope->FindMutableTensor(filter_name[i]); + auto* scale_t = scope->FindMutableTensor(scale_name[i]); + auto* bias_t = scope->FindMutableTensor(bias_name[i]); + auto* mean_t = scope->FindMutableTensor(mean_name[i]); + auto* var_t = scope->FindMutableTensor(var_name[i]); + + int mean_len = mean_t->numel(); + int filter_len = filter_t->numel(); + int filter_stride = filter_len / mean_len; + + float* filter_on_host = filter_t->mutable_data(); + float* scale_on_host = scale_t->mutable_data(); + float* bias_on_host = bias_t->mutable_data(); + float* mean_on_host = mean_t->mutable_data(); + float* var_on_host = var_t->mutable_data(); + + // Perform preprocess + for (int i = 0; i < mean_len; ++i) { + scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f); + } + for (int i = 0; i < mean_len; ++i) { + for (int j = 0; j < filter_stride; ++j) { + filter_on_host[i * filter_stride + j] *= scale_on_host[i]; + } + } + for (int i = 0; i < mean_len; ++i) { + bias_on_host[i] += -mean_on_host[i] * scale_on_host[i]; + } + + float max_f = + paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len); + std::unique_ptr filter_int16(new int16_t[filter_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + filter_on_host, filter_int16.get(), max_f, filter_len); + memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t)); + + // create new arg in graph and scope + std::string max_name = filter_name[i] + "_max"; + max_filter_name.push_back(max_name); + auto* max_filter_node = graph->NewArgumentNode(max_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + DirectedLink(max_filter_node, matched.at("top_conv")); + auto* max_filter_t = scope->NewTensor(max_name); + max_filter_t->Resize({4}); + float* max_ptr = max_filter_t->mutable_data(); + max_ptr[0] = max_f; + max_ptr[1] = max_f; + max_ptr[2] = max_f; + max_ptr[3] = max_f; + } + op_desc.SetInput("Filter", filter_name); + op_desc.SetInput("Bias", bias_name); + op_desc.SetInput("MaxFilter", max_filter_name); + op_desc.SetOutput("Output", {matched.at("resnet_block2_out")->arg()->name}); + op_desc.SetAttr("xpu", 1); + auto* block2_op_info = matched.at("resnet_block2")->stmt()->op_info(); + op_desc.SetAttr("pool_p", block2_op_info->GetAttr("pool_p")); + + auto resnet_cbam_op = LiteOpRegistry::Global().Create(op_desc.Type()); + resnet_cbam_op->Attach(op_desc, scope); + resnet_cbam_op->SetValidPlaces(resnet_cbam_stmt->op()->valid_places()); + auto kernels = + resnet_cbam_op->CreateKernels(resnet_cbam_op->valid_places()); + resnet_cbam_stmt->SetOp(resnet_cbam_op); + resnet_cbam_stmt->SetKernels(std::move(kernels)); + + IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv")); + for (auto* node : extra_input_vars) { + IR_NODE_LINK_TO(node, matched.at("top_conv")); + } + IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("resnet_block2_out")); + } +}; + +} // namespace fusion + +class XPUResNetCbamFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + fusion::XPUResNetCbamBlock0Fuser block0_fuser; + block0_fuser(graph.get()); + fusion::XPUResNetCbamBlock1Fuser block1_fuser; + block1_fuser(graph.get()); + fusion::XPUResNetCbamBlock2Fuser block2_fuser; + block2_fuser(graph.get()); + fusion::XPUResNetCbamFuser resnet_fuser; + resnet_fuser(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__resnet_cbam_fuse_pass, + paddle::lite::mir::XPUResNetCbamFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("__xpu__resnet_cbam"); diff --git a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc index de2210a76ea0647cb02131a088ceb754afd0ef9c..7024a872f30d3c78affe82648c902a6128de7070 100644 --- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc +++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc @@ -315,10 +315,10 @@ class XPUResNetBlock0Fuser : public FuseBase { auto block0_stmt = matched.at("left_conv1")->stmt(); // block0_stmt->ResetOp(op_desc, graph->valid_places()); auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); - // XXX: memleak? - auto sub_block_desc = new cpp::BlockDesc(); + auto sub_program_desc = std::make_shared(); + sub_program_desc->AddBlock(); static_cast(fake_subgraph_op.get()) - ->SetSubBlock(sub_block_desc); + ->SetProgramDesc(sub_program_desc); fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope()); fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places()); block0_stmt->SetOp(fake_subgraph_op); @@ -577,10 +577,10 @@ class XPUResNetBlock1Fuser : public FuseBase { auto block1_stmt = matched.at("right_conv1")->stmt(); auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); - // XXX: memleak? - auto sub_block_desc = new cpp::BlockDesc(); + auto sub_program_desc = std::make_shared(); + sub_program_desc->AddBlock(); static_cast(fake_subgraph_op.get()) - ->SetSubBlock(sub_block_desc); + ->SetProgramDesc(sub_program_desc); fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope()); fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places()); block1_stmt->SetOp(fake_subgraph_op); diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc index 68c07c0ffd0694aec0ff073082e1192213a0ef4a..20023830123939f1cf83706f69ca8a7a2703b646 100644 --- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc @@ -25,21 +25,21 @@ namespace mir { void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { std::vector act_types{"relu"}; bool has_int8 = false; - bool has_arm_float = false; + bool has_arm = false; bool has_cuda = false; for (auto& place : graph->valid_places()) { if (place.precision == PRECISION(kInt8)) { has_int8 = true; } - if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) { - has_arm_float = true; + if (place.target == TARGET(kARM)) { + has_arm = true; } if (place.target == TARGET(kCUDA)) { has_cuda = true; } } - if (!has_int8 && has_arm_float) { + if (has_arm) { act_types.push_back("relu6"); act_types.push_back("leaky_relu"); } @@ -64,4 +64,5 @@ REGISTER_MIR_PASS(lite_conv_activation_fuse_pass, paddle::lite::mir::ConvActivationFusePass) .BindTargets({TARGET(kAny)}) .ExcludeTargets({TARGET(kXPU)}) + .ExcludeTargets({TARGET(kMLU)}) .BindKernel("conv2d"); diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc index 69be8dab0a06c26d5ca2bcdfe8327634edb9637d..a05f8fe8da5ee72581a9254b4d39354a0c5180e6 100644 --- a/lite/core/mir/fusion/conv_bn_fuser.cc +++ b/lite/core/mir/fusion/conv_bn_fuser.cc @@ -156,12 +156,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { // little difference for int8 /////////////////////////////////////////////////////////////////////////////// if (enable_int8) { - PADDLE_ENFORCE(conv_op_desc->HasAttr("weight_scale"), - "INT8 mode: Conv should has weight_scale attr"); + std::string weight_name = conv_op_desc->Input("Filter").front(); + CHECK(conv_op_desc->HasInputScale(weight_name)) + << "INT8 mode: Conv should has weight_scale attr"; auto conv_weight_d = conv_weight_t->mutable_data(); // compute new conv_weight for int8 - auto weight_scale = - conv_op_desc->GetAttr>("weight_scale"); + auto weight_scale = conv_op_desc->GetInputScale(weight_name); if (conv_type_ == "conv2d_transpose" && !depthwise) { int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; @@ -188,11 +188,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { } } } - conv_op_desc->SetAttr("weight_scale", weight_scale); + conv_op_desc->SetInputScale(weight_name, weight_scale); } else if (is_weight_quantization) { std::string scale_name = conv_weight_name + "_quant_scale"; if (conv_op_desc->HasAttr(scale_name)) { - auto scale = conv_op_desc->GetAttr>(scale_name); + std::vector scale = + conv_op_desc->GetAttr>(scale_name); CHECK_EQ(scale.size(), alpha_tensor.numel()); for (size_t i = 0; i < scale.size(); i++) { scale[i] *= alpha_data[i]; diff --git a/lite/core/mir/fusion/conv_bn_fuser.h b/lite/core/mir/fusion/conv_bn_fuser.h index 8bd8c0ce0600bb68667d96d07d43fa3028b5a856..841566067ba6675271227adfa82c74defac35f2a 100644 --- a/lite/core/mir/fusion/conv_bn_fuser.h +++ b/lite/core/mir/fusion/conv_bn_fuser.h @@ -18,7 +18,7 @@ #include #include #include "lite/core/mir/pattern_matcher_high_api.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { diff --git a/lite/core/mir/fusion/conv_conv_fuse_pass.cc b/lite/core/mir/fusion/conv_conv_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..e9c4f0c02cd89e04d93af8e4dab71acc5d24e411 --- /dev/null +++ b/lite/core/mir/fusion/conv_conv_fuse_pass.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/fusion/conv_conv_fuse_pass.h" +#include +#include +#include "lite/core/mir/fusion/conv_conv_fuser.h" +#include "lite/core/mir/graph_visualize_pass.h" +#include "lite/core/mir/pass_registry.h" + +namespace paddle { +namespace lite { +namespace mir { + +void ConvConvFusePass::Apply(const std::unique_ptr& graph) { + // initialze fuser params + std::vector conv_has_bias_cases{true, false}; + std::vector conv_type_cases{"conv2d", "depthwise_conv2d"}; + bool has_arm = false; + for (auto& place : graph->valid_places()) { + if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) { + has_arm = true; + break; + } + } + if (!has_arm) { + return; + } + // only support fp32 fusion + for (auto conv_has_bias0 : conv_has_bias_cases) { + for (auto conv_has_bias1 : conv_has_bias_cases) { + for (auto conv_type0 : conv_type_cases) { + for (auto conv_type1 : conv_type_cases) { + VLOG(4) << "conv_has_bias0:" << conv_has_bias0 + << " conv_type0:" << conv_type0; + VLOG(4) << "conv_has_bias1:" << conv_has_bias1 + << " conv_type1:" << conv_type1; + fusion::ConvConvFuser fuser( + conv_type0, conv_type1, conv_has_bias0, conv_has_bias1); + fuser(graph.get()); + } + } + } + } +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(lite_conv_conv_fuse_pass, paddle::lite::mir::ConvConvFusePass) + .BindTargets({TARGET(kARM)}); diff --git a/lite/core/mir/fusion/conv_conv_fuse_pass.h b/lite/core/mir/fusion/conv_conv_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..64e1b87ec9a8618572d6044f6dde2ab25c5a11c4 --- /dev/null +++ b/lite/core/mir/fusion/conv_conv_fuse_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/core/mir/pass.h" + +namespace paddle { +namespace lite { +namespace mir { + +class ConvConvFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/fusion/conv_conv_fuser.cc b/lite/core/mir/fusion/conv_conv_fuser.cc new file mode 100644 index 0000000000000000000000000000000000000000..737f96e69baa8953c0231fcc4c9e104907b17381 --- /dev/null +++ b/lite/core/mir/fusion/conv_conv_fuser.cc @@ -0,0 +1,211 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/fusion/conv_conv_fuser.h" +#include +#include +#include + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +void ConvConvFuser::BuildPattern() { + auto* conv_input0 = VarNode("conv_input0") + ->assert_is_op_input(conv_type0_, "Input") + ->AsInput(); + auto* conv_weight0 = VarNode("conv_weight0") + ->assert_is_op_input(conv_type0_, "Filter") + ->AsInput(); + auto* conv0 = OpNode("conv2d0", conv_type0_)->assert_is_op(conv_type0_); + auto* conv_out0 = VarNode("conv_out0") + ->assert_is_op_output(conv_type0_, "Output") + ->assert_is_op_input(conv_type1_, "Input") + ->AsIntermediate(); + + auto* conv_weight1 = VarNode("conv_weight1") + ->assert_is_op_input(conv_type1_, "Filter") + ->AsIntermediate(); + auto* conv1 = OpNode("conv2d1", conv_type1_) + ->assert_is_op(conv_type1_) + ->assert_op_attr("groups", 1) + ->AsIntermediate(); + + auto* conv_out1 = VarNode("conv_out1") + ->assert_is_op_output(conv_type1_, "Output") + ->AsOutput(); + + if (conv_has_bias0_) { + if (conv_has_bias1_) { + auto* conv_bias0 = VarNode("conv_bias0") + ->assert_is_op_input(conv_type0_, "Bias") + ->AsIntermediate(); + auto* conv_bias1 = VarNode("conv_bias1") + ->assert_is_op_input(conv_type1_, "Bias") + ->AsInput(); + conv0->LinksFrom({conv_input0, conv_weight0, conv_bias0}) + .LinksTo({conv_out0}); + conv1->LinksFrom({conv_out0, conv_weight1, conv_bias1}) + .LinksTo({conv_out1}); + } else { + auto* conv_bias0 = VarNode("conv_bias0") + ->assert_is_op_input(conv_type0_, "Bias") + ->AsIntermediate(); + conv0->LinksFrom({conv_input0, conv_weight0, conv_bias0}) + .LinksTo({conv_out0}); + conv1->LinksFrom({conv_out0, conv_weight1}).LinksTo({conv_out1}); + } + } else { + conv0->LinksFrom({conv_input0, conv_weight0}).LinksTo({conv_out0}); + if (conv_has_bias1_) { + auto* conv_bias1 = VarNode("conv_bias1") + ->assert_is_op_input(conv_type1_, "Bias") + ->AsInput(); + conv1->LinksFrom({conv_out0, conv_weight1, conv_bias1}) + .LinksTo({conv_out1}); + } else { + conv1->LinksFrom({conv_out0, conv_weight1}).LinksTo({conv_out1}); + } + } +} + +void ConvConvFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { + auto conv_instruct = matched.at("conv2d0")->stmt(); + auto conv_op_desc = conv_instruct->mutable_op_info(); + auto conv = conv_instruct->op(); + auto* scope = conv->scope(); + auto conv_op_desc1 = matched.at("conv2d1")->stmt()->mutable_op_info(); + + // conv0 + auto weight0_t = scope->FindVar(matched.at("conv_weight0")->arg()->name) + ->GetMutable(); + + // conv1 + auto weight1_t = scope->FindVar(matched.at("conv_weight1")->arg()->name) + ->GetMutable(); + // auto groups0 = conv_op_desc->GetAttr("groups"); + auto groups1 = conv_op_desc1->GetAttr("groups"); + auto strides1 = conv_op_desc1->GetAttr>("strides"); + auto paddings1 = conv_op_desc1->GetAttr>("paddings"); + auto dilations1 = conv_op_desc1->GetAttr>("dilations"); + + bool enable0_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false; + bool enable1_int8 = conv_op_desc1->HasAttr("enable_int8") ? true : false; + int kw = weight1_t->dims()[2]; + int kh = weight1_t->dims()[3]; + if (!(kw == 1 && kh == 1)) { + return; + } + CHECK_EQ(enable0_int8, enable1_int8) << "The Conv compute type must be same"; + CHECK_EQ(groups1, 1) << "The groups of weight1_dim must be 1"; + CHECK_EQ(weight0_t->dims()[0], weight1_t->dims()[1]) + << "weight0_dims[0] == weight1_dim[1]"; + for (int i = 0; i < strides1.size(); i++) { + CHECK_EQ(strides1[i], 1) << "strides[" << i << "]: " << strides1[i] + << " must be 1"; + } + for (int i = 0; i < paddings1.size(); i++) { + CHECK_EQ(paddings1[i], 0) << "paddings1[" << i << "]: " << paddings1[i] + << " must be 0"; + } + for (int i = 0; i < dilations1.size(); i++) { + CHECK_EQ(dilations1[i], 1) << "dilations1[" << i << "]: " << dilations1[i] + << " must be 1"; + } + // comupte new_wight and new bias + /////////////////////////////////////////////////////////////////////////////// + // Compute ConvConvFuser + // Before fusion + // + // conv(x) = conv(x) = kx + z = y + // conv(y) = ay + b + // + // After fusion: + // + // conv(conv(x)) = a(kx + z) + b = akx + az + b + // + // new_weights = ak + // new_bias = az + b + /////////////////////////////////////////////////////////////////////////////// + if (enable0_int8) { + LOG(FATAL) << "it doesn't support"; + return; + } else { + // compute new conv_weight + Tensor weight_tensor; + auto in_dims = weight0_t->dims(); + auto weight_dims = weight1_t->dims(); + const float* din = weight0_t->data(); + const float* weights = weight1_t->data(); + int oc0 = in_dims[0]; + int ic = in_dims[1]; + int ih = in_dims[2]; + int iw = in_dims[3]; + int oc = weight_dims[0]; + weight_tensor.Resize({oc, ic, ih, iw}); + float* dout = weight_tensor.mutable_data(); + ComputeNewWeight(dout, din, weights, oc0, ic, ih, iw, oc); + weight0_t->CopyDataFrom(weight_tensor); + } + // compute new conv_bias + if (conv_has_bias0_ && conv_op_desc->HasInput("Bias") && + conv_op_desc->Input("Bias").size() > 0) { + auto bias_t0 = scope->FindVar(matched.at("conv_bias0")->arg()->name) + ->GetMutable(); + if (conv_has_bias1_ && conv_op_desc1->HasInput("Bias") && + conv_op_desc1->Input("Bias").size() > 0) { + auto bias_t1 = scope->FindVar(matched.at("conv_bias1")->arg()->name) + ->GetMutable(); + Tensor bias; + bias.CopyDataFrom(*bias_t1); + auto bias_data = bias.mutable_data(); + ComputeNewBias(bias_data, bias_t0, weight1_t, bias_t1); + bias_t1->CopyDataFrom(bias); + conv_op_desc->SetInput( + "Bias", {matched.at("conv_bias1")->arg()->name}); // conv_bias + IR_NODE_LINK_TO(matched.at("conv_bias1"), matched.at("conv2d0")); + } else { + Tensor bias; + auto weight_dims = weight1_t->dims(); + bias.Resize({weight_dims[0]}); + auto bias_d = bias.mutable_data(); + ComputeNewBias(bias_d, bias_t0, weight1_t, nullptr); + bias_t0->CopyDataFrom(bias); + conv_op_desc->SetInput( + "Bias", {matched.at("conv_bias0")->arg()->name}); // conv_bias + } + } else { + if (conv_has_bias1_ && conv_op_desc1->HasInput("Bias") && + conv_op_desc1->Input("Bias").size() > 0) { + conv_op_desc->SetInput( + "Bias", {matched.at("conv_bias1")->arg()->name}); // conv_bias + IR_NODE_LINK_TO(matched.at("conv_bias1"), matched.at("conv2d0")); + } + } + conv_op_desc->SetType(conv_type0_); + conv_op_desc->SetInput("Input", {matched.at("conv_input0")->arg()->name}); + conv_op_desc->SetInput("Filter", {matched.at("conv_weight0")->arg()->name}); + conv_op_desc->SetOutput("Output", {matched.at("conv_out1")->arg()->name}); + + auto update_conv_desc = *conv_instruct->mutable_op_info(); + conv_instruct->ResetOp(update_conv_desc, graph->valid_places()); + + IR_OP_VAR_LINK(matched.at("conv2d0"), matched.at("conv_out1")); +} + +} // namespace fusion +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/fusion/conv_conv_fuser.h b/lite/core/mir/fusion/conv_conv_fuser.h new file mode 100644 index 0000000000000000000000000000000000000000..5d1f58d1c8746a137e2078006016ec6007c2afbb --- /dev/null +++ b/lite/core/mir/fusion/conv_conv_fuser.h @@ -0,0 +1,120 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/core/mir/pattern_matcher_high_api.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class ConvConvFuser : public FuseBase { + public: + explicit ConvConvFuser(const std::string& conv_type0, + const std::string& conv_type1, + const bool conv_has_bias0, + const bool conv_has_bias1) + : conv_type0_(conv_type0), + conv_type1_(conv_type1), + conv_has_bias0_(conv_has_bias0), + conv_has_bias1_(conv_has_bias1) {} + void BuildPattern() override; + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; + + private: + void ComputeNewWeight(float* dout, + const float* din, + const float* weights, + int oc0, + int ic, + int ih, + int iw, + int oc1) { + // input conv_weight0_t weights conv_weight1_t + // output weight_tensor + // ksize = 1 + int in_size = ih * iw; + int in_channel_size = ic * in_size; + // out = w1[j, i, ih, iw] * w2[k, j, kw, kh] + // out_dim = [oc1, ic, kh, kw], din_dim = [oc0, ic, kh, kw] + // weight_dim = [oc1, oc0, kh, kw] + for (int k = 0; k < oc1; k++) { + const float* weights_ptr = weights + k * oc0; + float* out_ptr = dout + k * in_channel_size; + for (int c = 0; c < ic; c++) { + float* out_ptr_channel = out_ptr + c * in_size; + const float* din_ptr = din + c * in_size; + for (int i = 0; i < in_size; i++) { + float sum = 0.f; + for (int j = 0; j < oc0; j++) { + sum += din_ptr[j * in_channel_size] * weights_ptr[j]; + } + *out_ptr_channel++ = sum; + } + } + } + } + + void ComputeNewBias(float* dout, + Tensor* bias0_tensor, + Tensor* weight_tensor, + Tensor* bias1_tensor) { + // input bias0_tensor weight_tensor bias1_tensor + // output bias_tensor + auto in_dims = bias0_tensor->dims(); + auto weight_dims = weight_tensor->dims(); + const float* din = bias0_tensor->data(); + const float* weights = weight_tensor->data(); + int ic = in_dims[0]; + int oc = weight_dims[0]; + // out_k = b0[num, j, 1, 1] * w2[k, j, 1, 1] + if (bias1_tensor) { + const float* din2 = bias1_tensor->data(); + for (int k = 0; k < oc; k++) { + const float* weights_ptr = weights + k * ic; + float sum = 0.f; + for (int j = 0; j < ic; j++) { + sum += din[j] * weights_ptr[j]; + } + dout[k] = sum + din2[k]; + } + } else { + for (int k = 0; k < oc; k++) { + const float* weights_ptr = weights + k * ic; + float sum = 0.f; + for (int j = 0; j < ic; j++) { + sum += din[j] * weights_ptr[j]; + } + dout[k] = sum; + } + } + } + + private: + std::string conv_type0_{"conv2d"}; + std::string conv_type1_{"conv2d"}; + bool conv_has_bias0_{false}; + bool conv_has_bias1_{false}; +}; + +} // namespace fusion +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc index e2d8f96c53bd76d9495035c6ec56a5364b9bdcf5..d9bffffebfaabcca9c63700caf6e3ee91fa2eecb 100644 --- a/lite/core/mir/fusion/fc_fuse_pass.cc +++ b/lite/core/mir/fusion/fc_fuse_pass.cc @@ -24,8 +24,13 @@ namespace mir { void FcFusePass::Apply(const std::unique_ptr& graph) { #ifdef LITE_WITH_X86 +#ifdef LITE_WITH_MLU + fusion::FcFuser fuser(false); + fuser(graph.get()); +#else fusion::FcFuser fuser(true); fuser(graph.get()); +#endif #endif fusion::FcFuser fuser2(false); @@ -38,7 +43,9 @@ void FcFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass) .BindTargets({TARGET(kAny)}) - .ExcludeTargets({TARGET(kXPU), TARGET(kX86)}) + .ExcludeTargets({TARGET(kXPU)}) +#ifndef LITE_WITH_MLU + .ExcludeTargets({TARGET(kX86)}) +#endif .ExcludeTargets({TARGET(kBM)}) - .ExcludeTargets({TARGET(kCUDA)}) .BindKernel("fc"); diff --git a/lite/core/mir/fusion/fc_fuser.cc b/lite/core/mir/fusion/fc_fuser.cc index 3c99131083d37ea2c8511ed136bff17c891529af..8fdde50fc3015b411ee13fed15e92a93a1c722e5 100644 --- a/lite/core/mir/fusion/fc_fuser.cc +++ b/lite/core/mir/fusion/fc_fuser.cc @@ -71,7 +71,20 @@ void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { } cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) { - cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info(); + auto op_desc = *matched.at("mul")->stmt()->op_info(); + + // Get the input scale from mul + std::vector x_scale_vct; + std::vector y_scale_vct; + auto input_x_name = op_desc.Input("X").front(); + auto input_y_name = op_desc.Input("Y").front(); + bool is_quantized_op = op_desc.HasInputScale(input_x_name) && + op_desc.HasInputScale(input_y_name); + if (is_quantized_op) { + x_scale_vct = op_desc.GetInputScale(input_x_name); + y_scale_vct = op_desc.GetInputScale(op_desc.Input("Y").front()); + } + op_desc.mutable_inputs()->clear(); op_desc.mutable_outputs()->clear(); op_desc.SetType("fc"); @@ -85,6 +98,13 @@ cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) { if (with_relu_) { op_desc.SetAttr("activation_type", std::string{"relu"}); } + + // Set the input scale into fc + if (is_quantized_op) { + op_desc.SetInputScale(matched.at("x")->arg()->name, x_scale_vct); + op_desc.SetInputScale(matched.at("W")->arg()->name, y_scale_vct); + } + return op_desc; } diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc index 80a033c75f2e23efa091375ee2a9f78e3ff40d71..da42d6d0c79a2a7975eacca7095fedababac6d89 100644 --- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc +++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc @@ -34,19 +34,25 @@ void QuantDequantFusePass::Apply(const std::unique_ptr& graph) { } // fuse quantized node and dequant node - for (auto& op_type : {"conv2d", "mul", "depthwise_conv2d"}) { + std::vector quantized_op_types = { + "conv2d", "depthwise_conv2d", "conv2d_transpose", "mul"}; + for (auto& op_type : quantized_op_types) { fusion::DequantOpFuser fuser(op_type); fuser(graph.get()); } - - for (auto& op_type : {"conv2d", "depthwise_conv2d"}) { + for (auto& op_type : quantized_op_types) { fusion::ChannelWiseDequantOpFuser fuser(op_type); fuser(graph.get()); } // process quant_dequant_node - fusion::DeleteQuantDequantOpFuser dqd_fuser; - dqd_fuser(graph.get()); + std::vector quant_dequant_op_types = { + "fake_quantize_dequantize_abs_max", + "fake_quantize_dequantize_moving_average_abs_max"}; + for (auto& op_type : quant_dequant_op_types) { + fusion::DeleteQuantDequantOpFuser dqd_fuser(op_type); + dqd_fuser(graph.get()); + } } } // namespace mir diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc index f6d03cc23d56f8ae25f22b5b2667ed451ef8afaa..758a85c84064fa8d1953a6531300208d13525634 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc @@ -23,6 +23,20 @@ namespace lite { namespace mir { namespace fusion { +static std::string GetWeightArgname(const std::string& op_type) { + std::string weight_argname{}; + std::vector conv_ops = { + "conv2d", "depthwise_conv2d", "conv2d_transpose"}; + std::vector mul_ops = {"mul", "matmul"}; + if (std::find(conv_ops.begin(), conv_ops.end(), op_type) != conv_ops.end()) { + weight_argname = "Filter"; + } else if (std::find(mul_ops.begin(), mul_ops.end(), op_type) != + mul_ops.end()) { + weight_argname = "Y"; + } + return weight_argname; +} + void DeleteQuantOpFuser::BuildPattern() { auto* input_scale_node = VarNode("input_scale_node") ->assert_is_op_input(quant_op_type_, "InScale"); @@ -64,13 +78,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph, for (auto* quantized_node : outlinks) { // save input scale in quantized op by input argname + index auto op_desc = *quantized_node->stmt()->mutable_op_info(); - std::string argname; - int index; - op_desc.GetInputArgname(out_act_name, &argname); - op_desc.GetInputIndex(out_act_name, &index); - op_desc.SetAttr(argname + std::to_string(index) + "_input_scale", - scale_value); - op_desc.SetAttr("input_scale", scale_value); // save it for now + op_desc.SetInputScale(out_act_name, {scale_value}); op_desc.SetAttr("bit_length", bit_length); op_desc.UpdateAllInputs(out_act_name, in_act_name); quantized_node->stmt()->ResetOp(op_desc, graph->valid_places()); @@ -89,20 +97,13 @@ cpp::OpDesc DeleteQuantOpFuser::GenOpDesc(const key2nodes_t& matched) { } void DequantOpFuser::BuildPattern() { - std::string weight_name = ""; - if (quantized_op_type_ == "conv2d" || - quantized_op_type_ == "depthwise_conv2d") { - weight_name = "Filter"; - } else { - weight_name = "Y"; - } - + std::string weight_argname = GetWeightArgname(quantized_op_type_); auto* quantized_op_input = VarNode("quantized_op_input") ->assert_is_op_input(quantized_op_type_) ->AsInput(); auto* quantized_op_weight = VarNode("quantized_op_weight") - ->assert_is_op_input(quantized_op_type_, weight_name) + ->assert_is_op_input(quantized_op_type_, weight_argname) ->AsInput(); auto* quantized_op = OpNode("quantized_op", quantized_op_type_) ->assert_is_op(quantized_op_type_) @@ -135,6 +136,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, auto* quantized_op = matched.at("quantized_op"); auto* dequant_op = matched.at("dequant_op"); auto* dequant_op_out = matched.at("dequant_op_out"); + auto weight_name = quantized_op_weight->arg()->name; // obtain weight_scale from max_range auto* scope = quantized_op->stmt()->op()->scope(); @@ -150,14 +152,15 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, // = max(abs(weight)) / range // set op desc - cpp::OpDesc op_desc = *quantized_op->stmt()->op_info(); + auto op_desc = *quantized_op->stmt()->op_info(); auto quantized_weight_var_name = quantized_op_weight->arg()->name; auto quantized_weight_t = scope->FindVar(quantized_weight_var_name)->GetMutable(); std::vector weight_scale; - int weight_scale_size; + int weight_scale_size = 0; if (quantized_op_type_ == "conv2d" || - quantized_op_type_ == "depthwise_conv2d") { + quantized_op_type_ == "depthwise_conv2d" || + quantized_op_type_ == "conv2d_transpose") { op_desc.SetInput("Input", {quantized_op_input->arg()->name}); op_desc.SetOutput("Output", {dequant_op_out->arg()->name}); // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should @@ -173,7 +176,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, weight_scale.push_back(whole_weight_scale); } op_desc.SetAttr("enable_int8", true); - op_desc.SetAttr("weight_scale", weight_scale); + op_desc.SetInputScale(weight_name, weight_scale); // change the weight from the float type to int8 type. Tensor temp_tensor; @@ -204,12 +207,13 @@ cpp::OpDesc DequantOpFuser::GenOpDesc(const key2nodes_t& matched) { void ChannelWiseDequantOpFuser::BuildPattern() { std::string dequant_op_type = "fake_channel_wise_dequantize_max_abs"; + std::string weight_argname = GetWeightArgname(quantized_op_type_); auto* quantized_op_input = VarNode("quantized_op_input") ->assert_is_op_input(quantized_op_type_) ->AsInput(); auto* quantized_op_weight = VarNode("quantized_op_weight") - ->assert_is_op_input(quantized_op_type_, "Filter") + ->assert_is_op_input(quantized_op_type_, weight_argname) ->AsInput(); auto* quantized_op = OpNode("quantized_op", quantized_op_type_) ->assert_is_op(quantized_op_type_) @@ -246,6 +250,7 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, auto* dequant_op_channel_scale = matched.at("dequant_op_channel_scale"); auto* dequant_op = matched.at("dequant_op"); auto* dequant_op_out = matched.at("dequant_op_out"); + auto weight_name = quantized_op_weight->arg()->name; // obtain input weight_scale from fake_dequant op auto* scope = quantized_op->stmt()->op()->scope(); @@ -265,17 +270,20 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, } // set op desc - cpp::OpDesc op_desc = *quantized_op->stmt()->op_info(); + auto op_desc = *quantized_op->stmt()->op_info(); if (quantized_op_type_ == "conv2d" || - quantized_op_type_ == "depthwise_conv2d") { + quantized_op_type_ == "depthwise_conv2d" || + quantized_op_type_ == "conv2d_transpose") { op_desc.SetInput("Input", {quantized_op_input->arg()->name}); op_desc.SetOutput("Output", {dequant_op_out->arg()->name}); } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") { op_desc.SetInput("X", {quantized_op_input->arg()->name}); op_desc.SetOutput("Out", {dequant_op_out->arg()->name}); } - op_desc.SetAttr("enable_int8", true); - op_desc.SetAttr("weight_scale", weight_scale); + if (quantized_op_type_ != "conv2d_transpose") { + op_desc.SetAttr("enable_int8", true); + } + op_desc.SetInputScale(weight_name, weight_scale); // change the weight from the float type to int8 type. auto quantized_weight_var_name = quantized_op_weight->arg()->name; @@ -307,30 +315,33 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) { } void DeleteQuantDequantOpFuser::BuildPattern() { - std::string quant_dequant_op_type = - "fake_quantize_dequantize_moving_average_abs_max"; - auto* input_scale_node = - VarNode("input_scale_node") - ->assert_is_op_input(quant_dequant_op_type, "InScale"); - auto* input_act_node = - VarNode("input_act_node")->assert_is_op_input(quant_dequant_op_type, "X"); - auto* quant_dequant_node = OpNode("quant_dequant_node", quant_dequant_op_type) - ->assert_is_op(quant_dequant_op_type); + auto* input_act_node = VarNode("input_act_node") + ->assert_is_op_input(quant_dequant_op_type_, "X"); + auto* quant_dequant_node = + OpNode("quant_dequant_node", quant_dequant_op_type_) + ->assert_is_op(quant_dequant_op_type_); auto* output_scale_node = VarNode("output_scale_node") - ->assert_is_op_output(quant_dequant_op_type, "OutScale"); + ->assert_is_op_output(quant_dequant_op_type_, "OutScale"); auto* output_act_node = VarNode("output_act_node") - ->assert_is_op_output(quant_dequant_op_type, "Out"); - - quant_dequant_node->LinksFrom({input_scale_node, input_act_node}); + ->assert_is_op_output(quant_dequant_op_type_, "Out"); + + if (quant_dequant_op_type_ == + "fake_quantize_dequantize_moving_average_abs_max") { + auto* input_scale_node = + VarNode("input_scale_node") + ->assert_is_op_input(quant_dequant_op_type_, "InScale"); + quant_dequant_node->LinksFrom({input_scale_node, input_act_node}); + } else { + quant_dequant_node->LinksFrom({input_act_node}); + } output_scale_node->LinksFrom({quant_dequant_node}); output_act_node->LinksFrom({quant_dequant_node}); } void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { - auto* input_scale_node = matched.at("input_scale_node"); auto* input_act_node = matched.at("input_act_node"); auto* quant_dequant_node = matched.at("quant_dequant_node"); auto* output_scale_node = matched.at("output_scale_node"); @@ -352,22 +363,7 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, // Save quantization info in op_info attr auto op_info = *quantized_node->stmt()->op_info(); op_info.SetAttr("bit_length", bit_length); - - std::string argname; - int index; - op_info.GetInputArgname(output_act_name, &argname); - op_info.GetInputIndex(output_act_name, &index); - op_info.SetAttr(argname + std::to_string(index) + "_input_scale", - scale_value); - std::string op_type = op_info.Type(); - // Analyse the weight scale or input scale. - if (((op_type == "conv2d" || op_type == "depthwise_conv2d") && - argname == "Input") || - ((op_type == "mul" || op_type == "matmul") && argname == "Y")) { - op_info.SetAttr("weight_scale", scale_value); - } else { - op_info.SetAttr("input_scale", scale_value); - } + op_info.SetInputScale(output_act_name, {scale_value}); op_info.UpdateAllInputs(output_act_name, input_act_name); quantized_node->stmt()->ResetOp(op_info, graph->valid_places()); @@ -375,7 +371,12 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, } // delete nodes and edges std::set nodes2rm = { - input_scale_node, quant_dequant_node, output_scale_node, output_act_node}; + quant_dequant_node, output_scale_node, output_act_node}; + if (quant_dequant_op_type_ == + "fake_quantize_dequantize_moving_average_abs_max") { + auto* input_scale_node = matched.at("input_scale_node"); + nodes2rm.insert(input_scale_node); + } GraphSafeRemoveNodes(graph, nodes2rm); } diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h index ac3ac112b3aa504bc075125f2f13292073ca9444..c2dd1e5191cf0ad9b242dfa230abe3d38bad0cf7 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.h +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h @@ -86,17 +86,22 @@ class ChannelWiseDequantOpFuser : public FuseBase { std::string quantized_op_type_{}; }; -/* The pattern like "fake_quantize_dequantize_moving_average_abs_max + - * quantized_op" can be deteted by this fuser. The fuser modifies the input - * scale for the quantized_op and deletes the fake_quant_dequant_op. +/* The pattern like "fake_quantize_dequantize_op + quantized_op" can be + * deteted by this fuser. The fuser modifies the input scale for the + * quantized_op and deletes the fake_quant_dequant_op. */ class DeleteQuantDequantOpFuser : public FuseBase { public: + explicit DeleteQuantDequantOpFuser(const std::string& quant_dequant_op_type) + : quant_dequant_op_type_(quant_dequant_op_type) {} void BuildPattern() override; void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; private: cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; + + private: + std::string quant_dequant_op_type_{}; }; } // namespace fusion diff --git a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc b/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc index d578b725ec42c926e5f0581fd8eeef855e586bdc..68417783e932f3c882eaae38e620b8b651b937dd 100644 --- a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc +++ b/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc @@ -84,11 +84,12 @@ cpp::OpDesc TransposeSoftmaxTransposeFuser::GenOpDesc( op_desc.SetInput("X", {matched.at("x1")->arg()->name}); op_desc.SetOutput("Out", {matched.at("out")->arg()->name}); op_desc.SetAttr("axis", - matched.at("transpose1") - ->stmt() - ->op_info() - ->GetAttr>("axis") - .back()); + *(matched.at("transpose1") + ->stmt() + ->op_info() + ->GetAttr>("axis") + .end() - + 1)); return op_desc; } diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc index d7486c0933dbbe74115bd6358962817b2b946c12..3c9bac1c5b9fbf6d48683f6423a4c670b17cb127 100644 --- a/lite/core/mir/generate_program_pass.cc +++ b/lite/core/mir/generate_program_pass.cc @@ -39,6 +39,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr& graph) { nodes_in_order = graph->StmtTopologicalOrder(); } + insts_.emplace_back(); for (auto& item : nodes_in_order) { if (item->IsStmt()) { auto& stmt = item->AsStmt(); @@ -57,7 +58,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr& graph) { .SetSyncStreams(stmt.sync_streams_); } #endif - insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); + insts_.back().emplace_back(stmt.op(), std::move(stmt.kernels().front())); } } } diff --git a/lite/core/mir/generate_program_pass.h b/lite/core/mir/generate_program_pass.h index b126b4aba4d09a95a0033b04ed241812c88a3287..2ef4d035710d9542b365789aeabe8a08537ff225 100644 --- a/lite/core/mir/generate_program_pass.h +++ b/lite/core/mir/generate_program_pass.h @@ -42,7 +42,7 @@ class GenerateProgramPass : public ProgramPass { } private: - std::vector insts_; + std::vector> insts_; }; } // namespace mir diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc index 55b7a004567ec5a5298e084839d6dcf5a8591882..98b1597b49b9a7e151c86d11843e45163890191a 100644 --- a/lite/core/mir/graph_visualize_pass.cc +++ b/lite/core/mir/graph_visualize_pass.cc @@ -62,15 +62,17 @@ std::string Visualize(mir::SSAGraph* graph) { << string_trunc(op_info->GetAttr(attr_name)) << "\""; break; case AttrType::FLOATS: { - auto vals = op_info->GetAttr>(attr_name); + std::vector vals = + op_info->GetAttr>(attr_name); os << ":floats: {" + Join(vals, ",") << "}"; } break; case AttrType::INTS: { - auto vals = op_info->GetAttr>(attr_name); + std::vector vals = op_info->GetAttr>(attr_name); os << ":ints: {" + Join(vals, ",") + "}"; } break; case AttrType::STRINGS: { - auto vals = op_info->GetAttr>(attr_name); + std::vector vals = + op_info->GetAttr>(attr_name); os << ":strings: {" + string_trunc(Join(vals, ",")) << "}"; } break; default: diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc index 5ad094fd4219bcbb3c59ec1c71f42af6cac5a11a..eddbebb545351fa6b1820682af487bb7b04e8bb3 100644 --- a/lite/core/mir/memory_optimize_pass.cc +++ b/lite/core/mir/memory_optimize_pass.cc @@ -314,4 +314,6 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) TARGET(kXPU), TARGET(kBM), TARGET(kRKNPU), - TARGET(kAPU)}); + TARGET(kAPU), + TARGET(kMLU), + TARGET(kHuaweiAscendNPU)}); diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc index ba48d5d4ead5ea922ded0bff3a87c2c127595790..e09220d083ee8241001b6d9d55fb48eb1ba74f2e 100644 --- a/lite/core/mir/mlu_postprocess_pass.cc +++ b/lite/core/mir/mlu_postprocess_pass.cc @@ -14,18 +14,22 @@ #include "lite/core/mir/mlu_postprocess_pass.h" #include +#include #include #include #include #include #include "lite/core/mir/graph_visualize_pass.h" #include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/subgraph/subgraph_detector.h" #include "lite/operators/subgraph_op.h" namespace paddle { namespace lite { namespace mir { +static thread_local int g_stream_id = 0; + Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, const std::string& cast_arg_name, SSAGraph* graph, @@ -37,6 +41,10 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, cast_arg->AsArg().type = cast_type; inst_node->AsStmt().op()->scope()->Var(cast_arg_name); + VLOG(4) << "insert cast before subgraph"; + VLOG(4) << "curent node type: " << cur_node->AsArg().type->name() + << " cast to node type: " << cast_type->name(); + // create the stmt node auto* cast_inst = graph->NewInstructNode(); // create op @@ -60,14 +68,17 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, CHECK(0) << "Unsupport cast type"; } cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + + auto v_places = graph->valid_places(); // create kernels - auto kernels = cast_op->CreateKernels(graph->valid_places()); + auto kernels = cast_op->CreateKernels(v_places); std::vector> selected_kernels; bool is_found = false; for (auto& kernel : kernels) { if (op_type == "cast") { const Type* in_arg_ty = kernel->GetInputDeclType("X"); - if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) { + if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) && + DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type)) { is_found = true; } } else if (op_type == "layout") { @@ -83,24 +94,22 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type, const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); if (TargetCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) && - TargetCompatibleTo(*out_arg_ty, *cast_type)) { + TargetCompatibleTo(*out_arg_ty, *cast_type) && + PrecisionCompatible(*in_arg_ty, *cur_node->AsArg().type) && + PrecisionCompatible(*out_arg_ty, *cast_type)) { is_found = true; } } else { CHECK(0) << "Unsupport cast type"; } if (is_found) { + VLOG(4) << "insert kernel: " << kernel->name(); selected_kernels.emplace_back(std::move(kernel)); // we pick the kernel cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); auto& stmt = cast_inst->AsStmt(); - if (op_type == "layout") { - stmt.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(TARGET(kX86))); - } else { - stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( - stmt.picked_kernel().target())); - } + stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + stmt.picked_kernel().target(), g_stream_id)); break; } } @@ -124,6 +133,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name); // for CastAfter manully set the tensor's type var->GetMutable(); + VLOG(4) << "insert cast after subgraph"; + VLOG(4) << "curent node type: " << cur_node->AsArg().type->name() + << " cast to node type: " << cast_type->name(); // create the stmt node auto* cast_inst = graph->NewInstructNode(); @@ -133,8 +145,8 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, cpp::OpDesc op_desc; op_desc.SetType(op_type); if (op_type == "cast") { - op_desc.SetAttr("in_dtype", 4); // FP32 - op_desc.SetAttr("out_dtype", 5); // FP16 + op_desc.SetAttr("in_dtype", 4); // FP16 + op_desc.SetAttr("out_dtype", 5); // FP32 op_desc.SetInput("X", {cast_arg_name}); op_desc.SetOutput("Out", {cur_node->AsArg().name}); } else if (op_type == "layout") { @@ -150,8 +162,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + auto v_places = graph->valid_places(); // create kernels - auto kernels = cast_op->CreateKernels(graph->valid_places()); + auto kernels = cast_op->CreateKernels(v_places); std::vector> selected_kernels; bool is_found = false; for (auto& kernel : kernels) { @@ -164,14 +177,17 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); if (DataLayoutCompatible(*in_arg_ty, *cast_type) && - DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) { + DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type) && + PrecisionCompatibleTo(*in_arg_ty, *cast_type)) { is_found = true; } } else if (op_type == "io_copy") { const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); if (TargetCompatibleTo(*in_arg_ty, *cast_type) && - TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type)) { + TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type) && + PrecisionCompatible(*in_arg_ty, *cur_node->AsArg().type) && + PrecisionCompatible(*out_arg_ty, *cast_type)) { is_found = true; } } else { @@ -182,13 +198,8 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, // we pick the kernel cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op); auto& stmt = cast_inst->AsStmt(); - if (op_type == "layout") { - stmt.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(TARGET(kX86))); - } else { - stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( - stmt.picked_kernel().target())); - } + stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + stmt.picked_kernel().target(), g_stream_id)); break; } } @@ -203,7 +214,8 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type, void MLUPostprocessPass::InsertBefore(SSAGraph* graph, Node* head_node, Node* inst_node, - const Type* inst_type) { + const Type* inst_type, + bool use_mlu_cast) { const auto* head_type = head_node->AsArg().type; // break original link @@ -218,39 +230,52 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph, head_node->AsArg().name) != first_conv_nodes_.end(); // precision cast node - if (head_type->precision() != inst_type->precision() && !is_first_conv_head) { + if (!use_mlu_cast) { + if (head_type->precision() != inst_type->precision() && + !is_first_conv_head) { + cur_node = InsertCastBefore("cast", + name_prefix + "cast", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy(head_type->target(), + inst_type->precision(), + head_type->layout())); + } + + // layout cast node + if (head_type->layout() != inst_type->layout()) { + cur_node = InsertCastBefore("layout", + name_prefix + "layout", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy(head_type->target(), + inst_type->precision(), + inst_type->layout())); + } + + // io copy cur_node = InsertCastBefore( - "cast", - name_prefix + "cast", + "io_copy", + name_prefix + "io_copy", graph, cur_node, inst_node, LiteType::GetTensorTy( - head_type->target(), inst_type->precision(), head_type->layout())); - } - - // layout cast node - if (head_type->layout() != inst_type->layout()) { + inst_type->target(), inst_type->precision(), inst_type->layout())); + } else { + // io copy cur_node = InsertCastBefore( - "layout", - name_prefix + "layout", + "io_copy", + name_prefix + "io_copy", graph, cur_node, inst_node, LiteType::GetTensorTy( - head_type->target(), inst_type->precision(), inst_type->layout())); + inst_type->target(), head_type->precision(), head_type->layout())); } - // io copy - cur_node = InsertCastBefore( - "io_copy", - name_prefix + "io_copy", - graph, - cur_node, - inst_node, - LiteType::GetTensorTy( - inst_type->target(), inst_type->precision(), inst_type->layout())); - // connect cur_node to inst_node DirectedLink(cur_node, inst_node); @@ -259,13 +284,19 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph, head_node->AsArg().name, cur_node->AsArg().name); // for subgraph op, modify the BlockDesc - auto* sub_block_desc = dynamic_cast( - inst_node->AsStmt().op().get()) - ->GetSubBlock(); - for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) { - auto* sub_block_op_desc = sub_block_desc->GetOp(i); - UpdateInputTo( - sub_block_op_desc, head_node->AsArg().name, cur_node->AsArg().name); + auto sub_program_desc = dynamic_cast( + inst_node->AsStmt().op().get()) + ->GetProgramDesc(); + CHECK(sub_program_desc); + int sub_block_idx = + inst_node->AsStmt().op()->op_info()->GetAttr("sub_block"); + auto* sub_block_desc = + sub_program_desc->GetBlock(sub_block_idx); + for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize(); + ++sub_op_idx) { + auto* sub_op_desc = const_cast( + sub_block_desc->GetOp(sub_op_idx)); + UpdateInputTo(sub_op_desc, head_node->AsArg().name, cur_node->AsArg().name); } // recreate the op @@ -311,10 +342,9 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node, CHECK(subgraph_precision == PRECISION(kFloat) || subgraph_precision == PRECISION(kFP16)) << "Mlu node has unsupport precision"; - VLOG(4) << "picked kernel precision: " - << PrecisionToStr(subgraph_precision); *arg_type = LiteType::GetTensorTy( subgraph_target, subgraph_precision, subgraph_layout); + VLOG(4) << "picked subgraph kernel type: " << (*arg_type)->name(); break; } } @@ -356,7 +386,8 @@ bool MLUPostprocessPass::NeedInsert(Node* node, const Type* inst_type) { void MLUPostprocessPass::InsertAfter(SSAGraph* graph, Node* tail_node, Node* inst_node, - const Type* inst_type) { + const Type* inst_type, + bool use_mlu_cast) { const auto* tail_type = tail_node->AsArg().type; // break original link @@ -367,39 +398,50 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph, tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_"; // precision cast node - if (tail_type->precision() != inst_type->precision()) { + if (!use_mlu_cast) { + if (tail_type->precision() != inst_type->precision()) { + cur_node = InsertCastAfter("cast", + name_prefix + "cast", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy(tail_type->target(), + inst_type->precision(), + tail_type->layout())); + } + + // layout cast node + if (tail_type->layout() != inst_type->layout()) { + cur_node = InsertCastAfter("layout", + name_prefix + "layout", + graph, + cur_node, + inst_node, + LiteType::GetTensorTy(tail_type->target(), + inst_type->precision(), + inst_type->layout())); + } + + // io copy cur_node = InsertCastAfter( - "cast", - name_prefix + "cast", + "io_copy", + name_prefix + "io_copy", graph, cur_node, inst_node, LiteType::GetTensorTy( - tail_type->target(), inst_type->precision(), tail_type->layout())); - } - - // layout cast node - if (tail_type->layout() != inst_type->layout()) { + inst_type->target(), inst_type->precision(), inst_type->layout())); + } else { cur_node = InsertCastAfter( - "layout", - name_prefix + "layout", + "io_copy", + name_prefix + "io_copy", graph, cur_node, inst_node, LiteType::GetTensorTy( - tail_type->target(), inst_type->precision(), inst_type->layout())); + inst_type->target(), tail_type->precision(), tail_type->layout())); } - // io copy - cur_node = InsertCastAfter( - "io_copy", - name_prefix + "io_copy", - graph, - cur_node, - inst_node, - LiteType::GetTensorTy( - inst_type->target(), inst_type->precision(), inst_type->layout())); - // connect cur_node to inst_node DirectedLink(inst_node, cur_node); @@ -408,21 +450,27 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph, tail_node->AsArg().name, cur_node->AsArg().name); // for subgraph op, modify the BlockDesc - auto* sub_block_desc = dynamic_cast( - inst_node->AsStmt().op().get()) - ->GetSubBlock(); - for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) { - auto* sub_block_op_desc = sub_block_desc->GetOp(i); + auto sub_program_desc = dynamic_cast( + inst_node->AsStmt().op().get()) + ->GetProgramDesc(); + CHECK(sub_program_desc); + int sub_block_idx = + inst_node->AsStmt().op()->op_info()->GetAttr("sub_block"); + auto* sub_block_desc = + sub_program_desc->GetBlock(sub_block_idx); + for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize(); + ++sub_op_idx) { + auto* sub_op_desc = const_cast( + sub_block_desc->GetOp(sub_op_idx)); UpdateOutputTo( - sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); + sub_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); /* graph like this * subgraph_op_0 * / \ * / \ * subgraph_op_1 host_op */ - UpdateInputTo( - sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); + UpdateInputTo(sub_op_desc, tail_node->AsArg().name, cur_node->AsArg().name); } // recreate the op @@ -446,15 +494,22 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) { } } -bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) { - auto* block_desc = - static_cast(inst->AsStmt().op().get()) - ->GetSubBlock(); - for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) { - auto op_desc = block_desc->GetOp(op_idx); - CHECK(op_desc); - if (op_desc->Type() == "conv2d") { - for (auto& names : op_desc->inputs()) { +bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, + Node* inst_node) { + auto sub_program_desc = dynamic_cast( + inst_node->AsStmt().op().get()) + ->GetProgramDesc(); + CHECK(sub_program_desc); + int sub_block_idx = + inst_node->AsStmt().op()->op_info()->GetAttr("sub_block"); + auto* sub_block_desc = + sub_program_desc->GetBlock(sub_block_idx); + for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize(); + sub_op_idx++) { + auto sub_op_desc = sub_block_desc->GetOp(sub_op_idx); + CHECK(sub_op_desc); + if (sub_op_desc->Type() == "conv2d") { + for (auto& names : sub_op_desc->inputs()) { if (std::find(names.second.begin(), names.second.end(), arg_node->AsArg().name) != names.second.end()) { @@ -496,6 +551,74 @@ void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) { } } +void MLUPostprocessPass::ModifyInputOutputDataType(SSAGraph* graph) { + for (auto& node : graph->mutable_nodes()) { + if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") { + const Type* subgraph_arg_type = nullptr; + GetSubgraphOpArgType(&node, &subgraph_arg_type, graph); + for (auto& in_node : node.inlinks) { + const auto* in_node_type = in_node->AsArg().type; + VLOG(4) << "MLU subgraph input type: " << in_node->AsArg().name + << *in_node_type; + if (in_node->AsArg().is_weight || in_node->AsArg().is_persist) { + CHECK(in_node_type->target() == TARGET(kHost) && + in_node_type->precision() == PRECISION(kAny) && + in_node_type->layout() == DATALAYOUT(kNCHW)) + << "MLU subgraph unexpected persistent input type!"; + in_node->AsArg().type = LiteType::GetTensorTy( + TARGET(kMLU), PRECISION(kAny), DATALAYOUT(kNHWC)); + } else { + CHECK((in_node_type->target() == TARGET(kHost) || + in_node_type->target() == TARGET(kX86)) && + in_node_type->precision() == PRECISION(kFloat) && + in_node_type->layout() == DATALAYOUT(kNCHW)) + << "MLU subgraph unexpected common input type!"; + } + } + for (auto& out_node : node.outlinks) { + const auto* out_node_type = out_node->AsArg().type; + auto& out_arg = out_node->AsArg(); + VLOG(4) << "MLU subgraph output type: " << out_node->AsArg().name + << *out_node_type; + if (out_node->AsArg().is_weight || out_node->AsArg().is_persist) { + CHECK(out_node_type->target() == TARGET(kHost) && + out_node_type->precision() == PRECISION(kAny) && + out_node_type->layout() == DATALAYOUT(kNCHW)) + << "MLU subgraph unexpected persistent input type!"; + out_node->AsArg().type = LiteType::GetTensorTy( + TARGET(kMLU), PRECISION(kAny), DATALAYOUT(kNHWC)); + } else if (out_node_type->precision() == PRECISION(kAny) && + out_node->outlinks.empty()) { + out_arg.is_persist = true; + out_arg.type = LiteType::GetTensorTy( + TARGET(kMLU), PRECISION(kAny), DATALAYOUT(kNHWC)); + } else { + CHECK(out_node_type->precision() == PRECISION(kFloat)) + << "MLU subgraph unexpected common output type!"; + if (out_node->outlinks.empty()) { + out_arg.type = LiteType::GetTensorTy(TARGET(kHost), + subgraph_arg_type->precision(), + DATALAYOUT(kNHWC)); + VLOG(4) << "unused output node type: " << out_arg.name + << out_node_type->name(); + } else { + out_arg.type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + VLOG(4) << "output node type: " << out_arg.name + << out_node_type->name(); + } + } + const auto target = out_node->AsArg().type->target(); + const auto precision = out_node->AsArg().type->precision(); + const auto layout = out_node->AsArg().type->layout(); + VLOG(4) << "arg name: " << out_node->AsArg().name + << " type: " << TargetToStr(target) << ", " + << PrecisionToStr(precision) << ", " << DataLayoutToStr(layout); + } + } + } +} + void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { for (auto& node : graph->mutable_nodes()) { if (!node.IsStmt()) continue; @@ -515,6 +638,16 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { old_type->precision(), paddle::lite_api::DataLayoutType::kNHWC, old_type->device()); + // modify inst feed to NHWC, while set_mlu_input_layout(kNHWC) + // invoked, to keep consistent with actual data layout + auto place = node.AsStmt().place(); + place.layout = DATALAYOUT(kNHWC); + std::vector valid_places = {place}; + auto updated_op_info = *node.AsStmt().op_info(); + node.AsStmt().ResetOp(updated_op_info, valid_places, nullptr); + auto kernel = &(node.AsStmt().picked_kernel()); + VLOG(4) << "kernel info: " << kernel->name(); + node.AsStmt().op()->AttachKernel(kernel); } } } @@ -540,6 +673,219 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { } } +std::pair CheckInputAndInsert(Scope* scope, + cpp::BlockDesc* block_desc, + const std::string& input_name, + const Type* tensor_type, + const Type* subgraph_type) { + auto cur_node = input_name; + bool do_insert = false; + if (!DataLayoutCompatible(*tensor_type, *subgraph_type)) { + auto layout_op = block_desc->AddOp(); + auto layout_arg_name = string_format("%s/layout", cur_node.c_str()); + scope->Var(layout_arg_name); + VLOG(4) << "insert layout for subgraph input, arg tensor name: " + << layout_arg_name; + layout_op->SetType("layout"); + layout_op->SetInput("Input", {cur_node}); + layout_op->SetOutput("Out", {layout_arg_name}); + cur_node = layout_arg_name; + do_insert = true; + } + + if (!PrecisionCompatible(*tensor_type, *subgraph_type) && + tensor_type->precision() != PRECISION(kInt8) && + tensor_type->precision() != PRECISION(kInt32)) { + auto cast_op = block_desc->AddOp(); + auto cast_arg_name = string_format("%s/cast", cur_node.c_str()); + scope->Var(cast_arg_name); + VLOG(4) << "insert cast for subgraph input, arg tensor name: " + << cast_arg_name; + cast_op->SetType("cast"); + cast_op->SetAttr("in_dtype", 5); // FP32 + cast_op->SetAttr("out_dtype", 4); // FP16 + cast_op->SetInput("X", {cur_node}); + cast_op->SetOutput("Out", {cast_arg_name}); + cur_node = cast_arg_name; + do_insert = true; + } + + return std::make_pair(do_insert, cur_node); +} + +std::pair CheckOutputAndInsert( + Scope* scope, + cpp::BlockDesc* block_desc, + const std::string& output_name, + const Type* tensor_type, + const Type* subgraph_type) { + auto cur_node = output_name; + bool do_insert = false; + cpp::OpDesc *layout_op = nullptr, *cast_op = nullptr; + size_t cast_idx = 0; + + // subgraph -> cast -> layout -> output + if (!PrecisionCompatible(*tensor_type, *subgraph_type)) { + cast_op = block_desc->AddOp(); + cast_idx = block_desc->OpsSize() - 1; + CHECK_EQ(cast_op, block_desc->GetOp(cast_idx)); + cast_op->SetType("cast"); + cast_op->SetAttr("in_dtype", 4); // FP16 + cast_op->SetAttr("out_dtype", 5); // FP32 + do_insert = true; + } + + if (!DataLayoutCompatible(*tensor_type, *subgraph_type)) { + auto layout_arg_name = string_format("%s/layout", cur_node.c_str()); + scope->Var(layout_arg_name); + VLOG(4) << "insert layout for subgraph output, arg tensor name: " + << layout_arg_name; + layout_op = block_desc->AddOp(); + layout_op->SetType("layout"); + layout_op->SetInput("Input", {layout_arg_name}); + layout_op->SetOutput("Out", {cur_node}); + cur_node = layout_arg_name; + do_insert = true; + } + + if (cast_op) { + cast_op = block_desc->GetOp(cast_idx); + auto cast_arg_name = string_format("%s/cast", cur_node.c_str()); + scope->Var(cast_arg_name); + VLOG(4) << "insert cast for subgraph output, arg tensor name: " + << cast_arg_name; + cast_op->SetInput("X", {cast_arg_name}); + cast_op->SetOutput("Out", {cur_node}); + cur_node = cast_arg_name; + } + + return std::make_pair(do_insert, cur_node); +} + +// insert cast op on mlu, to avoid cast on cpu +void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node, + const Type* subgraph_type) { + CHECK_EQ(subgraph_node->AsStmt().op()->Type(), "subgraph"); + auto subgraph_op = + dynamic_cast(subgraph_node->AsStmt().op().get()); + CHECK(subgraph_op); + auto sub_program_desc = subgraph_op->GetProgramDesc(); + CHECK(sub_program_desc); + int sub_block_idx = subgraph_op->op_info()->GetAttr("sub_block"); + auto* sub_block_desc = const_cast( + sub_program_desc->GetBlock(sub_block_idx)); + + // create a new block desc to keep op sequence correct + cpp::BlockDesc new_block_desc; + new_block_desc.ClearOps(); + new_block_desc.ClearVars(); + new_block_desc.SetIdx(sub_block_desc->Idx()); + new_block_desc.SetParentIdx(sub_block_desc->ParentIdx()); + new_block_desc.SetForwardBlockIdx(sub_block_desc->ForwardBlockIdx()); + + // find all IO that is not weight or persist + std::list i_names, o_names; + std::map node_replace; + + // Insert cast op for iotensor which is not weight or persist + for (auto& input : subgraph_node->inlinks) { + auto input_name = input->AsArg().name; + if (!(input->AsArg().is_weight || input->AsArg().is_persist)) { + i_names.emplace_back(input_name); + auto ret = CheckInputAndInsert(subgraph_op->scope(), + &new_block_desc, + input_name, + input->AsArg().type, + subgraph_type); + if (ret.first) { + node_replace[input_name] = ret.second; + } + } + } + for (auto& output : subgraph_node->outlinks) { + auto output_name = output->AsArg().name; + if (!(output->AsArg().is_weight || output->AsArg().is_persist)) { + o_names.emplace_back(output_name); + auto ret = CheckOutputAndInsert(subgraph_op->scope(), + sub_block_desc, + output_name, + output->AsArg().type, + subgraph_type); + if (ret.first) { + node_replace[output_name] = ret.second; + } + } + } + + // update input and output + for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize(); + ++sub_op_idx) { + auto sub_op_desc = sub_block_desc->GetOp(sub_op_idx); + auto new_op_desc = new_block_desc.AddOp(); + *new_op_desc = *sub_op_desc; + + if (sub_op_desc->Type() != "layout" && sub_op_desc->Type() != "cast") { + auto op_input_args = new_op_desc->InputArgumentNames(); + for (auto& input_arg : op_input_args) { + auto op_input = new_op_desc->Input(input_arg); + for (auto& it : i_names) { + auto index = std::find(op_input.begin(), op_input.end(), it); + if (index != op_input.end() && + node_replace.find(it) != node_replace.end()) { + index = op_input.erase(index); + op_input.emplace(index, node_replace.at(it)); + VLOG(4) << new_op_desc->Type() << "] change input from " << it + << " to " << node_replace.at(it); + } + } + new_op_desc->SetInput(input_arg, op_input); + } + + auto op_output_args = new_op_desc->OutputArgumentNames(); + for (auto& output_arg : op_output_args) { + auto op_output = new_op_desc->Output(output_arg); + for (auto& it : o_names) { + auto index = std::find(op_output.begin(), op_output.end(), it); + if (index != op_output.end() && + node_replace.find(it) != node_replace.end()) { + index = op_output.erase(index); + op_output.emplace(index, node_replace.at(it)); + VLOG(4) << new_op_desc->Type() << "] change output from " << it + << " to " << node_replace.at(it); + } + } + new_op_desc->SetOutput(output_arg, op_output); + } + } + } + + *sub_block_desc = new_block_desc; +} + +void ModifyValidPlaces(SSAGraph* graph, bool use_mlu_cast) { + // remove invalid places, since only support X86, host, MLU + auto v_places = graph->valid_places(); + for (auto it = v_places.begin(); it != v_places.end();) { + if (it->target != TARGET(kMLU) && it->target != TARGET(kHost) && + it->target != TARGET(kX86)) { + it = v_places.erase(it); + } else { + ++it; + } + } + + if (use_mlu_cast) { + // insert mlu float place for float io copy, no effect to subgraph type + v_places.emplace_back(TARGET(kMLU), PRECISION(kFloat), DATALAYOUT(kNHWC)); + } + + graph->SetValidPlaces(v_places); + VLOG(4) << "valid places after modified:"; + for (auto& p : v_places) { + VLOG(4) << p.DebugString(); + } +} + void MLUPostprocessPass::Apply(const std::unique_ptr& graph) { // currently for non-persistent input and output args, mlu subgraph op // only support float16/float32 data type @@ -549,35 +895,47 @@ void MLUPostprocessPass::Apply(const std::unique_ptr& graph) { // arg_in and arg_out are assumed to be NHWC which user should be aware of. // Thus here we change these args' layout to NHWC #ifdef LITE_WITH_MLU - if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) { + ModifyInputOutputDataType(graph.get()); + + if (lite::TargetWrapperMlu::InputLayout() == DATALAYOUT(kNHWC)) { ModifyLayout(graph.get()); } - if (lite::DeviceInfo::Global().UseFirstConv()) { + if (lite::TargetWrapperMlu::UseFirstConv()) { GatherAndModifyFirstConvNodes(graph.get()); } #endif + g_stream_id = static_cast(reinterpret_cast(graph.get())); + bool disable_mlu_cast = GetBoolFromEnv("LITE_DISABLE_MLU_CAST"); + ModifyValidPlaces(graph.get(), !disable_mlu_cast); // insert io_copy, layout and precision cast of subgraph's inputs and outputs for (auto& node : graph->mutable_nodes()) { if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") { const Type* subgraph_arg_type = nullptr; GetSubgraphOpArgType(&node, &subgraph_arg_type, graph.get()); + if (!disable_mlu_cast) { + AdjustSubgraph(&node, subgraph_arg_type); + } auto links_tmp = node.inlinks; for (auto p_in : links_tmp) { if (NeedInsert(p_in, subgraph_arg_type)) { - InsertBefore(graph.get(), p_in, &node, subgraph_arg_type); + InsertBefore( + graph.get(), p_in, &node, subgraph_arg_type, !disable_mlu_cast); } } links_tmp.assign(node.outlinks.begin(), node.outlinks.end()); for (auto p_out : links_tmp) { if (NeedInsert(p_out, subgraph_arg_type)) { - InsertAfter(graph.get(), p_out, &node, subgraph_arg_type); + InsertAfter( + graph.get(), p_out, &node, subgraph_arg_type, !disable_mlu_cast); } } } } + // std::vector> subgraphs({graph->NodeTopologicalOrder()}); + // SubgraphVisualizer(graph.get(), subgraphs)(); } } // namespace mir diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h index 688dd06fb5fbec0c8e1c53acfe4215456ddb4192..5a31c1d8322db7bbc57de8dd18fdaf8ff4b0c885 100644 --- a/lite/core/mir/mlu_postprocess_pass.h +++ b/lite/core/mir/mlu_postprocess_pass.h @@ -79,6 +79,8 @@ class MLUPostprocessPass : public ProgramPass { const Type** arg_type, SSAGraph* graph); + void ModifyInputOutputDataType(SSAGraph* graph); + void ModifyLayout(SSAGraph* graph); bool NeedInsert(Node* node, const Type* inst_type); @@ -86,12 +88,14 @@ class MLUPostprocessPass : public ProgramPass { void InsertBefore(SSAGraph* graph, Node* head_node, Node* inst_node, - const Type* type); + const Type* type, + bool use_mlu_cast); void InsertAfter(SSAGraph* graph, Node* tail_node, Node* inst_node, - const Type* type); + const Type* type, + bool use_mlu_cast); Node* InsertCastBefore(const std::string& op_type, const std::string& cast_arg_name, @@ -115,6 +119,8 @@ class MLUPostprocessPass : public ProgramPass { bool IsFirstConvInSubgraph(Node* arg_node, Node* inst); + void AdjustSubgraph(Node* subgraph_node, const Type* op_type); + private: std::set first_conv_nodes_; }; diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc index 66b37446a4cc6a33c09757266c9dd2cbc818325e..259447aa21b76261a266a243dcc9c2a7530c9dc5 100644 --- a/lite/core/mir/quantized_op_attributes_inference_pass.cc +++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc @@ -37,34 +37,53 @@ void QuantizedOpAttributesInferencePass::Apply( auto& inst = op_node->AsStmt(); auto op_info = inst.op_info(); auto op_type = op_info->Type(); - if (!op_info->HasAttr("input_scale")) continue; - bool found = false; - float output_scale; + + // Check if any of the inputs of the op have scale value + bool has_input_scale = false; + for (auto in_var_node : op_node->inlinks) { + CHECK(in_var_node->IsArg()); + auto in_var_node_name = in_var_node->arg()->name; + has_input_scale |= op_info->HasInputScale(in_var_node_name); + } + if (!has_input_scale) continue; + + // Infer the output scale according to its out_threshold or the input scale + // of its adjacent ops + bool is_quantized = true; for (auto out_var_node : op_node->outlinks) { CHECK(out_var_node->IsArg()); + std::vector output_scale; + bool has_output_scale = false; + auto out_var_node_name = out_var_node->arg()->name; for (auto out_op_node : out_var_node->outlinks) { CHECK(out_op_node->IsStmt()); auto& out_inst = out_op_node->AsStmt(); auto out_op_info = out_inst.op_info(); - if (!out_op_info->HasAttr("input_scale")) continue; - auto input_scale = out_op_info->GetAttr("input_scale"); - if (!found) { - found = true; + if (!out_op_info->HasInputScale(out_var_node_name)) continue; + auto input_scale = out_op_info->GetInputScale(out_var_node_name); + if (!has_output_scale) { output_scale = input_scale; + has_output_scale = true; } else { - CHECK_EQ(output_scale, input_scale); + CHECK_EQ(output_scale.size(), input_scale.size()); } } + if (has_output_scale) { + inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale); + } else if (op_info->HasAttr("out_threshold")) { + // Only consider one output, there are only one out_threshold + int bit_length = op_info->GetAttr("bit_length"); + int range = (1 << (bit_length - 1)) - 1; + output_scale = std::vector{ + op_info->GetAttr("out_threshold") / range}; + inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale); + } else { + is_quantized = false; + } } - if (found) { - inst.mutable_op_info()->SetAttr("output_scale", output_scale); - } else if (op_info->HasAttr("output_scale")) { - int bit_length = op_info->GetAttr("bit_length"); - int range = (1 << (bit_length - 1)) - 1; - output_scale = op_info->GetAttr("output_scale"); - inst.mutable_op_info()->SetAttr("output_scale", output_scale / range); - } - if (op_info->HasAttr("output_scale")) { + + // Fix the missing of the attribute 'enable_int8'. + if (is_quantized) { inst.mutable_op_info()->SetAttr("enable_int8", true); } } diff --git a/lite/core/mir/runtime_context_assign_pass.cc b/lite/core/mir/runtime_context_assign_pass.cc index 5b6f968484b7b49838a004c3edfd00ff9b7e5e5e..7ad833b22885204130b50a931dc2da7d040c654c 100644 --- a/lite/core/mir/runtime_context_assign_pass.cc +++ b/lite/core/mir/runtime_context_assign_pass.cc @@ -44,6 +44,10 @@ class RuntimeContextAssignPass : public StmtPass { inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext( inst.picked_kernel().target())); } +#elif LITE_WITH_MLU + inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + inst.picked_kernel().target(), + static_cast(reinterpret_cast(graph.get())))); #else int stream_id = inst.stream_id_; diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc index f8991a359b177799cc5f59651c5d305fe64231ef..9cf7bc8995766e47895ce3dd2ef6bf7bcb614e5c 100644 --- a/lite/core/mir/ssa_graph.cc +++ b/lite/core/mir/ssa_graph.cc @@ -153,60 +153,61 @@ Node *SSAGraph::GraphCreateInstructNode( } void SSAGraph::Build(const Program &program, - const std::vector &valid_places) { + const std::vector &valid_places, + int block_idx) { CHECK(node_storage_.empty()); - auto weights_name = program.weights(); - auto is_weights = [&](const std::string &name) -> bool { - auto it = std::find(weights_name.begin(), weights_name.end(), name); - if (it == weights_name.end()) return false; + auto weights = program.weights(); + auto is_weight = [&](const std::string &name) -> bool { + auto it = std::find(weights.begin(), weights.end(), name); + if (it == weights.end()) return false; return true; }; - std::map var_types = program.var_data_type(); - - std::map arg_update_node_map_; - for (auto &op : program.ops()) { + auto var_type_map = program.var_type_map(); + std::map arg_update_node_map; + for (auto &op : program.ops(block_idx)) { VLOG(3) << op->op_info()->Type(); auto *op_node = GraphCreateInstructNode(op, valid_places); - for (const std::string &name : op->op_info()->input_names()) { + auto *op_info = op->op_info(); + const auto &op_type = op_info->Type(); + for (const auto &var_name : op_info->input_names()) { mir::Node *arg_node = nullptr; - if (arg_update_node_map_.count(name)) { - arg_node = arg_update_node_map_.at(name); + if (arg_update_node_map.count(var_name)) { + arg_node = arg_update_node_map.at(var_name); } else { node_storage_.emplace_back(); arg_node = &node_storage_.back(); - arg_node->AsArg(name, node_storage_.size() - 1); - arg_update_node_map_[name] = arg_node; + arg_node->AsArg(var_name, node_storage_.size() - 1); + arg_update_node_map[var_name] = arg_node; } - if (var_types.count(name)) { + if (var_type_map.count(var_name)) { if (!arg_node->arg()->type) { - arg_node->arg()->type = LiteType::GetTensorTy( - TARGET(kUnk), var_types[name], DATALAYOUT(kUnk)); + arg_node->arg()->type = var_type_map[var_name]; } // Store the original data type of the output tensors for // type_precision_cast_pass, to keep the consistency between the // output types of original graph and optimized graph's - if (op->op_info()->Type() == "fetch") { + if (op_type == "fetch") { op->mutable_op_info()->SetAttr( - "data_type", static_cast(var_types[name])); + "data_type", + static_cast(var_type_map[var_name]->precision())); } } - if (is_weights(name)) arg_node->AsArg().is_weight = true; + if (is_weight(var_name)) arg_node->AsArg().is_weight = true; CHECK(arg_node->IsRoleSet()); DirectedLink(arg_node, op_node); } - for (const std::string &name : op->op_info()->output_names()) { + for (const auto &var_name : op->op_info()->output_names()) { node_storage_.emplace_back(); auto *arg_node = &node_storage_.back(); - arg_node->AsArg(name, node_storage_.size() - 1); - arg_update_node_map_[name] = arg_node; - if (var_types.count(name) && !arg_node->arg()->type) { - arg_node->arg()->type = LiteType::GetTensorTy( - TARGET(kUnk), var_types[name], DATALAYOUT(kUnk)); + arg_node->AsArg(var_name, node_storage_.size() - 1); + arg_update_node_map[var_name] = arg_node; + if (var_type_map.count(var_name) && !arg_node->arg()->type) { + arg_node->arg()->type = var_type_map[var_name]; } - if (is_weights(name)) arg_node->AsArg().is_weight = true; + if (is_weight(var_name)) arg_node->AsArg().is_weight = true; CHECK(arg_node->IsRoleSet()); DirectedLink(op_node, arg_node); } diff --git a/lite/core/mir/ssa_graph.h b/lite/core/mir/ssa_graph.h index e2967cf96a6b00ccc225ce05b043cb94f161b1d6..819b0a71ea1be04c85316e90001aef311b7d7238 100644 --- a/lite/core/mir/ssa_graph.h +++ b/lite/core/mir/ssa_graph.h @@ -35,9 +35,13 @@ class GraphBase {}; class SSAGraph : GraphBase { public: - // @param program: the op program + // @param program: the target program with vars and ops // @param valid_places: the valid places user set for the system. - void Build(const Program &program, const std::vector &valid_places); + // @param block_idx: the block index in the target program, default is 0(main + // block) + void Build(const Program &program, + const std::vector &valid_places, + int block_idx = kRootBlockIdx); void RemoveNode(const mir::Node *node); std::vector StmtTopologicalOrder(); diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc index 1de0d1a26577b31e1dfc5187562cc80bce6fe4d1..b5dd1f8b9c119f4647b72a35eb71df37f31fc6f8 100644 --- a/lite/core/mir/static_kernel_pick_pass.cc +++ b/lite/core/mir/static_kernel_pick_pass.cc @@ -110,15 +110,16 @@ void StaticKernelPickPass::Apply(const std::unique_ptr& graph) { if (out_type_int8) { auto out_node = node.outlinks.front(); CHECK(out_node->IsArg()); + auto out_node_name = out_node->arg()->name; auto one_adj_op_node = out_node->outlinks.front(); CHECK(one_adj_op_node->IsStmt()); auto& one_adj_instruct = one_adj_op_node->AsStmt(); CHECK(one_adj_instruct.op_info()->HasAttr("enable_int8")); - CHECK(one_adj_instruct.op_info()->HasAttr("input_scale")); + CHECK(one_adj_instruct.op_info()->HasInputScale(out_node_name)); - instruct.mutable_op_info()->SetAttr( - "output_scale", - one_adj_instruct.op_info()->GetAttr("input_scale")); + instruct.mutable_op_info()->SetOutputScale( + out_node_name, + one_adj_instruct.op_info()->GetInputScale(out_node_name)); auto update_desc = *instruct.mutable_op_info(); instruct.ResetOp(update_desc, graph->valid_places()); diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc index 31a38280ff537d486f5fb3ba46dee5b025d3f1f1..13805b2b18634551d4b74ac436954fa8f6b9ed05 100644 --- a/lite/core/mir/subgraph/subgraph_detector.cc +++ b/lite/core/mir/subgraph/subgraph_detector.cc @@ -411,34 +411,60 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, cpp::OpDesc subgraph_op_desc; subgraph_op_desc.SetType("subgraph"); - // Create a new sub block desc for storing all of Ops and Vars of the target - // subgraph and sub_block_idx is set as a attribute of subgraph op, - // sub_block_idx < 0 means it's a new subgraph op - int sub_block_idx = -(subgraph_idx + 1); - auto sub_block_desc = new cpp::BlockDesc(); + // Create a program desc and a block desc for storing all of Ops and Vars of + // the target subgraph and sub_block_idx is set as a attribute of subgraph op, + // sub_block_idx = 0 means it's a new subgraph op + auto sub_program_desc = std::make_shared(); + int sub_block_idx = 0; + auto sub_block_desc = sub_program_desc->AddBlock(); sub_block_desc->ClearOps(); sub_block_desc->ClearVars(); for (auto &op_node : subgraph_nodes) { - auto sub_block_op_desc = sub_block_desc->AddOp(); - *sub_block_op_desc = *op_node->AsStmt().op_info(); + auto sub_op_desc = sub_block_desc->AddOp(); + *sub_op_desc = *op_node->AsStmt().op_info(); } subgraph_op_desc.SetAttr("sub_block", sub_block_idx); // Extract input and output nodes from the target subgraph - std::set input_var_nodes; + std::set idata_var_nodes; std::set weight_var_nodes; - std::set output_var_nodes; + std::set odata_var_nodes; std::set local_var_nodes; std::set unused_var_nodes; ExtractInputsOutputs(subgraph_nodes, - &input_var_nodes, + &idata_var_nodes, &weight_var_nodes, - &output_var_nodes, + &odata_var_nodes, &local_var_nodes, &unused_var_nodes); - + // A simplified model without the original weight/local/unused nodes on the + // subgraph ops will be saved only if 'SUBGRAPH_ONLINE_MODE' is set to + // true(default) and Predictor->Run(...), Predictor->Save(...) is called. + std::set input_var_nodes(idata_var_nodes.begin(), + idata_var_nodes.end()); + std::set output_var_nodes(odata_var_nodes.begin(), + odata_var_nodes.end()); + if (GetBoolFromEnv(SUBGRAPH_ONLINE_MODE, true)) { + input_var_nodes.insert(weight_var_nodes.begin(), weight_var_nodes.end()); + output_var_nodes.insert(local_var_nodes.begin(), local_var_nodes.end()); + output_var_nodes.insert(unused_var_nodes.begin(), unused_var_nodes.end()); + } // Set input and output name mapping which stores the real inputs and // outputs + std::vector idata_var_names; + std::vector odata_var_names; + for (auto &var_node : idata_var_nodes) { + idata_var_names.push_back(var_node->AsArg().name); + } + for (auto &var_node : odata_var_nodes) { + odata_var_names.push_back(var_node->AsArg().name); + } + subgraph_op_desc.SetAttr>("input_data_names", + idata_var_names); + subgraph_op_desc.SetAttr>("output_data_names", + odata_var_names); + // Set all of the inputs and outputs to the target subgraph op + // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram() std::vector input_var_names; std::vector output_var_names; for (auto &var_node : input_var_nodes) { @@ -447,60 +473,36 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, for (auto &var_node : output_var_nodes) { output_var_names.push_back(var_node->AsArg().name); } - subgraph_op_desc.SetAttr>("input_data_names", - input_var_names); - subgraph_op_desc.SetAttr>("output_data_names", - output_var_names); + subgraph_op_desc.SetInput("Inputs", input_var_names); + subgraph_op_desc.SetOutput("Outputs", output_var_names); + auto subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + static_cast(subgraph_op.get()) + ->SetProgramDesc(sub_program_desc); + auto any_op = (*subgraph_nodes.begin())->AsStmt().op(); + subgraph_op->Attach(subgraph_op_desc, any_op->scope()); - // Set input/output scale values of input/output var nodes for - // type_precision_cast_pass. - std::vector input_data_scales; - std::vector output_data_scales; + // Export the scale values of the input/output var nodes of the inner op nodes + // only for type_precision_cast_pass. for (auto &var_node : input_var_nodes) { + auto var_node_name = var_node->arg()->name; auto any_op_node = var_node->outlinks.front(); CHECK(any_op_node->IsStmt()); auto &any_inst = any_op_node->AsStmt(); - if (any_inst.op_info()->HasAttr("input_scale")) { - input_data_scales.push_back( - any_inst.op_info()->GetAttr("input_scale")); + if (any_inst.op_info()->HasInputScale(var_node_name)) { + subgraph_op->mutable_op_info()->SetInputScale( + var_node_name, any_inst.op_info()->GetInputScale(var_node_name)); } } for (auto &var_node : output_var_nodes) { + auto var_node_name = var_node->arg()->name; auto any_op_node = var_node->inlinks.front(); CHECK(any_op_node->IsStmt()); auto &any_inst = any_op_node->AsStmt(); - if (any_inst.op_info()->HasAttr("output_scale")) { - output_data_scales.push_back( - any_inst.op_info()->GetAttr("output_scale")); + if (any_inst.op_info()->HasOutputScale(var_node_name)) { + subgraph_op->mutable_op_info()->SetOutputScale( + var_node_name, any_inst.op_info()->GetOutputScale(var_node_name)); } } - if (input_data_scales.size() > 0) { - subgraph_op_desc.SetAttr>("input_data_scales", - input_data_scales); - } - if (output_data_scales.size() > 0) { - subgraph_op_desc.SetAttr>("output_data_scales", - output_data_scales); - } - - // Set all of the inputs and outputs to the target subgraph op - // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram() - for (auto &var_node : weight_var_nodes) { - input_var_names.push_back(var_node->AsArg().name); - } - for (auto &var_node : local_var_nodes) { - output_var_names.push_back(var_node->AsArg().name); - } - for (auto &var_node : unused_var_nodes) { - output_var_names.push_back(var_node->AsArg().name); - } - subgraph_op_desc.SetInput("Inputs", input_var_names); - subgraph_op_desc.SetOutput("Outputs", output_var_names); - auto subgraph_op = LiteOpRegistry::Global().Create("subgraph"); - static_cast(subgraph_op.get()) - ->SetSubBlock(sub_block_desc); - auto any_op = (*subgraph_nodes.begin())->AsStmt().op(); - subgraph_op->Attach(subgraph_op_desc, any_op->scope()); // Create and add a new subgraph node into the graph auto subgraph_op_node = @@ -508,26 +510,13 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, for (auto &var_node : input_var_nodes) { IR_NODE_LINK_TO(var_node, subgraph_op_node); } - for (auto &var_node : weight_var_nodes) { - IR_NODE_LINK_TO(var_node, subgraph_op_node); - } for (auto &var_node : output_var_nodes) { IR_OP_VAR_LINK(subgraph_op_node, var_node); } - for (auto &var_node : local_var_nodes) { - IR_OP_VAR_LINK(subgraph_op_node, var_node); - } - for (auto &var_node : unused_var_nodes) { - IR_OP_VAR_LINK(subgraph_op_node, var_node); - } // Remove subgraph nodes and unused var nodes - auto nodes2rm = GetNodes2RM(subgraph_nodes, - {input_var_nodes, - weight_var_nodes, - output_var_nodes, - local_var_nodes, - unused_var_nodes}); + auto nodes2rm = + GetNodes2RM(subgraph_nodes, {input_var_nodes, output_var_nodes}); GraphSafeRemoveNodes(graph, nodes2rm); } @@ -602,7 +591,17 @@ std::set GetNodes2RM( std::set nodes2rm(op_nodes.begin(), op_nodes.end()); for (auto &op_node : op_nodes) { for (auto &var_node : op_node->inlinks) { - if (!nodes2rm.count(var_node)) { + bool skip = false; + // skip the var node which is used by any other ops that doesn't belong to + // the subgraph ops. + for (auto &out_op_node : var_node->outlinks) { + if (std::find(op_nodes.begin(), op_nodes.end(), out_op_node) != + op_nodes.end()) { + skip = true; + break; + } + } + if (!skip && !nodes2rm.count(var_node)) { nodes2rm.insert(var_node); } } diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc index f52c0332fa3cfce904d2b7c8bf010bc3d3ac6ac9..f7e354f7a22582991ca64fa2d5fcc147bf6ed427 100644 --- a/lite/core/mir/subgraph/subgraph_detector_test.cc +++ b/lite/core/mir/subgraph/subgraph_detector_test.cc @@ -20,7 +20,7 @@ #include "lite/api/paddle_use_passes.h" #include "lite/core/mir/ssa_graph.h" #include "lite/core/program.h" -#include "lite/model_parser/cpp/program_desc.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/model_parser/model_parser.h" DEFINE_string(model_dir, "", "model_dir"); @@ -141,12 +141,11 @@ std::vector AddFetchDesc( } TEST(Subgraph, detect_simple_model) { - cpp::ProgramDesc program_desc; + auto program_desc = std::make_shared(); std::vector valid_places{{TARGET(kHost), PRECISION(kFloat)}}; auto scope = std::make_shared(); // Build a simple network - program_desc.ClearBlocks(); - auto* block_desc = program_desc.AddBlock(); + auto* block_desc = program_desc->AddBlock(); block_desc->ClearOps(); block_desc->ClearVars(); auto* var_desc = block_desc->AddVar(); @@ -181,13 +180,13 @@ TEST(Subgraph, detect_custom_model) { "the path of model files."; return; } - cpp::ProgramDesc program_desc; + auto program_desc = std::make_shared(); auto scope = std::make_shared(); LoadModelPb(FLAGS_model_dir, FLAGS_model_file, FLAGS_params_file, scope.get(), - &program_desc, + program_desc.get(), !FLAGS_model_file.empty() && !FLAGS_params_file.empty(), false); std::vector valid_places({ @@ -200,6 +199,9 @@ TEST(Subgraph, detect_custom_model) { #ifdef LITE_WITH_NPU Place{TARGET(kNPU), PRECISION(kFloat)}, #endif +#ifdef LITE_WITH_HUAWEI_ASCEND_NPU + Place{TARGET(kHuaweiAscendNPU), PRECISION(kFloat)}, +#endif #ifdef LITE_WITH_XTCL Place{TARGET(kXPU), PRECISION(kFloat)}, #endif diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc index f4df5c5f454c08c5f79dd220e579632dc7cf05a5..429c780912094baf9ceb8b5124dc197abd51af41 100644 --- a/lite/core/mir/subgraph/subgraph_pass.cc +++ b/lite/core/mir/subgraph/subgraph_pass.cc @@ -40,6 +40,21 @@ void NPUSubgraphPass::Apply(const std::unique_ptr& graph) { fuser(); } +void HuaweiAscendNPUSubgraphPass::Apply( + const std::unique_ptr& graph) { + std::set supported_lists; +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); +#include "lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h" +#undef USE_SUBGRAPH_BRIDGE + auto teller = [&](Node* node) { + if (!node->IsStmt()) return false; + auto& stmt = node->AsStmt(); + return supported_lists.count(stmt.op_type()) != 0; + }; + SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */); + fuser(); +} + void APUSubgraphPass::Apply(const std::unique_ptr& graph) { std::set supported_lists; #define USE_SUBGRAPH_BRIDGE(op_type, target) \ @@ -119,6 +134,9 @@ void MLUSubgraphPass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass) .BindTargets({TARGET(kNPU)}); +REGISTER_MIR_PASS(huawei_ascend_npu_subgraph_pass, + paddle::lite::mir::HuaweiAscendNPUSubgraphPass) + .BindTargets({TARGET(kHuaweiAscendNPU)}); REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass) .BindTargets({TARGET(kAPU)}); REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass) diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h index 8c2b501a62356c91e93f3c4ca91f70879d3c9229..c40a527cfe72ab1556e868d05aab5c0280fa4514 100644 --- a/lite/core/mir/subgraph/subgraph_pass.h +++ b/lite/core/mir/subgraph/subgraph_pass.h @@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass { void Apply(const std::unique_ptr& graph) override; }; +class HuaweiAscendNPUSubgraphPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + class APUSubgraphPass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override; diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc index 8fd3751f9ca1585af6b8b00f23acd6bacf5b7a51..5a57623b0c984be24e2d0b97ee575b22d369fdad 100644 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -13,8 +13,12 @@ // limitations under the License. #include + #include + #include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" #include "lite/api/test_helper.h" #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" @@ -183,6 +187,10 @@ TEST(Subgraph, generate_model_and_check_precision) { #ifdef LITE_WITH_NPU valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}); #endif +#ifdef LITE_WITH_HUAWEI_ASCEND_NPU + valid_places.push_back( + lite_api::Place{TARGET(kHuaweiAscendNPU), PRECISION(kFloat)}); +#endif #ifdef LITE_WITH_XTCL valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}); #endif diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc index 1133e5ba8203ec9fea177844a6311c993f6b8ff7..44b6eaf1eb0c5c96630dd66d129919b40f3ea8c6 100644 --- a/lite/core/mir/type_layout_cast_pass.cc +++ b/lite/core/mir/type_layout_cast_pass.cc @@ -249,11 +249,13 @@ void OpenCLTypeLayoutTransformPass::Apply( REGISTER_MIR_PASS(type_layout_cast_pass, paddle::lite::mir::TypeLayoutTransformPass) .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kMLU)}) .BindKernel("layout_once") .BindKernel("layout"); REGISTER_MIR_PASS(type_layout_cast_preprocess_pass, paddle::lite::mir::OpenCLTypeLayoutTransformPass) .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kMLU)}) .BindKernel("layout_once") .BindKernel("layout"); diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc index 25648877568f6427843f8ded6890450c265b4f06..40ece35993cfd2f8bce07e605387741202973614 100644 --- a/lite/core/mir/type_precision_cast_pass.cc +++ b/lite/core/mir/type_precision_cast_pass.cc @@ -36,14 +36,20 @@ void UpdateInputsForSubgraph(OpLite* op, op_desc->GetAttr>("input_data_names"); std::replace(input_data_names.begin(), input_data_names.end(), from, to); op_desc->SetAttr("input_data_names", input_data_names); - auto* subblock_desc = static_cast(op)->GetSubBlock(); - CHECK(subblock_desc); - for (size_t i = 0; i < subblock_desc->OpsSize(); i++) { - auto* subblock_op_desc = subblock_desc->GetOp(i); - for (auto& subblock_op_input : *subblock_op_desc->mutable_inputs()) { - for (auto& subblock_var_name : subblock_op_input.second) { - if (subblock_var_name == from) { - subblock_var_name = to; + auto sub_program_desc = + static_cast(op)->GetProgramDesc(); + CHECK(sub_program_desc); + int sub_block_idx = op_desc->GetAttr("sub_block"); + auto sub_block_desc = + sub_program_desc->GetBlock(sub_block_idx); + for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize(); + sub_op_idx++) { + auto sub_op_desc = const_cast( + sub_block_desc->GetOp(sub_op_idx)); + for (auto& sub_op_input : *sub_op_desc->mutable_inputs()) { + for (auto& sub_var_name : sub_op_input.second) { + if (sub_var_name == from) { + sub_var_name = to; } } } @@ -66,65 +72,30 @@ void UpdateInputs(OpLite* op, const std::string& from, const std::string& to) { } } -// Infer the scale value for the new calib op from the subgraph op -static bool InferScaleFromSubgraph(std::string var_name, - const OpInfo* op_info, - float* scale, - bool reverse = false) { - std::string attr_name = reverse ? "output_data_names" : "input_data_names"; - if (!op_info->HasAttr(attr_name)) return false; - auto input_or_output_names = - op_info->GetAttr>(attr_name); - attr_name = reverse ? "output_data_scales" : "input_data_scales"; - if (!op_info->HasAttr(attr_name)) return false; - auto input_or_output_scales = op_info->GetAttr>(attr_name); - auto size = input_or_output_names.size(); - CHECK(size == input_or_output_scales.size()); - for (size_t i = 0; i < size; i++) { - if (input_or_output_names[i] == var_name) { - *scale = input_or_output_scales[i]; - return true; - } - } - return false; -} - // Infer the scale value for the new calib op from the input_scale of the // current op and output_scale of the previous op. // case 1: prev_op->var_node->op_node(int8->any op, with input_scale). -// case 2: prev_op->var_node->op_node(subgraph op, int8->any, with -// input_data_scales). -// case 3: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any, +// case 2: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any, // without input_scale). -// case 4: prev_op(any->int8, subgraph_op, with -// output_data_scales)->var_node->op_node(fp32->any, without input_scale). static bool InferScale(Node* var_node, Node* op_node, float* scale) { bool found = false; auto& inst = op_node->AsStmt(); auto op_info = inst.op_info(); auto op_type = op_info->Type(); auto var_name = var_node->AsArg().name; - if (op_type == "subgraph") { - found = InferScaleFromSubgraph(var_name, op_info, scale, false); + if (op_info->HasInputScale(var_name)) { + *scale = op_info->GetInputScale(var_name)[0]; + found = true; } else { - if (op_info->HasAttr("input_scale")) { - *scale = op_info->GetAttr("input_scale"); + // Obtain the output_scale from one of its previous Ops + auto prev_op_node = var_node->inlinks.front(); + CHECK(prev_op_node->IsStmt()); + auto& prev_inst = prev_op_node->AsStmt(); + auto prev_op_info = prev_inst.op_info(); + auto prev_op_type = prev_op_info->Type(); + if (prev_op_info->HasOutputScale(var_name)) { + *scale = prev_op_info->GetOutputScale(var_name)[0]; found = true; - } else { - // Obtain the output_scale from one of its previous Ops - auto prev_op_node = var_node->inlinks.front(); - CHECK(prev_op_node->IsStmt()); - auto& prev_inst = prev_op_node->AsStmt(); - auto prev_op_info = prev_inst.op_info(); - auto prev_op_type = prev_op_info->Type(); - if (prev_op_type == "subgraph") { - found = InferScaleFromSubgraph(var_name, prev_op_info, scale, true); - } else { - if (prev_op_info->HasAttr("output_scale")) { - *scale = prev_op_info->GetAttr("output_scale"); - found = true; - } - } } } return found; diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h index d9f420cfad90d3c6a1f08072d8c5f87d2326661a..f7d35bfef3ac53903448c48300c144f8fd15652d 100644 --- a/lite/core/mir/variable_place_inference_pass.h +++ b/lite/core/mir/variable_place_inference_pass.h @@ -59,25 +59,46 @@ class VariablePlaceInferencePass : public DebugPass { } // Set the type of the weight - void SetWeightType(Node* w, + void SetWeightType(Node* weight_node, const LiteType& type, - const std::map& lite_with_targets) { + const std::map& with_targets) { VLOG(4) << "type.precision():" << PrecisionRepr(type.precision()); - if (lite_with_targets.at("kFPGA")) { - w->AsArg().type = LiteType::GetTensorTy( + if (with_targets.at("kFPGA")) { + weight_node->AsArg().type = LiteType::GetTensorTy( TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); - } else if (lite_with_targets.at("kOpenCL")) { - w->AsArg().type = LiteType::GetTensorTy( + } else if (with_targets.at("kOpenCL")) { + weight_node->AsArg().type = LiteType::GetTensorTy( TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); - } else if (lite_with_targets.at("kCUDA")) { - w->AsArg().type = LiteType::GetTensorTy( + } else if (with_targets.at("kCUDA")) { + weight_node->AsArg().type = LiteType::GetTensorTy( TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); } else { - w->AsArg().type = LiteType::GetTensorTy( + weight_node->AsArg().type = LiteType::GetTensorTy( TARGET(kHost), type.precision(), DATALAYOUT(kNCHW)); } } + // Update a's kUnk fields from b's fields. + void UpdateTypeFrom(const Type** a, const Type* b) { + auto target = (*a)->target(); + auto precision = (*a)->precision(); + auto layout = (*a)->layout(); + if (target == TARGET(kUnk)) { + target = b->target(); + } + if (precision == PRECISION(kUnk)) { + precision = b->precision(); + } + if (layout == DATALAYOUT(kUnk)) { + layout = b->layout(); + } + if ((*a)->IsTensor() && b->IsTensor()) { + *a = LiteType::GetTensorTy(target, precision, layout); + } else if ((*a)->IsTensorList() && b->IsTensorList()) { + *a = LiteType::GetTensorListTy(target, precision, layout); + } + } + void InferenceArgumentPlace(SSAGraph* graph) { auto& valid_places = graph->valid_places(); auto valid_places_has_target = [&](TargetType t) -> bool { @@ -88,122 +109,90 @@ class VariablePlaceInferencePass : public DebugPass { } return false; }; - std::map lite_with_targets{ + std::map with_targets{ {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))}, {"kCUDA", valid_places_has_target(TARGET(kCUDA))}, {"kFPGA", valid_places_has_target(TARGET(kFPGA))}}; - VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"]; - VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"]; + VLOG(4) << "with_targets['kOpenCL']:" << with_targets["kOpenCL"]; + VLOG(4) << "with_targets['kFPGA']:" << with_targets["kFPGA"]; VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global(); - for (auto& x : graph->StmtTopologicalOrder()) { - auto& inst = x->AsStmt(); + for (auto& node : graph->StmtTopologicalOrder()) { + auto& inst = node->AsStmt(); + const auto* op_info = inst.op_info(); + const auto& op_type = op_info->Type(); + auto& kernel = inst.picked_kernel(); + // The IoCopyOp is a tool operator, it won't support the type inference. // in fpga, we has io_copy+cali+layout tool ops, so we need type inference - // for - // tool operator - if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) { - VLOG(3) << "inst.op_type() == 'io_copy', continue"; - if (inst.op_type() == "io_copy") continue; + // for tool operator + if ((!with_targets["kFPGA"]) && (!with_targets["kOpenCL"])) { + VLOG(3) << "skip 'io_copy' if target is FPGA and OpenCL"; + if (op_type == "io_copy") continue; } - // deal with inputs - VLOG(4) << "Infering op " << inst.op_info()->Repr(); - // TODO(zhaolong): Add check if the node's name in op's arguments. - auto get_argname = [&]( - const std::string& node_name, - const std::map>& argname_map) - -> std::string { - for (auto& ele : argname_map) { - auto it = - std::find(ele.second.begin(), ele.second.end(), node_name); - if (it != ele.second.end()) return ele.first; - } - return ""; - }; - - for (auto* x_in : x->inlinks) { - std::string node_name = x_in->AsArg().name; - std::string arg_name = get_argname(node_name, inst.op_info()->inputs()); - CHECK(arg_name.size() > 0) << "can not found op arguments for node " - << node_name; - VLOG(4) << "-- input arg_name:" << arg_name << " " - << "-- node name:" << node_name; - auto type = inst.picked_kernel().GetInputDeclType(arg_name); - if (!x_in->AsArg().type) { - VLOG(4) << "set type " << *type << " " << x_in->AsArg().name; - if (x_in->AsArg().is_weight) { - SetWeightType(x_in, *type, lite_with_targets); + // Infering the input and output variable's place according to the + // declaration of I/O arguments of the picked kernel of the op + VLOG(4) << "Op " << op_info->Repr(); + for (auto* in_node : node->inlinks) { + auto& var = in_node->AsArg(); + const auto& var_name = var.name; + auto* var_type = &var.type; + std::string arg_name; + CHECK(op_info->GetInputArgname(var_name, &arg_name)) + << "Can not find the input argument for var " << var_name; + VLOG(4) << " - input arg name:" << arg_name << " var name:" << var_name; + const auto* decl_type = kernel.GetInputDeclType(arg_name); + if (!(*var_type)) { + VLOG(4) << "set type " << *decl_type << " " << var_name; + if (var.is_weight) { + SetWeightType(in_node, *decl_type, with_targets); } else { - x_in->AsArg().type = type; + *var_type = decl_type; } - } else if (x_in->AsArg().type->target() == TARGET(kUnk) && - x_in->AsArg().type->precision() != PRECISION(kUnk) && - x_in->AsArg().type->layout() == DATALAYOUT(kUnk)) { + } else if (!(*var_type)->place().is_valid()) { // If is quantization, infer the Int8 type. - if (type->precision() == PRECISION(kInt8)) { - x_in->AsArg().type = type; + if (decl_type->precision() == PRECISION(kInt8)) { + *var_type = decl_type; } else { - PrecisionType tmp_ptype = x_in->AsArg().type->precision(); - x_in->AsArg().type = LiteType::GetTensorTy( - type->target(), tmp_ptype, type->layout()); + UpdateTypeFrom(var_type, decl_type); } } } - - VLOG(4) << "inst " << inst.op_info()->Repr(); - for (auto* x_out : x->outlinks) { - std::string node_name = x_out->AsArg().name; - std::string arg_name = - get_argname(node_name, inst.op_info()->outputs()); - CHECK(arg_name.size() > 0) << "can not found op arguments for node " - << node_name << " in Inst " - << inst.op_type(); - VLOG(4) << "-- output arg_name " << arg_name; - auto type = inst.picked_kernel().GetOutputDeclType(arg_name); - if (!x_out->AsArg().type) { - VLOG(4) << "set type " << *type << " " << x_out->AsArg().name; - if (x_out->AsArg().is_weight) { - SetWeightType(x_out, *type, lite_with_targets); + for (auto* out_node : node->outlinks) { + auto& var = out_node->AsArg(); + const auto& var_name = var.name; + auto* var_type = &var.type; + std::string arg_name; + CHECK(op_info->GetOutputArgname(var_name, &arg_name)) + << "Can not find the output argument for var " << var_name; + VLOG(4) << " - output arg name:" << arg_name + << " var name:" << var_name; + const auto* decl_type = kernel.GetOutputDeclType(arg_name); + if (!(*var_type)) { + VLOG(4) << "set type " << *decl_type << " " << var_name; + if (var.is_weight) { + SetWeightType(out_node, *decl_type, with_targets); } else { - x_out->AsArg().type = type; + *var_type = decl_type; } - } else if (x_out->AsArg().type->target() == TARGET(kUnk) && - x_out->AsArg().type->precision() != PRECISION(kUnk) && - x_out->AsArg().type->layout() == DATALAYOUT(kUnk)) { + } else if (!(*var_type)->place().is_valid()) { // If is quantization, infer the Int8 type. - if (type->precision() == PRECISION(kInt8)) { - x_out->AsArg().type = type; - } else if (type->precision() == PRECISION(kFP16) && - type->target() != TARGET(kOpenCL)) { - x_out->AsArg().type = type; + if (decl_type->precision() == PRECISION(kInt8) || + (decl_type->precision() == PRECISION(kFP16) && + decl_type->target() != TARGET(kOpenCL))) { + *var_type = decl_type; } else { - PrecisionType tmp_ptype = x_out->AsArg().type->precision(); - x_out->AsArg().type = LiteType::GetTensorTy( - type->target(), tmp_ptype, type->layout()); + UpdateTypeFrom(var_type, decl_type); } } } } } - // Update me's kUnk fields by other's fields. - void UpdatePlace(Place* me, const Place& other) { - CHECK(other.is_valid()); - if (me->target == TARGET(kUnk)) { - me->target = other.target; - } - if (me->precision == PRECISION(kUnk)) { - me->precision = other.precision; - } - if (me->layout == DATALAYOUT(kUnk)) { - me->layout = other.layout; - } - } - private: - // The default target for arguments, e.g. load weights to CPU memory for CUDA - // computation by default. + // The default target for arguments, e.g. load weights to CPU memory for + // CUDA computation by default. TargetType argument_default_target_{TARGET(kHost)}; }; diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc index 537636065d6aeea67fd7c8c71fb00b183720fecc..585aaf3b703bca0a0a34030106dbf793e2a31d52 100644 --- a/lite/core/op_lite.cc +++ b/lite/core/op_lite.cc @@ -18,6 +18,7 @@ #include #include #include "lite/core/op_registry.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { @@ -186,5 +187,114 @@ void OpLite::AttachOutput(const cpp::OpDesc &op_desc, } } +bool OpInfo::GetInputArgname(const std::string &value_name, + std::string *out) const { + for (auto &item : inputs()) { + auto it = std::find(item.second.begin(), item.second.end(), value_name); + if (it != item.second.end()) { + *out = item.first; + return true; + } + } + return false; +} + +bool OpInfo::GetOutputArgname(const std::string &value_name, + std::string *out) const { + for (auto &item : outputs()) { + auto it = std::find(item.second.begin(), item.second.end(), value_name); + if (it != item.second.end()) { + *out = item.first; + return true; + } + } + return false; +} + +bool OpInfo::GetInputIndex(const std::string &input_name, int *out) const { + for (auto &item : inputs()) { + auto it = std::find(item.second.begin(), item.second.end(), input_name); + if (it != item.second.end()) { + *out = it - item.second.begin(); + return true; + } + } + return false; +} + +bool OpInfo::GetOutputIndex(const std::string &output_name, int *out) const { + for (auto &item : outputs()) { + auto it = std::find(item.second.begin(), item.second.end(), output_name); + if (it != item.second.end()) { + *out = it - item.second.begin(); + return true; + } + } + return false; +} + +bool OpInfo::HasInputScale(const std::string &input_name) const { + std::string argname; + int index; + if (GetInputArgname(input_name, &argname) && + GetInputIndex(input_name, &index)) { + return HasAttr(argname + to_string(index) + "_scale"); + } else { + return false; + } +} + +bool OpInfo::HasOutputScale(const std::string &output_name) const { + std::string argname; + int index; + if (GetOutputArgname(output_name, &argname) && + GetOutputIndex(output_name, &index)) { + return HasAttr(argname + to_string(index) + "_scale"); + } else { + return false; + } +} + +void OpInfo::SetInputScale(const std::string &input_name, + const std::vector &scale_value) { + std::string argname; + int index; + CHECK(GetInputArgname(input_name, &argname)); + CHECK(GetInputIndex(input_name, &index)); + CHECK(scale_value.size() > 0) + << "Error in SetInputScale: the scales should not be empty"; + SetAttr>(argname + to_string(index) + "_scale", + scale_value); +} + +void OpInfo::SetOutputScale(const std::string &output_name, + const std::vector &scale_value) { + std::string argname; + int index; + CHECK(GetOutputArgname(output_name, &argname)); + CHECK(GetOutputIndex(output_name, &index)); + CHECK(scale_value.size() > 0) + << "Error in SetOutputScale: the scales should not be empty"; + SetAttr>(argname + to_string(index) + "_scale", + scale_value); +} + +std::vector OpInfo::GetInputScale(const std::string &input_name) const { + std::string argname; + int index; + CHECK(GetInputArgname(input_name, &argname)); + CHECK(GetInputIndex(input_name, &index)); + return GetAttr>(argname + to_string(index) + "_scale"); +} + +std::vector OpInfo::GetOutputScale( + const std::string &output_name) const { + std::string argname; + int index; + CHECK(GetOutputArgname(output_name, &argname)); + CHECK(GetOutputIndex(output_name, &index)); + return GetAttr>(argname + to_string(index) + "_scale"); +} + } // namespace lite } // namespace paddle diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h index 301065d5b6bb5c4f41b19d9a9034985ca2f74d89..d94753220a1b5d963092c62c43d7e49b03243c63 100644 --- a/lite/core/op_lite.h +++ b/lite/core/op_lite.h @@ -24,7 +24,7 @@ #include "lite/core/context.h" #include "lite/core/kernel.h" #include "lite/core/scope.h" -#include "lite/model_parser/cpp/op_desc.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/operators/op_params.h" namespace paddle { @@ -99,7 +99,7 @@ class OpLite : public Registry { std::vector> CreateKernels( const std::vector &places, const std::string &kernel_type = ""); - lite::Scope *scope() { return scope_; } + Scope *scope() { return scope_; } // Assign op param to kernel. virtual void AttachKernel(KernelBase *kernel) = 0; @@ -169,7 +169,7 @@ class OpLite : public Registry { } protected: - lite::Scope *scope_{nullptr}; + Scope *scope_{nullptr}; std::unique_ptr kernel_; std::string op_type_; std::vector valid_places_; @@ -229,55 +229,8 @@ class OpInfo : public cpp::OpDesc { return OutputArgumentNames(); } - bool GetInputArgname(const std::string &value_name, std::string *out) const { - for (auto &item : inputs_) { - auto it = std::find(item.second.begin(), item.second.end(), value_name); - if (it != item.second.end()) { - *out = item.first; - return true; - } - } - return false; - } - bool GetOutputArgname(const std::string &value_name, std::string *out) const { - for (auto &item : outputs_) { - auto it = std::find(item.second.begin(), item.second.end(), value_name); - if (it != item.second.end()) { - *out = item.first; - return true; - } - } - return false; - } - - // For the input variable name, find the index of the corresponding - // input argname - bool GetInputIndex(const std::string &value_name, int *out) const { - for (auto &item : inputs_) { - auto it = std::find(item.second.begin(), item.second.end(), value_name); - if (it != item.second.end()) { - *out = it - item.second.begin(); - return true; - } - } - return false; - } - - // For the output variable name, find the index of the corresponding - // output argname - bool GetOutputIndex(const std::string &value_name, int *out) const { - for (auto &item : outputs_) { - auto it = std::find(item.second.begin(), item.second.end(), value_name); - if (it != item.second.end()) { - *out = it - item.second.begin(); - return true; - } - } - return false; - } - void UpdateAllInputs(const std::string &from, const std::string &to) { - for (auto &item : inputs_) { + for (auto &item : *mutable_inputs()) { for (auto &var : item.second) { if (var == from) var = to; } @@ -285,12 +238,32 @@ class OpInfo : public cpp::OpDesc { } void UpdateAllOutputs(const std::string &from, const std::string &to) { - for (auto &item : outputs_) { + for (auto &item : *mutable_outputs()) { for (auto &var : item.second) { if (var == from) var = to; } } } + + bool GetInputArgname(const std::string &value_name, std::string *out) const; + bool GetOutputArgname(const std::string &value_name, std::string *out) const; + + bool GetInputIndex(const std::string &input_name, int *out) const; + bool GetOutputIndex(const std::string &output_name, int *out) const; + + bool HasInputScale(const std::string &input_name) const; + bool HasOutputScale(const std::string &output_name) const; + + void SetInputScale(const std::string &input_name, + const std::vector &scale_value); + void SetOutputScale(const std::string &output_name, + const std::vector &scale_value); + + // For conv2d, depthwise_conv2d and mul, the scale of weight are a vector. + // Otherwise, all input and output scales are scalar, but we save these + // as vecotr. + std::vector GetInputScale(const std::string &input_name) const; + std::vector GetOutputScale(const std::string &output_name) const; }; } // namespace lite diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc index ef6d3cfaf001ea55cef23faee11d508920c49715..cb773edd18ee236a30cbfcf5d6b1ce5773f0269d 100644 --- a/lite/core/op_registry.cc +++ b/lite/core/op_registry.cc @@ -17,277 +17,5 @@ #include namespace paddle { -namespace lite { - -const std::map &GetOp2PathDict() { - return OpKernelInfoCollector::Global().GetOp2PathDict(); -} - -std::list> KernelRegistry::Create( - const std::string &op_type, - TargetType target, - PrecisionType precision, - DataLayoutType layout) { - Place place{target, precision, layout}; - VLOG(5) << "creating " << op_type << " kernel for " << place.DebugString(); -#define CREATE_KERNEL1(target__, precision__) \ - switch (layout) { \ - case DATALAYOUT(kNCHW): \ - return Create(op_type); \ - case DATALAYOUT(kAny): \ - return Create(op_type); \ - case DATALAYOUT(kNHWC): \ - return Create(op_type); \ - case DATALAYOUT(kImageDefault): \ - return Create(op_type); \ - case DATALAYOUT(kImageFolder): \ - return Create(op_type); \ - case DATALAYOUT(kImageNW): \ - return Create(op_type); \ - default: \ - LOG(FATAL) << "unsupported kernel layout " << DataLayoutToStr(layout); \ - } - -#define CREATE_KERNEL(target__) \ - switch (precision) { \ - case PRECISION(kFloat): \ - CREATE_KERNEL1(target__, kFloat); \ - case PRECISION(kInt8): \ - CREATE_KERNEL1(target__, kInt8); \ - case PRECISION(kFP16): \ - CREATE_KERNEL1(target__, kFP16); \ - case PRECISION(kAny): \ - CREATE_KERNEL1(target__, kAny); \ - case PRECISION(kInt32): \ - CREATE_KERNEL1(target__, kInt32); \ - case PRECISION(kInt64): \ - CREATE_KERNEL1(target__, kInt64); \ - default: \ - CHECK(false) << "not supported kernel precision " \ - << PrecisionToStr(precision); \ - } - - switch (target) { - case TARGET(kHost): { - CREATE_KERNEL(kHost); - } break; -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_X86) - case TARGET(kX86): { - CREATE_KERNEL(kX86); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_CUDA) - case TARGET(kCUDA): { - CREATE_KERNEL(kCUDA); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_ARM) - case TARGET(kARM): { - CREATE_KERNEL(kARM); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_OPENCL) - case TARGET(kOpenCL): { - CREATE_KERNEL(kOpenCL); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_NPU) - case TARGET(kNPU): { - CREATE_KERNEL(kNPU); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_APU) - case TARGET(kAPU): { - CREATE_KERNEL(kAPU); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_XPU) - case TARGET(kXPU): { - CREATE_KERNEL(kXPU); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_FPGA) - case TARGET(kFPGA): { - CREATE_KERNEL(kFPGA); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_BM) - case TARGET(kBM): { - CREATE_KERNEL(kBM); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_MLU) - case TARGET(kMLU): { - CREATE_KERNEL(kMLU); - } break; -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_RKNPU) - case TARGET(kRKNPU): { - CREATE_KERNEL(kRKNPU); - } break; -#endif - default: - CHECK(false) << "not supported kernel target " << TargetToStr(target); - } - -#undef CREATE_KERNEL - return std::list>(); -} - -KernelRegistry::KernelRegistry() : registries_() { -#define INIT_FOR(target__, precision__, layout__) \ - registries_[std::make_tuple(TARGET(target__), \ - PRECISION(precision__), \ - DATALAYOUT(layout__))] \ - .set *>( \ - &KernelRegistryForTarget::Global()); -// Currently, just register 2 kernel targets. -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_CUDA) - INIT_FOR(kCUDA, kFloat, kNCHW); - INIT_FOR(kCUDA, kFloat, kNHWC); - INIT_FOR(kCUDA, kInt8, kNCHW); - INIT_FOR(kCUDA, kFP16, kNCHW); - INIT_FOR(kCUDA, kFP16, kNHWC); - INIT_FOR(kCUDA, kAny, kNCHW); - INIT_FOR(kCUDA, kAny, kAny); - INIT_FOR(kCUDA, kInt8, kNHWC); - INIT_FOR(kCUDA, kInt64, kNCHW); - INIT_FOR(kCUDA, kInt64, kNHWC); - INIT_FOR(kCUDA, kInt32, kNCHW); - INIT_FOR(kCUDA, kInt32, kNHWC); -#endif - -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_MLU) - INIT_FOR(kMLU, kFloat, kNHWC); - INIT_FOR(kMLU, kFloat, kNCHW); - INIT_FOR(kMLU, kFP16, kNHWC); - INIT_FOR(kMLU, kFP16, kNCHW); - INIT_FOR(kMLU, kInt8, kNHWC); - INIT_FOR(kMLU, kInt8, kNCHW); - INIT_FOR(kMLU, kInt16, kNHWC); - INIT_FOR(kMLU, kInt16, kNCHW); -#endif - - INIT_FOR(kHost, kAny, kNCHW); - INIT_FOR(kHost, kAny, kNHWC); - INIT_FOR(kHost, kAny, kAny); - INIT_FOR(kHost, kBool, kNCHW); - INIT_FOR(kHost, kBool, kNHWC); - INIT_FOR(kHost, kBool, kAny); - INIT_FOR(kHost, kFloat, kNCHW); - INIT_FOR(kHost, kFloat, kNHWC); - INIT_FOR(kHost, kFloat, kAny); - INIT_FOR(kHost, kFP16, kNCHW); - INIT_FOR(kHost, kFP16, kNHWC); - INIT_FOR(kHost, kFP16, kAny); - INIT_FOR(kHost, kInt8, kNCHW); - INIT_FOR(kHost, kInt8, kNHWC); - INIT_FOR(kHost, kInt8, kAny); - INIT_FOR(kHost, kInt16, kNCHW); - INIT_FOR(kHost, kInt16, kNHWC); - INIT_FOR(kHost, kInt16, kAny); - INIT_FOR(kHost, kInt32, kNCHW); - INIT_FOR(kHost, kInt32, kNHWC); - INIT_FOR(kHost, kInt32, kAny); - INIT_FOR(kHost, kInt64, kNCHW); - INIT_FOR(kHost, kInt64, kNHWC); - INIT_FOR(kHost, kInt64, kAny); - -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_X86) - INIT_FOR(kX86, kFloat, kNCHW); - INIT_FOR(kX86, kAny, kNCHW); - INIT_FOR(kX86, kAny, kAny); - INIT_FOR(kX86, kInt64, kNCHW); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_ARM) - INIT_FOR(kARM, kFloat, kNCHW); - INIT_FOR(kARM, kFloat, kNHWC); - INIT_FOR(kARM, kInt8, kNCHW); - INIT_FOR(kARM, kInt8, kNHWC); - INIT_FOR(kARM, kAny, kNCHW); - INIT_FOR(kARM, kAny, kAny); - INIT_FOR(kARM, kInt32, kNCHW); - INIT_FOR(kARM, kInt64, kNCHW); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_OPENCL) - INIT_FOR(kOpenCL, kFloat, kNCHW); - INIT_FOR(kOpenCL, kFloat, kNHWC); - INIT_FOR(kOpenCL, kAny, kNCHW); - INIT_FOR(kOpenCL, kAny, kNHWC); - INIT_FOR(kOpenCL, kFloat, kAny); - INIT_FOR(kOpenCL, kInt8, kNCHW); - INIT_FOR(kOpenCL, kAny, kAny); - INIT_FOR(kOpenCL, kFP16, kNCHW); - INIT_FOR(kOpenCL, kFP16, kNHWC); - INIT_FOR(kOpenCL, kFP16, kImageDefault); - INIT_FOR(kOpenCL, kFP16, kImageFolder); - INIT_FOR(kOpenCL, kFP16, kImageNW); - INIT_FOR(kOpenCL, kFloat, kImageDefault); - INIT_FOR(kOpenCL, kFloat, kImageFolder); - INIT_FOR(kOpenCL, kFloat, kImageNW); - INIT_FOR(kOpenCL, kAny, kImageDefault); - INIT_FOR(kOpenCL, kAny, kImageFolder); - INIT_FOR(kOpenCL, kAny, kImageNW); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_NPU) - INIT_FOR(kNPU, kFloat, kNCHW); - INIT_FOR(kNPU, kFloat, kNHWC); - INIT_FOR(kNPU, kInt8, kNCHW); - INIT_FOR(kNPU, kInt8, kNHWC); - INIT_FOR(kNPU, kAny, kNCHW); - INIT_FOR(kNPU, kAny, kNHWC); - INIT_FOR(kNPU, kAny, kAny); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_APU) - INIT_FOR(kAPU, kInt8, kNCHW); - INIT_FOR(kXPU, kFloat, kNCHW); - INIT_FOR(kXPU, kInt8, kNCHW); - INIT_FOR(kXPU, kAny, kNCHW); - INIT_FOR(kXPU, kAny, kAny); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_FPGA) - INIT_FOR(kFPGA, kFP16, kNHWC); - INIT_FOR(kFPGA, kFP16, kAny); - INIT_FOR(kFPGA, kFloat, kNHWC); - INIT_FOR(kFPGA, kAny, kNHWC); - INIT_FOR(kFPGA, kAny, kAny); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_BM) - INIT_FOR(kBM, kFloat, kNCHW); - INIT_FOR(kBM, kInt8, kNCHW); - INIT_FOR(kBM, kAny, kNCHW); - INIT_FOR(kBM, kAny, kAny); -#endif -#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_RKNPU) - INIT_FOR(kRKNPU, kFloat, kNCHW); - INIT_FOR(kRKNPU, kInt8, kNCHW); - INIT_FOR(kRKNPU, kAny, kNCHW); - INIT_FOR(kRKNPU, kAny, kAny); -#endif - -#undef INIT_FOR -} - -KernelRegistry &KernelRegistry::Global() { - static auto *x = new KernelRegistry; - return *x; -} - -} // namespace lite +namespace lite {} // namespace lite } // namespace paddle diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index 2128e218554fb304474c14cfacd7867e491a4fe6..90a2b563af7e17a4806bd47cb883d9590cdab40f 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -33,19 +32,19 @@ using LiteType = paddle::lite::Type; class OpKernelInfoCollector { public: - static OpKernelInfoCollector &Global() { - static auto *x = new OpKernelInfoCollector; + static OpKernelInfoCollector& Global() { + static auto* x = new OpKernelInfoCollector; return *x; } - void AddOp2path(const std::string &op_name, const std::string &op_path) { + void AddOp2path(const std::string& op_name, const std::string& op_path) { size_t index = op_path.find_last_of('/'); if (index != std::string::npos) { op2path_.insert(std::pair( op_name, op_path.substr(index + 1))); } } - void AddKernel2path(const std::string &kernel_name, - const std::string &kernel_path) { + void AddKernel2path(const std::string& kernel_name, + const std::string& kernel_path) { size_t index = kernel_path.find_last_of('/'); if (index != std::string::npos) { kernel2path_.insert(std::pair( @@ -53,13 +52,13 @@ class OpKernelInfoCollector { } } void SetKernel2path( - const std::map &kernel2path_map) { + const std::map& kernel2path_map) { kernel2path_ = kernel2path_map; } - const std::map &GetOp2PathDict() { + const std::map& GetOp2PathDict() { return op2path_; } - const std::map &GetKernel2PathDict() { + const std::map& GetKernel2PathDict() { return kernel2path_; } @@ -71,409 +70,185 @@ class OpKernelInfoCollector { namespace paddle { namespace lite { -const std::map &GetOp2PathDict(); - -using KernelFunc = std::function; -using KernelFuncCreator = std::function()>; -class LiteOpRegistry final : public Factory> { +class OpLiteFactory { public: - static LiteOpRegistry &Global() { - static auto *x = new LiteOpRegistry; - return *x; + // Register a function to create an op + void RegisterCreator(const std::string& op_type, + std::function()> fun) { + op_registry_[op_type] = fun; } - private: - LiteOpRegistry() = default; -}; - -template -class OpLiteRegistor : public Registor { - public: - explicit OpLiteRegistor(const std::string &op_type) - : Registor([&] { - LiteOpRegistry::Global().Register( - op_type, [op_type]() -> std::unique_ptr { - return std::unique_ptr(new OpClass(op_type)); - }); - }) {} -}; -template -using KernelRegistryForTarget = - Factory, std::unique_ptr>; - -class KernelRegistry final { - public: - using any_kernel_registor_t = - variant *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // + static OpLiteFactory& Global() { + static OpLiteFactory* x = new OpLiteFactory; + return *x; + } - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // + std::shared_ptr Create(const std::string& op_type) const { + auto it = op_registry_.find(op_type); + if (it == op_registry_.end()) return nullptr; + return it->second(); + } - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // + std::string DebugString() const { + STL::stringstream ss; + for (const auto& item : op_registry_) { + ss << " - " << item.first << "\n"; + } + return ss.str(); + } - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // + std::vector GetAllOps() const { + std::vector res; + for (const auto& op : op_registry_) { + res.push_back(op.first); + } + return res; + } - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // + protected: + std::map()>> op_registry_; +}; - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget * // - >; +using LiteOpRegistry = OpLiteFactory; - KernelRegistry(); +// Register OpLite by initializing a static OpLiteRegistrar instance +class OpLiteRegistrar { + public: + OpLiteRegistrar(const std::string& op_type, + std::function()> fun) { + OpLiteFactory::Global().RegisterCreator(op_type, fun); + } + // Touch function is used to guarantee registrar was initialized. + void touch() {} +}; - static KernelRegistry &Global(); +class KernelFactory { + public: + // Register a function to create kernels + void RegisterCreator(const std::string& op_type, + TargetType target, + PrecisionType precision, + DataLayoutType layout, + std::function()> fun) { + op_registry_[op_type][std::make_tuple(target, precision, layout)].push_back( + fun); + } - template - void Register( - const std::string &name, - typename KernelRegistryForTarget::creator_t - &&creator) { - using kernel_registor_t = - KernelRegistryForTarget; - auto &varient = registries_[std::make_tuple(Target, Precision, Layout)]; - auto *reg = varient.template get(); - CHECK(reg) << "Can not be empty of " << name; - reg->Register(name, std::move(creator)); -#ifdef LITE_ON_MODEL_OPTIMIZE_TOOL - kernel_info_map_[name].push_back( - std::make_tuple(Target, Precision, Layout)); -#endif // LITE_ON_MODEL_OPTIMIZE_TOOL + static KernelFactory& Global() { + static KernelFactory* x = new KernelFactory; + return *x; } - template - std::list> Create(const std::string &op_type) { - using kernel_registor_t = - KernelRegistryForTarget; - std::list> kernel_list; - std::tuple temp_tuple( - Target, Precision, Layout); - if (registries_[temp_tuple].valid()) { - kernel_list = - registries_[temp_tuple].template get()->Creates( - op_type); + /** + * Create all kernels belongs to an op. + */ + std::list> Create(const std::string& op_type) { + std::list> res; + if (op_registry_.find(op_type) == op_registry_.end()) return res; + auto& kernel_registry = op_registry_[op_type]; + for (auto it = kernel_registry.begin(); it != kernel_registry.end(); ++it) { + for (auto& fun : it->second) { + res.emplace_back(fun()); + } } - return kernel_list; + return res; } - std::list> Create(const std::string &op_type, + /** + * Create a specific kernel. Return a list for API compatible. + */ + std::list> Create(const std::string& op_type, TargetType target, PrecisionType precision, - DataLayoutType layout); + DataLayoutType layout) { + std::list> res; + if (op_registry_.find(op_type) == op_registry_.end()) return res; + auto& kernel_registry = op_registry_[op_type]; + auto it = kernel_registry.find(std::make_tuple(target, precision, layout)); + if (it == kernel_registry.end()) return res; + for (auto& fun : it->second) { + res.emplace_back(fun()); + } + return res; + } std::string DebugString() const { -#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL - return "No more debug info"; -#else // LITE_ON_MODEL_OPTIMIZE_TOOL STL::stringstream ss; - ss << "\n"; - ss << "Count of kernel kinds: "; - int count = 0; - for (auto &item : kernel_info_map_) { - count += item.second.size(); - } - ss << count << "\n"; - - ss << "Count of registered kernels: " << kernel_info_map_.size() << "\n"; - for (auto &item : kernel_info_map_) { - ss << "op: " << item.first << "\n"; - for (auto &kernel : item.second) { - ss << " - (" << TargetToStr(std::get<0>(kernel)) << ","; - ss << PrecisionToStr(std::get<1>(kernel)) << ","; - ss << DataLayoutToStr(std::get<2>(kernel)); - ss << ")"; - ss << "\n"; - } + for (const auto& item : op_registry_) { + ss << " - " << item.first << "\n"; } - return ss.str(); -#endif // LITE_ON_MODEL_OPTIMIZE_TOOL } - private: - mutable std::map, - any_kernel_registor_t> - registries_; -#ifndef LITE_ON_TINY_PUBLISH - mutable std::map< - std::string, - std::vector>> - kernel_info_map_; -#endif + protected: + // Outer map: op -> a map of kernel. + // Inner map: kernel -> creator function. + // Each kernel was represented by a combination of + std::map, + std::list()>>>> + op_registry_; }; -template -class KernelRegistor : public lite::Registor { +using KernelRegistry = KernelFactory; + +// Register Kernel by initializing a static KernelRegistrar instance +class KernelRegistrar { public: - KernelRegistor(const std::string &op_type, const std::string &alias) - : Registor([=] { - KernelRegistry::Global().Register( - op_type, [=]() -> std::unique_ptr { - std::unique_ptr x(new KernelType); - x->set_op_type(op_type); - x->set_alias(alias); - return x; - }); - }) {} + KernelRegistrar(const std::string& op_type, + TargetType target, + PrecisionType precision, + DataLayoutType layout, + std::function()> fun) { + KernelFactory::Global().RegisterCreator( + op_type, target, precision, layout, fun); + } + // Touch function is used to guarantee registrar was initialized. + void touch() {} }; } // namespace lite } // namespace paddle -// Operator registry -#define LITE_OP_REGISTER_INSTANCE(op_type__) op_type__##__registry__instance__ -#define REGISTER_LITE_OP(op_type__, OpClass) \ - static paddle::lite::OpLiteRegistor LITE_OP_REGISTER_INSTANCE( \ - op_type__)(#op_type__); \ - int touch_op_##op_type__() { \ - OpKernelInfoCollector::Global().AddOp2path(#op_type__, __FILE__); \ - return LITE_OP_REGISTER_INSTANCE(op_type__).Touch(); \ +// Register an op. +#define REGISTER_LITE_OP(op_type__, OpClass) \ + static paddle::lite::OpLiteRegistrar op_type__##__registry( \ + #op_type__, []() { \ + return std::unique_ptr(new OpClass(#op_type__)); \ + }); \ + int touch_op_##op_type__() { \ + op_type__##__registry.touch(); \ + OpKernelInfoCollector::Global().AddOp2path(#op_type__, __FILE__); \ + return 0; \ } -// Kernel registry -#define LITE_KERNEL_REGISTER(op_type__, target__, precision__) \ - op_type__##__##target__##__##precision__##__registor__ -#define LITE_KERNEL_REGISTER_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - op_type__##__##target__##__##precision__##__##layout__##registor__instance__##alias__ // NOLINT - -#define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \ - LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__) - +// Register a kernel. #define REGISTER_LITE_KERNEL( \ op_type__, target__, precision__, layout__, KernelClass, alias__) \ - static paddle::lite::KernelRegistor \ - LITE_KERNEL_REGISTER_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__)(#op_type__, \ - #alias__); \ - static KernelClass LITE_KERNEL_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__); \ + static paddle::lite::KernelRegistrar \ + op_type__##target__##precision__##layout__##alias__##_kernel_registry( \ + #op_type__, \ + TARGET(target__), \ + PRECISION(precision__), \ + DATALAYOUT(layout__), \ + []() { \ + std::unique_ptr x(new KernelClass); \ + x->set_op_type(#op_type__); \ + x->set_alias(#alias__); \ + return x; \ + }); \ int touch_##op_type__##target__##precision__##layout__##alias__() { \ + op_type__##target__##precision__##layout__##alias__##_kernel_registry \ + .touch(); \ OpKernelInfoCollector::Global().AddKernel2path( \ #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \ __FILE__); \ - LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \ - .Touch(); \ return 0; \ } \ - static bool LITE_KERNEL_PARAM_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) UNUSED = \ - paddle::lite::ParamTypeRegistry::NewInstance( \ - #op_type__ "/" #alias__) - -#define LITE_KERNEL_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - op_type__##target__##precision__##layout__##alias__ -#define LITE_KERNEL_PARAM_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - op_type__##target__##precision__##layout__##alias__##param_register + static auto \ + op_type__##target__##precision__##layout__##alias__##param_register \ + UNUSED = paddle::lite::ParamTypeRegistry::NewInstance< \ + TARGET(target__), \ + PRECISION(precision__), \ + DATALAYOUT(layout__)>(#op_type__ "/" #alias__) diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 05f801facdf9557da1e872d69fcde0bf3b321d2e..42dac8e59bda84ce5dc2cb04f2f3712d1386b96c 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -19,6 +19,7 @@ #include #include #include +#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h" #include "lite/core/mir/generate_program_pass.h" #include "lite/core/mir/pass_manager.h" #include "lite/core/mir/pass_utils.h" @@ -36,6 +37,9 @@ namespace lite { * lite::Optimizer optimize a program. It utilize the mir passes to analysis the * program and export an optimized program. */ +// TODO(hong1986032) Support the following passes for the subblocks +const std::set kSubblockUnsupportedPasses( + {"memory_optimize_pass"}); class Optimizer { public: Optimizer() {} @@ -60,14 +64,20 @@ class Optimizer { program_ = &program; valid_places_ = valid_places; CHECK(!valid_places.empty()) << "At least one valid_place should be set"; - CHECK(!graph_) << "duplicate optimize found"; - - graph_.reset(new mir::SSAGraph); - graph_->Build(program, valid_places); - graph_->SetValidPlaces(valid_places); + CHECK(graphs_.empty()) << "duplicate optimize found"; + + auto block_size = program.block_size(); + for (size_t block_idx = 0; block_idx < block_size; ++block_idx) { + std::unique_ptr graph; + graph.reset(new mir::SSAGraph); + graph->Build(program, valid_places, block_idx); + graph->SetValidPlaces(valid_places); + graphs_.emplace_back(std::move(graph)); + } SpecifyKernelPickTactic(kernel_pick_factor); InitTargetTypeTransformPass(); + InitControlFlowOpUnusedInputsAndOutputsEliminatePass(); if (passes.empty() || passes.size() == 1) { std::vector passes_local{ @@ -76,6 +86,7 @@ class Optimizer { "lite_conv_elementwise_fuse_pass", // conv-elemwise-bn "lite_conv_bn_fuse_pass", // "lite_conv_elementwise_fuse_pass", // conv-bn-elemwise + "lite_conv_conv_fuse_pass", // // TODO(Superjomn) Refine the fusion related design to select fusion // kernels for devices automatically. "lite_conv_activation_fuse_pass", // @@ -94,6 +105,8 @@ class Optimizer { #endif "identity_dropout_eliminate_pass", "__xpu__resnet_fuse_pass", + "__xpu__resnet_cbam_fuse_pass", + "__xpu__mmdnn_fuse_pass", "__xpu__multi_encoder_fuse_pass", "__xpu__embedding_with_eltwise_add_fuse_pass", "__xpu__fc_fuse_pass", @@ -104,12 +117,19 @@ class Optimizer { // 'enable_int8' for all // of the quantized ops. "npu_subgraph_pass", + "huawei_ascend_npu_subgraph_pass", "xpu_subgraph_pass", "bm_subgraph_pass", "apu_subgraph_pass", "rknpu_subgraph_pass", - "static_kernel_pick_pass", // pick original kernel from graph + "mlu_subgraph_pass", + "control_flow_op_unused_inputs_and_outputs_eliminate_pass", + "static_kernel_pick_pass", // pick original kernel from graph + + "remove_tf_redundant_ops_pass", "variable_place_inference_pass", // inference arg/var's + + "mlu_postprocess_pass", // info(target/precision/layout/device) // using kernel info "argument_type_display_pass", // debug pass: show arg-type-node's @@ -139,13 +159,9 @@ class Optimizer { "variable_place_inference_pass", // "argument_type_display_pass", - "mlu_subgraph_pass", - "runtime_context_assign_pass", "argument_type_display_pass", - "mlu_postprocess_pass", - "memory_optimize_pass"}}; if (passes.size() == 1) { @@ -172,13 +188,15 @@ class Optimizer { exec_scope_ = program.exec_scope(); } - const lite::Scope* exec_scope() const { return exec_scope_; } + const Scope* exec_scope() const { return exec_scope_; } // Generate a new program based on the mir graph. std::unique_ptr GenRuntimeProgram() { auto pass = mir::PassManager::Global().LookUp( "generate_program_pass"); - pass->Apply(graph_); + for (auto& graph : graphs_) { + pass->Apply(graph); + } auto program = pass->GenProgram(); CHECK(exec_scope_); program->set_exec_scope(exec_scope_); @@ -194,20 +212,32 @@ class Optimizer { pass->SetValidPlaces(valid_places_); } + void InitControlFlowOpUnusedInputsAndOutputsEliminatePass() { + auto* pass = + mir::PassManager::Global() + .LookUp( + "control_flow_op_unused_inputs_and_outputs_eliminate_pass"); + CHECK(pass); + CHECK(!graphs_.empty()); + pass->SetAllGraphs(&graphs_); + } + // Generate C++ code which combines the inference program, model and weights. void GenCode(const std::string& code_dir); - const mir::SSAGraph& ssa_graph() const { - CHECK(graph_); - return *graph_; + const mir::SSAGraph& ssa_graph(int block_idx = kRootBlockIdx) const { + CHECK(!graphs_.empty()); + CHECK(graphs_[block_idx]); + return *graphs_[block_idx]; } - mir::SSAGraph* mutable_ssa_graph() { - CHECK(graph_); - return graph_.get(); + mir::SSAGraph* mutable_ssa_graph(int block_idx = kRootBlockIdx) { + CHECK(!graphs_.empty()); + CHECK(graphs_[block_idx]); + return graphs_[block_idx].get(); } - lite::Scope* exec_scope() { return exec_scope_; } + Scope* exec_scope() { return exec_scope_; } protected: void SpecifyKernelPickTactic(core::KernelPickFactor factor); @@ -231,16 +261,23 @@ class Optimizer { LOG(INFO) << " - Skip " << x << " because the target or kernel does not match."; } else { - pass->Apply(graph_); + // Check the pass whether it is supported for processing subblocks + if (kSubblockUnsupportedPasses.count(x)) { + pass->Apply(graphs_[kRootBlockIdx]); + } else { + for (auto& graph : graphs_) { + pass->Apply(graph); + } + } LOG(INFO) << "== Finished running: " << x; } } } private: - std::unique_ptr graph_; + std::vector> graphs_; std::vector valid_places_; - lite::Scope* exec_scope_{}; + Scope* exec_scope_{}; Program* program_{}; }; diff --git a/lite/core/program.cc b/lite/core/program.cc index 6dee8796ee1dcd944940f41cb9454344fb8367a7..bd6dd09683b5004167ee1f8d6426fde0fff4f6b0 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -15,9 +15,8 @@ #include "lite/core/program.h" #include #include -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/var_desc.h" +#include +#include "lite/model_parser/cpp_desc.h" #include "lite/operators/conditional_block_op.h" #include "lite/operators/subgraph_op.h" #include "lite/operators/while_op.h" @@ -28,122 +27,221 @@ namespace paddle { namespace lite { -void RuntimeProgram::SaveOpInfosToProgram(cpp::ProgramDesc* desc) { - CHECK(desc); - // NOTE: RuntimeProgram do not has all meta info, so save model just update - // upon origin model - CHECK(desc->BlocksSize()); - auto main_block = desc->GetBlock(0); - main_block->ClearOps(); - for (auto& node : instructions_) { - auto op_type = node.op()->op_info()->Type(); - if (op_type == "subgraph") { - auto subgraph_op = const_cast( - static_cast(node.op())); - int sub_block_idx = subgraph_op->op_info()->GetAttr("sub_block"); - if (sub_block_idx < 0) { - // It's a new subgraph op when its sub_block_idx < 0, Now we add its +void RuntimeProgram::SaveToProgram( + std::shared_ptr program_desc) { + CHECK(program_desc); + auto block_size = program_desc->BlocksSize(); + CHECK_GT(block_size, 0) << "No block found!"; + // TODD(hong19860320) Only support updating the block desc which already + // exists in the origin program desc + CHECK_LE(block_size, instructions_.size()) + << "Invalid block size, expected (0," << instructions_.size() + << "] but got " << block_size; + for (size_t block_idx = 0; block_idx < block_size; ++block_idx) { + auto block_desc = program_desc->GetBlock(block_idx); + // Record all of the origin vars in the origin block + std::map origin_var_maps; + auto var_size = block_desc->VarsSize(); + for (size_t var_idx = 0; var_idx < var_size; ++var_idx) { + auto v = block_desc->GetVar(var_idx); + origin_var_maps.emplace(v->Name(), *v); + } + // Update the ops and vars for each block according to the instructions + block_desc->ClearVars(); + block_desc->ClearOps(); + std::set already_added_vars; + for (auto& inst : instructions_[block_idx]) { + auto* op = const_cast(inst.op()); + auto* op_info = op->op_info(); + auto op_type = op_info->Type(); + auto* kernel = inst.mutable_kernel(); + auto* scope = op->scope(); + // Update the origin vars which are referred by the instructions + // Add the new vars which are created in the passes and referred by the + // instructions + auto var_names = op_info->input_names(); + auto out_names = op_info->output_names(); + // Combine input and output vars and delete the duplicates + var_names.insert(var_names.end(), out_names.begin(), out_names.end()); + std::stable_sort(var_names.begin(), var_names.end()); + var_names.erase(std::unique(var_names.begin(), var_names.end()), + var_names.end()); + for (auto& var_name : var_names) { + if (already_added_vars.count(var_name)) continue; + auto* v = block_desc->AddVar(); + v->SetName(var_name); + auto it = origin_var_maps.find(var_name); + if (it != origin_var_maps.end()) { + v->SetType(it->second.GetType()); + v->SetPersistable(it->second.Persistable()); + if (var_name != "feed" && var_name != "fetch") { + v->SetShape(it->second.GetShape()); + v->SetDataType(it->second.GetDataType()); + } + } else { + std::string arg_name; + const Type* decl_type; + if (op_info->GetInputArgname(var_name, &arg_name)) { + decl_type = kernel->GetInputDeclType(arg_name); + } else { + op_info->GetOutputArgname(var_name, &arg_name); + decl_type = kernel->GetOutputDeclType(arg_name); + } + if (decl_type->IsTensor()) { + v->SetType(cpp::VarDesc::Type::LOD_TENSOR); + auto tensor = scope->FindVar(var_name)->GetMutable(); + v->SetPersistable(tensor->persistable()); + if (var_name != "feed" && var_name != "fetch") { + v->SetShape(tensor->dims().data()); + auto precision = tensor->precision(); + switch (precision) { +#define SET_DATATYPE(precision__, data_type) \ + case PrecisionType::precision__: \ + v->SetDataType(data_type); \ + LOG(INFO) << "Update var " << var_name << " done"; \ + break + SET_DATATYPE(kBool, VarDescAPI::VarDataType::BOOL); + SET_DATATYPE(kFloat, VarDescAPI::VarDataType::FP32); + SET_DATATYPE(kFP16, VarDescAPI::VarDataType::FP16); + SET_DATATYPE(kInt8, VarDescAPI::VarDataType::INT8); + SET_DATATYPE(kInt16, VarDescAPI::VarDataType::INT16); + SET_DATATYPE(kInt32, VarDescAPI::VarDataType::INT32); + SET_DATATYPE(kInt64, VarDescAPI::VarDataType::INT64); +#undef SET_DATATYPE + default: + LOG(WARNING) << "Unknown precision type " + << PrecisionToStr(precision) << " for var " + << var_name << " in op " << op_type; + } + } + } else if (decl_type->IsTensorList()) { + // Set persistable=false for tensor array + v->SetType(cpp::VarDesc::Type::LOD_TENSOR_ARRAY); + v->SetPersistable(false); + } else { + CHECK(false) << "Unsupported decl type " << *decl_type + << " for var " << var_name << " in op " << op_type; + } + } + already_added_vars.insert(var_name); + } + // Replace all of origin ops with the instructions + auto op_desc = block_desc->AddOp(); + *op_desc = *op_info; + op_desc->SetAttr(kKernelTypeAttr, kernel->SerializedKernelType()); + if (op_type == "subgraph" && !op_info->GetAttr("sub_block")) { + // It's a new subgraph op when its sub_block_idx = 0, Now we add its // subblock desc to the program desc, Then update its sub_block_idx to // the index of block desc of the program desc. - sub_block_idx = desc->BlocksSize(); - auto sub_block_desc = subgraph_op->GetSubBlock(); - CHECK(sub_block_desc); - auto new_block_desc = desc->AddBlock(); - *new_block_desc = *sub_block_desc; - delete sub_block_desc; - subgraph_op->mutable_op_info()->SetAttr("sub_block", - sub_block_idx); - subgraph_op->SetSubBlock(new_block_desc); - // Update main block desc after a new subblock desc is added - main_block = desc->GetBlock(0); + auto subgraph_op = static_cast(op); + auto sub_program_desc = subgraph_op->GetProgramDesc(); + CHECK(sub_program_desc); + auto sub_block_desc = program_desc->AddBlock(); + *sub_block_desc = *sub_program_desc->GetBlock(0); + subgraph_op->SetProgramDesc(program_desc); + op_desc->SetAttr("sub_block", program_desc->BlocksSize() - 1); + // Attach op and kernel again to update the new block_idx and + // program_desc + subgraph_op->Attach(*op_desc, scope); + subgraph_op->AttachKernel(kernel); + // Update the pointer of block desc after a new subblock desc is added + block_desc = program_desc->GetBlock(block_idx); } } - auto op = main_block->AddOp(); - *op = *node.op()->op_info(); - op->SetAttr(kKernelTypeAttr, node.kernel()->SerializedKernelType()); } } -// `UpdateVarsOfProgram` will remove unused var_descs and add new created -// vars' descs in the block 0. Now, the type of a new created var can only -// be LOD_TENSOR. -void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { - CHECK(desc); - CHECK(desc->BlocksSize()); - std::map origin_var_maps; - auto& main_block = *desc->GetBlock(0); - auto var_size = main_block.VarsSize(); - for (int i = 0; i < var_size; i++) { - auto v = main_block.GetVar(i); - auto name = v->Name(); - origin_var_maps.emplace(name, *v); - } - - main_block.ClearVars(); - for (auto& node : instructions_) { - auto* op = const_cast(node.op()); - auto* kernel = node.kernel(); - auto* scope = op->scope(); - auto in_names = op->op_info()->input_names(); - auto out_names = op->op_info()->output_names(); - in_names.insert(in_names.end(), out_names.begin(), out_names.end()); - std::stable_sort(in_names.begin(), in_names.end()); - in_names.erase(std::unique(in_names.begin(), in_names.end()), - in_names.end()); - for (auto& in_name : in_names) { - auto it = origin_var_maps.find(in_name); - if (it != origin_var_maps.end()) { - auto* v = main_block.AddVar(); - v->SetName((it->second).Name()); - v->SetType((it->second).GetType()); - v->SetPersistable((it->second).Persistable()); - if ((it->second).Name() != "feed" && (it->second).Name() != "fetch") { - v->SetShape((it->second).GetShape()); - v->SetDataType((it->second).GetDataType()); - } +// Create runtime program from sub_block desc according to block_idx and +// program_desc, which is used for while/conditional_block/subgraph op. +RuntimeProgram::RuntimeProgram( + const std::shared_ptr& program_desc, + Scope* exec_scope, + int block_idx) + : exec_scope_(exec_scope) { +#ifdef LITE_WITH_OPENCL + using OpenCLContext = Context; + std::unique_ptr local_ctx(new KernelContext()); + local_ctx->As().InitOnce(); +#endif + CHECK(program_desc); + auto block_size = program_desc->BlocksSize(); + CHECK(block_size) << "No block found!"; + CHECK(block_idx >= 0 && block_idx < block_size) + << "Invalid block index, expected [0," << (block_size - 1) << "] but got " + << block_idx; + auto block_desc = program_desc->GetBlock(block_idx); + instructions_.resize(kRootBlockIdx + 1); + auto op_size = block_desc->OpsSize(); + for (size_t op_idx = 0; op_idx < op_size; op_idx++) { + auto op_desc = block_desc->GetOp(op_idx); + CHECK(op_desc); + std::string op_type = op_desc->Type(); + // if (op_type == "feed" || op_type == "fetch") continue; + // Create op and pick up the best kernel + auto op = LiteOpRegistry::Global().Create(op_type); + CHECK(op) << "no Op found for " << op_type; + if (op_type == "while") { + static_cast(op.get())->SetProgramDesc(program_desc); + } else if (op_type == "conditional_block") { + static_cast(op.get())->SetProgramDesc( + program_desc); + } else if (op_type == "subgraph") { + static_cast(op.get())->SetProgramDesc( + program_desc); + } + op->Attach(*op_desc, exec_scope_); + std::unique_ptr kernel; + if (op_desc->HasAttr(kKernelTypeAttr)) { + // Create op and pick up the best kernel according to the + // kKernelTypeAttr attribute + auto kernel_type = op_desc->GetAttr(kKernelTypeAttr); + std::string alias; + Place place; + KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place); + VLOG(3) << "Found the attr '" << kKernelTypeAttr << "': " << kernel_type + << " for " << op_type; + auto kernels = op->CreateKernels({place}); + CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type; + auto it = std::find_if( + kernels.begin(), kernels.end(), [&](std::unique_ptr& it) { + return it->alias() == alias; + }); + CHECK(it != kernels.end()); + kernel = std::move(*it); + } else { + // TODO(hong19860320) add kernel picking according to the type of input + // and output tensors + VLOG(3) << "The attr '" << kKernelTypeAttr + << "' not found, pick the first kernel for " << op_type; + std::vector> kernels; +#if defined(LITE_WITH_ARM) + kernels = op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}}); +#elif defined(LITE_WITH_X86) + kernels = op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}}); +#endif + if (kernels.size() > 0) { + kernel = std::move(kernels.front()); } else { - // New created vars must be LOD_TENSOR - auto* v = main_block.AddVar(); - v->SetName(in_name); - v->SetType(cpp::VarDesc::Type::LOD_TENSOR); - std::string in_arg_name; - const Type* type; - if (op->op_info()->GetInputArgname(in_name, &in_arg_name)) { - type = kernel->GetInputDeclType(in_arg_name); - } else { - op->op_info()->GetOutputArgname(in_name, &in_arg_name); - type = kernel->GetOutputDeclType(in_arg_name); - } - if (type->IsTensor()) { - auto tensor = scope->FindVar(in_name)->GetMutable(); - v->SetPersistable(tensor->persistable()); - if (in_name != "feed" && in_name != "fetch") { - v->SetShape(tensor->dims().data()); - switch (tensor->precision()) { -#define SET_DATATYPE(precision__, data_type) \ - case PrecisionType::precision__: \ - v->SetDataType(data_type); \ - LOG(INFO) << "update var" << (it->second).Name() << "done"; \ - break - SET_DATATYPE(kBool, VarDescAPI::VarDataType::BOOL); - SET_DATATYPE(kFloat, VarDescAPI::VarDataType::FP32); - SET_DATATYPE(kFP16, VarDescAPI::VarDataType::FP16); - SET_DATATYPE(kInt8, VarDescAPI::VarDataType::INT8); - SET_DATATYPE(kInt16, VarDescAPI::VarDataType::INT16); - SET_DATATYPE(kInt32, VarDescAPI::VarDataType::INT32); - SET_DATATYPE(kInt64, VarDescAPI::VarDataType::INT64); -#undef SET_DATATYPE - default: - VLOG(4) << "warning! unknown precision type"; - } - } - } else { - CHECK(false) << "unsupported var type"; - } + LOG(WARNING) << "No kernels found for " << op_type; } } +#ifdef LITE_WITH_OPENCL + if (kernel->target() == TARGET(kOpenCL)) { + std::unique_ptr ctx(new KernelContext()); + (*local_ctx).As().CopySharedTo(&ctx->As()); + kernel->SetContext(std::move(ctx)); + } else { + kernel->SetContext( + ContextScheduler::Global().NewContext(kernel->target())); + } +#else + kernel->SetContext(ContextScheduler::Global().NewContext(kernel->target())); +#endif + instructions_[kRootBlockIdx].emplace_back(std::move(op), std::move(kernel)); } + Init(); } + void RuntimeProgram::Run() { #ifdef LITE_WITH_PRECISION_PROFILE auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler(); @@ -160,7 +258,8 @@ void RuntimeProgram::Run() { } #endif int idx = -1; - for (auto& inst : instructions_) { + auto& insts = instructions_[kRootBlockIdx]; + for (auto& inst : insts) { ++idx; #ifndef LITE_WITH_FPGA if (inst.is_feed_fetch_op()) continue; @@ -193,57 +292,50 @@ void RuntimeProgram::Run() { #endif } -void Program::Build(const cpp::ProgramDesc& prog) { +void Program::Build(const std::shared_ptr& program_desc) { CHECK(ops_.empty()) << "Executor duplicate Build found"; // Create operators. - auto program = prog; - CHECK(program.BlocksSize()); - auto& main_block = *program.GetBlock(0); - for (size_t i = 0; i < main_block.OpsSize(); ++i) { - auto& op_desc = *main_block.GetOp(i); - auto op_type = op_desc.Type(); - // if (op_type == "feed" || op_type == "fetch") continue; - VLOG(4) << "create Op [" << op_type << "]"; - auto op = LiteOpRegistry::Global().Create(op_type); - CHECK(op) << "no Op found for " << op_type; - if (op_type == "while" || op_type == "conditional_block" || - op_type == "subgraph") { - auto sub_block_idx = op_desc.GetAttr("sub_block"); - CHECK(sub_block_idx >= 0 && sub_block_idx < program.BlocksSize()) - << "Invalid attribute sub_block(" << sub_block_idx << ") for " - << op_type; - auto sub_block_desc = - const_cast(prog).GetBlock( - sub_block_idx); - CHECK(sub_block_desc); + auto block_size = program_desc->BlocksSize(); + CHECK(block_size); + ops_.resize(block_size); + for (size_t block_idx = 0; block_idx < block_size; ++block_idx) { + auto* block_desc = program_desc->GetBlock(block_idx); + auto op_size = block_desc->OpsSize(); + for (size_t op_idx = 0; op_idx < op_size; ++op_idx) { + auto* op_desc = block_desc->GetOp(op_idx); + auto op_type = op_desc->Type(); + VLOG(4) << "create Op [" << op_type << "]"; + auto op = LiteOpRegistry::Global().Create(op_type); + CHECK(op) << "no Op found for " << op_type; if (op_type == "while") { - static_cast(op.get())->SetSubBlock( - sub_block_desc); + static_cast(op.get())->SetProgramDesc( + program_desc); } else if (op_type == "conditional_block") { - static_cast(op.get())->SetSubBlock( - sub_block_desc); + static_cast(op.get())->SetProgramDesc( + program_desc); } else if (op_type == "subgraph") { - static_cast(op.get())->SetSubBlock( - sub_block_desc); + static_cast(op.get())->SetProgramDesc( + program_desc); } + op->Attach(*op_desc, exec_scope_); + ops_[block_idx].emplace_back(std::move(op)); } - ops_.emplace_back(std::move(op)); - ops_.back()->Attach(op_desc, exec_scope_); } } -void Program::PrepareWorkspace(const cpp::ProgramDesc& prog, - const std::vector& var_names) { +void Program::PrepareWorkspace( + const std::shared_ptr& program_desc, + const std::vector& vars_to_clone) { CHECK(!exec_scope_) << "Duplicate PrepareWorkspace found"; exec_scope_ = &scope_->NewScope(); // Create Feed and Fetch var. scope_->Var("feed")->GetMutable>(); scope_->Var("fetch")->GetMutable>(); - tmp_vars_.push_back("feed"); - tmp_vars_.push_back("fetch"); + vars_.push_back("feed"); + vars_.push_back("fetch"); - auto VarPrecision2KernlPrecision = + auto VarDescType2PrecisionType = [](const lite::VarDescAPI::Type& type) -> PrecisionType { switch (type) { case lite::VarDescAPI::Type::FP32: @@ -259,44 +351,60 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog, case lite::VarDescAPI::Type::INT64: return PRECISION(kInt64); default: - // LOG(FATAL) << "not supported type: " << static_cast(type); + LOG(WARNING) << "Unable to convert var desc type(" + << static_cast(type) << ") to precision type!"; return PRECISION(kUnk); } }; - auto program = prog; - CHECK(program.BlocksSize()); - for (size_t b = 0; b < program.BlocksSize(); ++b) { - auto& main_block = *program.GetBlock(b); - for (size_t i = 0; i < main_block.VarsSize(); ++i) { - auto& var_desc = *main_block.GetVar(i); - if (!var_desc.Persistable()) { - if (var_desc.GetType() == lite::VarDescAPI::Type::LOD_TENSOR && - VarPrecision2KernlPrecision(var_desc.GetDataType()) != - PRECISION(kUnk)) { - var_data_type_[var_desc.Name()] = - VarPrecision2KernlPrecision(var_desc.GetDataType()); - } - tmp_vars_.push_back(var_desc.Name()); - VLOG(4) << "var name: " << var_desc.Name() << " type is " - << static_cast(var_desc.GetType()) << " data type is " - << static_cast(var_desc.GetDataType()); - exec_scope_->Var(var_desc.Name()); - if (b > 0) { - VLOG(4) << "var: " << var_desc.Name(); + auto block_size = program_desc->BlocksSize(); + CHECK(block_size); + for (size_t block_idx = 0; block_idx < block_size; ++block_idx) { + auto* block_desc = program_desc->GetBlock(block_idx); + auto var_size = block_desc->VarsSize(); + for (size_t var_idx = 0; var_idx < var_size; ++var_idx) { + auto* var_desc = block_desc->GetVar(var_idx); + const auto& var_name = var_desc->Name(); + const auto& var_type = var_desc->GetType(); + if (!var_desc->Persistable()) { + vars_.push_back(var_name); + auto* var = exec_scope_->Var(var_name); + VLOG(4) << "Var " << var_name << " in block " << block_idx; + VLOG(4) << " - type " << static_cast(var_type); + if (var_type == lite::VarDescAPI::Type::LOD_TENSOR) { + const auto& var_data_type = + VarDescType2PrecisionType(var_desc->GetDataType()); + if (var_data_type != PRECISION(kUnk)) { + var_type_map_[var_name] = LiteType::GetTensorTy( + TARGET(kUnk), var_data_type, DATALAYOUT(kUnk)); + } + VLOG(4) << " - data type " << static_cast(var_data_type); + // Create the tensor with the shape from var desc, it's convenient to + // the graph analysis in the passes, but you should resize the tensor + // with the real shape before accessing its data, because the + // var_shape may be [-1,3,224,224] + const auto& var_shape = var_desc->GetShape(); + auto* tensor = var->GetMutable(); + if (tensor->dims().empty() && !var_shape.empty()) { + tensor->Resize(var_shape); + VLOG(4) << " - dims " << tensor->dims().repr(); + } + } else if (var_type == lite::VarDescAPI::Type::LOD_TENSOR_ARRAY) { + var_type_map_[var_name] = LiteType::GetTensorListTy( + TARGET(kUnk), PRECISION(kUnk), DATALAYOUT(kUnk)); } } else { - if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue; - weights_.push_back(var_desc.Name()); - if (var_desc.Persistable()) scope_->Var(var_desc.Name()); + if (var_name == "feed" || var_name == "fetch") continue; + weights_.push_back(var_name); + scope_->Var(var_name); } } } - for (auto i : var_names) { - exec_scope_->LocalVar(i); - auto* tensor = scope_->Var(i)->GetMutable(); - auto* sub_tensor = exec_scope_->Var(i)->GetMutable(); + for (auto var_name : vars_to_clone) { + exec_scope_->LocalVar(var_name); + auto* tensor = scope_->Var(var_name)->GetMutable(); + auto* sub_tensor = exec_scope_->Var(var_name)->GetMutable(); sub_tensor->CopyDataFrom(*tensor); } } diff --git a/lite/core/program.h b/lite/core/program.h index 6fe65f158b8d547e7a741e329a192d2661a60060..f0715b9760b81f8de42e0acee5f5839fc42dd65a 100644 --- a/lite/core/program.h +++ b/lite/core/program.h @@ -22,7 +22,7 @@ #include "lite/core/kernel.h" #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" -#include "lite/model_parser/cpp/program_desc.h" +#include "lite/model_parser/cpp_desc.h" #ifdef LITE_WITH_PROFILE #include "lite/core/profile/profiler.h" #endif @@ -41,58 +41,66 @@ static const char kKernelTypeAttr[] = "__@kernel_type_attr@__"; // - scope: which contains all the weights struct Program { public: - explicit Program(const std::shared_ptr& root) { scope_ = root; } - Program(const cpp::ProgramDesc& desc, - const std::shared_ptr& root, + explicit Program(const std::shared_ptr& root_scope) { + scope_ = root_scope; + } + Program(const std::shared_ptr& program_desc, + const std::shared_ptr& root_scope, const std::vector& valid_places, const std::vector& var_names = {}) - : scope_(root), valid_places_(valid_places), desc_(desc) { + : scope_(root_scope), valid_places_(valid_places) { CHECK(scope_) << "scope should be init first"; VLOG(4) << "prepare work"; - PrepareWorkspace(desc, var_names); + PrepareWorkspace(program_desc, var_names); VLOG(4) << "build desc"; - Build(desc); + Build(program_desc); VLOG(4) << "build desc finished"; } std::unique_ptr Clone() const { - std::unique_ptr res(new Program(desc_, scope_, valid_places_)); - return res; + return std::unique_ptr(new Program(scope_)); } const std::list& weights() const { return weights_; } - const std::list& tmp_vars() const { return tmp_vars_; } + const std::list& vars() const { return vars_; } std::list* mutable_weights() { return &weights_; } - std::list* mutable_tmp_vars() { return &tmp_vars_; } + std::list* mutable_vars() { return &vars_; } - const std::list>& ops() const { return ops_; } - std::list>* mutable_ops() { return &ops_; } + const std::list>& ops( + int block_idx = kRootBlockIdx) const { + return ops_[block_idx]; + } + std::list>* mutable_ops( + int block_idx = kRootBlockIdx) { + return &ops_[block_idx]; + } - lite::Scope* exec_scope() { return exec_scope_; } - lite::Scope* scope() { return scope_.get(); } + size_t block_size() { return ops_.size(); } - const std::map& var_data_type() const { - return var_data_type_; + Scope* exec_scope() { return exec_scope_; } + Scope* scope() { return scope_.get(); } + + const std::map& var_type_map() const { + return var_type_map_; } private: // Build from a program and scope. - void Build(const cpp::ProgramDesc& program); + void Build(const std::shared_ptr& program_desc); // Create temporary variables. - void PrepareWorkspace(const cpp::ProgramDesc& program, - const std::vector& var_names = {}); + void PrepareWorkspace(const std::shared_ptr& program_desc, + const std::vector& vars_to_clone = {}); private: - std::map var_data_type_; - std::list tmp_vars_; + std::map var_type_map_; + std::list vars_; std::list weights_; - std::list> ops_; + std::vector>> ops_; // the scope to run the kernels, NOTE this is the execution scope. - std::shared_ptr scope_; + std::shared_ptr scope_; std::vector valid_places_; // Runtime scope. - lite::Scope* exec_scope_{}; - cpp::ProgramDesc desc_; + Scope* exec_scope_{}; }; struct Instruction { @@ -170,8 +178,22 @@ struct Instruction { */ class LITE_API RuntimeProgram { public: - explicit RuntimeProgram(std::vector&& insts) + explicit RuntimeProgram(std::vector>&& insts) : instructions_(std::move(insts)) { + Init(); + } + explicit RuntimeProgram( + const std::shared_ptr& program_desc, + Scope* exec_scope, + int block_idx = kRootBlockIdx); + ~RuntimeProgram() { +#ifdef LITE_WITH_PROFILE + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate); + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch); +#endif // LITE_WITH_PROFILE + } + + void Init() { if (instructions_.empty()) { LOG(FATAL) << "no instructions"; } @@ -180,7 +202,7 @@ class LITE_API RuntimeProgram { #endif #ifdef LITE_WITH_NVTX const NVTXAnnotator& annotator = NVTXAnnotator::Global(); - for (auto& inst : instructions_) { + for (auto& inst : instructions_[kRootBlockIdx]) { NVTXRangeAnnotation annotation = annotator.AnnotateBlock(); register_layer_names_.push_back(annotator.RegisterString( const_cast(inst.op())->Type().c_str())); @@ -188,41 +210,38 @@ class LITE_API RuntimeProgram { register_layer_names_.push_back(annotator.RegisterString("one_loop")); #endif } - ~RuntimeProgram() { -#ifdef LITE_WITH_PROFILE - LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate); - LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch); -#endif // LITE_WITH_PROFILE - } void Run(); - void set_exec_scope(lite::Scope* x) { exec_scope_ = x; } - lite::Scope* exec_scope() { return exec_scope_; } + void set_exec_scope(Scope* x) { exec_scope_ = x; } + Scope* exec_scope() { return exec_scope_; } - size_t num_instructions() const { return instructions_.size(); } + const std::vector& instructions( + int block_idx = kRootBlockIdx) const { + return instructions_[block_idx]; + } - const std::vector& instructions() const { return instructions_; } + std::vector* mutable_instructions( + int block_idx = kRootBlockIdx) { + return &instructions_[block_idx]; + } - // `SaveOpInfosToProgram` will update the op list(ops_) of the block 0 - // in ProgramDesc. - void SaveOpInfosToProgram(cpp::ProgramDesc* desc); + size_t block_size() { return instructions_.size(); } - // `UpdateVarsOfProgram` will update the var list(vars_) of the block 0 in - // ProgramDesc. Namely, if a new var created in some passes, its var_desc will - // be added in vars_. - void UpdateVarsOfProgram(cpp::ProgramDesc* desc); + // Update the ops and vars of all of blocks to the given program_desc + // according to the instructions + void SaveToProgram(std::shared_ptr program_desc); private: RuntimeProgram(const RuntimeProgram&) = delete; - std::vector instructions_; - lite::Scope* exec_scope_{}; + std::vector> instructions_; + Scope* exec_scope_{}; #ifdef LITE_WITH_PROFILE profile::Profiler profiler_; void set_profiler() { - for (auto i = instructions_.begin(); i != instructions_.end(); ++i) { - i->set_profiler(&profiler_); + for (auto& inst : instructions_[kRootBlockIdx]) { + inst.set_profiler(&profiler_); } } #endif diff --git a/lite/core/scope.h b/lite/core/scope.h index 57e4e3a5e058000f963ff369cbd25e69b9c981c6..41d6ee8f4f55268e3389cd4cada7e48fb8f922d7 100644 --- a/lite/core/scope.h +++ b/lite/core/scope.h @@ -62,19 +62,36 @@ class Scope final { // Create a Tensor variable. This will create a new Variable called `name`. Tensor* NewTensor(const std::string& name) { auto* var = Var(name); - return var->GetMutable(); + return var->GetMutable(); } const Tensor* FindTensor(const std::string& name) { auto* var = FindVar(name); if (!var) return nullptr; - return &var->Get(); + return &var->Get(); } Tensor* FindMutableTensor(const std::string& name) { auto* var = FindVar(name); if (!var) return nullptr; - return var->GetMutable(); + return var->GetMutable(); + } + + std::vector* NewTensorList(const std::string& name) { + auto* var = Var(name); + return var->GetMutable>(); + } + + const std::vector* FindTensorList(const std::string& name) { + auto* var = FindVar(name); + if (!var) return nullptr; + return &var->Get>(); + } + + std::vector* FindMutableTensorList(const std::string& name) { + auto* var = FindVar(name); + if (!var) return nullptr; + return var->GetMutable>(); } private: diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc index 197ee4ddbcd5df62dd0f8a15eba39e2a880f7125..3b21cf9147ded7b05938edc6c2985c8fce23842f 100644 --- a/lite/core/tensor.cc +++ b/lite/core/tensor.cc @@ -84,6 +84,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) { lod_ = other.lod_; memory_size_ = other.memory_size_; precision_ = other.precision_; + persistable_ = other.persistable_; buffer_->CopyDataFrom(*other.buffer_, memory_size_); } diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc index 3d09c071aa7ecbe51f1723cad314f2aedcdb2bd7..2604f104e72081025d9bd59bb60843cc627ad54f 100644 --- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc +++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc @@ -78,6 +78,28 @@ void RunModel(std::string model_dir, // 1. Set MobileConfig MobileConfig config; config.set_model_from_file(model_dir); + + // NOTE: Use android gpu with opencl, you should ensure: + // first, [compile **cpu+opencl** paddlelite + // lib](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/demo_guides/opencl.md); + // second, [convert and use opencl nb + // model](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/user_guides/opt/opt_bin.md). + // + /* Uncomment code below to enable OpenCL + bool is_opencl_backend_valid = ::IsOpenCLBackendValid(); + std::cout << "is_opencl_backend_valid:" << is_opencl_backend_valid << + std::endl; + if (is_opencl_backend_valid) { + // give opencl nb model dir + config.set_model_from_file(model_dir); + } else { + std::cout << "Unsupport opencl nb model." << std::endl; + exit(1); + // you can give backup cpu nb model instead + // config.set_model_from_file(cpu_nb_model_dir); + } + */ + // NOTE: To load model transformed by model_optimize_tool before // release/v2.3.0, plese use `set_model_dir` API as listed below. // config.set_model_dir(model_dir); diff --git a/lite/fluid/data_type.cc b/lite/fluid/data_type.cc index 0dab71ed26c1b4ee438f52e088614bb577a9eade..3ad02a9c53c311a9253bbdf481c9aa6288685654 100644 --- a/lite/fluid/data_type.cc +++ b/lite/fluid/data_type.cc @@ -67,7 +67,7 @@ framework::proto::VarType::Type ToDataType(std::type_index type) { if (it != gDataTypeMap().cpp_to_proto_.end()) { return it->second; } - PADDLE_THROW("Not support %s as tensor type", type.name()); + LOG(FATAL) << "Not support " << type.name() << " as tensor type"; return static_cast(-1); } @@ -76,8 +76,8 @@ std::type_index ToTypeIndex(framework::proto::VarType::Type type) { if (it != gDataTypeMap().proto_to_cpp_.end()) { return it->second; } - PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type", - static_cast(type)); + LOG(FATAL) << "Not support framework::proto::VarType::Type(" + << static_cast(type) << ") as tensor type"; return std::type_index(typeid(void)); } @@ -86,8 +86,8 @@ std::string DataTypeToString(const framework::proto::VarType::Type type) { if (it != gDataTypeMap().proto_to_str_.end()) { return it->second; } - PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type", - static_cast(type)); + LOG(FATAL) << "Not support framework::proto::VarType::Type(" + << static_cast(type) << ") as tensor type"; return std::string(); } @@ -96,7 +96,8 @@ size_t SizeOfType(framework::proto::VarType::Type type) { if (it != gDataTypeMap().proto_to_size_.end()) { return it->second; } - PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type).c_str()); + LOG(FATAL) << "Not support " << DataTypeToString(type).c_str() + << " as tensor type"; return 0; } diff --git a/lite/fluid/data_type.h b/lite/fluid/data_type.h index a8b11ec465e00356561c95b56f63e3c56cbe8a5b..9896c0d54844b99748e1a7c8bddc5e178f84fb51 100644 --- a/lite/fluid/data_type.h +++ b/lite/fluid/data_type.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "lite/core/framework.pb.h" #include "lite/fluid/float16.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -72,7 +72,7 @@ inline void VisitDataType(framework::proto::VarType::Type type, _ForEachDataType_(VisitDataTypeCallback); #undef VisitDataTypeCallback - PADDLE_THROW("Not supported %d", type); + LOG(FATAL) << "Not supported " << type; } extern std::string DataTypeToString(const framework::proto::VarType::Type type); diff --git a/lite/fluid/eigen.h b/lite/fluid/eigen.h index c3af7e9f6c3588f404c614430bf01f7ab5e099e5..3312c9c39eaad4fc0a4225d9734b3f80790b2979 100644 --- a/lite/fluid/eigen.h +++ b/lite/fluid/eigen.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "lite/core/tensor.h" #include "lite/fluid/float16.h" -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" #include "unsupported/Eigen/CXX11/Tensor" namespace paddle { @@ -30,7 +30,7 @@ struct EigenDim { using Type = Eigen::DSizes; static Type From(const lite::DDim& dims) { - PADDLE_ENFORCE_EQ(dims.size(), D, "D must match DDim::size"); + CHECK_EQ(dims.size(), D) << "D must match DDim::size"; Type ret; for (size_t d = 0; d < dims.size(); d++) { ret[d] = dims[d]; @@ -39,7 +39,7 @@ struct EigenDim { } static Type From(const DDim::value_type length) { - PADDLE_ENFORCE_EQ(D, 1, "D must be 1."); + CHECK_EQ(D, 1) << "D must be 1."; Type ret; ret[0] = length; return ret; @@ -84,16 +84,16 @@ struct EigenMatrix : public EigenTensor { static typename EigenMatrix::Type Reshape(Tensor& tensor, // NOLINT int num_col_dims) { int rank = tensor.dims().size(); - PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, - "`num_col_dims` must be between (0, rank_of_tensor)."); + CHECK(num_col_dims > 0 && num_col_dims < rank) + << "`num_col_dims` must be between (0, rank_of_tensor)."; return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims)); } static typename EigenMatrix::ConstType Reshape(const Tensor& tensor, int num_col_dims) { int rank = tensor.dims().size(); - PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, - "`num_col_dims` must be between (0, rank_of_tensor)."); + CHECK(num_col_dims > 0 && num_col_dims < rank) + << "`num_col_dims` must be between (0, rank_of_tensor)."; return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims)); } }; diff --git a/lite/fluid/rw_lock.h b/lite/fluid/rw_lock.h index eb9829425eca9d8bd363a45961302a7f3818e513..f68a21502073ccde6d27c46793d3f8cfa0751af3 100644 --- a/lite/fluid/rw_lock.h +++ b/lite/fluid/rw_lock.h @@ -20,7 +20,7 @@ limitations under the License. */ #include // NOLINT #endif // !_WIN32 -#include "lite/utils/paddle_enforce.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { @@ -33,17 +33,15 @@ struct RWLock { ~RWLock() { pthread_rwlock_destroy(&lock_); } inline void RDLock() { - PADDLE_ENFORCE_EQ( - pthread_rwlock_rdlock(&lock_), 0, "acquire read lock failed"); + CHECK_EQ(pthread_rwlock_rdlock(&lock_), 0) << "acquire read lock failed"; } inline void WRLock() { - PADDLE_ENFORCE_EQ( - pthread_rwlock_wrlock(&lock_), 0, "acquire write lock failed"); + CHECK_EQ(pthread_rwlock_wrlock(&lock_), 0) << "acquire write lock failed"; } inline void UNLock() { - PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed"); + CHECK_EQ(pthread_rwlock_unlock(&lock_), 0) << "unlock failed"; } private: diff --git a/lite/fluid/selected_rows.cc b/lite/fluid/selected_rows.cc index 98e9325ca2f8fab3f8aa77a0bb074ae5d1be7670..361d63cf5dfd9cd21db47917047a7e2f3758ec96 100644 --- a/lite/fluid/selected_rows.cc +++ b/lite/fluid/selected_rows.cc @@ -119,7 +119,7 @@ void DeserializeFromStream( // the 1st field, unit32_t version for SelectedRows uint32_t version; is.read(reinterpret_cast(&version), sizeof(version)); - PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + CHECK_EQ(version, 0U) << "Only version 0 is supported"; } { // the 2st field, rows information @@ -163,24 +163,22 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key, if (iter == id_to_index_.end()) { rwlock_->UNLock(); if (!auto_grown) { - PADDLE_THROW("key %ld not found", key); + LOG(FATAL) << "key " << key << " not found"; } rwlock_->WRLock(); auto map_size = id_to_index_.size(); auto vector_size = rows_.size(); if (map_size != vector_size) { rwlock_->UNLock(); - PADDLE_THROW( - "id_to_index_ size %lu should have the same size with rows_ %lu", - map_size, - vector_size); + LOG(FATAL) << "id_to_index_ size " << map_size + << " should have the same size with rows_ " << vector_size; } auto write_iter = id_to_index_.find(key); if (write_iter == id_to_index_.end()) { int row_num = rows_.size(); if (row_num == value_->dims()[0]) { rwlock_->UNLock(); - PADDLE_THROW("selected rows is full, then length exceed %d", row_num); + LOG(FATAL) << "selected rows is full, then length exceed " << row_num; } // key logic to put a key into id_to_index_ rows_.push_back(key); @@ -213,16 +211,14 @@ void SelectedRows::Get(const lite::Tensor& ids, lite::Tensor* value, bool auto_grown, bool is_test) { - PADDLE_ENFORCE(value->IsInitialized(), - "The value tensor should be initialized."); + CHECK(value->IsInitialized()) << "The value tensor should be initialized."; if (ids.numel() == 0) { VLOG(3) << "keys is empty, please check data!"; } else { int64_t value_width = value_->numel() / value_->dims()[0]; - PADDLE_ENFORCE_EQ(value_width, - value->numel() / value->dims()[0], - "output tensor should have the same shape with table " - "except the dims[0]."); + CHECK_EQ(value_width, value->numel() / value->dims()[0]) + << "output tensor should have the same shape with table " + "except the dims[0]."; for (int i = 0; i < ids.numel(); ++i) { auto id = ids.data()[i]; int64_t index = AutoGrownIndex(id, auto_grown, is_test); diff --git a/lite/fluid/selected_rows.h b/lite/fluid/selected_rows.h index 5db322f8592f4518d9e1ccc996ffb1e847e7b964..aad93552ebef5d67c77e554b29bf593f5cd176f7 100644 --- a/lite/fluid/selected_rows.h +++ b/lite/fluid/selected_rows.h @@ -82,7 +82,7 @@ class SelectedRows { int64_t Index(int64_t key) const { auto it = std::find(rows_.begin(), rows_.end(), key); if (it == rows_.end()) { - PADDLE_THROW("id %ld not in table", key); + LOG(FATAL) << "id " << key << " not in table"; } return static_cast(std::distance(rows_.begin(), it)); } diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt index 2416278ad74068d28f6de523c55513891b08cc72..5dffd7c1a93225a38e433a4ff447b9b0fc863216 100644 --- a/lite/gen_code/CMakeLists.txt +++ b/lite/gen_code/CMakeLists.txt @@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} RKNPU_DEPS ${rknpu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} @@ -44,6 +45,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} RKNPU_DEPS ${rknpu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} diff --git a/lite/gen_code/gen_code.cc b/lite/gen_code/gen_code.cc index 6c43f6e0116d9adfc4fc6f315d5653b2634dfe7b..a1e69b624a600719121926fc3a4f58391fa63ce6 100644 --- a/lite/gen_code/gen_code.cc +++ b/lite/gen_code/gen_code.cc @@ -59,7 +59,7 @@ void Module::AddHeaderIncludeGenCode() { Line("#include \"lite/gen_code/paddle_infer.h\""); Line("#include \"lite/core/op_registry.h\""); Line("#include \"lite/core/scope.h\""); - Line("#include \"lite/model_parser/cpp/op_desc.h\""); + Line("#include \"lite/model_parser/cpp_desc.h\""); Line(""); Line(""); } diff --git a/lite/gen_code/gen_code.h b/lite/gen_code/gen_code.h index d316eac43f99664fa71cba54b3ab5360852300a0..e100904a7fe4f9c3e489c056ceeeba21657b4944 100644 --- a/lite/gen_code/gen_code.h +++ b/lite/gen_code/gen_code.h @@ -20,9 +20,9 @@ #include "lite/core/program.h" #include "lite/core/target_wrapper.h" #include "lite/core/tensor.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/compatible_pb.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/model_parser/pb/op_desc.h" #include "lite/utils/all.h" diff --git a/lite/gen_code/gen_code_test.cc b/lite/gen_code/gen_code_test.cc index d0b1c1f8b23f90976f4b315a1a4e13069b2136f1..5b3db0de8342f312dcb4443ebcd1fd72e857eea0 100644 --- a/lite/gen_code/gen_code_test.cc +++ b/lite/gen_code/gen_code_test.cc @@ -25,7 +25,7 @@ #include "lite/core/scope.h" #include "lite/core/tensor.h" #include "lite/model_parser/compatible_pb.h" -#include "lite/model_parser/cpp/op_desc.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/model_parser/model_parser.h" #include "lite/model_parser/pb/program_desc.h" diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt index 17a836b17183d69b0e2a15b46b7a2097c323312f..91268bc28dbdf38137904f986b254a76cbd5e538 100644 --- a/lite/kernels/CMakeLists.txt +++ b/lite/kernels/CMakeLists.txt @@ -14,3 +14,4 @@ add_subdirectory(mlu) add_subdirectory(apu) add_subdirectory(bm) add_subdirectory(rknpu) +add_subdirectory(huawei_ascend_npu) diff --git a/lite/kernels/apu/bridges/conv_op.cc b/lite/kernels/apu/bridges/conv_op.cc index ca6e0ff2ac3930fe5cab9230dbbefa0af0a864ab..bf5e313180d9d8089b29f993384bd243b2a5ed05 100644 --- a/lite/kernels/apu/bridges/conv_op.cc +++ b/lite/kernels/apu/bridges/conv_op.cc @@ -35,6 +35,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { int neuron_errCode; VLOG(3) << "[APU] Converting [" << op_type << "]"; + CHECK(op_info->HasAttr("enable_int8") && + op_info->GetAttr("enable_int8")); + // Get input and output vars and op attributes auto input_name = op_info->Input("Input").front(); auto input = scope->FindMutableTensor(input_name); @@ -94,30 +97,18 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { input_dims, filter_dims); - float input_scale; - float output_scale; - std::vector weight_scale; - if (op_info->HasAttr("enable_int8")) { - if (op_info->GetAttr("enable_int8")) { - if (op_info->HasAttr("input_scale")) - input_scale = op_info->GetAttr("input_scale"); - if (op_info->HasAttr("weight_scale")) - weight_scale = op_info->GetAttr>("weight_scale"); - if (op_info->HasAttr("output_scale")) - output_scale = op_info->GetAttr("output_scale"); - VLOG(3) << "has output scale:" << output_scale; - } else { - return FAILED; - } - } else { - return FAILED; - } + CHECK(op_info->HasInputScale(input_name)); + auto input_scale = op_info->GetInputScale(input_name)[0]; + CHECK(op_info->HasInputScale(filter_name)); + auto filter_scale = op_info->GetInputScale(filter_name); + CHECK(op_info->HasOutputScale(output_name)); + auto output_scale = op_info->GetOutputScale(output_name)[0]; VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups << " ,dilations: " << dilations[0] << ":" << dilations[1]; VLOG(3) << "with_act: " << with_act << " ,act_type:" << act_type; VLOG(3) << "input_dims: " << input_dims << " ,output_dims: " << output_dims - << " ,weight_scale size: " << weight_scale.size(); + << " ,filter_scale size: " << filter_scale.size(); VLOG(3) << "filter_dims: " << filter_dims << " ,memory_size: " << filter->memory_size() << " ,data_size: " << filter->data_size(); @@ -216,10 +207,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { NeuronOperandType filterType; NeuronOperandType channelFilterType; NeuronSymmPerChannelQuantParams symmPerChannelQuantParams; - if (1 == weight_scale.size()) { + if (1 == filter_scale.size()) { // Per layer type filterType.type = NEURON_TENSOR_QUANT8_ASYMM; - filterType.scale = weight_scale[0]; + filterType.scale = filter_scale[0]; filterType.zeroPoint = 128; filterType.dimensionCount = filter_dims.size(); filterType.dimensions = &dims_filter[0]; @@ -237,17 +228,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { symmPerChannelQuantParams.channelDim = 3; else symmPerChannelQuantParams.channelDim = 0; - symmPerChannelQuantParams.scaleCount = weight_scale.size(); - symmPerChannelQuantParams.scales = weight_scale.data(); + symmPerChannelQuantParams.scaleCount = filter_scale.size(); + symmPerChannelQuantParams.scales = filter_scale.data(); biasType.scale = 0; } std::shared_ptr filter_node = nullptr; - if (1 == weight_scale.size()) { + if (1 == filter_scale.size()) { NeuronModel_addOperand(model, &filterType); // 1: filter filter_node = graph->Add(filter_name, dims_filter); - VLOG(3) << "filter node idx: " << filter_node->index() << "w_scale[0]" - << weight_scale[0] << ": filterType: " << filterType.dimensions[0] + VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]" + << filter_scale[0] << ": filterType: " << filterType.dimensions[0] << ":" << filterType.dimensions[1] << ":" << filterType.dimensions[2] << ":" << filterType.dimensions[3]; memcpy(filter->mutable_data(), @@ -263,8 +254,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { NeuronModel_addOperand(model, &channelFilterType); // 1: filter filter_node = graph->Add(filter_name, dims_filter); VLOG(3) << "chennel filter node idx: " << filter_node->index() - << " ,scale_count:" << weight_scale.size() - << " weight_scale[0]:" << weight_scale.data()[0] + << " ,scale_count:" << filter_scale.size() + << " filter_scale[0]:" << filter_scale.data()[0] << " ,channelFilterType: " << channelFilterType.dimensions[0] << ":" << channelFilterType.dimensions[1] << ":" << channelFilterType.dimensions[2] << ":" @@ -298,7 +289,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { std::shared_ptr bias_node = nullptr; if (HasInputArg(op_info, scope, "Bias")) { auto bias_name = op_info->Input("Bias").front(); - auto bias_type = kernel->GetInputDeclType("Bias"); auto bias = scope->FindMutableTensor(bias_name); auto bias_dims = bias->dims(); @@ -364,10 +354,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Add output tensor type NeuronOperandType outType; outType.type = NEURON_TENSOR_QUANT8_ASYMM; - if (graph->IsOutput(output_name)) - outType.scale = output_scale / 127; - else - outType.scale = output_scale; + outType.scale = output_scale; outType.zeroPoint = 128; outType.dimensionCount = output_dims.size(); std::vector dims_out = {(uint32_t)output_dims[0], @@ -401,7 +388,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { int32_t* int32_bias_data = reinterpret_cast(bias->mutable_data()); float2int32( - bias->data(), input_scale, weight_scale, int32_bias_data); + bias->data(), input_scale, filter_scale, int32_bias_data); VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << " : " << int32_bias_data[1] << " : " << int32_bias_data[2] << " : " diff --git a/lite/kernels/apu/bridges/fc_op.cc b/lite/kernels/apu/bridges/fc_op.cc index a00a35f9a0766b4fb4f02d05419a0ae42354ca37..106ce2c16f3fd287a27c92179fa3a429c7be57c8 100644 --- a/lite/kernels/apu/bridges/fc_op.cc +++ b/lite/kernels/apu/bridges/fc_op.cc @@ -31,6 +31,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto scope = op->scope(); VLOG(3) << "[APU] Converting [" + op_type + "]"; + CHECK(op_info->HasAttr("enable_int8") && + op_info->GetAttr("enable_int8")); + + // Get input and output vars and op attributes auto input_name = op_info->Input("Input").front(); auto input = scope->FindMutableTensor(input_name); auto input_dims = input->dims(); @@ -52,23 +56,12 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { << " out_dims: " << out_dims << " m: " << m << " k: " << k << " n: " << n; - float input_scale = 1.0f; - float out_scale = 1.0f; - std::vector w_scale; - if (op_info->HasAttr("enable_int8")) { - if (op_info->GetAttr("enable_int8")) { - if (op_info->HasAttr("input_scale")) - input_scale = op_info->GetAttr("input_scale"); - if (op_info->HasAttr("weight_scale")) - w_scale = op_info->GetAttr>("weight_scale"); - if (op_info->HasAttr("output_scale")) - out_scale = op_info->GetAttr("output_scale"); - } else { - return FAILED; - } - } else { - return FAILED; - } + CHECK(op_info->HasInputScale(input_name)); + auto input_scale = op_info->GetInputScale(input_name)[0]; + CHECK(op_info->HasInputScale(w_name)); + auto w_scale = op_info->GetInputScale(w_name); + CHECK(op_info->HasOutputScale(out_name)); + auto out_scale = op_info->GetOutputScale(out_name)[0]; // Add input tensor type NeuronOperandType inType; diff --git a/lite/kernels/apu/bridges/pool_op.cc b/lite/kernels/apu/bridges/pool_op.cc index 2bda76ab99af727276102e884f84534b77a59586..b82f23beaf715e8c720ffc22792b804ff6c2c225 100644 --- a/lite/kernels/apu/bridges/pool_op.cc +++ b/lite/kernels/apu/bridges/pool_op.cc @@ -32,6 +32,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto scope = op->scope(); VLOG(3) << "[APU] Converting [" + op_type + "] "; + CHECK(op_info->HasAttr("enable_int8") && + op_info->GetAttr("enable_int8")); + // Get input and output vars and op attributes auto x_name = op_info->Input("X").front(); auto x = scope->FindMutableTensor(x_name); @@ -87,22 +90,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { ksize); // Add x tensor type - float x_scale = 1.0f; - float out_scale = 1.0f; - if (op_info->HasAttr("enable_int8")) { - if (op_info->GetAttr("enable_int8")) { - if (op_info->HasAttr("input_scale")) - x_scale = op_info->GetAttr("input_scale"); - if (op_info->HasAttr("output_scale")) - out_scale = op_info->GetAttr("output_scale"); - } else { - LOG(WARNING) << "Do not enable_int8"; - return FAILED; - } - } else { - LOG(WARNING) << "Do not enable_int8"; - return FAILED; - } + CHECK(op_info->HasInputScale(x_name)); + auto x_scale = op_info->GetInputScale(x_name)[0]; + CHECK(op_info->HasOutputScale(out_name)); + auto out_scale = op_info->GetOutputScale(out_name)[0]; NeuronOperandType xType; xType.type = NEURON_TENSOR_QUANT8_ASYMM; diff --git a/lite/kernels/apu/bridges/softmax_op.cc b/lite/kernels/apu/bridges/softmax_op.cc index 6a289ac987b9fa300cb548d190b6e46b67f24c44..dec6d12307b50798d04f743064360aa6870acfa3 100644 --- a/lite/kernels/apu/bridges/softmax_op.cc +++ b/lite/kernels/apu/bridges/softmax_op.cc @@ -31,6 +31,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto scope = op->scope(); VLOG(3) << "[APU] Converting [" + op_type + "]"; + CHECK(op_info->HasAttr("enable_int8") && + op_info->GetAttr("enable_int8")); + // Get input and output vars and op attributes auto x_name = op_info->Input("X").front(); auto x = scope->FindMutableTensor(x_name); @@ -45,22 +48,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { axis += x_rank; } - float input_scale = 1.0f; - float out_scale = 1.0f; - if (op_info->HasAttr("enable_int8")) { - if (op_info->GetAttr("enable_int8")) { - if (op_info->HasAttr("input_scale")) - input_scale = op_info->GetAttr("input_scale"); - if (op_info->HasAttr("output_scale")) - out_scale = op_info->GetAttr("output_scale"); - } else { - LOG(WARNING) << "Do not enable_int8"; - return FAILED; - } - } else { - LOG(WARNING) << "Do not enable_int8"; - return FAILED; - } + CHECK(op_info->HasInputScale(x_name)); + auto input_scale = op_info->GetInputScale(x_name)[0]; + CHECK(op_info->HasOutputScale(out_name)); + auto out_scale = op_info->GetOutputScale(out_name)[0]; // Check output scale NeuronOperandType xType; @@ -104,14 +95,14 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { // Add out operand NeuronOperandType outType; outType.type = NEURON_TENSOR_QUANT8_ASYMM; - outType.scale = out_scale / 127; + outType.scale = out_scale; outType.zeroPoint = 128; outType.dimensionCount = x_dims.size(); outType.dimensions = &dims_x[0]; NeuronModel_addOperand(model, &outType); // 3: output std::shared_ptr out_node = nullptr; out_node = graph->Add(out_name, dims_x); - VLOG(3) << "output_scale: " << out_scale; + VLOG(3) << "out_scale: " << out_scale; float beta_val[] = {1.0f}; NeuronModel_setOperandValue( diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc index 6009e71e05c33f6dedfd995020612e112c888d36..579ed97b161dade9822250dab411cefd214b50f8 100644 --- a/lite/kernels/apu/subgraph_compute.cc +++ b/lite/kernels/apu/subgraph_compute.cc @@ -28,7 +28,7 @@ namespace lite { namespace kernels { namespace apu { -int SubgraphEngine::BuildDeviceProgram() { +bool SubgraphEngine::BuildDeviceProgram() { unsigned int version; Neuron_getVersion(&version); VLOG(3) << "Neuron Adapter version: " << version; @@ -37,8 +37,8 @@ int SubgraphEngine::BuildDeviceProgram() { subgraph::apu::Graph graph; int neuron_errCode = NeuronModel_create(&model_); if (NEURON_NO_ERROR != neuron_errCode) { - LOG(WARNING) << "Fail to create model"; - return subgraph::FAILED; + LOG(WARNING) << "[APU] Failed to create the neuron model!"; + return false; } graph.set_model(model_); graph.set_input_names(input_names_); @@ -46,15 +46,19 @@ int SubgraphEngine::BuildDeviceProgram() { // Convert all of ops and their input vars and weights and added into the APU // NIR graph + if (!origin_program_) { + BuildOriginProgram(); + } const auto& bridges = subgraph::Registry::Instance(); - for (auto& inst : origin_program_) { + const auto& insts = origin_program_->instructions(kRootBlockIdx); + for (auto& inst : insts) { auto op = const_cast(inst.op()); CHECK(op); op->CheckShape(); op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kAPU))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); @@ -63,60 +67,43 @@ int SubgraphEngine::BuildDeviceProgram() { const_cast(op), const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } - // Get input tensor - std::vector ins; - origin_itensors_.resize(input_names_.size()); - origin_idims_.resize(input_names_.size()); + // Get the index of input tensors + std::vector input_indices; for (int i = 0; i < input_names_.size(); i++) { - origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]); - CHECK(origin_itensors_[i]); - origin_idims_[i] = origin_itensors_[i]->dims(); - VLOG(3) << "subgraph input name: " << i << ", " << input_names_[i] << ":" - << origin_idims_[i].production(); - // Get input index - int idx; - if (graph.Has(input_names_[i])) { - ins.push_back(graph.Get(input_names_[i])->index()); - VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index(); - } else { - LOG(WARNING) << "Fail to find input: " << input_names_[i]; - return subgraph::FAILED; - } + CHECK(graph.Has(input_names_[i])) << "[APU] Failed to find input node " + << input_names_[i]; + auto index = graph.Get(input_names_[i])->index(); + input_indices.push_back(index); + VLOG(3) << "[APU] Input[" << i << "] name " << input_names_[i] << " dims " + << origin_itensors_[i]->dims() << " index " << index; } - // Get output tensor - std::vector outs; - origin_otensors_.resize(output_names_.size()); - origin_odims_.resize(output_names_.size()); + // Get the index of output tensors + std::vector output_indices; for (int i = 0; i < output_names_.size(); i++) { - origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]); - CHECK(origin_otensors_[i]); - origin_odims_[i] = origin_otensors_[i]->dims(); - VLOG(3) << "subgraph output name: " << i << ", " << output_names_[i] << ":" - << origin_odims_[i].production(); + CHECK(graph.Has(output_names_[i])) << "[APU] Failed to find output node " + << output_names_[i]; origin_otensors_[i]->mutable_data(); - // Get input index - if (graph.Has(output_names_[i])) { - outs.push_back(graph.Get(output_names_[i])->index()); - VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index(); - } else { - LOG(WARNING) << "Fail to find output: " << output_names_[i]; - return subgraph::FAILED; - } + auto index = graph.Get(output_names_[i])->index(); + output_indices.push_back(index); + VLOG(3) << "[APU] Output[" << i << "] name " << output_names_[i] << " dims " + << origin_otensors_[i]->dims() << " index " << index; } - VLOG(3) << "ins size: " << ins.size() << " outs size:" << outs.size(); - // Set subgraph input/output - NeuronModel_identifyInputsAndOutputs( - model_, ins.size(), &ins[0], outs.size(), &outs[0]); + // Indentify the input and output tensors of the neuron model + NeuronModel_identifyInputsAndOutputs(model_, + input_indices.size(), + &input_indices[0], + output_indices.size(), + &output_indices[0]); neuron_errCode = NeuronModel_finish(model_); if (NEURON_NO_ERROR != neuron_errCode) { - LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode; - return subgraph::FAILED; + LOG(WARNING) << "[APU] Fail to create NIR model:" << neuron_errCode; + return false; } VLOG(3) << "[APU] APU NIR model created!"; @@ -129,15 +116,14 @@ int SubgraphEngine::BuildDeviceProgram() { compilation_ = lite::apu::Device::Global().Build(model_); if (compilation_ == nullptr) { LOG(WARNING) << "[APU] Build APU DLA model failed!"; - return subgraph::FAILED; + return false; } VLOG(3) << "[APU] APU DLA model created, Build cost " << GetCurrentUS() - start_time << " us"; - - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { +bool SubgraphEngine::LaunchDeviceProgram() { auto GetCurrentUS = []() -> double { struct timeval time; gettimeofday(&time, NULL); @@ -149,22 +135,19 @@ int SubgraphEngine::LaunchDeviceProgram() { int neuron_errCode = NeuronExecution_create(compilation_, &run); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "[APU] Build APU runtime failed!"; - return subgraph::FAILED; + return false; } // Set input buffer - Tensor input_temp; for (size_t i = 0; i < origin_itensors_.size(); i++) { - input_temp.Resize({origin_idims_[i]}); - uint8_t* input_data = input_temp.mutable_data(); - memcpy(input_data, - origin_itensors_[i]->raw_data(), - origin_itensors_[i]->memory_size()); + auto origin_data = origin_itensors_[i]->mutable_data(); + auto converted_data = reinterpret_cast(origin_data); for (int j = 0; j < origin_itensors_[i]->data_size(); j++) { - input_data[j] += (uint8_t)128; + converted_data[j] = + static_cast(static_cast(origin_data[j]) + 128); } NeuronExecution_setInput( - run, i, NULL, input_data, origin_itensors_[i]->memory_size()); + run, i, NULL, converted_data, origin_itensors_[i]->memory_size()); } // Set output buffer @@ -180,19 +163,20 @@ int SubgraphEngine::LaunchDeviceProgram() { neuron_errCode = NeuronExecution_compute(run); if (NEURON_NO_ERROR != neuron_errCode) { LOG(WARNING) << "Fail to run execution!" << neuron_errCode; - return subgraph::FAILED; + return false; } for (size_t i = 0; i < origin_otensors_.size(); i++) { - int8_t* output_data = origin_otensors_[i]->mutable_data(); - VLOG(3) << "output size:" << origin_otensors_[i]->memory_size(); + auto converted_data = origin_otensors_[i]->mutable_data(); + auto origin_data = reinterpret_cast(converted_data); for (int j = 0; j < origin_otensors_[i]->data_size(); j++) { - output_data[j] -= (int8_t)128; + converted_data[j] = + static_cast(static_cast(origin_data[j]) - 128); } } NeuronExecution_free(run); VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us"; - return 0; + return true; } SubgraphEngine::~SubgraphEngine() { @@ -207,18 +191,17 @@ SubgraphEngine::~SubgraphEngine() { void SubgraphCompute::PrepareForRun() { auto& param = this->Param(); engine_.reset(new SubgraphEngine(ctx_.get(), - param.sub_block_idx, - param.sub_block_desc, + param.block_idx, + param.program_desc, + param.exec_scope, param.input_data_names, - param.output_data_names, - param.scope)); + param.output_data_names)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace apu diff --git a/lite/kernels/apu/subgraph_compute.h b/lite/kernels/apu/subgraph_compute.h index ecd8a38343cd1f62bb5a3bf8e948384b90cfe826..de15abdf7fdbce8001676a2bf7f651ad1e435c74 100644 --- a/lite/kernels/apu/subgraph_compute.h +++ b/lite/kernels/apu/subgraph_compute.h @@ -31,18 +31,22 @@ class SubgraphEngine : public subgraph::Engine { public: SubgraphEngine(KernelContext *ctx, int block_idx, - cpp::BlockDesc *block_desc, + const std::shared_ptr &program_desc, + Scope *exec_scope, const std::vector &input_names, - const std::vector &output_names, - Scope *scope) - : subgraph::Engine( - ctx, block_idx, block_desc, input_names, output_names, scope) {} + const std::vector &output_names) + : subgraph::Engine(ctx, + block_idx, + program_desc, + exec_scope, + input_names, + output_names) {} ~SubgraphEngine(); protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; NeuronModel *model_; NeuronCompilation *compilation_; diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index 687d91202fd62ed09a5157abe90bb59eb56303b5..f4fe6ba1ebb9a7e775f0d5db1031f9fd40508c20 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -54,7 +54,7 @@ add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_k add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm) - +add_kernel(group_norm_compute ARM extra SRCS group_norm_compute.cc DEPS ${lite_kernel_deps} math_arm) ## 3. extra kernels add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -75,9 +75,9 @@ add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_comp add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(clip_compute_arm ARM extra SRCS clip_compute.cc DEPS ${lite_kernel_deps} math_arm) # for OCR specific add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -86,7 +86,6 @@ add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_comp add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -102,7 +101,6 @@ add_kernel(deformable_conv_compute_arm ARM extra SRCS deformable_conv_compute.cc add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(activation_grad_compute_arm ARM train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm) diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc index 085e914c6e05c26d3031a4cfdac3c39d31f40f6d..5f3174edbbb53381db29bfa6b99f62a9e7094a4d 100644 --- a/lite/kernels/arm/activation_compute.cc +++ b/lite/kernels/arm/activation_compute.cc @@ -217,6 +217,17 @@ void AbsCompute::Run() { x_data, output_data, x_dims.production(), ctx.threads()); } +void ThresholdedReluCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + float threshold = param.relu_threshold; + lite::arm::math::act_thresholded_relu( + x_data, output_data, x_dims.production(), threshold, ctx.threads()); +} + } // namespace arm } // namespace kernels } // namespace lite @@ -336,3 +347,12 @@ REGISTER_LITE_KERNEL( .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); +REGISTER_LITE_KERNEL(thresholded_relu, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::ThresholdedReluCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h index 2e9774637b7a9156197ffeff5f4bca13a20620bb..a915937590ee8748ac419c5b33f82c81d8480852 100644 --- a/lite/kernels/arm/activation_compute.h +++ b/lite/kernels/arm/activation_compute.h @@ -175,6 +175,16 @@ class AbsCompute : public KernelLite { virtual ~AbsCompute() = default; }; +class ThresholdedReluCompute + : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~ThresholdedReluCompute() = default; +}; + } // namespace arm } // namespace kernels } // namespace lite diff --git a/lite/kernels/arm/activation_grad_compute.cc b/lite/kernels/arm/activation_grad_compute.cc deleted file mode 100644 index 137668fa5e0d1bd07e838b3040a31e084a7475c8..0000000000000000000000000000000000000000 --- a/lite/kernels/arm/activation_grad_compute.cc +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/activation_grad_compute.h" -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -void SquareGradCompute::Run() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - auto out_grad_dims = param.Out_grad->dims(); - auto out_grad_data = param.Out_grad->data(); - - auto x_data = param.X->data(); - auto x_grad_data = param.X_grad->mutable_data(); - lite::arm::math::act_square_grad(x_data, - out_grad_data, - x_grad_data, - out_grad_dims.production(), - ctx.threads()); -} - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle - -REGISTER_LITE_KERNEL(square_grad, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::SquareGradCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) - .Finalize(); diff --git a/lite/kernels/arm/argmax_compute_test.cc b/lite/kernels/arm/argmax_compute_test.cc index 034d57cdaba77130b319d203c3ae0616720c9d31..5e511264a855ac86a9fb12ede56d51fb1ea83010 100644 --- a/lite/kernels/arm/argmax_compute_test.cc +++ b/lite/kernels/arm/argmax_compute_test.cc @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/argmax_compute.h" #include + #include #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/argmax_compute.h" namespace paddle { namespace lite { @@ -66,9 +68,7 @@ void argmax_compute_ref(const operators::ArgmaxParam& param) { } TEST(argmax_arm, retrive_op) { - auto argmax = - KernelRegistry::Global().Create( - "arg_max"); + auto argmax = KernelRegistry::Global().Create("arg_max"); ASSERT_FALSE(argmax.empty()); ASSERT_TRUE(argmax.front()); } diff --git a/lite/kernels/arm/axpy_compute_test.cc b/lite/kernels/arm/axpy_compute_test.cc index af145435ebe2c5bd0c1d1b78b112e8a8572d36ec..7348630e776155cd421bc78a9da7494d42e84c3f 100644 --- a/lite/kernels/arm/axpy_compute_test.cc +++ b/lite/kernels/arm/axpy_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/axpy_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/axpy_compute.h" namespace paddle { namespace lite { @@ -61,8 +63,7 @@ void axpy_compute_ref(const operators::AxpyParam& param) { } TEST(axpy_arm, retrive_op) { - auto axpy = - KernelRegistry::Global().Create("axpy"); + auto axpy = KernelRegistry::Global().Create("axpy"); ASSERT_FALSE(axpy.empty()); ASSERT_TRUE(axpy.front()); } diff --git a/lite/kernels/arm/batch_norm_compute_test.cc b/lite/kernels/arm/batch_norm_compute_test.cc index bf690f88a5e776709a3988cc843762db3bf684e6..a3ef9bda4a17ebfdb5468c911cc6c9aa6a5d4fd7 100644 --- a/lite/kernels/arm/batch_norm_compute_test.cc +++ b/lite/kernels/arm/batch_norm_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/batch_norm_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/batch_norm_compute.h" namespace paddle { namespace lite { @@ -78,9 +80,7 @@ void batch_norm_compute_ref(const operators::BatchNormParam& param) { } TEST(batch_norm_arm, retrive_op) { - auto batch_norm = - KernelRegistry::Global().Create( - "batch_norm"); + auto batch_norm = KernelRegistry::Global().Create("batch_norm"); ASSERT_FALSE(batch_norm.empty()); ASSERT_TRUE(batch_norm.front()); } diff --git a/lite/kernels/arm/calib_compute.cc b/lite/kernels/arm/calib_compute.cc index 6dac97dcbc59991d4680ab1a98a54a900573f631..383e868843b43f4081e1eac330b1422b79307d9c 100644 --- a/lite/kernels/arm/calib_compute.cc +++ b/lite/kernels/arm/calib_compute.cc @@ -33,6 +33,17 @@ void CalibComputeFp32ToInt8::Run() { din, dout, scale.data(), 1, 1, param.input->numel()); } +template +void CalibComputeInt64ToInt32::Run() { + auto& param = this->template Param(); + const auto* din = param.input->template data(); + std::vector scale = {param.scale}; + auto* dout = param.output->template mutable_data(); + for (auto i = 0; i < param.input->numel(); ++i) { + dout[i] = din[i]; + } +} + template void CalibComputeInt8ToFp32::Run() { auto& param = this->template Param(); @@ -105,6 +116,23 @@ REGISTER_LITE_KERNEL( DATALAYOUT(kNHWC))}) .Finalize(); +REGISTER_LITE_KERNEL( + calib, + kARM, + kInt64, + kNCHW, + paddle::lite::kernels::arm::CalibComputeInt64ToInt32, + int64_to_int32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt64), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt32), + DATALAYOUT(kNCHW))}) + .Finalize(); + REGISTER_LITE_KERNEL( calib_once, kARM, @@ -161,3 +189,20 @@ REGISTER_LITE_KERNEL( PRECISION(kFloat), DATALAYOUT(kNHWC))}) .Finalize(); + +REGISTER_LITE_KERNEL( + calib_once, + kARM, + kInt64, + kNCHW, + paddle::lite::kernels::arm::CalibComputeInt64ToInt32, + int64_to_int32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt64), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt32), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/arm/calib_compute.h b/lite/kernels/arm/calib_compute.h index a4c8b4c1232101416e95171d70ab629f6a37177b..f10bb931df9b276bc3bb01da16906f3e5b5a7dce 100644 --- a/lite/kernels/arm/calib_compute.h +++ b/lite/kernels/arm/calib_compute.h @@ -34,6 +34,19 @@ class CalibComputeFp32ToInt8 private: }; +template +class CalibComputeInt64ToInt32 + : public KernelLite { + public: + using param_t = operators::CalibParam; + + void Run() override; + + ~CalibComputeInt64ToInt32() override{}; + + private: +}; + template class CalibComputeInt8ToFp32 : public KernelLite { diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc index 3b3ef07e105c583b7e3eb8b64b14610ca0f9e41a..919e9c603edff4383f086ac795c3dff4ed856c4f 100644 --- a/lite/kernels/arm/cast_compute.cc +++ b/lite/kernels/arm/cast_compute.cc @@ -62,8 +62,19 @@ void CastCompute::Run() { int32_t* out_data = param.Out->mutable_data(); std::transform( x_data_begin, x_data_end, out_data, TransOp); + } else if (param.in_dtype == 0 && param.out_dtype == 5) { // bool->fp32 + const bool* x_data_begin = param.X->data(); + const bool* x_data_end = x_data_begin + param.X->numel(); + float* out_data = param.Out->mutable_data(); + std::transform(x_data_begin, x_data_end, out_data, TransOp); + } else if (param.in_dtype == 3 && param.out_dtype == 5) { // int64->fp32 + const int64_t* x_data_begin = param.X->data(); + const int64_t* x_data_end = x_data_begin + param.X->numel(); + float* out_data = param.Out->mutable_data(); + std::transform(x_data_begin, x_data_end, out_data, TransOp); } else { - LOG(FATAL) << "other has not been implemented"; + LOG(FATAL) << "other has not been implemented transform with dtype" + << param.in_dtype << " X, dtype" << param.out_dtype << " Out"; } } diff --git a/lite/kernels/arm/clip_compute.cc b/lite/kernels/arm/clip_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d71eaef9e5b3e68d571a48e1a9772b8870c29b7 --- /dev/null +++ b/lite/kernels/arm/clip_compute.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/clip_compute.h" +#include +#include +#include "lite/backends/arm/math/funcs.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void ClipCompute::Run() { + auto& param = Param(); + lite::Tensor* x = param.x; + lite::Tensor* min_tensor = param.min_tensor; + lite::Tensor* max_tensor = param.max_tensor; + lite::Tensor* out = param.out; + float min = param.min; + float max = param.max; + + if (min_tensor != nullptr) { + min = min_tensor->data()[0]; + } + if (max_tensor != nullptr) { + max = max_tensor->data()[0]; + } + + const float* x_ptr = x->data(); + float* out_ptr = out->mutable_data(); + int64_t num = x->numel(); + lite::arm::math::clip_kernel_fp32(x_ptr, num, min, max, out_ptr); + return; +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + clip, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ClipCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Min", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Max", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/clip_compute.h b/lite/kernels/arm/clip_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..94c2b3a32ea2fc0847d8e223ecd61856fa8e3ed2 --- /dev/null +++ b/lite/kernels/arm/clip_compute.h @@ -0,0 +1,37 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/operators/clip_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class ClipCompute : public KernelLite { + public: + using param_t = operators::ClipParam; + + void Run() override; + + virtual ~ClipCompute() = default; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/arm/concat_compute.cc b/lite/kernels/arm/concat_compute.cc index dc78e1b955c29b261b2103479ea00bb836c0a31f..9ab4ca54bb909876bc823ac25cb67764eab12e47 100644 --- a/lite/kernels/arm/concat_compute.cc +++ b/lite/kernels/arm/concat_compute.cc @@ -52,11 +52,7 @@ void ConcatFunc(const std::vector inputs, output_offset += in_stride[0]; } } else { - std::vector inputs_concat(inputs.size()); - for (int j = 0; j < inputs.size(); ++j) { - inputs_concat[j] = inputs[j]; - } - lite::arm::math::concat_func(inputs_concat, axis, out); + lite::arm::math::concat_func(inputs, axis, out); } } @@ -71,6 +67,9 @@ void ConcatCompute::Run() { auto* axis_tensor_data = axis_tensor->data(); axis = axis_tensor_data[0]; } + if (axis < 0) { + axis += inputs[0]->dims().size(); + } switch (inputs.front()->precision()) { case PRECISION(kFloat): diff --git a/lite/kernels/arm/concat_compute_test.cc b/lite/kernels/arm/concat_compute_test.cc index 44c6dedd44ad4509a3f5a9c13fc04d6f1ffbdc64..862094fd23aa339bba0b06c4200e71f06402c645 100644 --- a/lite/kernels/arm/concat_compute_test.cc +++ b/lite/kernels/arm/concat_compute_test.cc @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/concat_compute.h" #include + #include #include #include + #include "lite/backends/arm/math/funcs.h" #include "lite/core/op_registry.h" #include "lite/core/tensor.h" +#include "lite/kernels/arm/concat_compute.h" namespace paddle { namespace lite { @@ -221,8 +223,7 @@ TEST(concat_arm, compute_input_multi) { } TEST(concat, retrive_op) { - auto concat = - KernelRegistry::Global().Create("concat"); + auto concat = KernelRegistry::Global().Create("concat"); ASSERT_FALSE(concat.empty()); ASSERT_TRUE(concat.front()); } diff --git a/lite/kernels/arm/conditional_block_compute.h b/lite/kernels/arm/conditional_block_compute.h deleted file mode 100644 index 91eadff931ec8aa54092347bcf18f8428130ef75..0000000000000000000000000000000000000000 --- a/lite/kernels/arm/conditional_block_compute.h +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include "lite/core/kernel.h" -#include "lite/core/op_registry.h" -#include "lite/core/program.h" -#include "lite/operators/conditional_block_op.h" -#ifdef LITE_WITH_PROFILE -#include "lite/core/profile/basic_profiler.h" -#include "lite/core/profile/precision_profiler.h" -#include "lite/core/profile/profiler.h" -#endif - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class CondExecutor { - typedef std::shared_ptr OpPtr; - - public: - CondExecutor(cpp::BlockDesc *block, Scope *scope, Place place) - : scope_(scope), place_(place) { - int32_t op_size = block->OpsSize(); - for (int32_t i = 0; i < op_size; ++i) { - auto &op_desc = *block->template GetOp(i); - auto op_type = op_desc.Type(); - auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type()); - op_handler->Attach(op_desc, scope); - - auto hostplace = place_; - hostplace.target = TARGET(kHost); - auto kernels = op_handler->CreateKernels({place_, hostplace}); - CHECK_GT(kernels.size(), 0) << "cannot create kernel"; - op_handler->AttachKernel(kernels[0].get()); - op_handler->SetKernel(kernels); - ops_of_block_.push_back(op_handler); - } - } - - void Run() { -#ifdef LITE_WITH_PROFILE -#ifdef LITE_WITH_PRECISION_PROFILE - lite::profile::Profiler profiler; -#endif // LITE_WITH_PRECISION_PROFILE -#endif // LITE_WITH_PROFILE - for (auto &op_handler : ops_of_block_) { - op_handler->CheckShape(); - op_handler->InferShape(); -#ifdef LITE_WITH_PROFILE -#ifdef LITE_WITH_PRECISION_PROFILE - std::unique_ptr kernel(op_handler->GetKernel()); - Instruction inst(op_handler, std::move(kernel)); - inst.set_profiler(&profiler); -#endif // LITE_WITH_PRECISION_PROFILE -#endif // LITE_WITH_PROFILE - op_handler->Run(); -#ifdef LITE_WITH_PROFILE -#ifdef LITE_WITH_PRECISION_PROFILE - LITE_PRECISION_PROFILE(inst) -#endif // LITE_WITH_PRECISION_PROFILE -#endif // LITE_WITH_PROFILE - } - } - - private: - Scope *scope_; - Place place_; - std::vector ops_of_block_; -}; - -class ConditionalBlockCompute - : public KernelLite { - public: - using param_t = operators::ConditionalBlockParam; - - void PrepareForRun() override; - void Run() override; - - virtual ~ConditionalBlockCompute() = default; - - private: - std::shared_ptr executor_; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc index 2a545e70691f030a3a1e3f2a9a9822f5cd8b85b9..54e67de5abbfc88f64a50b07335d2527d9738206 100644 --- a/lite/kernels/arm/conv_compute.cc +++ b/lite/kernels/arm/conv_compute.cc @@ -73,7 +73,6 @@ void ConvCompute::PrepareForRun() { // VLOG(3) << "invoking dw conv"; } else if (param.groups == 1 && kw == 3 && stride == 1 && ks_equal && no_dilation) { - // TODO(MyPandaShaoxiang): winograd conv support any pad impl_ = new WinogradConv; // VLOG(3) << "invoking winograd conv"; } else if (param.groups == 1 && kw == 3 && stride == 2 && @@ -122,10 +121,14 @@ void ConvCompute::PrepareForRun() { no_dilation && flag_dw) { impl_ = new DepthwiseConv; // VLOG(3) << "Run DepthwiseConv Int8"; - } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) && - ic * oc < 4 * hin * win && kps_equal && no_dilation) { + } else if (param.groups == 1 && kw == 3 && sw == 2 && no_dilation && + pads_equal) { impl_ = new DirectConv; // VLOG(3) << "Run DirectConv Int8"; + } else if (param.groups == 1 && kw == 3 && sw == 1 && no_dilation && + pads_equal) { + impl_ = new WinogradConv; + // VLOG(3) << "Run WinogradConv Int8"; } else { impl_ = new GemmLikeConv; // VLOG(3) << "Run GemmLikeConvInt8"; @@ -169,10 +172,14 @@ void ConvCompute::PrepareForRun() { no_dilation && flag_dw) { impl_ = new DepthwiseConv; // VLOG(3) << "Run DepthwiseConv Int8"; - } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) && - ic * oc < 4 * hin * win && kps_equal && no_dilation) { + } else if (param.groups == 1 && kw == 3 && sw == 2 && no_dilation && + pads_equal) { impl_ = new DirectConv; // VLOG(3) << "Run DirectConv Int8"; + } else if (param.groups == 1 && kw == 3 && sw == 1 && no_dilation && + pads_equal) { + impl_ = new WinogradConv; + // VLOG(3) << "Run WinogradConv Int8"; } else { impl_ = new GemmLikeConv; // VLOG(3) << "Run GemmLikeConvInt8"; diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc index af428fd9c03a34f2d181958815a927da62982e9d..f61c6109cdfd57b30c2b57390d21dec7c3bb3aa2 100644 --- a/lite/kernels/arm/conv_winograd.cc +++ b/lite/kernels/arm/conv_winograd.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "lite/kernels/arm/conv_winograd.h" -#include #include "lite/backends/arm/math/conv_impl.h" #include "lite/backends/arm/math/packed_sgemm.h" @@ -183,6 +182,189 @@ void WinogradConv::Run() { } } +template +void WinogradConv::ReInitWhenNeeded() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + int threads = ctx.threads(); + + auto x_dims = param.x->dims(); + auto w_dims = param.filter->dims(); + auto o_dims = param.output->dims(); + + if (last_shape_ == x_dims) { + return; + } + last_shape_ = x_dims; + //! update workspace size + int ic = x_dims[1]; + int ih = x_dims[2]; + int iw = x_dims[3]; + int oc = o_dims[1]; + int oh = o_dims[2]; + int ow = o_dims[3]; + int tile_block = 8; + auto pad = *(param.paddings); + int pad_h0 = pad[0]; + int pad_h1 = pad[1]; + int pad_w0 = pad[2]; + int pad_w1 = pad[3]; + int oc_pad = (oc + 7) / 8 * 8; + int ic_pad = (ic + 7) / 8 * 8; + const int new_input_size = + ic_pad * (ih + pad_h0 + pad_h1) * (iw + pad_w0 + pad_w1) + + oc_pad * oh * ow * sizeof(int32_t); + int tmp_input_thread_size_byte = + tile_block * ic_pad * wino_iw * wino_iw * sizeof(int16_t); + int tmp_output_thread_size_byte = + tile_block * oc_pad * wino_iw * wino_iw * sizeof(int32_t); + const int temp_size = + (tmp_input_thread_size_byte + tmp_output_thread_size_byte + + wino_iw * wino_iw * (8 + 8 * sizeof(int32_t))) * + threads; + workspace_size_ = temp_size + new_input_size; + + //! update trans weights impl + // choose_small_ = ow * oh / (tile_block * threads) < 36 ? true : false; + // we only support 2x2 now + choose_small_ = true; + float w_fact = 0.25; + if (choose_small_) { + wino_iw = 4; + + if (last_function_ == 0) { + return; + } + last_function_ = 0; + } else { + wino_iw = 6; + if (last_function_ == 1) { + return; + } + last_function_ = 1; + } + /// update scale + for (auto& ws : w_scale_) { + ws *= w_fact; + } + + weights_.Resize({1, 1, 1, wino_iw * wino_iw * oc_pad * ic_pad}); + void* trans_tmp_ptr = malloc(sizeof(int16_t) * wino_iw * wino_iw * oc * ic); + auto weights_data_ = weights_.mutable_data(); + if (!choose_small_) { + } else { + lite::arm::math::weight_trans_c8_4x4_int8( + weights_data_, + param.filter->template data(), + ic, + oc, + trans_tmp_ptr); + } + free(trans_tmp_ptr); +} + +template +void WinogradConv::PrepareForRun() { + auto& param = this->Param(); + w_scale_ = param.weight_scale; + if (w_scale_.size() != 1 && w_scale_.size() != param.filter->dims()[0]) { + LOG(FATAL) << "weights scale size must equal to filter size"; + return; + } + if (w_scale_.size() == 1) { + for (int i = 0; i < param.filter->dims()[0] - 1; ++i) { + w_scale_.push_back(w_scale_[0]); + } + } + float input_scale = param.input_scale; + for (auto& ws : w_scale_) { + ws *= input_scale; + } + if (param.bias) { + bias_.Resize(param.bias->dims()); + auto ptr = bias_.mutable_data(); + auto ptr_in = param.bias->template data(); + for (int i = 0; i < bias_.numel(); ++i) { + ptr[i] = ptr_in[i]; + } + } + if (OutType == PRECISION(kInt8)) { + float output_scale = param.output_scale; + for (auto& ws : w_scale_) { + ws /= output_scale; + } + if (param.bias) { + auto ptr = bias_.mutable_data(); + for (int i = 0; i < bias_.numel(); ++i) { + ptr[i] /= output_scale; + } + } + } + ReInitWhenNeeded(); +} + +template +void WinogradConv::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + ctx.ExtendWorkspace(workspace_size_); + const auto* i_data = param.x->template data(); + const auto* w_data = weights_.data(); + const auto* b_data = param.bias ? bias_.data() : nullptr; + // const float* i_data; + auto x_dims = param.x->dims(); + auto w_dims = param.filter->dims(); + auto o_dims = param.output->dims(); + + int iw = x_dims[3]; // nchw + int ih = x_dims[2]; + int ic = x_dims[1]; + int bs = x_dims[0]; + int oh = o_dims[2]; + int ow = o_dims[3]; + int oc = o_dims[1]; + + // now always choose small + if (OutType == PRECISION(kInt8)) { + auto* o_data = param.output->template mutable_data(); + lite::arm::math::conv_compute_2x2_3x3_int8(i_data, + o_data, + bs, + oc, + oh, + ow, + ic, + ih, + iw, + w_data, + b_data, + w_scale_.data(), + param, + &ctx); + } else { + auto* o_data = param.output->template mutable_data(); + lite::arm::math::conv_compute_2x2_3x3_int8(i_data, + o_data, + bs, + oc, + oh, + ow, + ic, + ih, + iw, + w_data, + b_data, + w_scale_.data(), + param, + &ctx); + } +#ifdef LITE_WITH_PROFILE + kernel_func_name_ = "conv_compute_2x2_3x3_int8"; +#endif +} +template class WinogradConv; +template class WinogradConv; + } // namespace arm } // namespace kernels } // namespace lite diff --git a/lite/kernels/arm/conv_winograd.h b/lite/kernels/arm/conv_winograd.h index 1cb4d69acbc562b7bb7d50944daf6c6ff3b5d790..b93a719f7dbb13aa9888ea943fa81b6ea2b38c00 100644 --- a/lite/kernels/arm/conv_winograd.h +++ b/lite/kernels/arm/conv_winograd.h @@ -16,11 +16,11 @@ #include #include +#include #include "lite/backends/arm/math/conv_impl.h" #include "lite/core/context.h" #include "lite/core/kernel.h" #include "lite/core/target_wrapper.h" - namespace paddle { namespace lite { namespace kernels { @@ -52,7 +52,34 @@ class WinogradConv : public KernelLite { bool choose_small_{false}; int wino_iw{8}; }; +template +class WinogradConv + : public KernelLite { + public: + WinogradConv() = default; + ~WinogradConv() {} + virtual void PrepareForRun(); + virtual void ReInitWhenNeeded(); + virtual void Run(); +#ifdef LITE_WITH_PROFILE + virtual void SetProfileRuntimeKernelInfo( + paddle::lite::profile::OpCharacter* ch) { + ch->kernel_func_name = kernel_func_name_; + } + std::string kernel_func_name_{"NotImplForConvWino"}; +#endif + protected: + using param_t = operators::ConvParam; + Tensor weights_; + Tensor bias_; + DDim last_shape_; + int workspace_size_{0}; + int last_function_{-1}; + bool choose_small_{true}; + int wino_iw{4}; + std::vector w_scale_; +}; } // namespace arm } // namespace kernels } // namespace lite diff --git a/lite/kernels/arm/decode_bboxes_compute_test.cc b/lite/kernels/arm/decode_bboxes_compute_test.cc index 271a99c29b61063877b7d1c0d2e50bc65d135d72..ef9da0f1e2c53a021c82f19d3151a2fe8fba8af4 100644 --- a/lite/kernels/arm/decode_bboxes_compute_test.cc +++ b/lite/kernels/arm/decode_bboxes_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/decode_bboxes_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/decode_bboxes_compute.h" namespace paddle { namespace lite { @@ -115,9 +117,7 @@ void decode_bboxes_compute_ref(const operators::DecodeBboxesParam& param) { } TEST(decode_bboxes_arm, retrive_op) { - auto decode_bboxes = - KernelRegistry::Global().Create( - "decode_bboxes"); + auto decode_bboxes = KernelRegistry::Global().Create("decode_bboxes"); ASSERT_FALSE(decode_bboxes.empty()); ASSERT_TRUE(decode_bboxes.front()); } diff --git a/lite/kernels/arm/deformable_conv_compute.cc b/lite/kernels/arm/deformable_conv_compute.cc index 6253b661d05535d7b3b4a2ee18de7707e80b2877..dfdd27799bc1df7f403f40cb50b48aebbfb8d67a 100644 --- a/lite/kernels/arm/deformable_conv_compute.cc +++ b/lite/kernels/arm/deformable_conv_compute.cc @@ -235,7 +235,8 @@ typedef paddle::lite::kernels::arm::DeformableConvCompute DeformableConvFp32; -REGISTER_LITE_KERNEL(deformconv2d, kARM, kFloat, kNCHW, DeformableConvFp32, def) +REGISTER_LITE_KERNEL( + deformable_conv, kARM, kFloat, kNCHW, DeformableConvFp32, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) diff --git a/lite/kernels/arm/deformable_conv_compute.h b/lite/kernels/arm/deformable_conv_compute.h index 6c8995ddd447a4382ee40e00f3b31832566ad9e9..17fae957619b7754637023a21169da9641686e59 100644 --- a/lite/kernels/arm/deformable_conv_compute.h +++ b/lite/kernels/arm/deformable_conv_compute.h @@ -17,6 +17,7 @@ #include "lite/backends/arm/math/funcs.h" #include "lite/core/kernel.h" #ifdef LITE_WITH_PROFILE +#include #include "lite/core/profile/profiler.h" #endif @@ -56,8 +57,9 @@ class DeformableConvCompute : public KernelLite { #ifdef LITE_WITH_PROFILE virtual void SetProfileRuntimeKernelInfo( paddle::lite::profile::OpCharacter* ch) { - impl_->SetProfileRuntimeKernelInfo(ch); + ch->kernel_func_name = kernel_func_name_; } + std::string kernel_func_name_{"NotImplForDeformableConv"}; #endif ~DeformableConvCompute() = default; diff --git a/lite/kernels/arm/dropout_compute_test.cc b/lite/kernels/arm/dropout_compute_test.cc index 1c0f8db347304076caee23ee3d295bcfacbe2a1f..0aa16b8d348d7b8415120051df0e9732fada4495 100644 --- a/lite/kernels/arm/dropout_compute_test.cc +++ b/lite/kernels/arm/dropout_compute_test.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/dropout_compute.h" #include + #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/dropout_compute.h" namespace paddle { namespace lite { @@ -30,9 +32,7 @@ TEST(dropout_arm, init) { } TEST(dropout, retrive_op) { - auto dropout = - KernelRegistry::Global().Create( - "dropout"); + auto dropout = KernelRegistry::Global().Create("dropout"); ASSERT_FALSE(dropout.empty()); ASSERT_TRUE(dropout.front()); } diff --git a/lite/kernels/arm/elementwise_compute.cc b/lite/kernels/arm/elementwise_compute.cc index 8115700f5950ddfcb71df49e6a21528563f23d95..3e898d9ded2153588c164d2ccd618fc77f7c3854 100644 --- a/lite/kernels/arm/elementwise_compute.cc +++ b/lite/kernels/arm/elementwise_compute.cc @@ -202,17 +202,13 @@ void ElementwiseMulCompute::Run() { } } -template <> -void ElementwiseMulCompute::Run() { - auto& param = this->template Param(); - lite::arm::math::elementwise_compute_basic(param, "mul", ""); -} - -void ElementwiseMulActivationCompute::Run() { - auto& param = Param(); - const float* x_data = param.X->data(); - const float* y_data = param.Y->data(); - float* out_data = param.Out->mutable_data(); +template +void ElementwiseMulActivationCompute::Run() { + auto& param = + this->template Param(); + auto* x_data = param.X->template data(); + auto* y_data = param.Y->template data(); + auto* out_data = param.Out->template mutable_data(); int axis = param.axis; std::string act_type = param.act_type; auto x_dims = param.X->dims(); @@ -221,21 +217,21 @@ void ElementwiseMulActivationCompute::Run() { if (x_dims.size() < y_dims.size() && is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) { if (act_type == "relu") { - lite::arm::math::elementwise_mul_relu_broadcast( + lite::arm::math::elementwise_mul_relu_broadcast( y_data, x_data, out_data, pre, n, post); } else { LOG(FATAL) << "unsupported Activation type: " << act_type; } } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) { if (act_type == "relu") { - lite::arm::math::elementwise_mul_relu_broadcast( + lite::arm::math::elementwise_mul_relu_broadcast( x_data, y_data, out_data, pre, n, post); } else { LOG(FATAL) << "unsupported Activation type: " << act_type; } } else { if (act_type == "relu") { - lite::arm::math::elementwise_mul_relu( + lite::arm::math::elementwise_mul_relu( x_data, y_data, out_data, x_dims.production()); } else { LOG(FATAL) << "unsupported Activation type: " << act_type; @@ -300,11 +296,12 @@ void ElementwiseMaxActivationCompute::Run() { } } -void ElementwiseDivCompute::Run() { - auto& param = Param(); - const float* x_data = param.X->data(); - const float* y_data = param.Y->data(); - float* out_data = param.Out->mutable_data(); +template +void ElementwiseDivCompute::Run() { + auto& param = this->template Param(); + auto* x_data = param.X->template data(); + auto* y_data = param.Y->template data(); + auto* out_data = param.Out->template mutable_data(); int axis = param.axis; auto x_dims = param.X->dims(); auto y_dims = param.Y->dims(); @@ -313,10 +310,10 @@ void ElementwiseDivCompute::Run() { LOG(FATAL) << "elewise div don't support x_dims size < y_dims size"; } if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) { - lite::arm::math::elementwise_div_broadcast( + lite::arm::math::elementwise_div_broadcast( x_data, y_data, out_data, pre, n, post); } else { - lite::arm::math::elementwise_div( + lite::arm::math::elementwise_div( x_data, y_data, out_data, x_dims.production()); } } @@ -351,6 +348,29 @@ void ElementwiseDivActivationCompute::Run() { } } +template +void ElementwiseModCompute::Run() { + auto& param = this->template Param(); + auto* x_data = param.X->template data(); + auto* y_data = param.Y->template data(); + auto* out_data = param.Out->template mutable_data(); + int axis = param.axis; + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + int pre, n, post; + if (x_dims.size() < y_dims.size() && + is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) { + lite::arm::math::elementwise_mod_broadcast( + y_data, x_data, out_data, pre, n, post); + } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) { + lite::arm::math::elementwise_mod_broadcast( + x_data, y_data, out_data, pre, n, post); + } else { + lite::arm::math::elementwise_mod( + x_data, y_data, out_data, x_dims.production()); + } +} + } // namespace arm } // namespace kernels } // namespace lite @@ -402,46 +422,60 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); -using elementwise_mul_float = +using elementwise_mul_float_t = paddle::lite::kernels::arm::ElementwiseMulCompute; REGISTER_LITE_KERNEL( - elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float, def) + elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float_t, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); -using elementwise_mul_int32 = +using elementwise_mul_int32_t = paddle::lite::kernels::arm::ElementwiseMulCompute; REGISTER_LITE_KERNEL( - elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32, def) + elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32_t, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .Finalize(); -using elementwise_mul_int64 = +using elementwise_mul_int64_t = paddle::lite::kernels::arm::ElementwiseMulCompute; REGISTER_LITE_KERNEL( - elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64, def) + elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64_t, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) .Finalize(); -REGISTER_LITE_KERNEL( - fusion_elementwise_mul_activation, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::ElementwiseMulActivationCompute, - def) +using fusion_elementwise_mul_activation_float_t = paddle::lite::kernels::arm:: + ElementwiseMulActivationCompute; +REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation, + kARM, + kFloat, + kNCHW, + fusion_elementwise_mul_activation_float_t, + def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); +using fusion_elementwise_mul_activation_int64_t = paddle::lite::kernels::arm:: + ElementwiseMulActivationCompute; +REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation, + kARM, + kInt64, + kNCHW, + fusion_elementwise_mul_activation_int64_t, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .Finalize(); + REGISTER_LITE_KERNEL(elementwise_max, kARM, kFloat, @@ -465,17 +499,27 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); -REGISTER_LITE_KERNEL(elementwise_div, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::ElementwiseDivCompute, - def) +using elementwise_div_fp32_t = + paddle::lite::kernels::arm::ElementwiseDivCompute; + +REGISTER_LITE_KERNEL( + elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32_t, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); +using elementwise_div_int64_t = + paddle::lite::kernels::arm::ElementwiseDivCompute; + +REGISTER_LITE_KERNEL( + elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64_t, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .Finalize(); + REGISTER_LITE_KERNEL( fusion_elementwise_div_activation, kARM, @@ -487,3 +531,13 @@ REGISTER_LITE_KERNEL( .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); + +using elementwise_mod_int64_t = + paddle::lite::kernels::arm::ElementwiseModCompute; +REGISTER_LITE_KERNEL( + elementwise_mod, kARM, kInt64, kNCHW, elementwise_mod_int64_t, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/arm/elementwise_compute.h b/lite/kernels/arm/elementwise_compute.h index 731010a0d189c08f031363e6df95652c000a237b..89d9898648d25fec98568f2456fe96903da0a69d 100644 --- a/lite/kernels/arm/elementwise_compute.h +++ b/lite/kernels/arm/elementwise_compute.h @@ -62,8 +62,8 @@ class ElementwiseMulCompute : public KernelLite { virtual ~ElementwiseMulCompute() = default; }; -class ElementwiseMulActivationCompute - : public KernelLite { +template +class ElementwiseMulActivationCompute : public KernelLite { public: void Run() override; @@ -86,8 +86,8 @@ class ElementwiseMaxActivationCompute virtual ~ElementwiseMaxActivationCompute() = default; }; -class ElementwiseDivCompute - : public KernelLite { +template +class ElementwiseDivCompute : public KernelLite { public: void Run() override; @@ -102,6 +102,22 @@ class ElementwiseDivActivationCompute virtual ~ElementwiseDivActivationCompute() = default; }; +template +class ElementwiseModCompute : public KernelLite { + public: + void Run() override; + + virtual ~ElementwiseModCompute() = default; +}; + +// class ElementwiseModActivationCompute +// : public KernelLite { +// public: +// void Run() override; + +// virtual ~ElementwiseModActivationCompute() = default; +// }; + } // namespace arm } // namespace kernels } // namespace lite diff --git a/lite/kernels/arm/elementwise_compute_test.cc b/lite/kernels/arm/elementwise_compute_test.cc index b0ac3a7d33d92239c83147a3fe7615cd2fbf0249..79262fb4ef75283eba12efa0a4ad8dc048681338 100644 --- a/lite/kernels/arm/elementwise_compute_test.cc +++ b/lite/kernels/arm/elementwise_compute_test.cc @@ -12,11 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/elementwise_compute.h" #include + +#include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/elementwise_compute.h" namespace paddle { namespace lite { @@ -24,9 +27,7 @@ namespace kernels { namespace arm { TEST(elementwise_add_arm, retrive_op) { - auto elementwise_add = - KernelRegistry::Global().Create( - "elementwise_add"); + auto elementwise_add = KernelRegistry::Global().Create("elementwise_add"); ASSERT_FALSE(elementwise_add.empty()); ASSERT_TRUE(elementwise_add.front()); } @@ -140,6 +141,119 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param, } } +template +void elementwise_fmod_compute_ref(const operators::ElementwiseParam& param, + const std::string act_type) { + const dtype* x_data = param.X->data(); + const dtype* y_data = param.Y->data(); + dtype* out_data = param.Out->mutable_data(); + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + int axis = param.axis; + if (axis < 0) { + axis = x_dims.size() - y_dims.size(); + } + int batch = 1; + int channels = 1; + int num = 1; + for (int i = 0; i < axis; ++i) { + batch *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + channels *= y_dims[i]; + } + for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { + num *= x_dims[i]; + } + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = fmod(diny_data + fmod(*din_ptr, diny_data), diny_data); + dout_ptr++; + din_ptr++; + } + } + } + // do activation relu + if (act_type.size() > 0) { + if (act_type == "relu") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + dtype* dout_ptr = out_data + (i * channels + j) * num; + for (int k = 0; k < num; ++k) { + *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f; + dout_ptr++; + } + } + } + } + } +} + +template +void elementwise_imod_compute_ref(const operators::ElementwiseParam& param, + const std::string act_type) { + const dtype* x_data = param.X->data(); + const dtype* y_data = param.Y->data(); + dtype* out_data = param.Out->mutable_data(); + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + int axis = param.axis; + if (axis < 0) { + axis = x_dims.size() - y_dims.size(); + } + int batch = 1; + int channels = 1; + int num = 1; + for (int i = 0; i < axis; ++i) { + batch *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + channels *= y_dims[i]; + } + for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { + num *= x_dims[i]; + } + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = (*din_ptr) % diny_data; + dout_ptr++; + din_ptr++; + } + } + } + // do activation relu + if (act_type.size() > 0) { + if (act_type == "relu") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + dtype* dout_ptr = out_data + (i * channels + j) * num; + for (int k = 0; k < num; ++k) { + *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f; + dout_ptr++; + } + } + } + } + } +} + +template void elementwise_fmod_compute_ref( + const operators::ElementwiseParam& param, const std::string act_type); +template void elementwise_imod_compute_ref( + const operators::ElementwiseParam& param, const std::string act_type); +template void elementwise_imod_compute_ref( + const operators::ElementwiseParam& param, const std::string act_type); + TEST(elementwise_add, compute) { ElementwiseAddCompute elementwise_add; operators::ElementwiseParam param; @@ -222,8 +336,7 @@ TEST(elementwise_add, compute) { TEST(fusion_elementwise_add_activation_arm, retrive_op) { auto fusion_elementwise_add_activation = - KernelRegistry::Global().Create( - "fusion_elementwise_add_activation"); + KernelRegistry::Global().Create("fusion_elementwise_add_activation"); ASSERT_FALSE(fusion_elementwise_add_activation.empty()); ASSERT_TRUE(fusion_elementwise_add_activation.front()); } @@ -321,9 +434,7 @@ TEST(fusion_elementwise_add_activation_arm, compute) { } TEST(elementwise_mul_arm, retrive_op) { - auto elementwise_mul = - KernelRegistry::Global().Create( - "elementwise_mul"); + auto elementwise_mul = KernelRegistry::Global().Create("elementwise_mul"); ASSERT_FALSE(elementwise_mul.empty()); ASSERT_TRUE(elementwise_mul.front()); } @@ -416,20 +527,21 @@ TEST(elementwise_mul, compute) { TEST(fusion_elementwise_mul_activation_arm, retrive_op) { auto fusion_elementwise_mul_activation = - KernelRegistry::Global().Create( - "fusion_elementwise_mul_activation"); + KernelRegistry::Global().Create("fusion_elementwise_mul_activation"); ASSERT_FALSE(fusion_elementwise_mul_activation.empty()); ASSERT_TRUE(fusion_elementwise_mul_activation.front()); } TEST(fusion_elementwise_mul_activation_arm, init) { - ElementwiseMulActivationCompute fusion_elementwise_mul_activation; + ElementwiseMulActivationCompute + fusion_elementwise_mul_activation; ASSERT_EQ(fusion_elementwise_mul_activation.precision(), PRECISION(kFloat)); ASSERT_EQ(fusion_elementwise_mul_activation.target(), TARGET(kARM)); } TEST(fusion_elementwise_mul_activation_arm, compute) { - ElementwiseMulActivationCompute fusion_elementwise_mul_activation; + ElementwiseMulActivationCompute + fusion_elementwise_mul_activation; operators::FusionElementwiseActivationParam param; lite::Tensor x, y, output, output_ref; @@ -515,9 +627,7 @@ TEST(fusion_elementwise_mul_activation_arm, compute) { } TEST(elementwise_max_arm, retrive_op) { - auto elementwise_max = - KernelRegistry::Global().Create( - "elementwise_max"); + auto elementwise_max = KernelRegistry::Global().Create("elementwise_max"); ASSERT_FALSE(elementwise_max.empty()); ASSERT_TRUE(elementwise_max.front()); } @@ -610,8 +720,7 @@ TEST(elementwise_max, compute) { TEST(fusion_elementwise_max_activation_arm, retrive_op) { auto fusion_elementwise_max_activation = - KernelRegistry::Global().Create( - "fusion_elementwise_max_activation"); + KernelRegistry::Global().Create("fusion_elementwise_max_activation"); ASSERT_FALSE(fusion_elementwise_max_activation.empty()); ASSERT_TRUE(fusion_elementwise_max_activation.front()); } @@ -685,7 +794,7 @@ TEST(fusion_elementwise_max_activation_arm, compute) { } for (int i = 0; i < y_dim.production(); i++) { float sign = i % 2 == 0 ? 0.5f : -0.5f; - y_data[i] = i * sign; + y_data[i] = (i + 1) * sign; } param.X = &x; param.Y = &y; @@ -708,6 +817,106 @@ TEST(fusion_elementwise_max_activation_arm, compute) { } } +TEST(elementwise_mod_int64_arm, retrive_op) { + auto elementwise_mod = KernelRegistry::Global().Create("elementwise_mod"); + ASSERT_FALSE(elementwise_mod.empty()); + ASSERT_TRUE(elementwise_mod.front()); +} + +TEST(elementwise_mod_int64_arm, init) { + ElementwiseModCompute elementwise_mod; + ASSERT_EQ(elementwise_mod.precision(), PRECISION(kInt64)); + ASSERT_EQ(elementwise_mod.target(), TARGET(kARM)); +} + +TEST(elementwise_mod_int64_arm, compute) { + ElementwiseModCompute elementwise_mod; + operators::ElementwiseParam param; + lite::Tensor x, y, output, output_ref; + +#if 1 + for (auto n : {1, 3, 4}) { + for (auto c : {1, 3, 4}) { + for (auto h : {1, 3, 4}) { + for (auto w : {1, 3, 4}) { + for (auto axis : {-1, 0, 1, 3}) { + for (auto yd : {std::vector({n}), + std::vector({c}), + std::vector({h}), + std::vector({w}), + std::vector({n, c}), + std::vector({c, h}), + std::vector({c, h, w}), + std::vector({n, c, h, w})}) { +#else + for (auto n : {1, 3, 4, 11}) { + for (auto c : {1, 3, 4, 11}) { + for (auto h : {1, 3, 4, 11}) { + for (auto w : {1, 3, 4, 11}) { + for (auto axis : {-1, 0, 1, 2, 3}) { + for (auto yd : {std::vector({n}), + std::vector({c}), + std::vector({h}), + std::vector({w}), + std::vector({n, c}), + std::vector({c, h}), + std::vector({h, w}), + std::vector({n, c, h}), + std::vector({c, h, w}), + std::vector({n, c, h, w})}) { +#endif + auto x_dim = DDim(std::vector({n, c, h, w})); + auto y_dim = DDim(yd); + int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis; + + if (axis_t + y_dim.size() > 4) continue; + bool flag = false; + for (int i = 0; i < y_dim.size(); i++) { + if (x_dim[i + axis_t] != y_dim[i]) flag = true; + } + if (flag) continue; + + x.Resize(x_dim); + y.Resize(y_dim); + output.Resize(x_dim); + output_ref.Resize(x_dim); + auto* x_data = x.mutable_data(); + auto* y_data = y.mutable_data(); + auto* output_data = output.mutable_data(); + auto* output_ref_data = output_ref.mutable_data(); + for (int i = 0; i < x_dim.production(); i++) { + x_data[i] = i + 1; + } + for (int i = 0; i < y_dim.production(); i++) { + y_data[i] = y_dim.production() - i; + } + param.X = &x; + param.Y = &y; + param.axis = axis; + param.Out = &output; + elementwise_mod.SetParam(param); + elementwise_mod.Run(); + param.Out = &output_ref; + elementwise_imod_compute_ref(param, ""); + for (int i = 0; i < output.dims().production(); i++) { + if (std::abs(output_data[i] - output_ref_data[i]) > 1e-5 || + std::isnan(output_data[i]) || + std::isnan(output_ref_data[i])) { + LOG(FATAL) << "elementwise mod cmp error, i: " << i + << ", x_data: " << x_data[i] + << ", y_data: " << y_data[i] + << ", output_data: " << output_data[i] + << ", output_ref_data: " << output_ref_data[i]; + } + } + } + } + } + } + } + } +} + } // namespace arm } // namespace kernels } // namespace lite @@ -719,3 +928,4 @@ USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(elementwise_mod, kARM, kInt64, kNCHW, def); diff --git a/lite/kernels/arm/fc_compute.cc b/lite/kernels/arm/fc_compute.cc index 0ff1cd6b0dc26cdb2b45b00e34baced1bc5fa131..6e3a620a4a8989807481cb0f56ac91643eda4ce7 100644 --- a/lite/kernels/arm/fc_compute.cc +++ b/lite/kernels/arm/fc_compute.cc @@ -88,7 +88,7 @@ void FcCompute::Run() { auto i_data = param.input->data(); auto o_data = param.output->mutable_data(); - auto w_data = flag_gemm_ ? param.w->data() : weights_.data(); + auto w_data = param.w->data(); const float* b_data = param.bias ? param.bias->data() : nullptr; if (flag_trans_bias_) { b_data = bias_.data(); @@ -149,8 +149,7 @@ void FcCompute::Run() { auto i_data = param.input->data(); auto o_data = param.output->mutable_data(); - auto w_data = - flag_trans_weights_ ? weights_.data() : param.w->data(); + auto w_data = param.w->data(); const float* b_data = param.bias ? param.bias->data() : nullptr; if (flag_trans_bias_) { b_data = bias_.data(); @@ -208,8 +207,7 @@ void FcCompute::Run() { auto i_data = param.input->data(); auto o_data = param.output->mutable_data(); - auto w_data = - flag_trans_weights_ ? weights_.data() : param.w->data(); + auto w_data = param.w->data(); const float* b_data = param.bias ? param.bias->data() : nullptr; if (flag_trans_bias_) { b_data = bias_.data(); diff --git a/lite/kernels/arm/fc_compute.h b/lite/kernels/arm/fc_compute.h index 4f8a82a8689c1f221ee146176ff7074602cad1c9..e45758775d99112afa0a7e3a45e1c15a9ea371aa 100644 --- a/lite/kernels/arm/fc_compute.h +++ b/lite/kernels/arm/fc_compute.h @@ -104,9 +104,11 @@ class FcCompute : public KernelLite { CHECK_EQ(k_, static_cast(w_dims[0])); flag_gemm_ = check_fc_use_gemm( m_, param.weight_scale, param.bias != nullptr); - if (!flag_trans_weights_ && !flag_gemm_) { - flag_trans_weights_ = true; - fc_trans_weights(*param.w, &weights_); + if (flag_trans_weights_ == flag_gemm_) { + flag_trans_weights_ = !flag_trans_weights_; + Tensor tmp_tensor; + fc_trans_weights(*param.w, &tmp_tensor); + param.w->CopyDataFrom(tmp_tensor); } } @@ -117,7 +119,6 @@ class FcCompute : public KernelLite { private: DDim last_shape_; - Tensor weights_; Tensor bias_; bool flag_trans_weights_{false}; bool flag_trans_bias_{false}; diff --git a/lite/kernels/arm/gather_compute.cc b/lite/kernels/arm/gather_compute.cc index 3efacc4aacefcb150d53738c950ec9e797ed78c7..f5a87e5431955252e47143252ce13ba4056c4a7f 100644 --- a/lite/kernels/arm/gather_compute.cc +++ b/lite/kernels/arm/gather_compute.cc @@ -20,44 +20,45 @@ namespace lite { namespace kernels { namespace arm { -template +template void GatherFunc(const operators::GatherParam& param) { auto src_dims = param.X->dims(); auto index_size = param.Index->dims()[0]; - auto* p_src = param.X->data(); - const int* p_index = param.Index->data(); - auto* p_output = param.Out->mutable_data(); + auto* p_src = param.X->data(); + const IndexType* p_index = param.Index->data(); + auto* p_output = param.Out->mutable_data(); int slice_size = 1; for (size_t i = 1; i < src_dims.size(); ++i) { slice_size *= src_dims[i]; } for (int i = 0; i < index_size; ++i) { - int index_ = p_index[i]; + IndexType index_ = p_index[i]; memcpy(p_output + i * slice_size, p_src + index_ * slice_size, - slice_size * sizeof(T)); + slice_size * sizeof(DataType)); } } -void GatherCompute::Run() { - auto& param = this->Param(); +template +void GatherCompute::Run() { + auto& param = this->template Param(); switch (param.X->precision()) { case PRECISION(kFloat): - GatherFunc(param); + GatherFunc(param); break; case PRECISION(kInt8): - GatherFunc(param); + GatherFunc(param); break; case PRECISION(kInt16): - GatherFunc(param); + GatherFunc(param); break; case PRECISION(kInt32): - GatherFunc(param); + GatherFunc(param); break; case PRECISION(kInt64): - GatherFunc(param); + GatherFunc(param); break; default: LOG(FATAL) << "Gather does not implement for the " @@ -70,10 +71,26 @@ void GatherCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL( - gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def) +REGISTER_LITE_KERNEL(gather, + kARM, + kAny, + kNCHW, + paddle::lite::kernels::arm::GatherCompute, + def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .BindInput("Index", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) .Finalize(); + +REGISTER_LITE_KERNEL(gather, + kARM, + kAny, + kNCHW, + paddle::lite::kernels::arm::GatherCompute, + def_int64_idx) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) + .BindInput("Index", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))}) + .Finalize(); diff --git a/lite/kernels/arm/gather_compute.h b/lite/kernels/arm/gather_compute.h index 9753f42972407b250886afa6bada8861a642e189..0226e5f68eee3f23dbd945af6f4f455ab79190c5 100644 --- a/lite/kernels/arm/gather_compute.h +++ b/lite/kernels/arm/gather_compute.h @@ -23,6 +23,7 @@ namespace lite { namespace kernels { namespace arm { +template class GatherCompute : public KernelLite { public: void Run() override; diff --git a/lite/kernels/arm/group_norm_compute.cc b/lite/kernels/arm/group_norm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..2e370414f4079f8dbbc2e5cc9af294c7b3f88718 --- /dev/null +++ b/lite/kernels/arm/group_norm_compute.cc @@ -0,0 +1,180 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/group_norm_compute.h" +#include "lite/backends/arm/math/funcs.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void GroupNormCompute::PrepareForRun() {} + +void GroupNormCompute::Run() { + auto& param = this->Param(); + const float* in = param.x->data(); + const float* scale = param.scale->data(); + const float* bias = param.bias->data(); + float* out = param.out->mutable_data(); + float* saved_mean = param.saved_mean->mutable_data(); + float* saved_variance = param.saved_variance->mutable_data(); + float epsilon = param.epsilon; + int groups = param.groups; + int channels = param.channels; + int n = param.x->dims()[0]; + int c = param.x->dims()[1]; + int ch_per_group = channels / groups; + int height = param.x->dims()[2]; + int width = param.x->dims()[3]; + int spatial_size = ch_per_group * height * width; + int ngroup = n * groups; + int cnt = spatial_size >> 4; + int remain = spatial_size % 16; +// compute saved_mean and saved_variance +#pragma omp parallel for + for (int n = 0; n < ngroup; ++n) { + const float* in_p = in + n * spatial_size; + float sum_spatial = 0.f; + float summ_spatial = 0.f; + float32x4_t sum0 = vdupq_n_f32(0.f); + float32x4_t sum1 = vdupq_n_f32(0.f); + float32x4_t sum2 = vdupq_n_f32(0.f); + float32x4_t sum3 = vdupq_n_f32(0.f); + float32x4_t summ0 = vdupq_n_f32(0.f); + float32x4_t summ1 = vdupq_n_f32(0.f); + float32x4_t summ2 = vdupq_n_f32(0.f); + float32x4_t summ3 = vdupq_n_f32(0.f); + for (int i = 0; i < cnt; i++) { + float32x4_t in0 = vld1q_f32(in_p); + float32x4_t in1 = vld1q_f32(in_p + 4); + float32x4_t in2 = vld1q_f32(in_p + 8); + float32x4_t in3 = vld1q_f32(in_p + 12); + sum0 = vaddq_f32(sum0, in0); + summ0 = vmlaq_f32(summ0, in0, in0); + sum1 = vaddq_f32(sum1, in1); + summ1 = vmlaq_f32(summ1, in1, in1); + sum2 = vaddq_f32(sum2, in2); + summ2 = vmlaq_f32(summ2, in2, in2); + sum3 = vaddq_f32(sum3, in3); + summ3 = vmlaq_f32(summ3, in3, in3); + in_p += 16; + } + for (int i = 0; i < remain - 3; i += 4) { + float32x4_t in0 = vld1q_f32(in_p); + sum1 = vaddq_f32(sum1, in0); + summ1 = vmlaq_f32(summ1, in0, in0); + in_p += 4; + } + float sum = 0.0; + float summ = 0.0; + sum0 = vaddq_f32(sum0, sum1); + sum2 = vaddq_f32(sum2, sum3); + summ0 = vaddq_f32(summ0, summ1); + summ2 = vaddq_f32(summ2, summ3); + for (int i = 0; i < remain % 4; i++) { + sum += *in_p; + summ += (*in_p) * (*in_p); + in_p++; + } + sum0 = vaddq_f32(sum0, sum2); + summ0 = vaddq_f32(summ0, summ2); + float32x2_t sum_low = vpadd_f32(vget_low_f32(sum0), vget_high_f32(sum0)); + float32x2_t sum_high = vpadd_f32(vget_low_f32(summ0), vget_high_f32(summ0)); + float32x2_t sum_mix = vpadd_f32(sum_low, sum_high); + sum += vget_lane_f32(sum_mix, 0); + summ += vget_lane_f32(sum_mix, 1); + float mean = sum / spatial_size; + // float variance = summ / spatial_size - mean * mean; + // the flolowing code has higher precision than above comment code + float variance = (summ - mean * mean * spatial_size) / spatial_size; + float std = 1.f / sqrtf(variance + epsilon); + saved_mean[n] = mean; + saved_variance[n] = std; + } + int in_size = height * width; + cnt = in_size >> 4; + remain = in_size % 16; +// compute Group_norm result: out = scale * (in - mean) / std + bias +#pragma omp parallel for + for (int i = 0; i < ngroup; ++i) { + const float* in_p = in + i * spatial_size; + float* out_p = out + i * spatial_size; + int numc = i % groups; + numc *= ch_per_group; + for (int c = 0; c < ch_per_group; c++) { + int chin = numc + c; + const float sstd_val = scale[chin] * saved_variance[i]; + const float bias_val = bias[chin]; + const float mean_val = saved_mean[i]; + const float32x4_t vsstd = vdupq_n_f32(sstd_val); + const float32x4_t vbias = vdupq_n_f32(bias_val); + const float32x4_t vmean = vdupq_n_f32(mean_val); + for (int k = 0; k < cnt; k++) { + float32x4_t in0 = vld1q_f32(in_p); + float32x4_t in1 = vld1q_f32(in_p + 4); + float32x4_t in2 = vld1q_f32(in_p + 8); + float32x4_t in3 = vld1q_f32(in_p + 12); + float32x4_t submean0 = vsubq_f32(in0, vmean); + float32x4_t submean1 = vsubq_f32(in1, vmean); + float32x4_t submean2 = vsubq_f32(in2, vmean); + float32x4_t submean3 = vsubq_f32(in3, vmean); + float32x4_t out0 = vmlaq_f32(vbias, submean0, vsstd); + float32x4_t out1 = vmlaq_f32(vbias, submean1, vsstd); + float32x4_t out2 = vmlaq_f32(vbias, submean2, vsstd); + float32x4_t out3 = vmlaq_f32(vbias, submean3, vsstd); + vst1q_f32(out_p, out0); + vst1q_f32(out_p + 4, out1); + vst1q_f32(out_p + 8, out2); + vst1q_f32(out_p + 12, out3); + in_p += 16; + out_p += 16; + } + for (int k = 0; k < remain - 3; k += 4) { + float32x4_t in0 = vld1q_f32(in_p); + in_p += 4; + float32x4_t submean0 = vsubq_f32(in0, vmean); + float32x4_t out0 = vmlaq_f32(vbias, submean0, vsstd); + vst1q_f32(out_p, out0); + out_p += 4; + } + for (int k = 0; k < remain % 4; k++) { + *out_p = (*in_p - mean_val) * sstd_val + bias_val; + in_p++; + out_p++; + } + } + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(group_norm, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::GroupNormCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/activation_grad_compute.h b/lite/kernels/arm/group_norm_compute.h similarity index 81% rename from lite/kernels/arm/activation_grad_compute.h rename to lite/kernels/arm/group_norm_compute.h index ef03f58fa8cd499192aa6edfe3a7c51b49b14f65..7d61b8ec8d9a1c8620c54858487b21691bef84d5 100644 --- a/lite/kernels/arm/activation_grad_compute.h +++ b/lite/kernels/arm/group_norm_compute.h @@ -13,7 +13,6 @@ // limitations under the License. #pragma once -#include #include "lite/core/kernel.h" #include "lite/core/op_registry.h" @@ -22,13 +21,17 @@ namespace lite { namespace kernels { namespace arm { -class SquareGradCompute : public KernelLite { +class GroupNormCompute : public KernelLite { public: - using param_t = operators::ActivationGradParam; + using param_t = operators::GroupNormParam; + + void PrepareForRun() override; void Run() override; - virtual ~SquareGradCompute() = default; + virtual ~GroupNormCompute() = default; + + private: }; } // namespace arm diff --git a/lite/kernels/arm/layer_norm_compute_test.cc b/lite/kernels/arm/layer_norm_compute_test.cc index 22fe3d06569fac424ab797712142b4d088dc7d3a..e84f9f133ce0cdecb714dc535c0f5833597105c6 100644 --- a/lite/kernels/arm/layer_norm_compute_test.cc +++ b/lite/kernels/arm/layer_norm_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/layer_norm_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/layer_norm_compute.h" namespace paddle { namespace lite { @@ -181,9 +183,7 @@ TEST(layer_norm_arm, compute) { } TEST(layer_norm, retrive_op) { - auto layer_norm = - KernelRegistry::Global().Create( - "layer_norm"); + auto layer_norm = KernelRegistry::Global().Create("layer_norm"); ASSERT_FALSE(layer_norm.empty()); ASSERT_TRUE(layer_norm.front()); } diff --git a/lite/kernels/arm/lrn_compute_test.cc b/lite/kernels/arm/lrn_compute_test.cc index e7030d00427e55c7faf333997cd90cba46260cd4..9afd05b80aaffdc4be2ae1deaa5993b8fd21dce4 100644 --- a/lite/kernels/arm/lrn_compute_test.cc +++ b/lite/kernels/arm/lrn_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/lrn_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/lrn_compute.h" namespace paddle { namespace lite { @@ -133,8 +135,7 @@ void lrn_compute_ref(const operators::LrnParam& param) { } TEST(lrn_arm, retrive_op) { - auto lrn = - KernelRegistry::Global().Create("lrn"); + auto lrn = KernelRegistry::Global().Create("lrn"); ASSERT_FALSE(lrn.empty()); ASSERT_TRUE(lrn.front()); } diff --git a/lite/kernels/arm/merge_lod_tensor_compute_test.cc b/lite/kernels/arm/merge_lod_tensor_compute_test.cc index 914a58308bdf0d5c6d374d5f81ca38224941c85d..f8d92dfdc740988733ad26d5385b17050b490635 100644 --- a/lite/kernels/arm/merge_lod_tensor_compute_test.cc +++ b/lite/kernels/arm/merge_lod_tensor_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/merge_lod_tensor_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/merge_lod_tensor_compute.h" namespace paddle { namespace lite { @@ -26,9 +28,7 @@ namespace kernels { namespace arm { TEST(merge_lod_tensor_arm, retrive_op) { - auto kernel = - KernelRegistry::Global().Create( - "merge_lod_tensor"); + auto kernel = KernelRegistry::Global().Create("merge_lod_tensor"); ASSERT_FALSE(kernel.empty()); ASSERT_TRUE(kernel.front()); } diff --git a/lite/kernels/arm/mul_compute_test.cc b/lite/kernels/arm/mul_compute_test.cc index cddee81fe22897dbe91721ed172b144539e0852c..76ab95b93485b3e6701dca6224ce2a5f7a8b3df7 100644 --- a/lite/kernels/arm/mul_compute_test.cc +++ b/lite/kernels/arm/mul_compute_test.cc @@ -12,16 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/mul_compute.h" #include + #include #include #include #include #include #include + #include "lite/backends/arm/math/funcs.h" #include "lite/core/op_registry.h" +#include "lite/kernels/arm/mul_compute.h" namespace paddle { namespace lite { @@ -69,8 +71,7 @@ void FillData(T* a, } TEST(mul_arm, retrive_op) { - auto mul = - KernelRegistry::Global().Create("mul"); + auto mul = KernelRegistry::Global().Create("mul"); ASSERT_FALSE(mul.empty()); ASSERT_TRUE(mul.front()); } diff --git a/lite/kernels/arm/pool_compute_test.cc b/lite/kernels/arm/pool_compute_test.cc index acdaf0d0131621c1c2403b8a071d6cb1134f4565..c4aeb20a5bf53d80be4b407698a51ead46f6b8f5 100644 --- a/lite/kernels/arm/pool_compute_test.cc +++ b/lite/kernels/arm/pool_compute_test.cc @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/pool_compute.h" #include + #include #include #include #include + #include "lite/backends/arm/math/funcs.h" #include "lite/core/op_registry.h" +#include "lite/kernels/arm/pool_compute.h" namespace paddle { namespace lite { @@ -341,8 +343,7 @@ TEST(pool_arm, compute) { } TEST(pool_arm, retrive_op) { - auto pool = KernelRegistry::Global().Create( - "pool2d"); + auto pool = KernelRegistry::Global().Create("pool2d"); ASSERT_FALSE(pool.empty()); ASSERT_TRUE(pool.front()); } diff --git a/lite/kernels/arm/scale_compute_test.cc b/lite/kernels/arm/scale_compute_test.cc index 0d327b9807d306770850b09ed1ed2a0337104c92..fe5e1911d0cc2c012876731f50bd04b3125b8fa2 100644 --- a/lite/kernels/arm/scale_compute_test.cc +++ b/lite/kernels/arm/scale_compute_test.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/scale_compute.h" #include + #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/scale_compute.h" namespace paddle { namespace lite { @@ -103,8 +105,7 @@ TEST(scale_arm, compute) { } TEST(scale, retrive_op) { - auto scale = - KernelRegistry::Global().Create("scale"); + auto scale = KernelRegistry::Global().Create("scale"); ASSERT_FALSE(scale.empty()); ASSERT_TRUE(scale.front()); } diff --git a/lite/kernels/arm/sequence_conv_compute.cc b/lite/kernels/arm/sequence_conv_compute.cc index a70b6717097ec0ffdaa24ba257bfdf8dbd536f3f..455615e66de53a4a6f235f8ab803394962292936 100644 --- a/lite/kernels/arm/sequence_conv_compute.cc +++ b/lite/kernels/arm/sequence_conv_compute.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include "lite/backends/arm/math/conv_block_utils.h" #include "lite/backends/arm/math/conv_impl.h" #include "lite/backends/arm/math/sgemm.h" #include "lite/core/op_registry.h" @@ -88,7 +89,7 @@ void SequenceConvCompute::Run() { paddle::lite::arm::math::im2col( sub_in_data, 1, - sequence_len, + input_row_end - input_row_begin, hidden_dim, // C H W -> 1, seq_len, hidden_dim kernel_size, hidden_dim, // kernel_h, kernel_w @@ -101,10 +102,14 @@ void SequenceConvCompute::Run() { 1, 1, // stride_h, stride_w, dilation_h, dilation_w tmp_data); - local_naive_transpose(tmp_data, - sub_col_data, - kernel_size * hidden_dim, - input_row_end - input_row_begin); + int cols = kernel_size * hidden_dim; + int rows = input_row_end - input_row_begin; + if (cols % 4 == 0 && rows % 4 == 0) { + paddle::lite::arm::math::local_transpose( + tmp_data, sub_col_data, cols, rows); + } else { + local_naive_transpose(tmp_data, sub_col_data, cols, rows); + } } } diff --git a/lite/kernels/arm/softmax_compute.cc b/lite/kernels/arm/softmax_compute.cc index 3409d0f5c5bd6e7ce1ea77809f7715b62bb10ca2..79ea23ab3fad3340c63846ea11cc89b371f5c6c9 100644 --- a/lite/kernels/arm/softmax_compute.cc +++ b/lite/kernels/arm/softmax_compute.cc @@ -34,7 +34,7 @@ void SoftmaxCompute::Run() { int inner_num = x_dims.Slice(axis + 1, x_rank).production(); int axis_size = x_dims[axis]; if (inner_num == 1) { - if (axis_size >= 4) { + if (axis_size > 4) { lite::arm::math::softmax_inner1_large_axis( din, dout, outer_num, axis_size); } else { diff --git a/lite/kernels/arm/softmax_compute_test.cc b/lite/kernels/arm/softmax_compute_test.cc index 459112d8c0169375584baf0cb983037682e47a3d..486ccf2cedd1af3ce0d7cc2f7d0aeecaadf15ca9 100644 --- a/lite/kernels/arm/softmax_compute_test.cc +++ b/lite/kernels/arm/softmax_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/softmax_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/softmax_compute.h" namespace paddle { namespace lite { @@ -121,9 +123,7 @@ TEST(softmax_arm, compute) { } TEST(softmax, retrive_op) { - auto softmax = - KernelRegistry::Global().Create( - "softmax"); + auto softmax = KernelRegistry::Global().Create("softmax"); ASSERT_FALSE(softmax.empty()); ASSERT_TRUE(softmax.front()); } diff --git a/lite/kernels/arm/split_compute_test.cc b/lite/kernels/arm/split_compute_test.cc index 034fbb85c487df6159a6a22b9958cc9e64d9e1c6..c51ea186b52a77abec5c7560b0a028079bea4aba 100644 --- a/lite/kernels/arm/split_compute_test.cc +++ b/lite/kernels/arm/split_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/split_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/split_compute.h" namespace paddle { namespace lite { @@ -165,8 +167,7 @@ TEST(split_arm, compute) { } TEST(split, retrive_op) { - auto split = - KernelRegistry::Global().Create("split"); + auto split = KernelRegistry::Global().Create("split"); ASSERT_FALSE(split.empty()); ASSERT_TRUE(split.front()); } diff --git a/lite/kernels/arm/split_lod_tensor_compute_test.cc b/lite/kernels/arm/split_lod_tensor_compute_test.cc index 3b2004c786698b70b4c54b68d696a9cf5f5221fd..03f5a21890ffd515e83de7895c2be886b15b8967 100644 --- a/lite/kernels/arm/split_lod_tensor_compute_test.cc +++ b/lite/kernels/arm/split_lod_tensor_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/split_lod_tensor_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/arm/split_lod_tensor_compute.h" namespace paddle { namespace lite { @@ -26,9 +28,7 @@ namespace kernels { namespace arm { TEST(split_lod_tensor_arm, retrive_op) { - auto kernel = - KernelRegistry::Global().Create( - "split_lod_tensor"); + auto kernel = KernelRegistry::Global().Create("split_lod_tensor"); ASSERT_FALSE(kernel.empty()); ASSERT_TRUE(kernel.front()); } diff --git a/lite/kernels/arm/transpose_compute_test.cc b/lite/kernels/arm/transpose_compute_test.cc index aaf3f138a54db2c7ff766325cfd61bc51ec8b1d2..74fd14754637427277a6b19b820bb5d3de66c418 100644 --- a/lite/kernels/arm/transpose_compute_test.cc +++ b/lite/kernels/arm/transpose_compute_test.cc @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/transpose_compute.h" #include + #include #include #include + #include "lite/backends/arm/math/funcs.h" #include "lite/core/op_registry.h" #include "lite/core/tensor.h" +#include "lite/kernels/arm/transpose_compute.h" namespace paddle { namespace lite { @@ -121,9 +123,7 @@ TEST(transpose_arm, compute_shape_nchw) { } TEST(transpose, retrive_op) { - auto transpose = - KernelRegistry::Global().Create( - "transpose"); + auto transpose = KernelRegistry::Global().Create("transpose"); ASSERT_FALSE(transpose.empty()); ASSERT_TRUE(transpose.front()); } @@ -189,9 +189,7 @@ TEST(transpose2_arm, compute_shape_nchw) { } TEST(transpose2, retrive_op) { - auto transpose2 = - KernelRegistry::Global().Create( - "transpose2"); + auto transpose2 = KernelRegistry::Global().Create("transpose2"); ASSERT_FALSE(transpose2.empty()); ASSERT_TRUE(transpose2.front()); } diff --git a/lite/kernels/arm/while_compute.h b/lite/kernels/arm/while_compute.h deleted file mode 100644 index f735d96f9190755daacdf846a2d99901c1a14493..0000000000000000000000000000000000000000 --- a/lite/kernels/arm/while_compute.h +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include "lite/core/kernel.h" -#include "lite/core/op_registry.h" -#include "lite/operators/while_op.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -class StepExecutor { - typedef std::shared_ptr OpPtr; - - public: - StepExecutor(cpp::BlockDesc *block, Scope *scope, Place place) - : scope_(scope), place_(place) { - int32_t op_size = block->OpsSize(); - for (int32_t i = 0; i < op_size; ++i) { - auto &op_desc = *block->template GetOp(i); - auto op_type = op_desc.Type(); - auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type()); - // VLOG(4) << "while: creating Op [" << op_type << "]"; - op_handler->Attach(op_desc, scope); - - auto hostplace = place_; - hostplace.target = TARGET(kHost); - auto kernels = op_handler->CreateKernels({place_, hostplace}); - CHECK_GT(kernels.size(), 0) << "cannot create kernel"; - op_handler->AttachKernel(kernels[0].get()); - op_handler->SetKernel(kernels); - ops_of_block_.push_back(op_handler); - } - } - - void Run() { - for (auto &op_handler : ops_of_block_) { - // VLOG(4) << op_handler->op_info()->Repr(); - op_handler->InferShape(); - // VLOG(4) << "while: infered shape"; - op_handler->Run(); - } - } - - private: - Scope *scope_; - Place place_; - std::vector ops_of_block_; -}; - -class WhileCompute : public KernelLite { - public: - using param_t = operators::WhileParam; - - void Run() override; - void PrepareForRun() override; - - virtual ~WhileCompute() = default; - - private: - std::shared_ptr executor_; -}; - -} // namespace arm -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/bm/bridges/batch_norm_op.cc b/lite/kernels/bm/bridges/batch_norm_op.cc index fbf70178fdd971edce34b3253b02febfa3e3b85c..f5ecc0825a17f26b1cf65605ea2e8c0c93338f39 100644 --- a/lite/kernels/bm/bridges/batch_norm_op.cc +++ b/lite/kernels/bm/bridges/batch_norm_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include "lite/kernels/bm/bridges/graph.h" #include "lite/kernels/bm/bridges/utility.h" #include "lite/kernels/npu/bridges/registry.h" @@ -64,10 +65,16 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto* bias_data = bias->mutable_data(); auto* mean_data = mean->mutable_data(); auto* variance_data = variance->mutable_data(); + + float* new_bias = static_cast(malloc(bias->memory_size())); + float* new_scale = static_cast(malloc(scale->memory_size())); + CHECK(new_bias != nullptr); + CHECK(new_scale != nullptr); + for (int c = 0; c < channel_size; c++) { float inv_scale = 1.f / (std::sqrt(variance_data[c] + epsilon)); - bias_data[c] = bias_data[c] - inv_scale * scale_data[c] * mean_data[c]; - scale_data[c] = inv_scale * scale_data[c]; + new_bias[c] = bias_data[c] - inv_scale * scale_data[c] * mean_data[c]; + new_scale[c] = inv_scale * scale_data[c]; } const int input_num = 1; @@ -86,11 +93,13 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { output_dims.size(), static_cast(output_var_name.c_str()), static_cast(unique_op_name.c_str()), - static_cast(scale->mutable_data()), - static_cast(bias->mutable_data()), + static_cast(new_scale), + static_cast(new_bias), 1, 1, 1); + free(new_scale); + free(new_bias); delete[] shape; delete[] name; delete[] dim; diff --git a/lite/kernels/bm/bridges/density_prior_box_op.cc b/lite/kernels/bm/bridges/density_prior_box_op.cc index 137c5142d5ae544226dbe5d6cd7c872fc272b71a..895901d94e2b2077f530e196ef8f30d4f57df793 100644 --- a/lite/kernels/bm/bridges/density_prior_box_op.cc +++ b/lite/kernels/bm/bridges/density_prior_box_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include "lite/kernels/bm/bridges/graph.h" #include "lite/kernels/bm/bridges/utility.h" #include "lite/kernels/npu/bridges/registry.h" diff --git a/lite/kernels/bm/bridges/interpolate_op.cc b/lite/kernels/bm/bridges/interpolate_op.cc index 8c2d39b16ac0206d83199fdeac6c30a0a352856e..a77ec4e8f788e581d9d226369210a449ec50840c 100644 --- a/lite/kernels/bm/bridges/interpolate_op.cc +++ b/lite/kernels/bm/bridges/interpolate_op.cc @@ -76,6 +76,8 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { static_cast(output_var_name.c_str()), 0, 0, + 0, + 0, type); } graph->AddNode(output_var_name); diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc index d7640e1ac7326d9764380469dc97a7806b044437..ea0dd82325976f33f123f21e0eb4aeb5dfdbfa9d 100644 --- a/lite/kernels/bm/subgraph_compute.cc +++ b/lite/kernels/bm/subgraph_compute.cc @@ -28,13 +28,17 @@ namespace lite { namespace kernels { namespace bm { -int SubgraphEngine::BuildDeviceProgram() { +bool SubgraphEngine::BuildDeviceProgram() { int status = 0; subgraph::bm::Graph graph; const auto& bridges = subgraph::Registry::Instance(); graph.CreateCompilerHandle(); auto& ctx = this->ctx_->template As(); - for (auto& inst : origin_program_) { + if (!origin_program_) { + BuildOriginProgram(); + } + const auto& insts = origin_program_->instructions(kRootBlockIdx); + for (auto& inst : insts) { auto op = const_cast(inst.op()); CHECK(op); op->CheckShape(); @@ -42,7 +46,7 @@ int SubgraphEngine::BuildDeviceProgram() { std::string op_type = op->op_info()->Type(); LOG(INFO) << op_type; if (!bridges.Exists(op_type, TARGET(kBM))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= @@ -50,12 +54,13 @@ int SubgraphEngine::BuildDeviceProgram() { const_cast(op), const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } - std::string net_name = "bmnetc_f32umodel"; + std::string net_name = "bmnet_f32bmodel"; + auto unique_net_name = lite::subgraph::bm::UniqueName(net_name); __bmcompile_opt( - graph.GetCompilerHandle(), const_cast(net_name.c_str()), 1); + graph.GetCompilerHandle(), const_cast(unique_net_name.c_str()), 2); void* bmodel_data = nullptr; unsigned int data_size = 0; bm_hd_ = static_cast(ctx.GetHandle()); @@ -63,19 +68,17 @@ int SubgraphEngine::BuildDeviceProgram() { graph.UnlockCompilerMutex(); bmrt_hd_ = bmrt_create(bm_hd_); if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) { - return subgraph::FAILED; + return false; } bmrt_get_network_names(bmrt_hd_, &net_names_); net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]); auto& stage = net_info_->stages[0]; // input - origin_idims_.resize(input_names_.size()); - origin_itensors_.resize(input_names_.size()); device_inputs_.resize(input_names_.size()); for (size_t i = 0; i < input_names_.size(); i++) { - origin_itensors_[i] = scope_->FindMutableTensor(net_info_->input_names[i]); + origin_itensors_[i] = + exec_scope_->FindMutableTensor(net_info_->input_names[i]); CHECK(origin_itensors_[i]); - origin_idims_[i] = origin_itensors_[i]->dims(); bm_device_mem_t* p_mem = static_cast(malloc(sizeof(bm_device_mem_t))); CHECK(p_mem != nullptr); @@ -88,8 +91,6 @@ int SubgraphEngine::BuildDeviceProgram() { stage.input_shapes[i]); } // output - origin_odims_.resize(output_names_.size()); - origin_otensors_.resize(output_names_.size()); device_outputs_.resize(net_info_->output_num); int out_index = 0; for (int i = 0; i < output_names_.size(); i++) { @@ -97,14 +98,13 @@ int SubgraphEngine::BuildDeviceProgram() { } for (int i = 0; i < net_info_->output_num; i++) { - Tensor* t_cur = scope_->FindMutableTensor(net_info_->output_names[i]); + Tensor* t_cur = exec_scope_->FindMutableTensor(net_info_->output_names[i]); CHECK(t_cur != nullptr); bm_device_mem_t* p_mem = static_cast(malloc(sizeof(bm_device_mem_t))); CHECK(p_mem != nullptr); if (outname_map_.find(net_info_->output_names[i]) != outname_map_.end()) { origin_otensors_[out_index] = t_cur; - origin_odims_[out_index] = origin_otensors_[out_index]->dims(); origin_otensors_[out_index]->mutable_data(); out_index += 1; } @@ -116,10 +116,10 @@ int SubgraphEngine::BuildDeviceProgram() { net_info_->output_dtypes[i], stage.output_shapes[i]); } - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { +bool SubgraphEngine::LaunchDeviceProgram() { for (size_t i = 0; i < device_inputs_.size(); i++) { bm_memcpy_s2d(bm_hd_, device_inputs_[i].device_mem, @@ -143,24 +143,23 @@ int SubgraphEngine::LaunchDeviceProgram() { out_index++; } } - return 0; + return true; } void SubgraphCompute::PrepareForRun() { auto& param = this->Param(); engine_.reset(new SubgraphEngine(ctx_.get(), - param.sub_block_idx, - param.sub_block_desc, + param.block_idx, + param.program_desc, + param.exec_scope, param.input_data_names, - param.output_data_names, - param.scope)); + param.output_data_names)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace bm diff --git a/lite/kernels/bm/subgraph_compute.h b/lite/kernels/bm/subgraph_compute.h index 60f7661c7990d90020dbfc7ec3a6e0d178dceb70..d1dcb3a6d3ef7eb6d9091eb45d1960862cca273a 100644 --- a/lite/kernels/bm/subgraph_compute.h +++ b/lite/kernels/bm/subgraph_compute.h @@ -36,16 +36,20 @@ class SubgraphEngine : public subgraph::Engine { public: SubgraphEngine(KernelContext *ctx, int block_idx, - cpp::BlockDesc *block_desc, + const std::shared_ptr &program_desc, + Scope *exec_scope, const std::vector &input_names, - const std::vector &output_names, - Scope *scope) - : subgraph::Engine( - ctx, block_idx, block_desc, input_names, output_names, scope) {} + const std::vector &output_names) + : subgraph::Engine(ctx, + block_idx, + program_desc, + exec_scope, + input_names, + output_names) {} protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; private: void *bmrt_hd_; diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt index 1a58a51c36a1ccbb21bb2830a197c096e7ddac51..3d396cfa12f8d89e4d868f5bce98cf143ab072ec 100644 --- a/lite/kernels/cuda/CMakeLists.txt +++ b/lite/kernels/cuda/CMakeLists.txt @@ -6,12 +6,16 @@ message(STATUS "compile with lite CUDA kernels") # basic kernels add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} ${math_cuda}) +add_kernel(fc_compute_cuda CUDA basic SRCS fc_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) +add_kernel(gru_compute_cuda CUDA basic SRCS gru_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) +add_kernel(matmul_compute_cuda CUDA basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} ${math_cuda}) add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps}) add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps}) add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps}) add_kernel(abs_compute_cuda CUDA basic SRCS abs_compute.cu DEPS ${lite_kernel_deps}) add_kernel(tanh_compute_cuda CUDA basic SRCS tanh_compute.cu DEPS ${lite_kernel_deps}) add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(sigmoid_compute_cuda CUDA basic SRCS sigmoid_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps}) add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps}) add_kernel(sequence_pool_concat_compute_cuda CUDA extra SRCS sequence_pool_concat_compute.cu DEPS ${lite_kernel_deps}) @@ -34,7 +38,10 @@ add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute. add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps}) add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm ${math_cuda}) add_kernel(sequence_reverse_compute_cuda CUDA extra SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(sequence_pad_compute_cuda CUDA extra SRCS sequence_pad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) +add_kernel(sequence_unpad_compute_cuda CUDA extra SRCS sequence_unpad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) add_kernel(sequence_concat_compute_cuda CUDA extra SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(sequence_mask_compute_cuda CUDA extra SRCS sequence_mask_compute.cu DEPS ${lite_kernel_deps}) add_kernel(sequence_arithmetic_compute_cuda CUDA extra SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps}) add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps}) add_kernel(attention_padding_mask_compute_cuda CUDA extra SRCS attention_padding_mask_compute.cu DEPS ${lite_kernel_deps}) @@ -44,6 +51,8 @@ add_kernel(match_matrix_tensor_compute_cuda CUDA extra SRCS match_matrix_tensor_ add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} cuda_batched_gemm) add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm) add_kernel(var_conv_2d_compute_cuda CUDA extra SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) +add_kernel(topk_pooling_compute_cuda CUDA extra SRCS topk_pooling_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(assign_value_compute_cuda CUDA extra SRCS assign_value_compute.cu DEPS ${lite_kernel_deps}) # unit test lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda) @@ -53,6 +62,7 @@ nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_ nv_test(abs_compute_cuda_test SRCS abs_compute_test.cc DEPS abs_compute_cuda) nv_test(tanh_compute_cuda_test SRCS tanh_compute_test.cc DEPS tanh_compute_cuda) nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda) +nv_test(sigmoid_compute_cuda_test SRCS sigmoid_compute_test.cc DEPS sigmoid_compute_cuda) nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda) nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda) nv_test(search_group_padding_compute_cuda_test SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_cuda) @@ -60,7 +70,10 @@ nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute nv_test(elementwise_compute_cuda_test SRCS elementwise_compute_test.cc DEPS elementwise_compute_cuda) nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda) #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda) -nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) +nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) +nv_test(fc_compute_cuda_test SRCS fc_compute_test.cc DEPS fc_compute_cuda) +nv_test(gru_compute_cuda_test SRCS gru_compute_test.cc DEPS gru_compute_cuda) +nv_test(matmul_compute_cuda_test SRCS matmul_compute_test.cc DEPS matmul_compute_cuda) nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda ) nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda) #nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda) @@ -74,9 +87,14 @@ if(LITE_BUILD_EXTRA) nv_test(search_aligned_mat_mul_compute_cuda_test SRCS search_aligned_mat_mul_compute_test.cc DEPS search_aligned_mat_mul_compute_cuda) nv_test(search_seq_fc_compute_cuda_test SRCS search_seq_fc_compute_test.cc DEPS search_seq_fc_compute_cuda) nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda) + nv_test(sequence_pad_compute_cuda_test SRCS sequence_pad_compute_test.cc DEPS sequence_pad_compute_cuda) + nv_test(sequence_unpad_compute_cuda_test SRCS sequence_unpad_compute_test.cc DEPS sequence_unpad_compute_cuda) + nv_test(sequence_mask_compute_cuda_test SRCS sequence_mask_compute_test.cc DEPS sequence_mask_compute_cuda) nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda) #nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda) #nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda) nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda) #nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda) + nv_test(topk_pooling_compute_cuda_test SRCS topk_pooling_compute_test.cc DEPS topk_pooling_compute_cuda) + nv_test(assign_value_compute_cuda_test SRCS assign_value_compute_test.cc DEPS assign_value_compute_cuda) endif() diff --git a/lite/kernels/cuda/assign_value_compute.cu b/lite/kernels/cuda/assign_value_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..6a2740101c2883b3b2f7c999bd96fd3fbd3ab3ce --- /dev/null +++ b/lite/kernels/cuda/assign_value_compute.cu @@ -0,0 +1,76 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "lite/backends/cuda/target_wrapper.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" +#include "lite/kernels/cuda/assign_value_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void TensorFromVector(const std::vector& src, + lite::Tensor* dst, + cudaStream_t* stream) { + auto* src_ptr = static_cast(src.data()); + auto* dst_ptr = static_cast(dst->mutable_data(TARGET(kCUDA))); + auto size = src.size() * sizeof(T); + TargetWrapperCuda::MemcpyAsync( + dst_ptr, src_ptr, size, IoDirection::HtoD, *stream); +} + +void AssignValueCompute::Run() { + auto& param = Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + int dtype = param.dtype; + std::vector fp32_values = param.fp32_values; + std::vector int32_values = param.int32_values; + std::vector int64_values = param.int64_values; + std::vector bool_values = param.bool_values; + auto* out = param.Out; + + if (dtype == static_cast(lite::core::FluidType::INT32)) { + TensorFromVector(int32_values, out, &stream); + } else if (dtype == static_cast(lite::core::FluidType::FP32)) { + TensorFromVector(fp32_values, out, &stream); + } else if (dtype == static_cast(lite::core::FluidType::INT64)) { + TensorFromVector(int64_values, out, &stream); + } else if (dtype == static_cast(lite::core::FluidType::BOOL)) { + TensorFromVector(bool_values, out, &stream); + } else { + LOG(FATAL) << "Unsupported dtype for assign_value_op:" << dtype; + } + return; +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(assign_value, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::AssignValueCompute, + def) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kAny))}) + .Finalize(); diff --git a/lite/kernels/cuda/assign_value_compute.h b/lite/kernels/cuda/assign_value_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..c334e36d8061437881a4ea67d960f87b7ffb3ceb --- /dev/null +++ b/lite/kernels/cuda/assign_value_compute.h @@ -0,0 +1,34 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class AssignValueCompute : public KernelLite { + public: + using param_t = operators::AssignValueParam; + + void Run() override; + virtual ~AssignValueCompute() = default; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/assign_value_compute_test.cc b/lite/kernels/cuda/assign_value_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c29426b745e92f71bcfeca6a8fc2890cd1908b4 --- /dev/null +++ b/lite/kernels/cuda/assign_value_compute_test.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/assign_value_compute.h" + +#include + +#include +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class AssignValueTest : public ::testing::Test { + protected: + AssignValueTest() : dtype_(5), shape_({1}) { + int num = std::accumulate( + shape_.begin(), shape_.end(), 1, std::multiplies()); + fp32_values_.resize(num); + int32_values_.resize(num); + int64_values_.resize(num); + bool_values_.resize(num); + for (int i = 0; i < num; ++i) { + fp32_values_[i] = i + 5; + int32_values_[i] = i; + int64_values_[i] = i; + bool_values_[i] = i; + } + std::vector out_shape(shape_.size(), 0); + for (size_t i = 0; i < shape_.size(); ++i) out_shape[i] = shape_[i]; + out_ref_.Resize(lite::DDim(out_shape)); + out_gpu_.Resize(out_ref_.dims()); + out_cpu_.Resize(out_ref_.dims()); + + RunBaseLine(&out_ref_); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.shape = shape_; + param_.dtype = dtype_; + param_.fp32_values = fp32_values_; + param_.int32_values = int32_values_; + param_.int64_values = int64_values_; + param_.bool_values = bool_values_; + param_.Out = &out_gpu_; + } + + void InitFloatInput() {} + + void InitHalfInput() {} + + void RunBaseLine(lite::Tensor* out) { + if (dtype_ == static_cast(lite::core::FluidType::INT32)) { + for (size_t i = 0; i < int32_values_.size(); ++i) { + out->mutable_data()[i] = int32_values_[i]; + } + } else if (dtype_ == static_cast(lite::core::FluidType::FP32)) { + for (size_t i = 0; i < fp32_values_.size(); ++i) { + out->mutable_data()[i] = fp32_values_[i]; + } + } else if (dtype_ == static_cast(lite::core::FluidType::INT64)) { + for (size_t i = 0; i < int64_values_.size(); ++i) { + out->mutable_data()[i] = int64_values_[i]; + } + } else if (dtype_ == static_cast(lite::core::FluidType::BOOL)) { + for (size_t i = 0; i < bool_values_.size(); ++i) { + out->mutable_data()[i] = bool_values_[i]; + } + } else { + LOG(FATAL) << "Unsupported dtype_ for assign_value_op:" << dtype_; + } + } + + int dtype_; + std::vector shape_; + std::vector fp32_values_; + std::vector int32_values_; + std::vector int64_values_; + std::vector bool_values_; + + lite::Tensor out_ref_; + lite::Tensor out_gpu_; + lite::Tensor out_cpu_; + + operators::AssignValueParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(AssignValueTest, fp32) { + InitFloatInput(); + AssignValueCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + EXPECT_NEAR(out_cpu_.data()[i], out_ref_.data()[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/concat_compute_test.cc b/lite/kernels/cuda/concat_compute_test.cc index cc12fcd289d36c38f02663c6a7aaa0ec7c70653a..08dd4013a5ce75ea5abc0c9d678f7437276df161 100644 --- a/lite/kernels/cuda/concat_compute_test.cc +++ b/lite/kernels/cuda/concat_compute_test.cc @@ -69,7 +69,7 @@ void concat_compute_ref(const operators::ConcatParam& param) { std::vector input_cols(input.size()); for (int i = 0; i < num; ++i) { int input_i_numel = input[i]->dims().size() == 0 ? 0 : 1; - for (int didx = 0; didx < input[i]->dims().size(); ++didx) { + for (size_t didx = 0; didx < input[i]->dims().size(); ++didx) { input_i_numel *= input[i]->dims()[didx]; } int t_cols = input_i_numel / rows; diff --git a/lite/kernels/cuda/dropout_compute.cc b/lite/kernels/cuda/dropout_compute.cc index 7e3a3a62432f3bc5f2e62112b2b220abc17ee2bd..f9303a39cebda322526e6cc25401db35e1f4309b 100644 --- a/lite/kernels/cuda/dropout_compute.cc +++ b/lite/kernels/cuda/dropout_compute.cc @@ -23,6 +23,9 @@ namespace cuda { void DropoutCompute::Run() { auto& param = Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + const float* x_data = param.x->data(); float* out_data = param.output->mutable_data(TARGET(kCUDA)); int num = param.x->dims().production(); @@ -31,7 +34,7 @@ void DropoutCompute::Run() { if (param.dropout_implementation == "downgrade_in_infer") { scale = 1.0f - prob_data; } - lite::cuda::math::scale(num, x_data, out_data, scale, 0); + lite::cuda::math::scale(num, x_data, out_data, scale, 0.f, stream); } } // namespace cuda diff --git a/lite/kernels/cuda/fc_compute.cu b/lite/kernels/cuda/fc_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..0ad376577b133540b782e2726564302a95ddf216 --- /dev/null +++ b/lite/kernels/cuda/fc_compute.cu @@ -0,0 +1,353 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "lite/kernels/cuda/fc_compute.h" + +#include + +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +struct FcTypeTraits; + +template <> +struct FcTypeTraits { + typedef float4 Type; +}; + +template +__global__ void AddBiasV2(const int num, const T* bias, T* data, int K) { + CUDA_KERNEL_LOOP(index, num) { + int bias_idx = index % K; + const T bias_ptr = bias[bias_idx]; + const T in_ptr = data[index]; + T packed_val; + packed_val.x = in_ptr.x + bias_ptr.x; + packed_val.y = in_ptr.y + bias_ptr.y; + data[index] = packed_val; + } +} + +template <> +__global__ void AddBiasV2(const int num, + const half2* bias, + half2* data, + int K) { + CUDA_KERNEL_LOOP(index, num) { + int bias_idx = index % K; + const half2 bias_ptr = bias[bias_idx]; + const half2 in_ptr = data[index]; +#if __CUDA_ARCH__ >= 530 + data[index] = __hadd2(in_ptr, bias_ptr); +#else + half2 packed_val; + packed_val.x = __hadd(in_ptr.x, bias_ptr.x); + packed_val.y = __hadd(in_ptr.y, bias_ptr.y); + data[index] = packed_val; +#endif + } +} + +template +__global__ void AddBiasReluV2(const int num, const T* bias, T* data, int K) { + CUDA_KERNEL_LOOP(index, num) { + int bias_idx = index % K; + const T bias_ptr = bias[bias_idx]; + const T in_ptr = data[index]; + T packed_val; + packed_val.x = fmaxf(0.f, in_ptr.x + bias_ptr.x); + packed_val.y = fmaxf(0.f, in_ptr.y + bias_ptr.y); + data[index] = packed_val; + } +} + +template <> +__global__ void AddBiasReluV2(const int num, + const half2* bias, + half2* data, + int K) { + CUDA_KERNEL_LOOP(index, num) { + int bias_idx = index % K; + const half2 bias_ptr = bias[bias_idx]; + const half2 in_ptr = data[index]; +#if __CUDA_ARCH__ >= 530 + data[index] = __hmul2(__hgt2(in_ptr + bias_ptr, __float2half2_rn(0.f)), + in_ptr + bias_ptr); +#else + const float2 bias = __half22float2(bias_ptr); + const float2 in = __half22float2(in_ptr); + data[index] = __floats2half2_rn( + bias.x + in.x > 0.0f ? static_cast(bias.x + in.x) : 0.0f, + bias.y + in.y > 0.0f ? static_cast(bias.y + in.y) : 0.0f); +#endif + } +} + +template +__global__ void AddBiasV4(const int num, const T* bias, T* data, int K) { + CUDA_KERNEL_LOOP(index, num) { + int bias_idx = index % K; + const T bias_ptr = bias[bias_idx]; + const T in_ptr = data[index]; + T packed_val; + packed_val.x = in_ptr.x + bias_ptr.x; + packed_val.y = in_ptr.y + bias_ptr.y; + packed_val.z = in_ptr.z + bias_ptr.z; + packed_val.w = in_ptr.w + bias_ptr.w; + data[index] = packed_val; + } +} + +template +__global__ void AddBiasReluV4(const int num, const T* bias, T* data, int K) { + CUDA_KERNEL_LOOP(index, num) { + int bias_idx = index % K; + const T bias_ptr = bias[bias_idx]; + const T in_ptr = data[index]; + T packed_val; + packed_val.x = fmaxf(0.f, in_ptr.x + bias_ptr.x); + packed_val.y = fmaxf(0.f, in_ptr.y + bias_ptr.y); + packed_val.z = fmaxf(0.f, in_ptr.z + bias_ptr.z); + packed_val.w = fmaxf(0.f, in_ptr.w + bias_ptr.w); + data[index] = packed_val; + } +} + +template +__global__ void AddBias(const int num, const T* bias, T* data) { + int offset = blockIdx.x * num; + + for (int i = threadIdx.x; i < num; i += blockDim.x) { + T temp; +#if __CUDA_ARCH__ >= 350 + temp = __ldg(data + offset + i) + __ldg(bias + i); +#else + temp = data[offset + i] + bias[i]; +#endif + data[offset + i] = temp; + } +} + +template <> +__global__ void AddBias(const int num, const half* bias, half* data) { + int offset = blockIdx.x * num; + + for (int i = threadIdx.x; i < num; i += blockDim.x) { + half temp; +#if __CUDA_ARCH__ >= 350 + temp = __hadd(__ldg(data + offset + i), __ldg(bias + i)); +#else + temp = __hadd(data[offset + i], bias[i]); +#endif + data[offset + i] = temp; + } +} + +template +__global__ void AddBiasRelu(const int num, const T* bias, T* data) { + int offset = blockIdx.x * num; + + for (int i = threadIdx.x; i < num; i += blockDim.x) { + T temp; +#if __CUDA_ARCH__ >= 350 + temp = __ldg(data + offset + i) + __ldg(bias + i); +#else + temp = data[offset + i] + bias[i]; +#endif + data[offset + i] = static_cast(temp > 0) * temp; + } +} + +template <> +__global__ void AddBiasRelu(const int num, const half* bias, half* data) { + int offset = blockIdx.x * num; + + for (int i = threadIdx.x; i < num; i += blockDim.x) { + half temp; +#if __CUDA_ARCH__ >= 350 + temp = __hadd(__ldg(data + offset + i), __ldg(bias + i)); +#else + temp = __hadd(data[offset + i], bias[i]); +#endif + +#if __CUDA_ARCH__ >= 530 + data[offset + i] = + __hgt(temp, __float2half(0.0f)) ? temp : __float2half(0.0f); +#else + data[offset + i] = + __float2half(__half2float(temp) > 0.f ? __half2float(temp) : 0.f); +#endif + } +} + +template +void FcCompute::PrepareForRun() { + gemm_impl_.reset(new lite::cuda::math::Gemm); +} + +template +void FcCompute::Run() { + auto& context = this->ctx_->template As(); + auto stream = context.exec_stream(); + auto& param = this->template Param(); + + const auto* x_data = param.input->template data(); + const auto* w_data = param.w->template data(); + const auto* b_data = param.bias ? param.bias->template data() : nullptr; + + auto out_vec = param.output->dims().Vectorize(); + out_vec.back() = param.w->dims()[1]; + param.output->Resize(out_vec); + auto* out_data = param.output->template mutable_data(TARGET(kCUDA)); + + int in_num_col_dims = param.in_num_col_dims; + + int M = static_cast( + param.input->dims().Slice(0, param.in_num_col_dims).production()); + int K = static_cast( + param.input->dims() + .Slice(param.in_num_col_dims, param.input->dims().size()) + .production()); + int K2 = static_cast(param.w->dims()[0]); + int N = static_cast(param.w->dims()[1]); + CHECK_EQ(K, K2) << "x_w must be equal with y_h"; + + CHECK(gemm_impl_->init(false, false, M, N, K, &context)); + gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &context); + + if (b_data == nullptr) { + return; + } + + std::string activation_type = param.activation_type; + if (N % 4 == 0) { + const int threads = 256; + const int num = M * N / 4; + const int blocks = (num + threads - 1) / threads; + typedef typename FcTypeTraits::Type trans_type; + const auto* bias_ptr_v4 = reinterpret_cast(b_data); + auto* data_ptr_v4 = reinterpret_cast(out_data); + if (activation_type == "relu") { + AddBiasReluV4<<>>( + num, bias_ptr_v4, data_ptr_v4, N / 4); + } else if (activation_type == "") { + AddBiasV4<<>>( + num, bias_ptr_v4, data_ptr_v4, N / 4); + } else { + LOG(FATAL) << "not supported activation type: " << activation_type; + } + } else { + const int threads = 256; + const int blocks = M; + if (activation_type == "relu") { + AddBiasRelu<<>>(N, b_data, out_data); + } else if (activation_type == "") { + AddBias<<>>(N, b_data, out_data); + } else { + LOG(FATAL) << "not supported activation type: " << activation_type; + } + } +} + +template <> +void FcCompute::Run() { + auto& context = this->ctx_->template As(); + auto stream = context.exec_stream(); + auto& param = this->template Param(); + + const auto* x_data = param.input->template data(); + const auto* w_data = param.w->template data(); + const auto* b_data = param.bias ? param.bias->template data() : nullptr; + + auto out_vec = param.output->dims().Vectorize(); + out_vec.back() = param.w->dims()[1]; + param.output->Resize(out_vec); + auto* out_data = param.output->template mutable_data(TARGET(kCUDA)); + + int in_num_col_dims = param.in_num_col_dims; + + int M = static_cast( + param.input->dims().Slice(0, param.in_num_col_dims).production()); + int K = static_cast( + param.input->dims() + .Slice(param.in_num_col_dims, param.input->dims().size()) + .production()); + int K2 = static_cast(param.w->dims()[0]); + int N = static_cast(param.w->dims()[1]); + CHECK_EQ(K, K2) << "x_w must be equal with y_h"; + + CHECK(gemm_impl_->init(false, false, M, N, K, &context)); + gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &context); + + if (b_data == nullptr) { + return; + } + + std::string activation_type = param.activation_type; + if (N % 2 == 0) { + const int threads = 256; + const int num = M * N / 2; + const int blocks = (num + threads - 1) / threads; + const auto* bias_ptr_v2 = reinterpret_cast(b_data); + auto* data_ptr_v2 = reinterpret_cast(out_data); + if (activation_type == "relu") { + AddBiasReluV2<<>>( + num, bias_ptr_v2, data_ptr_v2, N / 2); + } else if (activation_type == "") { + AddBiasV2<<>>( + num, bias_ptr_v2, data_ptr_v2, N / 2); + } else { + LOG(FATAL) << "not supported activation type: " << activation_type; + } + } else { + const int threads = 256; + const int blocks = M; + if (activation_type == "relu") { + AddBiasRelu<<>>(N, b_data, out_data); + } else if (activation_type == "") { + AddBias<<>>(N, b_data, out_data); + } else { + LOG(FATAL) << "not supported activation type: " << activation_type; + } + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +using FcFp32 = paddle::lite::kernels::cuda::FcCompute; + +using FcFp16 = paddle::lite::kernels::cuda::FcCompute; + +REGISTER_LITE_KERNEL(fc, kCUDA, kFloat, kNCHW, FcFp32, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fc, kCUDA, kFP16, kNCHW, FcFp16, def) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/cuda/fc_compute.h b/lite/kernels/cuda/fc_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..700194c115824762411e952c77d06cb01a754bc0 --- /dev/null +++ b/lite/kernels/cuda/fc_compute.h @@ -0,0 +1,45 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "lite/backends/cuda/math/gemm.h" +#include "lite/core/kernel.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class FcCompute : public KernelLite { + public: + using param_t = operators::FcParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~FcCompute() = default; + + private: + std::unique_ptr> gemm_impl_{nullptr}; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/fc_compute_test.cc b/lite/kernels/cuda/fc_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..fa0dada729ca01cb1a4176ca585ce8f921f3aa42 --- /dev/null +++ b/lite/kernels/cuda/fc_compute_test.cc @@ -0,0 +1,231 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/fc_compute.h" + +#include +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class FcTest : public ::testing::Test { + protected: + FcTest() + : m_(8), + k_(16), + n_(64), + in_num_col_dims_(1), + act_type_("relu"), + x_shape_({m_, k_}), + w_shape_({k_, n_}), + b_shape_({n_}), + out_shape_({m_, n_}) { + x_ref_.Resize(lite::DDim(x_shape_)); + x_gpu_.Resize(lite::DDim(x_shape_)); + + w_ref_.Resize(lite::DDim(w_shape_)); + w_gpu_.Resize(lite::DDim(w_shape_)); + + b_ref_.Resize(lite::DDim(b_shape_)); + b_gpu_.Resize(lite::DDim(b_shape_)); + + auto x_ref_data = x_ref_.mutable_data(); + auto w_ref_data = w_ref_.mutable_data(); + auto b_ref_data = b_ref_.mutable_data(); + + // prepare input + for (int64_t i = 0; i < x_ref_.numel(); i++) { + x_ref_data[i] = static_cast(i % 10 * 0.2); + } + for (int64_t i = 0; i < w_ref_.numel(); i++) { + w_ref_data[i] = static_cast(i % 10 * 0.2); + } + for (int64_t i = 0; i < b_ref_.numel(); i++) { + b_ref_data[i] = static_cast(i % 10 * 0.2); + } + + out_ref_.Resize(lite::DDim(out_shape_)); + out_cpu_.Resize(out_ref_.dims()); + out_gpu_.Resize(out_ref_.dims()); + RunBaseLine(&x_ref_, &w_ref_, &b_ref_, &out_ref_); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.input = &x_gpu_; + param_.w = &w_gpu_; + param_.bias = &b_gpu_; + param_.in_num_col_dims = in_num_col_dims_; + param_.activation_type = act_type_; + param_.output = &out_gpu_; + } + + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + w_gpu_.Assign(w_ref_.data(), + w_gpu_.dims()); + b_gpu_.Assign(b_ref_.data(), + b_gpu_.dims()); + } + + void InitHalfInput() { + x_half_.Resize(lite::DDim(x_shape_)); + auto x_half_data = x_half_.mutable_data(); + for (int64_t i = 0; i < x_half_.numel(); i++) { + x_half_data[i] = half(lite::float16(x_ref_.data()[i])); + } + x_gpu_.Assign(x_half_data, x_gpu_.dims()); + w_half_.Resize(w_ref_.dims()); + auto w_half_data = w_half_.mutable_data(); + for (int64_t i = 0; i < w_half_.numel(); i++) { + w_half_data[i] = half(lite::float16(w_ref_.data()[i])); + } + w_gpu_.Assign(w_half_data, w_gpu_.dims()); + b_half_.Resize(b_ref_.dims()); + auto b_half_data = b_half_.mutable_data(); + for (int64_t i = 0; i < b_half_.numel(); i++) { + b_half_data[i] = half(lite::float16(b_ref_.data()[i])); + } + b_gpu_.Assign(b_half_data, b_gpu_.dims()); + } + + void RunBaseLine(const lite::Tensor* x, + const lite::Tensor* w, + const lite::Tensor* b, + lite::Tensor* out) { + const float* data_in = x->data(); + const float* bias = b->data(); + const float* weights = w->data(); + float* data_out = out->mutable_data(); + int out_rows = x->dims()[0]; + int in_cols = x->numel() / out_rows; + int out_cols = w->numel() / in_cols; + int index_out; + for (int i = 0; i < out_rows; i++) { + for (int j = 0; j < out_cols; j++) { + index_out = i * out_cols + j; + data_out[index_out] = bias ? bias[j] : 0; + for (int k = 0; k < in_cols; k++) { + data_out[index_out] += + data_in[i * in_cols + k] * weights[k * out_cols + j]; + } + if (act_type_ == "relu") { + data_out[index_out] *= static_cast(data_out[index_out] > 0); + } + } + } + } + + int m_, k_, n_, in_num_col_dims_; + std::string act_type_; + std::vector x_shape_, w_shape_, b_shape_, out_shape_; + lite::Tensor x_ref_, w_ref_, b_ref_, out_ref_; + lite::Tensor x_gpu_, w_gpu_, b_gpu_; + lite::Tensor x_half_, w_half_, b_half_; + lite::Tensor out_cpu_, out_gpu_; + + operators::FcParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(FcTest, TestFP32) { + InitFloatInput(); + FcCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = out_cpu_.data()[i]; + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5); + } +} + +TEST_F(FcTest, TestFP16) { + InitHalfInput(); + FcCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 2e-2); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/gru_compute.cu b/lite/kernels/cuda/gru_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..ddca95048b303cce55cc3435b15f945a84fc8c0c --- /dev/null +++ b/lite/kernels/cuda/gru_compute.cu @@ -0,0 +1,394 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "lite/kernels/cuda/gru_compute.h" + +#include + +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/backends/cuda/math/bias.h" +#include "lite/backends/cuda/math/gru_forward.h" +#include "lite/backends/cuda/math/sequence2batch.h" +#include "lite/backends/cuda/target_wrapper.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +struct GRUMetaValue { + T* gate_weight; + T* state_weight; + T* gate_value; + T* reset_output_value; + T* output_value; + T* prev_out_value; +}; + +template +struct GRUUnitFunctor { + static void compute(GRUMetaValue value, + int frame_size, + int batch_size, + const lite::cuda::math::ActivationType& active_node, + const lite::cuda::math::ActivationType& active_gate, + bool origin_mode, + lite::cuda::math::Gemm* blas, + CUDAContext* context) { + dim3 threads, grids; + if (batch_size == 1) { + if (lite::TargetWrapperCuda::GetComputeCapability() >= 70) { + if (frame_size < 16) { + constexpr int tiled_size = 8; + int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size; + threads = dim3(tiled_size, 1); + grids = dim3(frame_blocks, 1); + lite::cuda::math::FastCollectiveGruGate< + T, + tiled_size><<exec_stream()>>>( + value.gate_value, + value.prev_out_value, + value.gate_weight, + value.reset_output_value, + frame_size, + active_gate); + frame_blocks = (frame_size + tiled_size - 1) / tiled_size; + grids = dim3(frame_blocks, 1); + lite::cuda::math::FastCollectiveGruOut< + T, + tiled_size><<exec_stream()>>>( + value.state_weight, + value.prev_out_value, + value.output_value, + value.gate_value, + value.reset_output_value, + frame_size, + active_node, + origin_mode); + } else { + constexpr int tiled_size = 16; + int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size; + threads = dim3(tiled_size, 1); + grids = dim3(frame_blocks, 1); + lite::cuda::math::FastCollectiveGruGate< + T, + tiled_size><<exec_stream()>>>( + value.gate_value, + value.prev_out_value, + value.gate_weight, + value.reset_output_value, + frame_size, + active_gate); + frame_blocks = (frame_size + tiled_size - 1) / tiled_size; + grids = dim3(frame_blocks, 1); + lite::cuda::math::FastCollectiveGruOut< + T, + tiled_size><<exec_stream()>>>( + value.state_weight, + value.prev_out_value, + value.output_value, + value.gate_value, + value.reset_output_value, + frame_size, + active_node, + origin_mode); + } + return; + } else { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grids = dim3(frame_blocks, 1); + } + } else { + threads = dim3(32, 32); + grids = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); + } + + if (value.prev_out_value) { + CHECK(blas->init(false, + false, + batch_size, + frame_size * 2, + frame_size, + frame_size, + frame_size * 2, + frame_size * 3, + context)); + blas->run(1.0f, + 1.0f, + value.prev_out_value, + value.gate_weight, + value.gate_value, + context); + } + + lite::cuda::math::GruForwardResetOutput< + T><<exec_stream()>>>( + value.gate_value, + value.reset_output_value, + value.prev_out_value, + frame_size, + batch_size, + active_gate, + batch_size != 1); + CUDA_POST_KERNEL_CHECK; + + if (value.prev_out_value) { + CHECK(blas->init(false, + false, + batch_size, + frame_size, + frame_size, + frame_size, + frame_size, + frame_size * 3, + context)); + blas->run(1.0f, + 1.0f, + value.reset_output_value, + value.state_weight, + value.gate_value + frame_size * 2, + context); + } + + lite::cuda::math::GruForwardFinalOutput< + T><<exec_stream()>>>(value.gate_value, + value.prev_out_value, + value.output_value, + frame_size, + batch_size, + active_node, + origin_mode, + batch_size != 1); + CUDA_POST_KERNEL_CHECK; + } +}; + +template struct GRUUnitFunctor; + +template <> +struct GRUUnitFunctor { + static void compute(GRUMetaValue value, + int frame_size, + int batch_size, + const lite::cuda::math::ActivationType& active_node, + const lite::cuda::math::ActivationType& active_gate, + bool origin_mode, + lite::cuda::math::Gemm* blas, + CUDAContext* context) { + dim3 threads, grids; + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grids = dim3(frame_blocks, 1); + } else { + threads = dim3(32, 32); + grids = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); + } + + if (value.prev_out_value) { + CHECK(blas->init(false, + false, + batch_size, + frame_size * 2, + frame_size, + frame_size, + frame_size * 2, + frame_size * 3, + context)); + blas->run(1.0f, + 1.0f, + value.prev_out_value, + value.gate_weight, + value.gate_value, + context); + } + + lite::cuda::math::GruForwardResetOutput< + half><<exec_stream()>>>( + value.gate_value, + value.reset_output_value, + value.prev_out_value, + frame_size, + batch_size, + active_gate, + batch_size == 1); + CUDA_POST_KERNEL_CHECK; + + if (value.prev_out_value) { + CHECK(blas->init(false, + false, + batch_size, + frame_size, + frame_size, + frame_size, + frame_size, + frame_size * 3, + context)); + blas->run(1.0f, + 1.0f, + value.reset_output_value, + value.state_weight, + value.gate_value + frame_size * 2, + context); + } + + lite::cuda::math::GruForwardFinalOutput< + half><<exec_stream()>>>( + value.gate_value, + value.prev_out_value, + value.output_value, + frame_size, + batch_size, + active_node, + origin_mode, + batch_size == 1); + CUDA_POST_KERNEL_CHECK; + } +}; + +template +void GRUCompute::PrepareForRun() { + gemm_impl_.reset(new lite::cuda::math::Gemm); +} + +template +void GRUCompute::Run() { + auto& context = this->ctx_->template As(); + auto stream = context.exec_stream(); + auto& param = this->template Param(); + + auto* input = param.input; + lite::Tensor* h0{nullptr}; + if (param.h0) { + h0 = const_cast(param.h0); + } + lite::Tensor* bias{nullptr}; + if (param.bias) { + bias = const_cast(param.bias); + } + const lite::Tensor* weight = param.weight; + T* weight_data = const_cast(weight->template data()); + lite::Tensor* batch_gate = param.batch_gate; + lite::Tensor* batch_reset_hidden_prev = param.batch_reset_hidden_prev; + lite::Tensor* batch_hidden = param.batch_hidden; + lite::Tensor* hidden = param.hidden; + T* batch_reset_hidden_prev_data = + batch_reset_hidden_prev->template mutable_data(TARGET(kCUDA)); + hidden->template mutable_data(TARGET(kCUDA)); + T* batch_gate_data = batch_gate->template mutable_data(TARGET(kCUDA)); + T* batch_hidden_data = batch_hidden->template mutable_data(TARGET(kCUDA)); + bool is_reverse = param.is_reverse; + auto active_node = lite::cuda::math::GetActiveType(param.activation); + auto active_gate = lite::cuda::math::GetActiveType(param.gate_activation); + bool origin_mode = param.origin_mode; + + auto hidden_dims = hidden->dims(); + int frame_size = hidden_dims[1]; + + lite::cuda::math::LoDTensor2BatchFunctor batch_func; + batch_func(*input, batch_gate, is_reverse, stream); + + if (bias) { + lite::cuda::math::RowwiseAdd add_bias; + add_bias(batch_gate_data, + bias->template data(), + batch_gate_data, + frame_size, + batch_gate->numel(), + stream); + } + GRUMetaValue gru_value; + gru_value.gate_weight = weight_data; + gru_value.state_weight = weight_data + 2 * frame_size * frame_size; + + if (h0) { + // Since the batch computing for GRU reorders the input sequences + // according to their length. The initialized cell state also needs + // to reorder. + ordered_h0_.Resize(h0->dims()); + lite::cuda::math::CopyMatrixRowsFunctor row_shuffle; + row_shuffle(*h0, &ordered_h0_, batch_gate->lod()[2], true, stream); + gru_value.prev_out_value = ordered_h0_.mutable_data(TARGET(kCUDA)); + } else { + gru_value.prev_out_value = nullptr; + } + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + for (size_t n = 0; n < num_batch; ++n) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + int cur_batch_size = bend - bstart; + + gru_value.output_value = batch_hidden_data + bstart * frame_size; + gru_value.gate_value = batch_gate_data + bstart * frame_size * 3; + gru_value.reset_output_value = + batch_reset_hidden_prev_data + bstart * frame_size; + + GRUUnitFunctor::compute(gru_value, + frame_size, + cur_batch_size, + active_node, + active_gate, + origin_mode, + gemm_impl_.get(), + &context); + gru_value.prev_out_value = gru_value.output_value; + } + + lite::cuda::math::Batch2LoDTensorFunctor to_seq; + batch_hidden->set_lod(batch_gate->lod()); + to_seq(*batch_hidden, hidden, stream); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +using GRUFp32 = + paddle::lite::kernels::cuda::GRUCompute; + +using GRUFp16 = paddle::lite::kernels::cuda::GRUCompute; + +REGISTER_LITE_KERNEL(gru, kCUDA, kFloat, kNCHW, GRUFp32, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("H0", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("BatchGate", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("BatchResetHiddenPrev", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("BatchHidden", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(gru, kCUDA, kFP16, kNCHW, GRUFp16, def) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("H0", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("Weight", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("BatchGate", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("BatchResetHiddenPrev", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("BatchHidden", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Hidden", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/cuda/gru_compute.h b/lite/kernels/cuda/gru_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..070deca2c54b919d1afeb856633d94fe5919eabd --- /dev/null +++ b/lite/kernels/cuda/gru_compute.h @@ -0,0 +1,46 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "lite/backends/cuda/math/gemm.h" +#include "lite/core/kernel.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class GRUCompute : public KernelLite { + public: + using param_t = operators::GRUParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~GRUCompute() = default; + + private: + std::unique_ptr> gemm_impl_{nullptr}; + lite::Tensor ordered_h0_; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/gru_compute_test.cc b/lite/kernels/cuda/gru_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..adff5b6b28d6a2b4b9513148fa1219f78534dfca --- /dev/null +++ b/lite/kernels/cuda/gru_compute_test.cc @@ -0,0 +1,183 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/gru_compute.h" + +#include + +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class GRUTest : public ::testing::Test { + protected: + GRUTest() + : batch_(12), + frame_size_(128), + activation_("tanh"), + gate_activation_("sigmoid"), + is_reverse_(false), + origin_mode_(false), + x_shape_({batch_, frame_size_ * 3}), + w_shape_({frame_size_, frame_size_ * 3}), + out_shape_({batch_, frame_size_}), + lod_({{0, 4, 9, 12}}) { + x_ref_.Resize(lite::DDim(x_shape_)); + x_gpu_.Resize(lite::DDim(x_shape_)); + x_ref_.set_lod(lod_); + + w_ref_.Resize(lite::DDim(w_shape_)); + w_gpu_.Resize(lite::DDim(w_shape_)); + + auto x_ref_data = x_ref_.mutable_data(); + auto w_ref_data = w_ref_.mutable_data(); + + for (int64_t i = 0; i < x_ref_.numel(); i++) { + x_ref_data[i] = static_cast(i % 10 * 0.2); + } + for (int64_t i = 0; i < w_ref_.numel(); i++) { + w_ref_data[i] = static_cast(i % 10 * 0.2); + } + + out_ref_.Resize(lite::DDim(out_shape_)); + out_cpu_.Resize(out_ref_.dims()); + out_gpu_.Resize(out_ref_.dims()); + batch_gate_gpu_.Resize(lite::DDim(x_shape_)); + batch_hidden_gpu_.Resize(lite::DDim(out_shape_)); + batch_reset_hidden_gpu_.Resize(lite::DDim(out_shape_)); + RunBaseLine(); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.input = &x_gpu_; + param_.weight = &w_gpu_; + param_.gate_activation = gate_activation_; + param_.activation = activation_; + param_.is_reverse = is_reverse_; + param_.origin_mode = origin_mode_; + param_.hidden = &out_gpu_; + param_.batch_gate = &batch_gate_gpu_; + param_.batch_reset_hidden_prev = &batch_reset_hidden_gpu_; + param_.batch_hidden = &batch_hidden_gpu_; + } + + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + x_gpu_.set_lod(x_ref_.lod()); + w_gpu_.Assign(w_ref_.data(), + w_gpu_.dims()); + } + + void InitHalfInput() { + x_half_.Resize(lite::DDim(x_shape_)); + auto x_half_data = x_half_.mutable_data(); + for (int64_t i = 0; i < x_half_.numel(); i++) { + x_half_data[i] = half(lite::float16(x_ref_.data()[i])); + } + x_gpu_.Assign(x_half_data, x_gpu_.dims()); + x_gpu_.set_lod(x_ref_.lod()); + w_half_.Resize(w_ref_.dims()); + auto w_half_data = w_half_.mutable_data(); + for (int64_t i = 0; i < w_half_.numel(); i++) { + w_half_data[i] = half(lite::float16(w_ref_.data()[i])); + } + w_gpu_.Assign(w_half_data, w_gpu_.dims()); + } + + void RunBaseLine() {} + + int batch_, frame_size_; + std::string activation_, gate_activation_; + bool is_reverse_, origin_mode_; + std::vector x_shape_, w_shape_, out_shape_; + LoD lod_; + lite::Tensor x_ref_, w_ref_, out_ref_; + lite::Tensor x_gpu_, w_gpu_; + lite::Tensor x_half_, w_half_; + lite::Tensor batch_gate_gpu_; + lite::Tensor batch_hidden_gpu_; + lite::Tensor batch_reset_hidden_gpu_; + lite::Tensor out_cpu_, out_gpu_; + + operators::GRUParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(GRUTest, TestFP32) { + InitFloatInput(); + GRUCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; +} + +TEST_F(GRUTest, TestFP16) { + InitHalfInput(); + GRUCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/lookup_table_compute_test.cc b/lite/kernels/cuda/lookup_table_compute_test.cc index 9323de14eb168fb55a68640350b87bf7040f5729..89050ea97f160b2fddb479966f59c05aafd8c268 100644 --- a/lite/kernels/cuda/lookup_table_compute_test.cc +++ b/lite/kernels/cuda/lookup_table_compute_test.cc @@ -12,14 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/cuda/lookup_table_compute.h" #include + #include #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/cuda/lookup_table_compute.h" namespace paddle { namespace lite { @@ -56,9 +58,7 @@ void LookupTableComputeRef(const operators::LookupTableParam& param) { } TEST(lookup_table_cuda, retrieve_op) { - auto lookup_table = - KernelRegistry::Global().Create( - "lookup_table"); + auto lookup_table = KernelRegistry::Global().Create("lookup_table"); ASSERT_FALSE(lookup_table.empty()); ASSERT_TRUE(lookup_table.front()); } diff --git a/lite/kernels/cuda/matmul_compute.cc b/lite/kernels/cuda/matmul_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..3b80b673dfabdccc7c728fa3081a81a870531acf --- /dev/null +++ b/lite/kernels/cuda/matmul_compute.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/matmul_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void MatMulCompute::Run() { + auto& context = this->ctx_->template As(); + auto& param = this->template Param(); + + const auto* x_data = param.X->template data(); + const auto* y_data = param.Y->template data(); + auto* out_data = param.Out->template mutable_data(TARGET(kCUDA)); + bool transpose_x = param.transpose_X; + bool transpose_y = param.transpose_Y; + float alpha = param.alpha; + + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + + int m = 0; + int k = 0; + int n = 0; + int batch = 0; + int64_t stride_x = 0; + int64_t stride_y = 0; + + if (x_dims.size() >= 2 && y_dims.size() >= 2 && + (x_dims.size() != 2 || y_dims.size() != 2)) { + // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N] + // x: [B, M, K], y: [K, N], out: [B, M, N] + // or + // x: [M, K], y: [B, ..., K, N], out: [B, ..., M, N] + // x: [M, K], y: [B, K, N], out: [B, M, N] + strided_gemm_impl_->init(transpose_x, transpose_y, &context); + m = transpose_x ? x_dims[x_dims.size() - 1] : x_dims[x_dims.size() - 2]; + k = transpose_x ? x_dims[x_dims.size() - 2] : x_dims[x_dims.size() - 1]; + n = transpose_y ? y_dims[y_dims.size() - 2] : y_dims[y_dims.size() - 1]; + int batch_x = x_dims.size() == 2 ? 0 : x_dims.count(0, x_dims.size() - 2); + int batch_y = y_dims.size() == 2 ? 0 : y_dims.count(0, y_dims.size() - 2); + CHECK(batch_x == batch_y || batch_x == 0 || batch_y == 0) + << "batch_size x should be equal to batch_size y, or " + "one of batch_size x and batch_size y should be 0. " + "But got batch_size x = " + << batch_x << ", batch_size y = " << batch_y; + batch = batch_x == 0 ? batch_y : batch_x; + stride_x = x_dims.size() == 2 ? 0 : m * k; + stride_y = y_dims.size() == 2 ? 0 : k * n; + strided_gemm_impl_->run(alpha, + 0.f, + m, + n, + k, + x_data, + y_data, + out_data, + batch, + stride_x, + stride_y); + } else if (x_dims.size() == 2 && y_dims.size() == 2) { + // x: [M, K], y: [K, N], out: [M, N] + m = transpose_x ? x_dims[1] : x_dims[0]; + k = transpose_x ? x_dims[0] : x_dims[1]; + n = transpose_y ? y_dims[0] : y_dims[1]; + gemm_impl_->init(transpose_x, transpose_y, m, n, k, &context); + gemm_impl_->run(alpha, 0.0f, x_data, y_data, out_data, &context); + } else if (x_dims.size() > 2 && y_dims.size() == 1) { + // x: [B, M, K], y: [K], out: [B, M] + strided_gemm_impl_->init(transpose_x, transpose_y, &context); + m = transpose_x ? x_dims[x_dims.size() - 1] : x_dims[x_dims.size() - 2]; + k = transpose_x ? x_dims[x_dims.size() - 2] : x_dims[x_dims.size() - 1]; + n = 1; + batch = x_dims.count(0, x_dims.size() - 2); + stride_x = m * k; + stride_y = 0; + strided_gemm_impl_->run(alpha, + 0.f, + m, + n, + k, + x_data, + y_data, + out_data, + batch, + stride_x, + stride_y); + } else if (x_dims.size() == 1 && y_dims.size() == 1) { + if (!transpose_x && !transpose_y) { + // x: [K], y: [K], out: [1] + m = 1; + k = x_dims[0]; + n = 1; + CHECK_EQ(x_dims[0], y_dims[0]) + << "x_dims[0] should be equal to y_dims[0]"; + gemm_impl_->init(false, false, m, n, k, &context); + gemm_impl_->run(alpha, 0.0f, x_data, y_data, out_data, &context); + } else if (transpose_x && transpose_y) { + // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N] + m = x_dims[0]; + k = 1; + n = y_dims[0]; + gemm_impl_->init(false, false, m, n, k, &context); + gemm_impl_->run(alpha, 0.0f, x_data, y_data, out_data, &context); + } else { + LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims(" + << y_dims << "), transpose_x(" << transpose_x + << "), transpose_y(" << transpose_y << ")"; + } + } else { + LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims + << ")"; + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +using MatMulFp32 = + paddle::lite::kernels::cuda::MatMulCompute; + +using MatMulFp16 = + paddle::lite::kernels::cuda::MatMulCompute; + +REGISTER_LITE_KERNEL(matmul, kCUDA, kFloat, kNCHW, MatMulFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(matmul, kCUDA, kFP16, kNCHW, MatMulFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/cuda/matmul_compute.h b/lite/kernels/cuda/matmul_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..69ad178d9184b7c3893f49a23024a14d7466115b --- /dev/null +++ b/lite/kernels/cuda/matmul_compute.h @@ -0,0 +1,50 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/math/gemm.h" +#include "lite/backends/cuda/math/strided_gemm.h" +#include "lite/core/kernel.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class MatMulCompute : public KernelLite { + public: + using param_t = operators::MatMulParam; + + void PrepareForRun() override { + strided_gemm_impl_.reset(new lite::cuda::math::StridedGemm); + gemm_impl_.reset(new lite::cuda::math::Gemm); + } + + void Run() override; + + virtual ~MatMulCompute() = default; + + private: + std::unique_ptr> strided_gemm_impl_{ + nullptr}; + std::unique_ptr> gemm_impl_{nullptr}; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/matmul_compute_test.cc b/lite/kernels/cuda/matmul_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..89f40af3920ba0d3e36781955ffbf5eaba093257 --- /dev/null +++ b/lite/kernels/cuda/matmul_compute_test.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/matmul_compute.h" + +#include +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class MatMulTest : public ::testing::Test { + protected: + MatMulTest() + : x_trans_(false), + y_trans_(true), + alpha_(1.0f), + x_shape_({4, 1, 2}), + y_shape_({4, 1, 2}), + out_shape_({4, 1, 1}) { + x_ref_.Resize(lite::DDim(x_shape_)); + x_gpu_.Resize(x_ref_.dims()); + + y_ref_.Resize(lite::DDim(y_shape_)); + y_gpu_.Resize(y_ref_.dims()); + + auto x_ref_data = x_ref_.mutable_data(); + auto y_ref_data = y_ref_.mutable_data(); + + // prepare input + for (int64_t i = 0; i < x_ref_.numel(); i++) { + x_ref_data[i] = static_cast(1); + } + for (int64_t i = 0; i < y_ref_.numel(); i++) { + y_ref_data[i] = static_cast(1); + } + + out_ref_.Resize(lite::DDim(out_shape_)); + out_cpu_.Resize(out_ref_.dims()); + out_gpu_.Resize(out_ref_.dims()); + RunBaseLine(); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.X = &x_gpu_; + param_.Y = &y_gpu_; + param_.transpose_X = x_trans_; + param_.transpose_Y = y_trans_; + param_.alpha = alpha_; + param_.Out = &out_gpu_; + } + + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + y_gpu_.Assign(y_ref_.data(), + y_gpu_.dims()); + } + + void InitHalfInput() { + x_half_.Resize(x_ref_.dims()); + auto x_half_data = x_half_.mutable_data(); + for (int64_t i = 0; i < x_half_.numel(); ++i) { + x_half_data[i] = half(lite::float16(x_ref_.data()[i])); + } + x_gpu_.Assign(x_half_data, x_gpu_.dims()); + y_half_.Resize(y_ref_.dims()); + auto y_half_data = y_half_.mutable_data(); + for (int64_t i = 0; i < y_half_.numel(); i++) { + y_half_data[i] = half(lite::float16(y_ref_.data()[i])); + } + y_gpu_.Assign(y_half_data, y_gpu_.dims()); + } + + void RunBaseLine() { + auto* out_data = out_ref_.mutable_data(); + for (int64_t i = 0; i < out_ref_.numel(); ++i) { + out_data[i] = 2; + } + } + + bool x_trans_, y_trans_; + float alpha_; + std::vector x_shape_, y_shape_, out_shape_; + lite::Tensor x_ref_, y_ref_, out_ref_; + lite::Tensor x_gpu_, y_gpu_; + lite::Tensor x_half_, y_half_; + lite::Tensor out_cpu_, out_gpu_; + + operators::MatMulParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(MatMulTest, TestFP32) { + InitFloatInput(); + MatMulCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = out_cpu_.data()[i]; + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5); + } +} + +TEST_F(MatMulTest, TestFP16) { + InitHalfInput(); + MatMulCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/scale_compute.cc b/lite/kernels/cuda/scale_compute.cc index 6bf7414d8c85383a834159678cdd5a09e0b434d9..9ce5905a7de750e1eed41e56784419c737e6d2d9 100644 --- a/lite/kernels/cuda/scale_compute.cc +++ b/lite/kernels/cuda/scale_compute.cc @@ -23,8 +23,11 @@ namespace cuda { void ScaleCompute::Run() { auto& param = Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + const float* x_data = param.x->data(); - float* output_data = param.output->mutable_data(); + float* output_data = param.output->mutable_data(TARGET(kCUDA)); DDim x_dims = param.x->dims(); bool bias_after_scale = param.bias_after_scale; float scale = param.scale; @@ -33,7 +36,7 @@ void ScaleCompute::Run() { bias *= scale; } lite::cuda::math::scale( - x_dims.production(), x_data, output_data, scale, bias); + x_dims.production(), x_data, output_data, scale, bias, stream); } } // namespace cuda diff --git a/lite/kernels/cuda/sequence_mask_compute.cu b/lite/kernels/cuda/sequence_mask_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..8e227a6a272127f500e10775f7ed4db53660e1f8 --- /dev/null +++ b/lite/kernels/cuda/sequence_mask_compute.cu @@ -0,0 +1,105 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/sequence_mask_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +__global__ void SequenceMaskKernel(T* dst, + const int64_t* src, + int count, + int maxlen) { + CUDA_KERNEL_LOOP(index, count) { + int src_idx = index / maxlen; + int inner_idx = index % maxlen; + dst[index] = static_cast(inner_idx < src[src_idx] ? 1 : 0); + } +} + +template +void SequenceMaskCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + const auto* x = param.X; + const int64_t* x_data = x->template data(); + auto* y = param.Y; + int maxlen = param.maxlen; + + if (param.MaxLenTensor) { + auto* len_tensor_data = param.MaxLenTensor->template data(); + int32_t len_data{0}; + TargetWrapperCuda::MemcpySync( + &len_data, len_tensor_data, sizeof(int32_t), IoDirection::DtoH); + maxlen = len_data; + } + + if (maxlen < 0) { + maxlen = static_cast( + thrust::reduce(thrust::device_pointer_cast(x_data), + thrust::device_pointer_cast(x_data) + x->numel(), + static_cast(0), + thrust::maximum())); + } + + auto y_dim = x->dims().Vectorize(); + y_dim.push_back(maxlen); + y->Resize(y_dim); + const int count = y->numel(); + auto* dst_data = y->template mutable_data(TARGET(kCUDA)); + if (param.out_dtype == 5) { + SequenceMaskKernel< + T><<>>( + dst_data, x_data, count, maxlen); + } else { + LOG(FATAL) << "not supported out_dtype: " << param.out_dtype; + } + CUDA_POST_KERNEL_CHECK; +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +using SeqMaskFp32 = + paddle::lite::kernels::cuda::SequenceMaskCompute; + +using SeqMaskFp16 = + paddle::lite::kernels::cuda::SequenceMaskCompute; + +REGISTER_LITE_KERNEL(sequence_mask, kCUDA, kFloat, kNCHW, SeqMaskFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .BindInput("MaxLenTensor", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt32))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(sequence_mask, kCUDA, kFP16, kNCHW, SeqMaskFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .BindInput("MaxLenTensor", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt32))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_mask_compute.h b/lite/kernels/cuda/sequence_mask_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..3611587f0ce7daef1a88f5b6a916e2d30d33bcc1 --- /dev/null +++ b/lite/kernels/cuda/sequence_mask_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class SequenceMaskCompute : public KernelLite { + public: + using param_t = operators::SequenceMaskParam; + + void Run() override; + virtual ~SequenceMaskCompute() = default; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_mask_compute_test.cc b/lite/kernels/cuda/sequence_mask_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..efbdf2ae00b6d1d9353831e94a202e5e42228b62 --- /dev/null +++ b/lite/kernels/cuda/sequence_mask_compute_test.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_mask_compute.h" + +#include + +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SequenceMaskTest : public ::testing::Test { + protected: + SequenceMaskTest() + : maxlen_(4), + out_dtype_(5), + x_data_({3, 2, 1, 0}), + out_shape_({static_cast(x_data_.size()), maxlen_}) { + x_ref_.Resize(lite::DDim({static_cast(x_data_.size())})); + x_gpu_.Resize(x_ref_.dims()); + + auto* x_ref_data = x_ref_.mutable_data(); + + // prepare input + for (size_t i = 0; i < x_data_.size(); i++) { + x_ref_data[i] = x_data_[i]; + } + + out_ref_.Resize(lite::DDim(out_shape_)); + out_gpu_.Resize(out_ref_.dims()); + out_cpu_.Resize(out_ref_.dims()); + RunBaseLine(&x_ref_, &out_ref_); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.X = &x_gpu_; + param_.Y = &out_gpu_; + param_.maxlen = maxlen_; + param_.out_dtype = out_dtype_; + } + + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + } + + void InitHalfInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + } + + void RunBaseLine(const lite::Tensor* x, lite::Tensor* out) { + auto* out_data = out->mutable_data(); + + for (size_t i = 0; i < x_data_.size(); ++i) { + for (int j = 0; j < maxlen_; ++j) { + out_data[i * maxlen_ + j] = j < x_data_[i] ? 1 : 0; + } + } + } + + int maxlen_, out_dtype_; + std::vector x_data_, out_shape_; + + lite::Tensor x_ref_, out_ref_; + lite::Tensor x_gpu_, out_gpu_; + lite::Tensor out_cpu_; + + operators::SequenceMaskParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(SequenceMaskTest, fp32) { + InitFloatInput(); + SequenceMaskCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + EXPECT_NEAR(out_cpu_.data()[i], out_ref_.data()[i], 1e-5); + } +} + +TEST_F(SequenceMaskTest, TestFP16) { + InitHalfInput(); + SequenceMaskCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_pad_compute.cu b/lite/kernels/cuda/sequence_pad_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..8368eb3007e3f1d036420a5dc1c86204365e179c --- /dev/null +++ b/lite/kernels/cuda/sequence_pad_compute.cu @@ -0,0 +1,116 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/math/sequence_padding.h" +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/sequence_pad_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void SequencePadCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + const auto* x = param.X; + const auto* pad_value = param.PadValue; + auto* out = param.Out; + auto* len_t = param.Length; + int seq_num = x->lod()[0].size() - 1; + int padded_length; + if (param.padded_length == -1) { + int max_seq_len = 0; + for (int i = 0; i < seq_num; ++i) { + max_seq_len = std::max( + max_seq_len, static_cast(x->lod()[0][i + 1] - x->lod()[0][i])); + } + padded_length = max_seq_len; + } else { + padded_length = param.padded_length; + } + + int max_seq_len = 0; + int step_width = x->numel() / x->dims()[0]; + + // calc for param.Lenght + seq_len_.resize(seq_num); + seq_offsets_vec_.resize(x->lod()[0].size()); + for (size_t i = 0; i < seq_num; ++i) { + max_seq_len = std::max( + max_seq_len, static_cast(x->lod()[0][i + 1] - x->lod()[0][i])); + seq_len_[i] = x->lod()[0][i + 1] - x->lod()[0][i]; + seq_offsets_vec_[i] = x->lod()[0][i]; + } + seq_offsets_vec_[seq_num] = x->lod()[0][seq_num]; + TargetWrapperCuda::MemcpyAsync( + len_t->template mutable_data(TARGET(kCUDA)), + seq_len_.data(), + sizeof(int64_t) * seq_len_.size(), + IoDirection::HtoD, + stream); + seq_offsets_.Resize({static_cast(x->lod()[0].size())}); + TargetWrapperCuda::MemcpyAsync( + seq_offsets_.mutable_data(TARGET(kCUDA)), + seq_offsets_vec_.data(), + sizeof(size_t) * seq_offsets_vec_.size(), + IoDirection::HtoD, + stream); + + const T* seq_data = x->template data(); + T* pad_data = out->template mutable_data(TARGET(kCUDA)); + const T* pad_value_data = pad_value->template data(); + + lite::cuda::math::SequencePadding(pad_data, + seq_data, + pad_value_data, + pad_value->numel() == 1, + seq_offsets_.data(), + seq_num, + padded_length, + step_width, + &stream); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +using SeqPadFp32 = + paddle::lite::kernels::cuda::SequencePadCompute; + +using SeqPadFp16 = + paddle::lite::kernels::cuda::SequencePadCompute; + +REGISTER_LITE_KERNEL(sequence_pad, kCUDA, kFloat, kNCHW, SeqPadFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("PadValue", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Length", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .Finalize(); + +REGISTER_LITE_KERNEL(sequence_pad, kCUDA, kFP16, kNCHW, SeqPadFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("PadValue", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Length", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_pad_compute.h b/lite/kernels/cuda/sequence_pad_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..c494fe127d4eb5a7c0ba77a5c76ab1d1d0c1f2f2 --- /dev/null +++ b/lite/kernels/cuda/sequence_pad_compute.h @@ -0,0 +1,41 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class SequencePadCompute : public KernelLite { + public: + using param_t = operators::SequencePadParam; + + void Run() override; + virtual ~SequencePadCompute() = default; + + private: + lite::Tensor seq_offsets_; + std::vector seq_len_; + std::vector seq_offsets_vec_; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_pad_compute_test.cc b/lite/kernels/cuda/sequence_pad_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..91141984c98d5d105f51d0acc247aa878ff219a7 --- /dev/null +++ b/lite/kernels/cuda/sequence_pad_compute_test.cc @@ -0,0 +1,233 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_pad_compute.h" + +#include + +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SequencePadTest : public ::testing::Test { + protected: + SequencePadTest() + : batch_(5), + features_(2), + padded_length_(3), + x_lod_({{0, 2, 5}}), + x_shape_({batch_, features_}), + pad_value_shape_({features_}), + out_shape_({static_cast(x_lod_[0].size() - 1), + padded_length_, + features_}) { + x_ref_.Resize(lite::DDim(x_shape_)); + x_ref_.set_lod(x_lod_); + x_gpu_.Resize(x_ref_.dims()); + + pad_value_ref_.Resize(lite::DDim(pad_value_shape_)); + pad_value_gpu_.Resize(pad_value_ref_.dims()); + + length_ref_.Resize( + lite::DDim({static_cast(x_lod_[0].size() - 1)})); + length_gpu_.Resize(length_ref_.dims()); + length_cpu_.Resize(length_ref_.dims()); + + auto x_ref_data = x_ref_.mutable_data(); + auto pad_value_ref_data = pad_value_ref_.mutable_data(); + + for (int64_t i = 0; i < x_ref_.numel(); i++) { + x_ref_data[i] = static_cast(i); + } + for (int64_t i = 0; i < pad_value_ref_.numel(); i++) { + pad_value_ref_data[i] = static_cast(i); + } + + out_ref_.Resize(lite::DDim(out_shape_)); + out_gpu_.Resize(out_ref_.dims()); + out_cpu_.Resize(out_ref_.dims()); + RunBaseLine(&x_ref_, &pad_value_ref_, &out_ref_, &length_ref_); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.X = &x_gpu_; + param_.PadValue = &pad_value_gpu_; + param_.Length = &length_gpu_; + param_.Out = &out_gpu_; + param_.padded_length = padded_length_; + } + + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + x_gpu_.set_lod(x_ref_.lod()); + pad_value_gpu_.Assign( + pad_value_ref_.data(), pad_value_gpu_.dims()); + } + + void InitHalfInput() { + x_half_.Resize(lite::DDim(x_shape_)); + auto x_half_data = x_half_.mutable_data(); + for (int64_t i = 0; i < x_half_.numel(); i++) { + x_half_data[i] = half(lite::float16(x_ref_.data()[i])); + } + x_gpu_.Assign(x_half_data, x_gpu_.dims()); + x_gpu_.set_lod(x_ref_.lod()); + pad_value_half_.Resize(pad_value_ref_.dims()); + auto pad_value_half_data = pad_value_half_.mutable_data(); + for (int64_t i = 0; i < pad_value_half_.numel(); i++) { + pad_value_half_data[i] = + half(lite::float16(pad_value_ref_.data()[i])); + } + pad_value_gpu_.Assign( + pad_value_half_data, pad_value_gpu_.dims()); + } + + void RunBaseLine(const lite::Tensor* x, + const lite::Tensor* pad_value, + lite::Tensor* out, + lite::Tensor* length) { + auto* length_data = length->mutable_data(); + auto* out_data = out->mutable_data(); + length_data[0] = 2; + length_data[1] = 3; + + for (size_t i = 0; i < 4; ++i) { + out_data[i] = i; + } + out_data[4] = 0; + out_data[5] = 1; + for (size_t i = 4; i < 10; ++i) { + out_data[2 + i] = i; + } + } + + int batch_, features_, padded_length_; + LoD x_lod_; + std::vector x_shape_, pad_value_shape_, out_shape_; + + lite::Tensor x_ref_, pad_value_ref_, out_ref_, length_ref_; + lite::Tensor x_gpu_, pad_value_gpu_, out_gpu_, length_gpu_; + lite::Tensor x_half_, pad_value_half_; + lite::Tensor out_cpu_, length_cpu_; + + operators::SequencePadParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(SequencePadTest, fp32) { + InitFloatInput(); + SequencePadCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + CopySync(length_cpu_.mutable_data(), + length_gpu_.data(), + sizeof(int64_t) * length_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + EXPECT_NEAR(out_cpu_.data()[i], out_ref_.data()[i], 1e-5); + } + for (int i = 0; i < length_gpu_.numel(); ++i) { + EXPECT_NEAR( + length_cpu_.data()[i], length_ref_.data()[i], 1e-5); + } +} + +TEST_F(SequencePadTest, TestFP16) { + InitHalfInput(); + SequencePadCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + const int64_t* length_gpu_data = length_gpu_.data(); + int64_t* length_cpu_data = length_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + CopySync(length_cpu_data, + length_gpu_data, + sizeof(int64_t) * length_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2); + } + for (int i = 0; i < length_gpu_.numel(); ++i) { + EXPECT_NEAR( + length_cpu_.data()[i], length_ref_.data()[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_unpad_compute.cu b/lite/kernels/cuda/sequence_unpad_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..b4274e19a86d55a4e5e5099e984c537c2929bce7 --- /dev/null +++ b/lite/kernels/cuda/sequence_unpad_compute.cu @@ -0,0 +1,124 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "lite/backends/cuda/math/sequence_padding.h" +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/sequence_unpad_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void SequenceUnpadCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + auto x_dims = param.X->dims(); + auto len_dims = param.Length->dims(); + + auto* seq_len_ptr = param.Length->template data(); + seq_len_cpu_.Resize(param.Length->dims()); + TargetWrapperCuda::MemcpyAsync(seq_len_cpu_.mutable_data(), + seq_len_ptr, + sizeof(int64_t) * param.Length->numel(), + IoDirection::DtoH, + stream); + TargetWrapperCuda::StreamSync(stream); + + int64_t batch_size = len_dims[0]; + std::vector out_lod0(batch_size + 1, 0); + for (int64_t i = 0; i < batch_size; ++i) { + out_lod0[i + 1] = out_lod0[i] + seq_len_cpu_.data()[i]; + } + paddle::lite::LoD out_lod; + out_lod.push_back(out_lod0); + + int64_t out_dim0 = out_lod0.back(); + std::vector out_dims{out_dim0}; + if (x_dims.size() == 2) { + out_dims.push_back(1); + } else { + for (size_t i = 2; i < x_dims.size(); ++i) { + out_dims.push_back(x_dims[i]); + } + } + param.Out->Resize(out_dims); + param.Out->set_lod(out_lod); + + const auto* pad_tensor = param.X; + auto* seq_tensor = param.Out; + + int padded_length = pad_tensor->dims()[1]; + int seq_num = seq_tensor->lod()[0].size() - 1; + int max_seq_len = 0; + int step_width = seq_tensor->numel() / seq_tensor->dims()[0]; + + seq_offsets_vec_.resize(seq_tensor->lod()[0].size()); + for (size_t i = 0; i < seq_num; ++i) { + max_seq_len = std::max(max_seq_len, + static_cast(seq_tensor->lod()[0][i + 1] - + seq_tensor->lod()[0][i])); + seq_offsets_vec_[i] = seq_tensor->lod()[0][i]; + } + seq_offsets_vec_[seq_num] = seq_tensor->lod()[0][seq_num]; + seq_offsets_.Resize({static_cast(seq_tensor->lod()[0].size())}); + TargetWrapperCuda::MemcpyAsync( + seq_offsets_.mutable_data(TARGET(kCUDA)), + seq_offsets_vec_.data(), + sizeof(size_t) * seq_offsets_vec_.size(), + IoDirection::HtoD, + stream); + + const T* pad_data = pad_tensor->template data(); + T* seq_data = seq_tensor->template mutable_data(TARGET(kCUDA)); + + lite::cuda::math::SequenceUnpadding(seq_data, + pad_data, + seq_offsets_.data(), + seq_num, + padded_length, + step_width, + &stream); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +using SeqUnadFp32 = + paddle::lite::kernels::cuda::SequenceUnpadCompute; + +using SeqUnadFp16 = + paddle::lite::kernels::cuda::SequenceUnpadCompute; + +REGISTER_LITE_KERNEL(sequence_unpad, kCUDA, kFloat, kNCHW, SeqUnadFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Length", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(sequence_unpad, kCUDA, kFP16, kNCHW, SeqUnadFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("Length", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_unpad_compute.h b/lite/kernels/cuda/sequence_unpad_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..6b077a4dcbd91eb8f9a9e2cb1340088434f117aa --- /dev/null +++ b/lite/kernels/cuda/sequence_unpad_compute.h @@ -0,0 +1,41 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class SequenceUnpadCompute : public KernelLite { + public: + using param_t = operators::SequenceUnpadParam; + + void Run() override; + virtual ~SequenceUnpadCompute() = default; + + private: + lite::Tensor seq_offsets_; + lite::Tensor seq_len_cpu_; + std::vector seq_offsets_vec_; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_unpad_compute_test.cc b/lite/kernels/cuda/sequence_unpad_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..417115a50b6d086bd628a0b93a7d45c688ea18af --- /dev/null +++ b/lite/kernels/cuda/sequence_unpad_compute_test.cc @@ -0,0 +1,198 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_unpad_compute.h" + +#include + +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SequenceUnpadTest : public ::testing::Test { + protected: + SequenceUnpadTest() + : batch_(5), + features_(2), + padded_length_(3), + out_lod_({{0, 2, 5}}), + x_shape_({static_cast(out_lod_[0].size() - 1), + padded_length_, + features_}), + out_shape_({batch_, features_}) { + x_ref_.Resize(lite::DDim(x_shape_)); + x_gpu_.Resize(x_ref_.dims()); + + length_ref_.Resize( + lite::DDim({static_cast(out_lod_[0].size() - 1)})); + length_gpu_.Resize(length_ref_.dims()); + + auto* x_ref_data = x_ref_.mutable_data(); + auto* length_ref_data = length_ref_.mutable_data(); + + // prepare input + for (int64_t i = 0; i < x_ref_.numel(); i++) { + x_ref_data[i] = static_cast(i); + } + for (size_t i = 0; i < out_lod_[0].size() - 1; ++i) { + length_ref_data[i] = out_lod_[0][i + 1] - out_lod_[0][i]; + } + + out_ref_.Resize(lite::DDim(out_shape_)); + out_ref_.set_lod(out_lod_); + out_gpu_.Resize(out_ref_.dims()); + out_gpu_.set_lod(out_ref_.lod()); + out_cpu_.Resize(out_ref_.dims()); + out_cpu_.set_lod(out_ref_.lod()); + + RunBaseLine(&x_ref_, &length_ref_, &out_ref_); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.X = &x_gpu_; + param_.Length = &length_gpu_; + param_.Out = &out_gpu_; + } + + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + length_gpu_.Assign( + length_ref_.data(), length_gpu_.dims()); + } + + void InitHalfInput() { + x_half_.Resize(lite::DDim(x_shape_)); + auto x_half_data = x_half_.mutable_data(); + for (int64_t i = 0; i < x_half_.numel(); i++) { + x_half_data[i] = half(lite::float16(x_ref_.data()[i])); + } + x_gpu_.Assign(x_half_data, x_gpu_.dims()); + length_gpu_.Assign( + length_ref_.data(), length_gpu_.dims()); + } + + void RunBaseLine(const lite::Tensor* X, + const lite::Tensor* Length, + lite::Tensor* Out) { + auto* out_data = Out->mutable_data(); + + for (size_t i = 0; i < 4; ++i) { + out_data[i] = i; + } + for (size_t i = 6; i < 12; ++i) { + out_data[i - 2] = i; + } + } + + int batch_, features_, padded_length_; + LoD out_lod_; + std::vector x_shape_, out_shape_; + + lite::Tensor x_ref_, out_ref_, length_ref_; + lite::Tensor x_gpu_, out_gpu_, length_gpu_; + lite::Tensor x_half_; + lite::Tensor out_cpu_, length_cpu_; + + operators::SequencePadParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(SequenceUnpadTest, fp32) { + InitFloatInput(); + SequenceUnpadCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + EXPECT_NEAR(out_cpu_.data()[i], out_ref_.data()[i], 1e-5); + } +} + +TEST_F(SequenceUnpadTest, TestFP16) { + InitHalfInput(); + SequenceUnpadCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sigmoid_compute.cu b/lite/kernels/cuda/sigmoid_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..2879f50b4d8a61c80c8c73bf8b3f43e4c8dbe5b0 --- /dev/null +++ b/lite/kernels/cuda/sigmoid_compute.cu @@ -0,0 +1,57 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/backends/cuda/math/activation.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/sigmoid_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void SigmoidCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + int num = static_cast(param.X->numel()); + auto input = param.X->template data(); + auto output = param.Out->template mutable_data(TARGET(kCUDA)); + + lite::cuda::math::sigmoid(num, input, output, stream); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +using SigmoidFp32 = + paddle::lite::kernels::cuda::SigmoidCompute; + +using SigmoidFp16 = + paddle::lite::kernels::cuda::SigmoidCompute; + +REGISTER_LITE_KERNEL(sigmoid, kCUDA, kFloat, kNCHW, SigmoidFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(sigmoid, kCUDA, kFP16, kNCHW, SigmoidFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/cuda/sigmoid_compute.h b/lite/kernels/cuda/sigmoid_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..455dc38d1f8d04fdaf5f4a70ee704c8a2fe7ddef --- /dev/null +++ b/lite/kernels/cuda/sigmoid_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class SigmoidCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + virtual ~SigmoidCompute() = default; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sigmoid_compute_test.cc b/lite/kernels/cuda/sigmoid_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e27904333b918baf0de7042005955b8fb44d6930 --- /dev/null +++ b/lite/kernels/cuda/sigmoid_compute_test.cc @@ -0,0 +1,168 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sigmoid_compute.h" + +#include + +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/backends/cuda/target_wrapper.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SigmoidTest : public ::testing::Test { + protected: + SigmoidTest() : m_(8), n_(64), shape_({m_, n_}) { + x_ref_.Resize(lite::DDim(shape_)); + x_gpu_.Resize(lite::DDim(shape_)); + + auto x_ref_data = x_ref_.mutable_data(); + + for (int64_t i = 0; i < x_ref_.numel(); i++) { + x_ref_data[i] = static_cast(i % 10 * 0.2); + } + + out_ref_.Resize(lite::DDim(shape_)); + out_cpu_.Resize(out_ref_.dims()); + out_gpu_.Resize(out_ref_.dims()); + RunBaseLine(); + + InitParamAndContext(); + } + + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.X = &x_gpu_; + param_.Out = &out_gpu_; + } + + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + } + + void InitHalfInput() { + x_half_.Resize(lite::DDim(shape_)); + auto x_half_data = x_half_.mutable_data(); + for (int64_t i = 0; i < x_half_.numel(); i++) { + x_half_data[i] = half(lite::float16(x_ref_.data()[i])); + } + x_gpu_.Assign(x_half_data, x_gpu_.dims()); + } + + void RunBaseLine() { + for (int64_t i = 0; i < x_ref_.numel(); ++i) { + out_ref_.mutable_data()[i] = + 1.f / (1.f + expf(-1 * x_ref_.data()[i])); + } + } + + int m_, n_; + std::vector shape_; + lite::Tensor x_ref_, out_ref_; + lite::Tensor x_gpu_; + lite::Tensor x_half_; + lite::Tensor out_cpu_, out_gpu_; + + operators::ActivationParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(SigmoidTest, TestFP32) { + InitFloatInput(); + SigmoidCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = out_cpu_.data()[i]; + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5); + } +} + +TEST_F(SigmoidTest, TestFP16) { + InitHalfInput(); + SigmoidCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_gpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 2e-2); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/topk_pooling_compute.cu b/lite/kernels/cuda/topk_pooling_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..bb4499b637a1435dec2dc913bf8141edd60130fc --- /dev/null +++ b/lite/kernels/cuda/topk_pooling_compute.cu @@ -0,0 +1,200 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/topk_pooling_compute.h" + +#include +#include + +#include "lite/backends/cuda/target_wrapper.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +__global__ void top_k_pooling_batch_kernel_reduction(Dtype *output_data, + const Dtype *input, + const int *height_offset, + const int *width_offset, + const int batch_size, + const int channel_num, + const int height_stride, + const int width_stride, + const int k) { + const Dtype *input_start = + input + + (blockIdx.x * channel_num + blockIdx.y) * height_stride * width_stride; + Dtype *output_start = + output_data + (blockIdx.x * channel_num + blockIdx.y) * k; + + int width = width_offset[blockIdx.x + 1] - width_offset[blockIdx.x]; + int height = height_offset[blockIdx.x + 1] - height_offset[blockIdx.x]; + int real_k = k < height * width ? k : height * width; + + extern __shared__ Dtype smem[]; + + Dtype min_val = -100000.0f; + for (int j = threadIdx.x; j < height * width; j += blockDim.x) { + int index_tmp = (j / width) * width_stride + j % width; + smem[j] = input_start[index_tmp]; + } + __syncthreads(); + + // get max val + int t = 0; + for (; t < real_k; ++t) { + // reduction + for (int gap = height * width; gap > 1;) { + if (threadIdx.x == 0) { // edge cond + if (gap % 2 != 0) { + Dtype value_first = smem[0]; + Dtype value_gap = smem[gap - 1]; + if (value_first < value_gap) { + smem[0] = value_gap; + smem[gap - 1] = value_first; + } + } + } + gap >>= 1; + for (int j = threadIdx.x; j < gap; j += blockDim.x) { + Dtype value_first = smem[j]; + Dtype value_gap = smem[j + gap]; + if (value_first < value_gap) { + smem[j] = value_gap; + smem[j + gap] = value_first; + } + } + __syncthreads(); + } + if (threadIdx.x == 0) { + output_start[t] = smem[0]; + smem[0] = min_val; + } + __syncthreads(); + } + for (int i = threadIdx.x; i < (k - t); i += blockDim.x) { + // output_start[t + i] = 0.0f; + } +} + +template +void TopkPoolingCompute::PrepareForRun() { + int device_id = lite::TargetWrapperCuda::GetCurDevice(); + cudaDeviceProp deviceProp; + CUDA_CALL(cudaGetDeviceProperties(&deviceProp, device_id)); + _shared_mem_size = deviceProp.sharedMemPerBlock; +} + +template +void TopkPoolingCompute::Run() { + auto ¶m = this->Param(); + auto &ctx = this->ctx_->template As(); + auto cuda_stream = ctx.exec_stream(); + + CHECK(param.X->lod().size() > 0 && param.X->lod()[0].size() > 0) + << "X sequence offset is not valid"; + CHECK(param.Y->lod().size() > 0 && param.Y->lod()[0].size() > 0) + << "Y sequence offset is not valid"; + + int width_offset_len = param.X->lod()[0].size(); + lite::DDim width_offset_shape(std::vector{width_offset_len}); + _width_offset.Resize(width_offset_shape); + std::vector width_lod_0(width_offset_len, 0); + for (size_t i = 0; i < param.X->lod()[0].size(); ++i) { + width_lod_0[i] = static_cast(param.X->lod()[0][i]); + } + lite::TargetWrapperCuda::MemcpyAsync( + _width_offset.mutable_data(TARGET(kCUDA)), + width_lod_0.data(), + sizeof(int) * width_offset_len, + lite::IoDirection::HtoD, + cuda_stream); + + int height_offset_len = param.Y->lod()[0].size(); + lite::DDim height_offset_shape(std::vector{height_offset_len}); + _height_offset.Resize(height_offset_shape); + std::vector height_lod_0(height_offset_len, 0); + for (size_t i = 0; i < param.Y->lod()[0].size(); ++i) { + height_lod_0[i] = static_cast(param.Y->lod()[0][i]); + } + lite::TargetWrapperCuda::MemcpyAsync( + _height_offset.mutable_data(TARGET(kCUDA)), + height_lod_0.data(), + sizeof(int) * height_offset_len, + lite::IoDirection::HtoD, + cuda_stream); + + const Tensor *x_tensor = param.X; + Tensor *out_tensor = param.Out; + const T *in_data = x_tensor->data(); + T *out_data = out_tensor->mutable_data(TARGET(kCUDA)); + + int num = x_tensor->dims()[0]; + int channel = x_tensor->dims()[1]; + int height = x_tensor->dims()[2]; + int width = x_tensor->dims()[3]; + + const int *height_offset = _height_offset.data(); + const int *width_offset = _width_offset.data(); + + int feat_map_size = height * width; + + if (feat_map_size * sizeof(T) <= _shared_mem_size) { + dim3 blocks(num, channel); + dim3 threads(32, 1); + + top_k_pooling_batch_kernel_reduction< + T><<>>( + out_data, + in_data, + height_offset, + width_offset, + num, + channel, + height, + width, + param.top_k); + } else { + LOG(FATAL) << "Not implemented. Exceeded the shared memory limit."; + } + CUDA_POST_KERNEL_CHECK; +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(topk_pooling, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::TopkPoolingCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/cuda/topk_pooling_compute.h b/lite/kernels/cuda/topk_pooling_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..abf16163812a74de8ebb8cce0dd7d80469e0a7d8 --- /dev/null +++ b/lite/kernels/cuda/topk_pooling_compute.h @@ -0,0 +1,45 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/kernel.h" +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class TopkPoolingCompute + : public KernelLite { + public: + using param_t = operators::TopkPoolingParam; + + void Run() override; + + void PrepareForRun() override; + + virtual ~TopkPoolingCompute() = default; + + protected: + lite::Tensor _height_offset; + lite::Tensor _width_offset; + int _shared_mem_size; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/topk_pooling_compute_test.cc b/lite/kernels/cuda/topk_pooling_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0fb5c29f25bba0b4cc00f3eb58fc1c0726e6b23b --- /dev/null +++ b/lite/kernels/cuda/topk_pooling_compute_test.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/topk_pooling_compute.h" + +#include + +#include +#include +#include +#include + +#include "lite/api/test_helper.h" +#include "lite/utils/float16.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class TopkPooingTest : public ::testing::Test { + protected: + TopkPooingTest() + : num(2), + channels(4), + height(4), + width(4), + top_k(2), + feat_map_num(height * width), + x_lod({{0, 4, 7}}), + y_lod({{0, 4, 7}}), + x_shape({num, channels, height, width}), + out_shape({num, channels * top_k}) { + CHECK_EQ(x_lod[0].size(), num + 1) << "invalid input."; + for (size_t i = 1; i < x_lod[0].size(); ++i) { + CHECK_LE(x_lod[0][i] - x_lod[0][i - 1], height) << "invalid input."; + } + + X_gpu.Resize(lite::DDim(x_shape)); + X_ref.Resize(lite::DDim(x_shape)); + X_ref.set_lod(x_lod); + Y_gpu.Resize(lite::DDim(x_shape)); + Y_ref.Resize(lite::DDim(x_shape)); + Y_ref.set_lod(y_lod); + auto x_ref_data = X_ref.mutable_data(); + auto y_ref_data = Y_ref.mutable_data(); + + // prepare input + for (int64_t i = 0; i < X_ref.numel(); i++) { + x_ref_data[i] = static_cast(i % 16); + } + for (int64_t i = 0; i < Y_ref.numel(); i++) { + y_ref_data[i] = static_cast(i % 16); + } + + Out_ref.Resize(lite::DDim(out_shape)); + Out_gpu.Resize(lite::DDim(out_shape)); + Out_cpu.Resize(lite::DDim(out_shape)); + + device_init(); + } + + void device_init() { + ctx.reset(new KernelContext); + cudaStreamCreate(&stream); + param.X = &X_gpu; + param.Y = &Y_gpu; + param.Out = &Out_gpu; + param.top_k = top_k; + param.feat_map_num = feat_map_num; + } + + void float_data_init() { + X_gpu.Assign(X_ref.data(), + X_gpu.dims()); + X_gpu.set_lod(X_ref.lod()); + Y_gpu.Assign(Y_ref.data(), + Y_gpu.dims()); + Y_gpu.set_lod(Y_ref.lod()); + } + + void half_data_init() {} + + void cpu_base(const lite::Tensor* X, + const lite::Tensor* Y, + lite::Tensor* Out) {} + + int num, channels, height, width; + int top_k, feat_map_num; + std::vector> x_lod, y_lod; + std::vector x_shape, out_shape; + lite::Tensor X_ref, Y_ref, Out_ref; + lite::Tensor X_gpu, Y_gpu; + lite::Tensor Out_cpu, Out_gpu; + + operators::TopkPoolingParam param; + std::unique_ptr ctx; + cudaStream_t stream; +}; + +TEST_F(TopkPooingTest, fp32) { + float_data_init(); + auto& context = ctx->As(); + context.SetExecStream(stream); + TopkPoolingCompute kernel; + kernel.SetParam(param); + kernel.SetContext(std::move(ctx)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(Out_cpu.mutable_data(), + Out_gpu.data(), + sizeof(float) * Out_gpu.numel(), + IoDirection::DtoH); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/transpose_compute.cu b/lite/kernels/cuda/transpose_compute.cu index c5693c674c573d7c9f59034dd3c0985c9d94a22f..ec7ecd16e0daa9f9cb696224ae498825fe75c5b4 100644 --- a/lite/kernels/cuda/transpose_compute.cu +++ b/lite/kernels/cuda/transpose_compute.cu @@ -13,17 +13,20 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "lite/kernels/cuda/transpose_compute.h" + #include + #include "lite/core/op_registry.h" -#include "lite/kernels/cuda/transpose_compute.h" namespace paddle { namespace lite { namespace kernels { namespace cuda { -void TransposeCompute::Run() { - auto& param = this->Param(); +template +void TransposeCompute::Run() { + auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); auto stream = ctx.exec_stream(); @@ -31,8 +34,8 @@ void TransposeCompute::Run() { lite::Tensor* Out = param.output; std::vector axes = param.axis; - const float* in = X->data(); - float* out = Out->mutable_data(TARGET(kCUDA)); + const T* in = X->template data(); + T* out = Out->mutable_data(TARGET(kCUDA)); int ndim = X->dims().size(); std::vector dims = X->dims().data(); @@ -40,7 +43,7 @@ void TransposeCompute::Run() { // NCHW -> NHWC if (axes.size() == 4 && axes[0] == 0 && axes[1] == 2 && axes[2] == 3 && axes[3] == 1) { - trans.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream); + trans_.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream); cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); return; @@ -49,13 +52,13 @@ void TransposeCompute::Run() { // NHWC -> NCHW if (axes.size() == 4 && axes[0] == 0 && axes[1] == 3 && axes[2] == 1 && axes[3] == 2) { - trans.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream); + trans_.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream); cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); return; } - trans.transpose(out, in, dims, axes, &stream); + trans_.transpose(out, in, dims, axes, &stream); cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); } @@ -65,34 +68,31 @@ void TransposeCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL(transpose, - kCUDA, - kFloat, - kNCHW, - paddle::lite::kernels::cuda::TransposeCompute, - def) +using TransFp32 = + paddle::lite::kernels::cuda::TransposeCompute; + +using TransFp16 = + paddle::lite::kernels::cuda::TransposeCompute; + +REGISTER_LITE_KERNEL(transpose, kCUDA, kFloat, kNCHW, TransFp32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) .Finalize(); -REGISTER_LITE_KERNEL(transpose2, - kCUDA, - kFloat, - kNCHW, - paddle::lite::kernels::cuda::TransposeCompute, - def) +REGISTER_LITE_KERNEL(transpose2, kCUDA, kFloat, kNCHW, TransFp32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kCUDA))}) .Finalize(); -// REGISTER_LITE_KERNEL(transpose2, -// kCUDA, -// kFloat, -// kNCHW, -// paddle::lite::kernels::cuda::TransposeCompute, -// def) -// .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) -// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) -// .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kCUDA))}) -// .Finalize(); +REGISTER_LITE_KERNEL(transpose, kCUDA, kFP16, kNCHW, TransFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); + +REGISTER_LITE_KERNEL(transpose2, kCUDA, kFP16, kNCHW, TransFp16, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindOutput("XShape", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/cuda/transpose_compute.h b/lite/kernels/cuda/transpose_compute.h index 273d072231fb0608deb9ed729bdf153395ee983f..7e373c3b26c1701cd467148a06466a86f04e0c95 100644 --- a/lite/kernels/cuda/transpose_compute.h +++ b/lite/kernels/cuda/transpose_compute.h @@ -21,7 +21,8 @@ namespace lite { namespace kernels { namespace cuda { -class TransposeCompute : public KernelLite { +template +class TransposeCompute : public KernelLite { public: using param_t = operators::TransposeParam; @@ -29,7 +30,7 @@ class TransposeCompute : public KernelLite { virtual ~TransposeCompute() = default; private: - lite::cuda::math::Transpose trans; + lite::cuda::math::Transpose trans_; }; } // namespace cuda diff --git a/lite/kernels/cuda/transpose_compute_test.cc b/lite/kernels/cuda/transpose_compute_test.cc index bf0d803a14a5f0e47c96128b953ae72a18798205..89654dd9c8a200f5672f23bd08c32b40b9b6f99e 100644 --- a/lite/kernels/cuda/transpose_compute_test.cc +++ b/lite/kernels/cuda/transpose_compute_test.cc @@ -13,11 +13,16 @@ // limitations under the License. #include "lite/kernels/cuda/transpose_compute.h" + #include #include #include #include +#include "lite/api/test_helper.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/utils/float16.h" + namespace paddle { namespace lite { namespace kernels { @@ -31,9 +36,9 @@ namespace { #define OUT(n, c, h, w) \ output_data[w + h * output_w + c * output_h * output_w + \ n * output_c * output_h * output_w] -void nchw2nhwc_ref(lite::Tensor* input, - lite::Tensor* output, - const std::vector axies) { +void Nchw2nhwcBaseLine(lite::Tensor* input, + lite::Tensor* output, + const std::vector axies) { auto* input_data = input->data(); auto* output_data = output->mutable_data(); @@ -64,9 +69,9 @@ void nchw2nhwc_ref(lite::Tensor* input, #define OUT(n, h, w, c) \ output_data[c + w * output_c + h * output_w * output_c + \ n * output_h * output_w * output_c] -void nhwc2nchw_ref(lite::Tensor* input, - lite::Tensor* output, - const std::vector axies) { +void Nhwc2nchwBaseLine(lite::Tensor* input, + lite::Tensor* output, + const std::vector& axies) { auto* input_data = input->data(); auto* output_data = output->mutable_data(); @@ -89,7 +94,7 @@ void nhwc2nchw_ref(lite::Tensor* input, } } -void transpose_ref(lite::Tensor* input, +void TransBaseLine(const lite::Tensor* input, lite::Tensor* output, const std::vector axes) { auto* input_data = input->data(); @@ -123,7 +128,7 @@ void transpose_ref(lite::Tensor* input, } // namespace TEST(transpose_nchw, normal) { - TransposeCompute transpose_kernel; + TransposeCompute transpose_kernel; std::unique_ptr ctx(new KernelContext); auto& context = ctx->As(); @@ -168,16 +173,15 @@ TEST(transpose_nchw, normal) { auto* out_data = out.mutable_data(TARGET(kCUDA)); CopySync( out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); - nchw2nhwc_ref(&x_ref, &out_ref, axes); + Nchw2nhwcBaseLine(&x_ref, &out_ref, axes); auto* out_ref_data = out_ref.mutable_data(); - // transpose_ref(&x_ref, &out_ref, axes); for (int i = 0; i < out.numel(); i++) { EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); } } TEST(transpose_nhwc, normal) { - TransposeCompute transpose_kernel; + TransposeCompute transpose_kernel; std::unique_ptr ctx(new KernelContext); auto& context = ctx->As(); @@ -220,62 +224,146 @@ TEST(transpose_nhwc, normal) { auto* out_data = out.mutable_data(TARGET(kCUDA)); CopySync( out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); - nhwc2nchw_ref(&x_ref, &out_ref, axes); - // transpose_ref(&x_ref, &out_ref, axes); + Nhwc2nchwBaseLine(&x_ref, &out_ref, axes); auto* out_ref_data = out_ref.mutable_data(); for (int i = 0; i < out.numel(); i++) { EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); } } -TEST(transpose, normal) { - TransposeCompute transpose_kernel; - std::unique_ptr ctx(new KernelContext); - auto& context = ctx->As(); +class TransposeTest : public ::testing::Test { + protected: + TransposeTest() + : C_(3), + H_(128), + W_(64), + axes_({1, 2, 0}), + x_shape_({C_, H_, W_}), + out_shape_({H_, W_, C_}) { + x_ref_.Resize(lite::DDim(x_shape_)); + x_gpu_.Resize(x_ref_.dims()); + + auto X_ref__data = x_ref_.mutable_data(); + + // prepare input + for (int64_t i = 0; i < x_ref_.numel(); i++) { + X_ref__data[i] = static_cast(i); + } - operators::TransposeParam param; + out_ref_.Resize(lite::DDim(out_shape_)); + out_gpu_.Resize(out_ref_.dims()); + out_cpu_.Resize(out_ref_.dims()); + RunBaseLine(&x_ref_, &out_ref_); - lite::Tensor x, x_cpu, x_ref; - lite::Tensor out, out_cpu, out_ref; + InitParamAndContext(); + } - int C = 3, H = 128, W = 128; - std::vector axes({2, 0, 1}); - x.Resize({C, H, W}); - out.Resize({W, C, H}); + void InitParamAndContext() { + ctx_.reset(new KernelContext); + cudaStreamCreate(&stream_); + auto& context = ctx_->As(); + context.SetExecStream(stream_); + param_.x = &x_gpu_; + param_.output = &out_gpu_; + param_.axis = axes_; + } - x_cpu.Resize({C, H, W}); - out_cpu.Resize({W, C, H}); + void InitFloatInput() { + x_gpu_.Assign(x_ref_.data(), + x_gpu_.dims()); + } - x_ref.Resize({C, H, W}); - out_ref.Resize({W, C, H}); + void InitHalfInput() { + x_half_.Resize(lite::DDim(x_ref_.dims())); + auto x_half_data = x_half_.mutable_data(); + for (int64_t i = 0; i < x_half_.numel(); i++) { + x_half_data[i] = half(lite::float16(x_ref_.data()[i])); + } + x_gpu_.Assign(x_half_data, x_gpu_.dims()); + } - auto* x_cpu_data = x_cpu.mutable_data(); - auto* out_cpu_data = out_cpu.mutable_data(); - auto* x_ref_data = x_ref.mutable_data(); + void RunBaseLine(const lite::Tensor* x, lite::Tensor* out) { + TransBaseLine(x, out, axes_); + } - for (int i = 0; i < x_cpu.numel(); ++i) { - x_cpu_data[i] = i + 1; - x_ref_data[i] = i + 1; + int C_, H_, W_; + std::vector axes_; + std::vector x_shape_, out_shape_; + + lite::Tensor x_ref_, out_ref_; + lite::Tensor x_gpu_, out_gpu_; + lite::Tensor x_half_; + lite::Tensor out_cpu_; + + operators::TransposeParam param_; + std::unique_ptr ctx_; + cudaStream_t stream_; +}; + +TEST_F(TransposeTest, fp32) { + InitFloatInput(); + TransposeCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); } - x.Assign(x_cpu_data, x_cpu.dims()); - param.x = &x; - param.output = &out; - param.axis = axes; - transpose_kernel.SetParam(param); - cudaStream_t stream; - cudaStreamCreate(&stream); - context.SetExecStream(stream); - transpose_kernel.SetContext(std::move(ctx)); - transpose_kernel.Launch(); + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } cudaDeviceSynchronize(); - auto* out_data = out.mutable_data(TARGET(kCUDA)); - CopySync( - out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); - transpose_ref(&x_ref, &out_ref, axes); - auto* out_ref_data = out_ref.mutable_data(); - for (int i = 0; i < out.numel(); i++) { - EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp32, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + CopySync(out_cpu_.mutable_data(), + out_gpu_.data(), + sizeof(float) * out_gpu_.numel(), + IoDirection::DtoH); + for (int i = 0; i < out_gpu_.numel(); ++i) { + EXPECT_NEAR(out_cpu_.data()[i], out_ref_.data()[i], 1e-5); + } +} + +TEST_F(TransposeTest, TestFP16) { + InitHalfInput(); + TransposeCompute kernel; + kernel.SetParam(param_); + kernel.SetContext(std::move(ctx_)); + + for (int i = 0; i < FLAGS_warmup; ++i) { + kernel.Launch(); + cudaDeviceSynchronize(); + } + + auto start = GetCurrentUS(); + kernel.PrepareForRun(); + for (int i = 0; i < FLAGS_repeats; ++i) { + kernel.Run(); + } + cudaDeviceSynchronize(); + auto duration = (GetCurrentUS() - start) / 1000.0; + LOG(INFO) << "fp16, warmup: " << FLAGS_warmup + << ", repeats: " << FLAGS_repeats << ", spend " + << duration / FLAGS_repeats << " ms in average."; + + const half* out_gpu_data = out_gpu_.data(); + half* out_cpu_data = out_cpu_.mutable_data(); + CopySync(out_cpu_data, + out_gpu_data, + sizeof(half) * out_gpu_.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_cpu_.numel(); ++i) { + float res = static_cast(lite::float16(out_cpu_data[i])); + float ref = out_ref_.data()[i]; + EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2); } } diff --git a/lite/kernels/cuda/var_conv_2d_compute.cu b/lite/kernels/cuda/var_conv_2d_compute.cu index b847069879357ea600fd62b8f70d6c50e3c8c35f..b14073e5e1bfe074d355265726562579895dde86 100644 --- a/lite/kernels/cuda/var_conv_2d_compute.cu +++ b/lite/kernels/cuda/var_conv_2d_compute.cu @@ -184,6 +184,8 @@ using VarConvFp16 = REGISTER_LITE_KERNEL(var_conv_2d, kCUDA, kFloat, kNCHW, VarConvFp32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kCUDA))}) .Finalize(); @@ -191,6 +193,9 @@ REGISTER_LITE_KERNEL(var_conv_2d, kCUDA, kFloat, kNCHW, VarConvFp32, def) REGISTER_LITE_KERNEL(var_conv_2d, kCUDA, kFP16, kNCHW, VarConvFp16, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("COLUMN", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) + .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))}) .Finalize(); diff --git a/lite/kernels/cuda/yolo_box_compute.cu b/lite/kernels/cuda/yolo_box_compute.cu index 6b4b2875f39c479f3ddd387230dbdf8e3d24ce3c..23f5639a9ddbafa38cc575ac5ca068916956a075 100644 --- a/lite/kernels/cuda/yolo_box_compute.cu +++ b/lite/kernels/cuda/yolo_box_compute.cu @@ -185,15 +185,11 @@ void YoloBoxCompute::Run() { anchors_.Resize({static_cast(anchors.size())}); int* d_anchors = anchors_.mutable_data(TARGET(kCUDA)); - // TargetWrapperCuda::MemcpyAsync(d_anchors, - // anchors.data(), - // sizeof(int) * anchors.size(), - // IoDirection::HtoD, - // stream); - CopySync(d_anchors, - anchors.data(), - sizeof(int) * anchors.size(), - IoDirection::HtoD); + TargetWrapperCuda::MemcpyAsync(d_anchors, + anchors.data(), + sizeof(int) * anchors.size(), + IoDirection::HtoD, + stream); int threads = 512; int blocks = (n * box_num + threads - 1) / threads; diff --git a/lite/kernels/fpga/activation_compute_test.cc b/lite/kernels/fpga/activation_compute_test.cc index cef87afffca65ee82ca63e58191d3877f62824f2..99f702b84b3439814433e7c416151b43772dfb0e 100644 --- a/lite/kernels/fpga/activation_compute_test.cc +++ b/lite/kernels/fpga/activation_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/fpga/activation_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/fpga/activation_compute.h" namespace paddle { namespace lite { @@ -37,8 +39,7 @@ void activation_compute_ref(const operators::ActivationParam& param) { } TEST(activation_fpga, retrive_op) { - auto activation = - KernelRegistry::Global().Create("relu"); + auto activation = KernelRegistry::Global().Create("relu"); ASSERT_FALSE(activation.empty()); ASSERT_TRUE(activation.front()); } diff --git a/lite/kernels/fpga/fc_compute_test.cc b/lite/kernels/fpga/fc_compute_test.cc index 6ef8c02ed06dd89876dcab8c14fe389039bda614..08daecda314c771d0597951162d043f34d6316c9 100644 --- a/lite/kernels/fpga/fc_compute_test.cc +++ b/lite/kernels/fpga/fc_compute_test.cc @@ -12,15 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/fpga/fc_compute.h" #include + #include #include #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/fpga/fc_compute.h" namespace paddle { namespace lite { @@ -76,8 +78,7 @@ void FillData(T* a, } TEST(fc_fpga, retrive_op) { - auto fc = - KernelRegistry::Global().Create("fc"); + auto fc = KernelRegistry::Global().Create("fc"); ASSERT_FALSE(fc.empty()); ASSERT_TRUE(fc.front()); } diff --git a/lite/kernels/fpga/pooling_compute_test.cc b/lite/kernels/fpga/pooling_compute_test.cc old mode 100755 new mode 100644 index 9248289fe9353705e7a2d84831b9f3de5d8ee7d7..ff93f1a6e1c30d006065deb04576255c24baed25 --- a/lite/kernels/fpga/pooling_compute_test.cc +++ b/lite/kernels/fpga/pooling_compute_test.cc @@ -12,14 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/fpga/pooling_compute.h" #include + #include #include #include -#include "lite/core/op_registry.h" #include "lite/backends/fpga/KD/float16.hpp" +#include "lite/core/op_registry.h" +#include "lite/kernels/fpga/pooling_compute.h" namespace paddle { namespace lite { @@ -277,8 +278,7 @@ TEST(pool_fpga, compute) { } TEST(pool_fpga, retrive_op) { - auto pool = KernelRegistry::Global().Create( - "pool2d"); + auto pool = KernelRegistry::Global().Create("pool2d"); ASSERT_FALSE(pool.empty()); ASSERT_TRUE(pool.front()); } diff --git a/lite/kernels/fpga/softmax_compute_test.cc b/lite/kernels/fpga/softmax_compute_test.cc index f92139d0f49b3d149531f11cb422e44ded6e7e64..a6f456ba1f140d07ccfcea0d7746c1061586611e 100644 --- a/lite/kernels/fpga/softmax_compute_test.cc +++ b/lite/kernels/fpga/softmax_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/fpga/softmax_compute.h" #include + #include #include + #include "lite/backends/fpga/KD/float16.hpp" #include "lite/core/op_registry.h" +#include "lite/kernels/fpga/softmax_compute.h" namespace paddle { namespace lite { @@ -121,9 +123,7 @@ TEST(softmax_arm, compute) { } TEST(softmax, retrive_op) { - auto softmax = - KernelRegistry::Global().Create( - "softmax"); + auto softmax = KernelRegistry::Global().Create("softmax"); ASSERT_FALSE(softmax.empty()); ASSERT_TRUE(softmax.front()); } diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt index a70345708cce678b52e288a1f3eaf4ee1a23f541..381b9304142537da028b35c688128d34465965aa 100644 --- a/lite/kernels/host/CMakeLists.txt +++ b/lite/kernels/host/CMakeLists.txt @@ -16,3 +16,13 @@ add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${li add_kernel(write_to_array_compute_host Host extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps}) add_kernel(read_from_array_compute_host Host extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps}) add_kernel(assign_compute_host Host extra SRCS assign_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(retinanet_detection_output_compute_host Host extra SRCS retinanet_detection_output_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(where_index_compute_host Host extra SRCS where_index_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(print_compute_host Host extra SRCS print_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(while_compute_host Host extra SRCS while_compute.cc DEPS ${lite_kernel_deps} program) +add_kernel(conditional_block_compute_host Host extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} program) +add_kernel(activation_grad_compute_host Host train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps}) + +if(LITE_BUILD_EXTRA) + lite_cc_test(test_where_index_compute_host SRCS where_index_compute.cc DEPS where_index_compute_host) +endif() diff --git a/lite/kernels/host/activation_grad_compute.cc b/lite/kernels/host/activation_grad_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..4b837cfda4572fa106a1ba1d015ffd5163b08340 --- /dev/null +++ b/lite/kernels/host/activation_grad_compute.cc @@ -0,0 +1,98 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/host/activation_grad_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +void SquareGradCompute::Run() { + auto& param = this->Param(); + CHECK(param.X); + auto out_grad_dims = param.Out_grad->dims(); + auto out_grad_data = param.Out_grad->data(); + + auto x_data = param.X->data(); + auto x_grad_data = param.X_grad->mutable_data(); + for (int i = 0; i < out_grad_dims.production(); i++) { + x_grad_data[i] = out_grad_data[i] * 2.0 * x_data[i]; + } +} + +void ReluGradCompute::Run() { + auto& param = this->Param(); + CHECK(param.X); + auto out_grad_dims = param.Out_grad->dims(); + auto out_grad_data = param.Out_grad->data(); + + auto x_data = param.X->data(); + auto x_grad_data = param.X_grad->mutable_data(); + for (int i = 0; i < out_grad_dims.production(); i++) { + x_grad_data[i] = x_data[i] > 0 ? out_grad_data[i] : 0.0; + } +} + +void TanhGradCompute::Run() { + auto& param = this->Param(); + CHECK(param.Out); + auto out_grad_dims = param.Out_grad->dims(); + auto out_grad_data = param.Out_grad->data(); + + auto out_data = param.Out->data(); + auto x_grad_data = param.X_grad->mutable_data(); + for (int i = 0; i < out_grad_dims.production(); i++) { + x_grad_data[i] = out_grad_data[i] * + (static_cast(1.0) - out_data[i] * out_data[i]); + } +} + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(square_grad, + kHost, + kFloat, + kNCHW, + paddle::lite::kernels::host::SquareGradCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); + +REGISTER_LITE_KERNEL(relu_grad, + kHost, + kFloat, + kNCHW, + paddle::lite::kernels::host::SquareGradCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); + +REGISTER_LITE_KERNEL(tanh_grad, + kHost, + kFloat, + kNCHW, + paddle::lite::kernels::host::SquareGradCompute, + def) + .BindInput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); diff --git a/lite/kernels/host/activation_grad_compute.h b/lite/kernels/host/activation_grad_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..d942b901c448ee87410a2030ea0f9f10aca0e493 --- /dev/null +++ b/lite/kernels/host/activation_grad_compute.h @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +class SquareGradCompute : public KernelLite { + public: + using param_t = operators::ActivationGradParam; + + void Run() override; + + virtual ~SquareGradCompute() = default; +}; + +class ReluGradCompute : public KernelLite { + public: + using param_t = operators::ActivationGradParam; + + void Run() override; + + virtual ~ReluGradCompute() = default; +}; + +class TanhGradCompute : public KernelLite { + public: + using param_t = operators::ActivationGradParam; + + void Run() override; + + virtual ~TanhGradCompute() = default; +}; + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/host/assign_compute.cc b/lite/kernels/host/assign_compute.cc index e496ffbd1d9a6362d730117be949cbdab83ec62a..bfbbc32e5f3b3b4dd5936e0e296306641312cabf 100644 --- a/lite/kernels/host/assign_compute.cc +++ b/lite/kernels/host/assign_compute.cc @@ -51,3 +51,19 @@ REGISTER_LITE_KERNEL( PRECISION(kAny), DATALAYOUT(kAny))}) .Finalize(); + +REGISTER_LITE_KERNEL(assign, + kHost, + kAny, + kAny, + paddle::lite::kernels::host::AssignCompute, + def_tensor_array) + .BindInput("X", + {LiteType::GetTensorListTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorListTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .Finalize(); diff --git a/lite/kernels/arm/conditional_block_compute.cc b/lite/kernels/host/conditional_block_compute.cc similarity index 51% rename from lite/kernels/arm/conditional_block_compute.cc rename to lite/kernels/host/conditional_block_compute.cc index f0bd43e1300d4034241c03d3e4ce27dcaa59c1e5..5bdca012dd4e838f3371bae7cf17634513d59db5 100644 --- a/lite/kernels/arm/conditional_block_compute.cc +++ b/lite/kernels/host/conditional_block_compute.cc @@ -12,28 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/conditional_block_compute.h" -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/tensor.h" -#include "lite/core/type_system.h" +#include "lite/kernels/host/conditional_block_compute.h" namespace paddle { namespace lite { namespace kernels { -namespace arm { +namespace host { void ConditionalBlockCompute::PrepareForRun() { - auto& param = Param(); - auto cur_scope = param.scope; - - executor_ = - std::make_shared(param.sub_block, cur_scope, place()); + auto& param = this->Param(); + program_.reset(new RuntimeProgram( + param.program_desc, param.exec_scope, param.block_idx)); } + void ConditionalBlockCompute::Run() { - auto& param = Param(); + auto& param = this->Param(); for (auto& out : param.outs) { out->clear(); } @@ -43,32 +36,40 @@ void ConditionalBlockCompute::Run() { auto* cond_data = cond->data(); need_run = cond_data[0]; } else { - auto x = param.x; - for (auto pt : x) { - if (pt == nullptr || !pt->IsInitialized() || pt->dims().empty()) { + for (auto input : param.inputs) { + if (input == nullptr || !input->IsInitialized() || + input->dims().empty()) { need_run = false; break; } } } if (need_run) { - executor_->Run(); + program_->Run(); } } -} // namespace arm +} // namespace host } // namespace kernels } // namespace lite } // namespace paddle REGISTER_LITE_KERNEL(conditional_block, - kARM, - kFloat, - kNCHW, - paddle::lite::kernels::arm::ConditionalBlockCompute, + kHost, + kAny, + kAny, + paddle::lite::kernels::host::ConditionalBlockCompute, def) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("Cond", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Scope", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Input", + {LiteType::GetTensorListTy( + TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) + .BindInput("Cond", + {LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)}) + .BindOutput("Out", + {LiteType::GetTensorListTy( + TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) + .BindOutput("Scope", + {LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) .Finalize(); diff --git a/lite/kernels/host/conditional_block_compute.h b/lite/kernels/host/conditional_block_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..8d3381ce3c4d6da076e6bb477df423bc640c56c9 --- /dev/null +++ b/lite/kernels/host/conditional_block_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/program.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +class ConditionalBlockCompute + : public KernelLite { + public: + using param_t = operators::ConditionalBlockParam; + + void PrepareForRun() override; + void Run() override; + + private: + std::unique_ptr program_; +}; + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/host/print_compute.cc b/lite/kernels/host/print_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..00c8ab7b13597ad33b9fafc878cd553572462a99 --- /dev/null +++ b/lite/kernels/host/print_compute.cc @@ -0,0 +1,188 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/host/print_compute.h" + +#include // NOLINT +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +const char kForward[] = "FORWARD"; +const char kBackward[] = "BACKWARD"; +const char kBoth[] = "BOTH"; + +class TensorFormatter { + public: + TensorFormatter() {} + + std::string Format(const Tensor& print_tensor, + const std::string& tensor_name = "", + const std::string& message = "") { + std::stringstream log_stream; + if (!tensor_name.empty()) { + log_stream << "Variable: " << tensor_name << std::endl; + } + + if (!message.empty()) { + log_stream << " - message: " << message << std::endl; + } + + if (print_tensor_lod_) { + log_stream << " - lod: {"; + const LoD& lod = print_tensor.lod(); + for (auto level : lod) { + log_stream << "{"; + bool is_first = true; + for (auto i : level) { + if (is_first) { + log_stream << i; + is_first = false; + } else { + log_stream << ", " << i; + } + } + log_stream << "}"; + } + log_stream << "}" << std::endl; + } + + log_stream << " - place: " << TargetToStr(print_tensor.target()) + << std::endl; // TODO(hong19860320) always kHost + + if (print_tensor_shape_) { + log_stream << " - shape: " << print_tensor.dims().repr() << std::endl; + } + + if (print_tensor_layout_) { + log_stream << " - layout: " + << DataLayoutToStr( + DATALAYOUT(kNCHW)) // TODO(hong19860320) Query the data + // layout from target tensor + << std::endl; + } + + auto dtype = print_tensor.precision(); + if (print_tensor_type_) { + log_stream << " - dtype: " << PrecisionToStr(dtype) << std::endl; + } + + if (dtype == PRECISION(kBool)) { + FormatData(print_tensor, log_stream); + } else if (dtype == PRECISION(kInt8)) { + FormatData(print_tensor, log_stream); + } else if (dtype == PRECISION(kInt16)) { + FormatData(print_tensor, log_stream); + } else if (dtype == PRECISION(kInt32)) { + FormatData(print_tensor, log_stream); + } else if (dtype == PRECISION(kInt64)) { + FormatData(print_tensor, log_stream); + } else if (dtype == PRECISION(kFloat)) { + FormatData(print_tensor, log_stream); + } else { + log_stream << "\tdata: unprintable type: " << PrecisionToStr(dtype) + << std::endl; + } + return log_stream.str(); + } + + void Print(const Tensor& print_tensor, + const std::string& tensor_name = "", + const std::string& message = "") { + static std::mutex mutex; + std::lock_guard lock(mutex); + std::cout << Format(print_tensor, tensor_name, message); + } + + void SetPrintTensorType(bool print_tensor_type) { + print_tensor_type_ = print_tensor_type; + } + void SetPrintTensorShape(bool print_tensor_shape) { + print_tensor_shape_ = print_tensor_shape; + } + void SetPrintTensorLod(bool print_tensor_lod) { + print_tensor_lod_ = print_tensor_lod; + } + void SetPrintTensorLayout(bool print_tensor_layout) { + print_tensor_layout_ = print_tensor_layout; + } + void SetSummarize(int64_t summarize) { summarize_ = summarize; } + + private: + template + void FormatData(const Tensor& print_tensor, std::stringstream& log_stream) { + int64_t print_size = summarize_ == -1 + ? print_tensor.numel() + : std::min(summarize_, print_tensor.numel()); + const T* data = print_tensor.data(); // Always kHost, so unnessary to + // copy the data from device + log_stream << " - data: ["; + if (print_size > 0) { + log_stream << data[0]; + for (int64_t i = 1; i < print_size; ++i) { + log_stream << " " << data[i]; + } + } + log_stream << "]" << std::endl; + } + + int64_t summarize_ = -1; + bool print_tensor_type_ = true; + bool print_tensor_shape_ = true; + bool print_tensor_lod_ = true; + bool print_tensor_layout_ = true; +}; + +void PrintCompute::Run() { + auto& param = Param(); + param.out->CopyDataFrom(*param.in); + + if ((param.is_forward && param.print_phase == kBackward) || + (!param.is_forward && param.print_phase == kForward)) { + return; + } + + int first_n = param.first_n; + if (first_n > 0 && ++times_ > first_n) return; + + TensorFormatter formatter; + const std::string& name = param.print_tensor_name ? param.name : ""; + formatter.SetPrintTensorType(param.print_tensor_type); + formatter.SetPrintTensorShape(param.print_tensor_shape); + formatter.SetPrintTensorLod(param.print_tensor_lod); + formatter.SetPrintTensorLayout(param.print_tensor_layout); + formatter.SetSummarize(static_cast(param.summarize)); + formatter.Print(*param.in, name, param.message); +} + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + print, kHost, kAny, kAny, paddle::lite::kernels::host::PrintCompute, def) + .BindInput("In", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .Finalize(); diff --git a/lite/kernels/host/print_compute.h b/lite/kernels/host/print_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..91a54182d2d2e00250da01fcd5d62556da930198 --- /dev/null +++ b/lite/kernels/host/print_compute.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +class PrintCompute + : public KernelLite { + public: + using param_t = operators::PrintParam; + + void Run() override; + + virtual ~PrintCompute() = default; + + private: + mutable int times_{0}; +}; + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/host/retinanet_detection_output_compute.cc b/lite/kernels/host/retinanet_detection_output_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..95a4bf708e7f03aee9d9ac99323b173287260b13 --- /dev/null +++ b/lite/kernels/host/retinanet_detection_output_compute.cc @@ -0,0 +1,435 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/host/retinanet_detection_output_compute.h" +#include +#include +#include +#include +#include "lite/operators/retinanet_detection_output_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +bool SortScoreTwoPairDescend(const std::pair>& pair1, + const std::pair>& pair2) { + return pair1.first > pair2.first; +} + +template +static inline void GetMaxScoreIndex( + const std::vector& scores, + const T threshold, + int top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), + sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { + sorted_indices->resize(top_k); + } +} + +template +static inline T BBoxArea(const std::vector& box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline T JaccardOverlap(const std::vector& box1, + const std::vector& box2, + const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + T norm = normalized ? static_cast(0.) : static_cast(1.); + T inter_w = inter_xmax - inter_xmin + norm; + T inter_h = inter_ymax - inter_ymin + norm; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +void NMSFast(const std::vector>& cls_dets, + const T nms_threshold, + const T eta, + std::vector* selected_indices) { + int64_t num_boxes = cls_dets.size(); + std::vector> sorted_indices; + for (int64_t i = 0; i < num_boxes; ++i) { + sorted_indices.push_back(std::make_pair(cls_dets[i][4], i)); + } + // Sort the score pair according to the scores in descending order + std::stable_sort( + sorted_indices.begin(), sorted_indices.end(), SortScorePairDescend); + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (size_t k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = T(0.); + + overlap = JaccardOverlap(cls_dets[idx], cls_dets[kept_idx], false); + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } +} + +template +void DeltaScoreToPrediction( + const std::vector& bboxes_data, + const std::vector& anchors_data, + T im_height, + T im_width, + T im_scale, + int class_num, + const std::vector>& sorted_indices, + std::map>>* preds) { + im_height = static_cast(std::round(im_height / im_scale)); + im_width = static_cast(std::round(im_width / im_scale)); + T zero(0); + int i = 0; + for (const auto& it : sorted_indices) { + T score = it.first; + int idx = it.second; + int a = idx / class_num; + int c = idx % class_num; + + int box_offset = a * 4; + T anchor_box_width = + anchors_data[box_offset + 2] - anchors_data[box_offset] + 1; + T anchor_box_height = + anchors_data[box_offset + 3] - anchors_data[box_offset + 1] + 1; + T anchor_box_center_x = anchors_data[box_offset] + anchor_box_width / 2; + T anchor_box_center_y = + anchors_data[box_offset + 1] + anchor_box_height / 2; + T target_box_center_x = 0, target_box_center_y = 0; + T target_box_width = 0, target_box_height = 0; + target_box_center_x = + bboxes_data[box_offset] * anchor_box_width + anchor_box_center_x; + target_box_center_y = + bboxes_data[box_offset + 1] * anchor_box_height + anchor_box_center_y; + target_box_width = std::exp(bboxes_data[box_offset + 2]) * anchor_box_width; + target_box_height = + std::exp(bboxes_data[box_offset + 3]) * anchor_box_height; + T pred_box_xmin = target_box_center_x - target_box_width / 2; + T pred_box_ymin = target_box_center_y - target_box_height / 2; + T pred_box_xmax = target_box_center_x + target_box_width / 2 - 1; + T pred_box_ymax = target_box_center_y + target_box_height / 2 - 1; + pred_box_xmin = pred_box_xmin / im_scale; + pred_box_ymin = pred_box_ymin / im_scale; + pred_box_xmax = pred_box_xmax / im_scale; + pred_box_ymax = pred_box_ymax / im_scale; + + pred_box_xmin = std::max(std::min(pred_box_xmin, im_width - 1), zero); + pred_box_ymin = std::max(std::min(pred_box_ymin, im_height - 1), zero); + pred_box_xmax = std::max(std::min(pred_box_xmax, im_width - 1), zero); + pred_box_ymax = std::max(std::min(pred_box_ymax, im_height - 1), zero); + + std::vector one_pred; + one_pred.push_back(pred_box_xmin); + one_pred.push_back(pred_box_ymin); + one_pred.push_back(pred_box_xmax); + one_pred.push_back(pred_box_ymax); + one_pred.push_back(score); + (*preds)[c].push_back(one_pred); + i++; + } +} + +template +void MultiClassNMS(const std::map>>& preds, + int class_num, + const int keep_top_k, + const T nms_threshold, + const T nms_eta, + std::vector>* nmsed_out, + int* num_nmsed_out) { + std::map> indices; + int num_det = 0; + for (int c = 0; c < class_num; ++c) { + if (static_cast(preds.count(c))) { + const std::vector> cls_dets = preds.at(c); + NMSFast(cls_dets, nms_threshold, nms_eta, &(indices[c])); + num_det += indices[c].size(); + } + } + + std::vector>> score_index_pairs; + for (const auto& it : indices) { + int label = it.first; + const std::vector& label_indices = it.second; + for (size_t j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + score_index_pairs.push_back( + std::make_pair(preds.at(label)[idx][4], std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), + score_index_pairs.end(), + SortScoreTwoPairDescend); + if (num_det > keep_top_k) { + score_index_pairs.resize(keep_top_k); + } + + // Store the new indices. + std::map> new_indices; + for (const auto& it : score_index_pairs) { + int label = it.second.first; + int idx = it.second.second; + std::vector one_pred; + one_pred.push_back(label); + one_pred.push_back(preds.at(label)[idx][4]); + one_pred.push_back(preds.at(label)[idx][0]); + one_pred.push_back(preds.at(label)[idx][1]); + one_pred.push_back(preds.at(label)[idx][2]); + one_pred.push_back(preds.at(label)[idx][3]); + nmsed_out->push_back(one_pred); + } + + *num_nmsed_out = (num_det > keep_top_k ? keep_top_k : num_det); +} + +template +void RetinanetDetectionOutput( + const operators::RetinanetDetectionOutputParam& param, + const std::vector& scores, + const std::vector& bboxes, + const std::vector& anchors, + const Tensor& im_info, + std::vector>* nmsed_out, + int* num_nmsed_out) { + int64_t nms_top_k = param.nms_top_k; + int64_t keep_top_k = param.keep_top_k; + T nms_threshold = static_cast(param.nms_threshold); + T nms_eta = static_cast(param.nms_eta); + T score_threshold = static_cast(param.score_threshold); + + int64_t class_num = scores[0].dims()[1]; + std::map>> preds; + for (size_t l = 0; l < scores.size(); ++l) { + // Fetch per level score + Tensor scores_per_level = scores[l]; + // Fetch per level bbox + Tensor bboxes_per_level = bboxes[l]; + // Fetch per level anchor + Tensor anchors_per_level = anchors[l]; + + int64_t scores_num = scores_per_level.numel(); + int64_t bboxes_num = bboxes_per_level.numel(); + std::vector scores_data(scores_num); + std::vector bboxes_data(bboxes_num); + std::vector anchors_data(bboxes_num); + std::copy_n(scores_per_level.data(), scores_num, scores_data.begin()); + std::copy_n(bboxes_per_level.data(), bboxes_num, bboxes_data.begin()); + std::copy_n(anchors_per_level.data(), bboxes_num, anchors_data.begin()); + std::vector> sorted_indices; + + // For the highest level, we take the threshold 0.0 + T threshold = (l < (scores.size() - 1) ? score_threshold : 0.0); + GetMaxScoreIndex(scores_data, threshold, nms_top_k, &sorted_indices); + auto* im_info_data = im_info.data(); + auto im_height = im_info_data[0]; + auto im_width = im_info_data[1]; + auto im_scale = im_info_data[2]; + DeltaScoreToPrediction(bboxes_data, + anchors_data, + im_height, + im_width, + im_scale, + class_num, + sorted_indices, + &preds); + } + + MultiClassNMS(preds, + class_num, + keep_top_k, + nms_threshold, + nms_eta, + nmsed_out, + num_nmsed_out); +} + +template +void MultiClassOutput(const std::vector>& nmsed_out, + Tensor* outs) { + auto* odata = outs->mutable_data(); + int count = 0; + int64_t out_dim = 6; + for (size_t i = 0; i < nmsed_out.size(); ++i) { + odata[count * out_dim] = nmsed_out[i][0] + 1; // label + odata[count * out_dim + 1] = nmsed_out[i][1]; // score + odata[count * out_dim + 2] = nmsed_out[i][2]; // xmin + odata[count * out_dim + 3] = nmsed_out[i][3]; // xmin + odata[count * out_dim + 4] = nmsed_out[i][4]; // xmin + odata[count * out_dim + 5] = nmsed_out[i][5]; // xmin + count++; + } +} + +void RetinanetDetectionOutputCompute::Run() { + auto& param = Param(); + auto& boxes = param.bboxes; + auto& scores = param.scores; + auto& anchors = param.anchors; + auto* im_info = param.im_info; + auto* outs = param.out; + + std::vector boxes_list(boxes.size()); + std::vector scores_list(scores.size()); + std::vector anchors_list(anchors.size()); + for (size_t j = 0; j < boxes_list.size(); ++j) { + boxes_list[j] = *boxes[j]; + scores_list[j] = *scores[j]; + anchors_list[j] = *anchors[j]; + } + auto score_dims = scores_list[0].dims(); + int64_t batch_size = score_dims[0]; + auto box_dims = boxes_list[0].dims(); + int64_t box_dim = box_dims[2]; + int64_t out_dim = box_dim + 2; + + std::vector>> all_nmsed_out; + std::vector batch_starts = {0}; + for (int i = 0; i < batch_size; ++i) { + int num_nmsed_out = 0; + std::vector box_per_batch_list(boxes_list.size()); + std::vector score_per_batch_list(scores_list.size()); + for (size_t j = 0; j < boxes_list.size(); ++j) { + auto score_dims = scores_list[j].dims(); + score_per_batch_list[j] = scores_list[j].Slice(i, i + 1); + score_per_batch_list[j].Resize({score_dims[1], score_dims[2]}); + box_per_batch_list[j] = boxes_list[j].Slice(i, i + 1); + box_per_batch_list[j].Resize({score_dims[1], box_dim}); + } + Tensor im_info_slice = im_info->Slice(i, i + 1); + + std::vector> nmsed_out; + RetinanetDetectionOutput(param, + score_per_batch_list, + box_per_batch_list, + anchors_list, + im_info_slice, + &nmsed_out, + &num_nmsed_out); + all_nmsed_out.push_back(nmsed_out); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + } + + uint64_t num_kept = batch_starts.back(); + if (num_kept == 0) { + outs->Resize({0, out_dim}); + } else { + outs->Resize({static_cast(num_kept), out_dim}); + for (int i = 0; i < batch_size; ++i) { + int64_t s = static_cast(batch_starts[i]); + int64_t e = static_cast(batch_starts[i + 1]); + if (e > s) { + Tensor out = outs->Slice(s, e); + MultiClassOutput(all_nmsed_out[i], &out); + } + } + } + + LoD lod; + lod.emplace_back(batch_starts); + outs->set_lod(lod); +} + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + retinanet_detection_output, + kHost, + kFloat, + kNCHW, + paddle::lite::kernels::host::RetinanetDetectionOutputCompute, + def) + .BindInput("BBoxes", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Scores", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Anchors", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("ImInfo", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/host/retinanet_detection_output_compute.h b/lite/kernels/host/retinanet_detection_output_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..612ea7105e2728b856f02d71e9fcfaea2a1ef680 --- /dev/null +++ b/lite/kernels/host/retinanet_detection_output_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +class RetinanetDetectionOutputCompute + : public KernelLite { + public: + void Run() override; + + virtual ~RetinanetDetectionOutputCompute() = default; +}; + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/host/where_index_compute.cc b/lite/kernels/host/where_index_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..d06be8d332734f3e41b0414e891c8810a117d8a6 --- /dev/null +++ b/lite/kernels/host/where_index_compute.cc @@ -0,0 +1,173 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/host/where_index_compute.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +static void where_index_rank4(const int64_t* true_index, + int true_num, + const int64_t* stride, + int64_t* out) { + int cnt = true_num >> 1; + register int64_t stride0 = stride[0]; + register int64_t stride1 = stride[1]; + register int64_t stride2 = stride[2]; + register int64_t stride3 = stride[3]; + for (int i = 0; i < cnt; ++i) { + int64_t index0 = true_index[i * 2]; + int64_t index1 = true_index[i * 2 + 1]; + int out_index = i * 8; + // rank0 + register int64_t oindex0 = index0 / stride0; + register int64_t oindex1 = index1 / stride0; + out[out_index] = oindex0; + index0 -= oindex0 * stride0; + index1 -= oindex1 * stride0; + out[out_index + 4] = oindex1; + out_index++; + // rank1 + oindex0 = index0 / stride1; + oindex1 = index1 / stride1; + out[out_index] = oindex0; + index0 -= oindex0 * stride1; + index1 -= oindex1 * stride1; + out[out_index + 4] = oindex1; + out_index++; + // rank2 + oindex0 = index0 / stride2; + oindex1 = index1 / stride2; + out[out_index] = oindex0; + index0 -= oindex0 * stride2; + index1 -= oindex1 * stride2; + out[out_index + 4] = oindex1; + out_index++; + // rank3 + oindex0 = index0 / stride3; + oindex1 = index1 / stride3; + out[out_index] = oindex0; + out[out_index + 4] = oindex1; + } + // remain + for (int r = cnt * 2; r < true_num; ++r) { + int out_index = r * 4; + int64_t index = true_index[r]; + for (int i = 0; i < 4; ++i) { + out[out_index + i] = index / stride[i]; + index -= out[out_index + i] * stride[i]; + } + } +} + +inline void where_index_rank1(const int64_t* true_index, + int true_num, + int64_t* out) { + memcpy(out, true_index, true_num * sizeof(int64_t)); +} + +static void where_index_rankn(const int64_t* true_index, + int true_num, + const int64_t* stride, + int rank, + int64_t* out) { + int out_index = 0; + for (int i = 0; i < true_num; ++i) { + int64_t index = true_index[i]; + for (int r = 0; r < rank; ++r) { + out[out_index] = index / stride[r]; + index -= out[out_index++] * stride[r]; + } + } +} + +template +void WhereIndexKernel(const operators::WhereIndexParam& param) { + auto* input = param.input; + auto* output = param.output; + auto dims = input->dims(); + auto numel = dims.production(); + int64_t rank = static_cast(dims.size()); + const T* cond_data = input->template data(); + int64_t true_num = 0; + std::vector true_index(numel); + for (auto i = 0; i < numel; i++) { + if (static_cast(cond_data[i])) { + true_index[true_num] = i; + true_num++; + } + } + output->Resize({true_num, rank}); + if (true_num == 0) { + return; + } + auto* out_ptr = output->template mutable_data(); + std::vector stride(rank); + stride[rank - 1] = 1; + for (int i = rank - 2; i >= 0; i--) { + stride[i] = stride[i + 1] * dims[i + 1]; + } + if (rank == 1) { + where_index_rank1(true_index.data(), true_num, out_ptr); + } else if (rank == 4) { + where_index_rank4(true_index.data(), true_num, stride.data(), out_ptr); + } else { + where_index_rankn( + true_index.data(), true_num, stride.data(), rank, out_ptr); + } +} + +void WhereIndexCompute::Run() { + auto& param = this->Param(); + switch (param.input->precision()) { + case PRECISION(kFloat): + WhereIndexKernel(param); + break; + case PRECISION(kInt32): + WhereIndexKernel(param); + break; + case PRECISION(kInt64): + WhereIndexKernel(param); + break; + case PRECISION(kInt8): + WhereIndexKernel(param); + break; + case PRECISION(kBool): + WhereIndexKernel(param); + break; + default: + LOG(FATAL) << "WhereIndex does not implement for the " + << "input type:" << static_cast(param.input->precision()); + } +} + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle + +using whereindex = paddle::lite::kernels::host::WhereIndexCompute; + +REGISTER_LITE_KERNEL(where_index, kHost, kAny, kAny, whereindex, def) + .BindInput("Condition", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/host/where_index_compute.h b/lite/kernels/host/where_index_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..6936e3ed8f0ee16bf0e41095bbcbd0c18169d62f --- /dev/null +++ b/lite/kernels/host/where_index_compute.h @@ -0,0 +1,37 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/operators/where_index_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +class WhereIndexCompute : public KernelLite { + public: + using param_t = operators::WhereIndexParam; + + void Run() override; + + virtual ~WhereIndexCompute() = default; +}; + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/host/where_index_compute_test.cc b/lite/kernels/host/where_index_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..7097bdcae2bb319331af72c390a9d5de4fc23a9f --- /dev/null +++ b/lite/kernels/host/where_index_compute_test.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/host/where_index_compute.h" +#include +#include +#include +#include +#include +#include +#include "lite/core/context.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +template +void where_index_compute_ref(lite::Tensor* condition, lite::Tensor* out) { + auto dims = condition->dims(); + auto numel = condition->numel(); + const int64_t rank = static_cast(dims.size()); + const T* cond_data = condition->data(); + std::vector true_index; + for (auto i = 0; i < numel; i++) { + if (static_cast(cond_data[i])) { + true_index.push_back(i); + } + } + int64_t true_num = static_cast(true_index.size()); + out->Resize({true_num, rank}); + int64_t* out_ptr = out->mutable_data(); + if (true_num == 0) { + return; + } + + std::vector stride(rank); + stride[rank - 1] = 1; + for (int i = rank - 2; i >= 0; i--) { + stride[i] = stride[i + 1] * dims[i + 1]; + } + for (int i = 0; i < true_num; ++i) { + int64_t index = true_index[i]; + for (int j = 0; j < rank; ++j) { + out_ptr[i * rank + j] = index / stride[j]; + index -= out_ptr[i * rank + j] * stride[j]; + } + } +} + +TEST(where_index, init) { + WhereIndexCompute where_index; + ASSERT_EQ(where_index.precision(), PRECISION(kAny)); + ASSERT_EQ(where_index.target(), TARGET(kHost)); +} + +TEST(where_index, retrive_op) { + auto where_index = + KernelRegistry::Global().Create( + "where_index"); + ASSERT_FALSE(where_index.empty()); + ASSERT_TRUE(where_index.front()); +} + +TEST(where_index, compute) { + paddle::lite::DeviceInfo::Init(); + WhereIndexCompute where_index; + operators::WhereIndexParam param; + + lite::Tensor input; + lite::Tensor output; + lite::Tensor output_ref; + param.input = &input; + param.output = &output; + where_index.SetParam(param); + for (auto& n : {1, 2, 4}) { + for (auto& c : {1, 3, 21, 32}) { + for (auto& h : {1, 5, 63}) { + for (auto& w : {1, 5, 64}) { + for (auto& dim_size : {1, 2, 3, 4}) { + for (int i = 0; i < 5; ++i) { + std::vector in_shape; + in_shape.push_back(n); + in_shape.push_back(c); + in_shape.push_back(h); + in_shape.push_back(w); + int outer = 1; + for (int i = dim_size - 1; i < in_shape.size(); ++i) { + outer *= in_shape[i]; + } + in_shape.resize(dim_size); + in_shape[dim_size - 1] = outer; + + DDim indim(in_shape); + LOG(INFO) << "in dims: "; + for (int i = 0; i < dim_size; ++i) { + LOG(INFO) << in_shape[i]; + } + input.Resize(indim); + std::default_random_engine engine; + std::uniform_real_distribution dist(-1, 1); + if (i == 0) { + int* indata = input.mutable_data(); + for (int i = 0; i < indim.production(); ++i) { + indata[i] = static_cast(dist(engine) > 0); + } + where_index_compute_ref(&input, &output_ref); + } else if (i == 1) { + int64_t* indata = input.mutable_data(); + for (int i = 0; i < indim.production(); ++i) { + indata[i] = static_cast(dist(engine) > 0); + } + where_index_compute_ref(&input, &output_ref); + } else if (i == 2) { + int8_t* indata = input.mutable_data(); + for (int i = 0; i < indim.production(); ++i) { + indata[i] = static_cast(dist(engine) > 0); + } + where_index_compute_ref(&input, &output_ref); + } else if (i == 3) { + bool* indata = input.mutable_data(); + for (int i = 0; i < indim.production(); ++i) { + indata[i] = dist(engine) > 0; + } + where_index_compute_ref(&input, &output_ref); + } else { + float* indata = input.mutable_data(); + for (int i = 0; i < indim.production(); ++i) { + indata[i] = dist(engine) > 0; + } + where_index_compute_ref(&input, &output_ref); + } + where_index.Run(); + const int64_t* outdata = output.data(); + const int64_t* outdata_ref = output_ref.data(); + CHECK_EQ(output.dims(), output_ref.dims()) + << "where_index out shape error! out_dim is not equal " + "to out_ref dim"; + for (int i = 0; i < output.numel(); i++) { + if (std::abs(outdata[i] - outdata_ref[i]) > 0) { + LOG(FATAL) << "where_index cmp error, i: " << i + << ", output_data: " << outdata[i] + << ", output_ref_data: " << outdata_ref[i] + << "input precision: " + << static_cast(input.precision()); + } + } + } + } + } + } + } + } +} + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(where_index, kHost, kAny, kAny, def); diff --git a/lite/kernels/arm/while_compute.cc b/lite/kernels/host/while_compute.cc similarity index 50% rename from lite/kernels/arm/while_compute.cc rename to lite/kernels/host/while_compute.cc index 9241fd410a542cef797b57b9341f59895b0f734d..4886b5ffe0f48b231bcef59b5494fc126b8b69e2 100644 --- a/lite/kernels/arm/while_compute.cc +++ b/lite/kernels/host/while_compute.cc @@ -12,44 +12,44 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/while_compute.h" -#include -#include -#include -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/tensor.h" -#include "lite/core/type_system.h" +#include "lite/kernels/host/while_compute.h" +#include +#include namespace paddle { namespace lite { namespace kernels { -namespace arm { +namespace host { void WhileCompute::PrepareForRun() { - auto ¶m = Param(); - auto cur_scope = param.scope; - - executor_ = - std::make_shared(param.sub_block, cur_scope, place()); + auto ¶m = this->Param(); + program_.reset(new RuntimeProgram( + param.program_desc, param.exec_scope, param.block_idx)); } void WhileCompute::Run() { - auto ¶m = Param(); + auto ¶m = this->Param(); while (param.cond->data()[0]) { - executor_->Run(); + program_->Run(); } } -} // namespace arm +} // namespace host } // namespace kernels } // namespace lite } // namespace paddle REGISTER_LITE_KERNEL( - while, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::WhileCompute, def) - .BindInput("X", {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))}) + while, kHost, kAny, kAny, paddle::lite::kernels::host::WhileCompute, def) + .BindInput("X", + {LiteType::GetTensorListTy( + TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) .BindInput("Condition", - {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))}) + {LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)}) .BindOutput("Out", - {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))}) - .BindOutput("StepScopes", {LiteType::GetTensorTy(TARGET(kARM))}) + {LiteType::GetTensorListTy( + TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) + .BindOutput("StepScopes", + {LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) .Finalize(); diff --git a/lite/kernels/host/while_compute.h b/lite/kernels/host/while_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..42065865e45c18376034dea0e105bc6d4f1f053f --- /dev/null +++ b/lite/kernels/host/while_compute.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/program.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace host { + +class WhileCompute + : public KernelLite { + public: + using param_t = operators::WhileParam; + + void Run() override; + void PrepareForRun() override; + + virtual ~WhileCompute() = default; + + private: + std::unique_ptr program_; +}; + +} // namespace host +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/huawei_ascend_npu/CMakeLists.txt b/lite/kernels/huawei_ascend_npu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..be0a8d05081e3dda5f474689dc4eed23bc5f56c4 --- /dev/null +++ b/lite/kernels/huawei_ascend_npu/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory(bridges) + +add_kernel(subgraph_compute_huawei_ascend_npu HUAWEI_ASCEND_NPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_huawei_ascend_npu subgraph_bridge_engine ${huawei_ascend_npu_subgraph_bridges}) diff --git a/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt b/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6fac2b0b560dcc467132abe9a21c2c75d266a77 --- /dev/null +++ b/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt @@ -0,0 +1,19 @@ +if(NOT LITE_WITH_HUAWEI_ASCEND_NPU) + return() +endif() + +lite_cc_library(subgraph_bridge_utility_huawei_ascend_npu SRCS utility.cc DEPS) +lite_cc_library(subgraph_bridge_graph_huawei_ascend_npu SRCS graph.cc DEPS subgraph_bridge_utility_huawei_ascend_npu) + +set(huawei_ascend_npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_huawei_ascend_npu subgraph_bridge_graph_huawei_ascend_npu) + +lite_cc_library(subgraph_bridge_act_op_huawei_ascend_npu SRCS act_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_conv_op_huawei_ascend_npu SRCS conv_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps}) + +set(huawei_ascend_npu_subgraph_bridges + subgraph_bridge_registry + subgraph_bridge_engine + subgraph_bridge_graph_huawei_ascend_npu + subgraph_bridge_act_op_huawei_ascend_npu + subgraph_bridge_conv_op_huawei_ascend_npu + CACHE INTERNAL "huawei_ascend_npu_subgraph_bridges") diff --git a/lite/kernels/huawei_ascend_npu/bridges/act_op.cc b/lite/kernels/huawei_ascend_npu/bridges/act_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0293515356a13035fcdc4725c5de132ea06ceb67 --- /dev/null +++ b/lite/kernels/huawei_ascend_npu/bridges/act_op.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/huawei_ascend_npu/bridges/graph.h" +#include "lite/kernels/huawei_ascend_npu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace huawei_ascend_npu { + +template +int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + x_node = graph->Add(x_name, *x); + } + + // Act node + auto act_node = graph->template Add(out_name); + auto act_op = act_node->template data(); + act_op->set_input_x(*x_node->data()); + + return SUCCESS; +} + +template <> +int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + x_node = graph->Add(x_name, *x); + } + + // Act node + auto act_node = graph->template Add(out_name); + auto act_op = act_node->template data(); + act_op->set_input_x(*x_node->data()); + // only for leaky_relu + auto alpha = op_info->GetAttr("alpha"); + act_op->set_attr_negative_slope(alpha); + + return SUCCESS; +} + +} // namespace huawei_ascend_npu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE( + sigmoid, + kHuaweiAscendNPU, + paddle::lite::subgraph::huawei_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + relu, + kHuaweiAscendNPU, + paddle::lite::subgraph::huawei_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + tanh, + kHuaweiAscendNPU, + paddle::lite::subgraph::huawei_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + relu6, + kHuaweiAscendNPU, + paddle::lite::subgraph::huawei_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + leaky_relu, + kHuaweiAscendNPU, + paddle::lite::subgraph::huawei_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + softsign, + kHuaweiAscendNPU, + paddle::lite::subgraph::huawei_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + softplus, + kHuaweiAscendNPU, + paddle::lite::subgraph::huawei_ascend_npu::ActConverter); diff --git a/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc b/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..075bbca8bd63a3c12d74b3624c6a1d51d7edfb76 --- /dev/null +++ b/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc @@ -0,0 +1,252 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/conv_op.h" +#include "lite/kernels/huawei_ascend_npu/bridges/graph.h" +#include "lite/kernels/huawei_ascend_npu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace huawei_ascend_npu { + +int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " << op_type << "... "; + + // Get input and output vars and op attributes + auto input_name = op_info->Input("Input").front(); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + ge::DataType ge_data_type = CvtPrecisionType(input->precision()); + + auto filter_name = op_info->Input("Filter").front(); + auto filter = scope->FindMutableTensor(filter_name); + auto filter_dims = filter->dims(); + + auto output_name = op_info->Output("Output").front(); + auto output = scope->FindMutableTensor(output_name); + auto output_dims = output->dims(); + + auto bs = input_dims[0]; + auto ic = input_dims[1]; + auto oc = filter_dims[0]; + CHECK_EQ(input_dims.size(), 4L); + CHECK_EQ(output_dims.size(), 4L); + CHECK_EQ(filter_dims.size(), 4L); + CHECK_EQ(output_dims[0], bs); + CHECK_EQ(output_dims[1], oc); + auto strides = op_info->GetAttr>("strides"); + auto paddings = op_info->GetAttr>("paddings"); + auto groups = op_info->GetAttr("groups"); + // Conv2D: groups must set to 1; DepthwiseConv2D: groups not supported. + CHECK_LE(groups, 1) + << "[HUAWEI_ASCEND_NPU] groups > 1 NOT supported, groups: " << groups; + auto dilations = op_info->GetAttr>("dilations"); + bool with_act = + op_info->HasAttr("with_act") && op_info->GetAttr("with_act"); + std::string act_type = + with_act ? op_info->GetAttr("act_type") : ""; + float leaky_relu_alpha = act_type == "leaky_relu" + ? op_info->GetAttr("leaky_relu_alpha") + : 0.f; + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + + // Input node + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); + } else { + input_node = graph->Add(input_name, *input); + } + + if (paddings.size() == 2L) { + for (size_t i = 0; i < strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "[HUAWEI_ASCEND_NPU] Paddings size should be " + "the same or twice as the input size."; + + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + operators::UpdatePaddingAndDilation(&paddings, + &dilations, + strides, + padding_algorithm, + input_dims, + filter_dims); + + // Check depthwise mode, and decide whether use DepthwiseConv2D Op + bool use_depthwise_conv = false; + bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1); + if (is_depthwise_mode && dilations[0] == 1 && dilations[1] == 1) { + use_depthwise_conv = true; + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] DepthwiseConv2D op is used."; + } + + // Filter node + auto filter_node = graph->Add(filter_name, *filter); + + // Add bias node if exists bias + // Supports the bias nodes with the following dimensions + // 0: {oc} => 1D tensor of foramt ND + // 1: {1, oc, oh, ow} + // 2: {n, oc, oh, ow} + std::vector bias_shape; + std::shared_ptr bias_node = nullptr; + bool is_channel_bias = false; + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); + } else { + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + auto bias_data_size = bias_dims.production(); + auto output_data_size = output_dims.production(); + if (bias_data_size == oc) { + // 0: {oc} + bias_shape = {oc}; + is_channel_bias = true; + } else if (bias_data_size == output_data_size / bs) { + // 1: {1, oc, oh, ow} + bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]}; + } else if (bias_data_size == output_data_size) { + // 2: {n, oc, oh, ow} + bias_shape = output_dims.Vectorize(); + } else { + LOG(WARNING) + << "[HUAWEI_ASCEND_NPU] Bias dimension " << bias_dims + << " isn't supported in conv2d Op when output dimension is " + << output_dims; + return FAILED; + } + bias_node = graph->Add(bias_name, *bias, bias_shape); + } + } + + // Ascend must update convop desc, or IR model build will fail + ge::TensorDesc conv2d_input_desc_x( + ge::Shape(CvtShape(input_dims)), ge::FORMAT_NCHW, ge_data_type); + ge::TensorDesc conv2d_input_desc_filter( + ge::Shape(CvtShape(filter_dims)), ge::FORMAT_NCHW, ge_data_type); + ge::TensorDesc conv2d_input_desc_bias( + ge::Shape(bias_shape), ge::FORMAT_ND, ge_data_type); + ge::TensorDesc conv2d_output_desc_y( + ge::Shape(CvtShape(output_dims)), ge::FORMAT_NCHW, ge_data_type); + // Setting desc name + conv2d_input_desc_x.SetName("conv2d_input_desc_x"); + conv2d_input_desc_filter.SetName("conv2d_input_desc_filter"); + conv2d_input_desc_bias.SetName("conv2d_input_desc_bias"); + conv2d_output_desc_y.SetName("conv2d_output_desc_y"); + // Conv node + std::shared_ptr conv_node = nullptr; + if (use_depthwise_conv && is_depthwise_mode) { + conv_node = graph->Add(output_name); + auto conv_op = conv_node->data(); + conv_op->set_input_x(*input_node->data()); + conv_op->set_input_filter(*filter_node->data()); + conv_op->set_attr_strides( + ge::Operator::OpListInt({1, 1, strides[0], strides[1]})); + conv_op->set_attr_dilations({1, 1, dilations[0], dilations[1]}); + conv_op->set_attr_pads( + {paddings[0], paddings[1], paddings[2], paddings[3]}); + conv_op->set_attr_data_format("NCHW"); + if (bias_node != nullptr && is_channel_bias) { + conv_op->set_input_bias(*bias_node->data()); + conv_op->update_input_desc_bias(conv2d_input_desc_bias); + } + // update tensor desc to conv2d + conv_op->update_input_desc_x(conv2d_input_desc_x); + conv_op->update_input_desc_filter(conv2d_input_desc_filter); + conv_op->update_output_desc_y(conv2d_output_desc_y); + } else { + conv_node = graph->Add(output_name); + auto conv_op = conv_node->data(); + conv_op->set_input_x(*input_node->data()); + conv_op->set_input_filter(*filter_node->data()); + conv_op->set_attr_strides( + ge::Operator::OpListInt({bs, ic, strides[0], strides[1]})); + conv_op->set_attr_pads(ge::Operator::OpListInt( + {paddings[0], paddings[1], paddings[2], paddings[3]})); + conv_op->set_attr_dilations( + ge::Operator::OpListInt({bs, ic, dilations[0], dilations[1]})); + conv_op->set_attr_groups(groups); + conv_op->set_attr_data_format("NCHW"); + if (bias_node != nullptr && is_channel_bias) { + conv_op->set_input_bias(*bias_node->data()); + conv_op->update_input_desc_bias(conv2d_input_desc_bias); + } + // update tensor desc to conv2d + conv_op->update_input_desc_x(conv2d_input_desc_x); + conv_op->update_input_desc_filter(conv2d_input_desc_filter); + conv_op->update_output_desc_y(conv2d_output_desc_y); + } + // append Add node to support bias + if (bias_node != nullptr && !is_channel_bias) { + auto add_node = graph->Add(output_name); + auto add_op = add_node->data(); + add_op->set_input_x1(*conv_node->data()); + add_op->set_input_x2(*bias_node->data()); + conv_node = add_node; + } + CHECK(conv_node); + + // ONLY support relu/leaky_relu now + // to do (@qili93): add more act types + if (!act_type.empty()) { + if (act_type == "relu") { + auto act_node = graph->Add(output_name); + auto act_op = act_node->data(); + act_op->set_input_x(*conv_node->data()); + } else if (act_type == "leaky_relu") { + auto act_node = graph->Add(output_name); + auto act_op = act_node->data(); + act_op->set_input_x(*conv_node->data()); + act_op->set_attr_negative_slope(leaky_relu_alpha); + } else { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] act type not supported: " + << act_type; + return FAILED; + } + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace huawei_ascend_npu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE( + conv2d, + kHuaweiAscendNPU, + paddle::lite::subgraph::huawei_ascend_npu::ConvConverter); +REGISTER_SUBGRAPH_BRIDGE( + depthwise_conv2d, + kHuaweiAscendNPU, + paddle::lite::subgraph::huawei_ascend_npu::ConvConverter); diff --git a/lite/kernels/huawei_ascend_npu/bridges/graph.cc b/lite/kernels/huawei_ascend_npu/bridges/graph.cc new file mode 100644 index 0000000000000000000000000000000000000000..7e1eaf1228fd3df7583ddc194b3d58862ddc0e12 --- /dev/null +++ b/lite/kernels/huawei_ascend_npu/bridges/graph.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/huawei_ascend_npu/bridges/graph.h" +#include +#include "lite/kernels/huawei_ascend_npu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace huawei_ascend_npu { + +int Graph::Add(const std::string& name, std::shared_ptr node) { + auto it = nodes_.find(name); + if (it != nodes_.end()) { + // Only variable node can be shared with the same name + if (!node->is_var() || !it->second.back()->is_var()) { + LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Const or data node " << name + << " is redefined."; + return -1; + } + } else { + auto ret = nodes_.insert( + std::make_pair(name, std::vector>())); + CHECK(ret.second); + it = ret.first; + } + it->second.push_back(node); + return it->second.size(); +} + +// Const or data node +std::shared_ptr Graph::Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + DataLayoutType layout) { + std::shared_ptr node = nullptr; + PrecisionType precision = tensor.precision(); + if (tensor.persistable()) { + // Const node + node = Add(name, precision, layout); + ge::TensorDesc desc(ge::Shape(shape), + CvtDataLayoutType(layout), + CvtPrecisionType(precision)); + desc.SetName("const_node_desc"); + node->data()->set_attr_value( + CvtTensor(tensor, shape, layout)); + node->data()->update_output_desc_y(desc); + } else { + // Data node + node = Add(name, shape, precision, layout); + } + return node; +} + +// Data node +std::shared_ptr Graph::Add(const std::string& name, + std::vector shape, + PrecisionType precision, + DataLayoutType layout) { + auto node = Add(name, precision, layout); + ge::TensorDesc desc( + ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision)); + desc.SetName("data_node_desc"); + node->data()->update_input_desc_x(desc); + node->data()->update_output_desc_y(desc); + return node; +} + +} // namespace huawei_ascend_npu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/huawei_ascend_npu/bridges/graph.h b/lite/kernels/huawei_ascend_npu/bridges/graph.h new file mode 100644 index 0000000000000000000000000000000000000000..bb763004939a4ccfffdd526e92bc029509aab45e --- /dev/null +++ b/lite/kernels/huawei_ascend_npu/bridges/graph.h @@ -0,0 +1,196 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/tensor.h" +#include "op_proto/built-in/inc/all_ops.h" // opp/op_proto/built-in/inc + +namespace paddle { +namespace lite { +namespace subgraph { +namespace huawei_ascend_npu { + +// Graph and node is defined to collect all of converted HiAI IR nodes +class Node { + public: + enum class Role { + kVar = 0, + kConst, + kData, + }; + + Node(std::shared_ptr data, + PrecisionType precision, + DataLayoutType layout, + Role role) + : data_(data), precision_(precision), layout_(layout), role_(role) {} + Node(PrecisionType precision, DataLayoutType layout, Role role) + : precision_(precision), layout_(layout), role_(role) {} + + void set_data(std::shared_ptr data) { data_ = data; } + void set_precision(PrecisionType precision) { precision_ = precision; } + void set_layout(DataLayoutType layout) { layout_ = layout; } + void set_role(Role role) { role_ = role; } + + template + std::shared_ptr data() { + return std::static_pointer_cast(data_); + } + std::shared_ptr data() { return data_; } + PrecisionType precision() const { return precision_; } + DataLayoutType layout() const { return layout_; } + bool is_var() const { return role_ == Role::kVar; } + bool is_const() const { return role_ == Role::kConst; } + bool is_data() const { return role_ == Role::kData; } + + private: + std::shared_ptr data_{nullptr}; + PrecisionType precision_{PRECISION(kFloat)}; + DataLayoutType layout_{DATALAYOUT(kNCHW)}; + Role role_{Role::kVar}; +}; + +class Graph { + public: + int Add(const std::string& name, std::shared_ptr node); + + // Variable, const or data node + template + std::shared_ptr Add(const std::string& name, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + Node::Role role = Node::Role::kVar; + if (typeid(T) == typeid(ge::op::Const)) { + role = Node::Role::kConst; + } else if (typeid(T) == typeid(ge::op::Data)) { + role = Node::Role::kData; + } + auto node = std::make_shared(precision, layout, role); + auto idx = Add(name, node); + CHECK_GE(idx, 1); + // Generate a unique name for the created HiAI IR + node->set_data( + std::make_shared(name + "__" + paddle::lite::to_string(idx))); + return node; + } + + // Const or data node + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, tensor, tensor.dims().Vectorize(), layout); + } + + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, tensor, dims.Vectorize(), layout); + } + + // Const node + template + std::shared_ptr Add(const std::string& name, + const std::vector& data, + std::vector shape = {}, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + if (shape.empty()) { + shape = {static_cast(data.size())}; + } else { + int size = 1; + for (auto i : shape) { + size *= i; + } + CHECK_EQ(data.size(), size); + } + Tensor tensor; + tensor.Resize(shape); + tensor.set_persistable(true); + std::memcpy(reinterpret_cast(tensor.mutable_data()), + reinterpret_cast(data.data()), + data.size() * sizeof(T)); + return Add(name, tensor, layout); + } + + template + std::shared_ptr Add(const std::string& name, + const std::vector& data, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, data, dims.Vectorize(), layout); + } + + template + std::shared_ptr Add(const std::string& name, + T value, + std::vector shape = {1}, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + int64_t size = 1; + for (auto i : shape) { + size *= i; + } + std::vector data(size, value); + return Add(name, data, shape, layout); + } + + template + std::shared_ptr Add(const std::string& name, + T value, + DDim dims, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, value, dims.Vectorize(), layout); + } + + // Data node + std::shared_ptr Add(const std::string& name, + std::vector shape, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)); + + std::shared_ptr Add(const std::string& name, + DDim dims, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW)) { + return Add(name, dims.Vectorize(), precision, layout); + } + + std::shared_ptr Get(std::string name) { + CHECK(Has(name)) << "[HUAWEI_ASCEND_NPU] Node " << name << " not found."; + return nodes_.at(name).back(); + } + + bool Has(const std::string& name) { + return nodes_.find(name) != nodes_.end(); + } + + private: + std::map>> nodes_; +}; + +} // namespace huawei_ascend_npu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h b/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h new file mode 100644 index 0000000000000000000000000000000000000000..5d38a4b0e68df0ddd66e0642e34323c40a6f1056 --- /dev/null +++ b/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h @@ -0,0 +1,27 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +// activation +USE_SUBGRAPH_BRIDGE(sigmoid, kHuaweiAscendNPU); +USE_SUBGRAPH_BRIDGE(relu, kHuaweiAscendNPU); +USE_SUBGRAPH_BRIDGE(tanh, kHuaweiAscendNPU); +USE_SUBGRAPH_BRIDGE(relu6, kHuaweiAscendNPU); +USE_SUBGRAPH_BRIDGE(leaky_relu, kHuaweiAscendNPU); +USE_SUBGRAPH_BRIDGE(softsign, kHuaweiAscendNPU); +USE_SUBGRAPH_BRIDGE(softplus, kHuaweiAscendNPU); +// conv +USE_SUBGRAPH_BRIDGE(conv2d, kHuaweiAscendNPU); +USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kHuaweiAscendNPU); diff --git a/lite/kernels/huawei_ascend_npu/bridges/utility.cc b/lite/kernels/huawei_ascend_npu/bridges/utility.cc new file mode 100644 index 0000000000000000000000000000000000000000..2fdaa49b94f48ad12b58036cd89d2f545566cad6 --- /dev/null +++ b/lite/kernels/huawei_ascend_npu/bridges/utility.cc @@ -0,0 +1,217 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/huawei_ascend_npu/bridges/utility.h" +#include + +namespace paddle { +namespace lite { +namespace subgraph { +namespace huawei_ascend_npu { + +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname) { + auto iarg_names = op_info->input_argnames(); + if (std::find(iarg_names.begin(), iarg_names.end(), argname) != + iarg_names.end()) { + auto inputs = op_info->Input(argname); + if (inputs.empty()) { + return false; + } + auto var_name = inputs.front(); + auto var = scope->FindVar(var_name); + return var != nullptr; + } else { + return false; + } +} + +ge::DataType CvtPrecisionType(PrecisionType itype) { + ge::DataType otype = ge::DT_FLOAT; + switch (itype) { + case PRECISION(kFloat): + otype = ge::DT_FLOAT; + break; + case PRECISION(kFP16): + otype = ge::DT_FLOAT16; + break; + case PRECISION(kInt8): + otype = ge::DT_INT8; + break; + case PRECISION(kInt16): + otype = ge::DT_INT16; + break; + case PRECISION(kInt32): + otype = ge::DT_INT32; + break; + case PRECISION(kInt64): + otype = ge::DT_INT64; + break; + // TODO(liq27) support more precision type + default: + LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Can not convert precision type(" + << PrecisionToStr(itype) << ") from Lite to NPU"; + break; + } + return otype; +} + +ge::Format CvtDataLayoutType(DataLayoutType itype) { + ge::Format otype = ge::FORMAT_NCHW; + switch (itype) { + case DATALAYOUT(kNCHW): + otype = ge::FORMAT_NCHW; + break; + case DATALAYOUT(kNHWC): + otype = ge::FORMAT_NHWC; + break; + // TODO(liq27) support more data layout type + default: + LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Can not convert data layout type(" + << DataLayoutToStr(itype) + << ") from Lite to HUAWEI_ASCEND_NPU"; + break; + } + return otype; +} + +std::vector CvtShape(const std::vector& in_shape) { + std::vector out_shape; + // Padding the shape to 4-dimensions(NCHW) + for (size_t i = 0; i < 4 - in_shape.size(); i++) { + out_shape.push_back(1); + } + for (size_t i = 0; i < in_shape.size(); i++) { + out_shape.push_back(in_shape[i]); + } + return out_shape; +} + +std::vector CvtShape(const DDim& in_dims) { + return CvtShape(in_dims.Vectorize()); +} + +ge::Tensor CvtTensor(const Tensor& in_tensor, + std::vector out_shape, + DataLayoutType in_layout) { + PrecisionType in_precision = in_tensor.precision(); + auto in_size = in_tensor.dims().production(); + auto in_shape = in_tensor.dims().Vectorize(); + if (out_shape.empty()) { + out_shape = in_shape; + } + ge::TensorDesc out_desc(ge::Shape(out_shape), + CvtDataLayoutType(in_layout), + CvtPrecisionType(in_precision)); + auto out_size = out_desc.GetShape().GetShapeSize(); + CHECK_EQ(out_size, in_size); + ge::Tensor out_tensor; + out_tensor.SetTensorDesc(out_desc); + out_tensor.SetData(reinterpret_cast(in_tensor.raw_data()), + in_tensor.memory_size()); + return out_tensor; +} + +int CvtActMode(std::string act_type) { + int act_mode = 1; + if (act_type == "sigmoid") { + act_mode = 0; + } else if (act_type == "relu") { + act_mode = 1; + } else if (act_type == "tanh") { + act_mode = 2; + } else if (act_type == "relu_clipped" || act_type == "relu6") { + act_mode = 3; + } else if (act_type == "elu") { + act_mode = 4; + } else if (act_type == "leaky_relu") { + act_mode = 5; + } else if (act_type == "abs") { + act_mode = 6; + } else if (act_type == "softsign") { + act_mode = 8; + } else if (act_type == "softplus") { + act_mode = 9; + } else if (act_type == "hard_sigmoid") { + act_mode = 10; + } else if (act_type == "thresholded_relu") { + act_mode = 11; + } else { + // TODO(liqi27) support more activation mode + LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Unsupported activation type " + << act_type; + } + return act_mode; +} + +const std::string& CvtFormat(ge::Format format) { + static const int MAX_FORMAT_LENGTH = 25; + static const std::string format2string[] = { + "FORMAT_NCHW = 0", + "FORMAT_NHWC = 1", + "FORMAT_ND = 2", + "FORMAT_NC1HWC0 = 3", + "FORMAT_FRACTAL_Z = 4", + "FORMAT_NC1C0HWPAD = 5", + "FORMAT_NHWC1C0 = 6", + "FORMAT_FSR_NCHW = 7", + "FORMAT_FRACTAL_DECONV = 8", + "FORMAT_C1HWNC0 = 9", + "FORMAT_FRACTAL_DECONV_TRANSPOSE = 10", + "FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS = 11", + "FORMAT_NC1HWC0_C04 = 12", + "FORMAT_FRACTAL_Z_C04 = 13", + "FORMAT_CHWN = 14", + "FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS = 15", + "FORMAT_HWCN = 16", + "FORMAT_NC1KHKWHWC0 = 17", + "FORMAT_BN_WEIGHT = 18", + "FORMAT_FILTER_HWCK = 19", + "FORMAT_HASHTABLE_LOOKUP_LOOKUPS = 20", + "FORMAT_HASHTABLE_LOOKUP_KEYS = 21", + "FORMAT_HASHTABLE_LOOKUP_VALUE = 22", + "FORMAT_HASHTABLE_LOOKUP_OUTPUT = 23", + "FORMAT_HASHTABLE_LOOKUP_HITS = 24"}; + auto x = static_cast(format); + CHECK_LT(x, MAX_FORMAT_LENGTH); + return format2string[x]; +} + +const std::string& CvtDataType(ge::DataType data_type) { + static const int MAX_DATATYPE_LENGTH = 14; + static const std::string datatype2string[] = {"DT_FLOAT=0", + "DT_FLOAT16=1", + "DT_INT8=2", + "DT_INT32=3", + "DT_UINT8=4", + "Unknown=5", + "DT_INT16=6", + "DT_UINT16=7", + "DT_UINT32=8", + "DT_INT64=9", + "DT_UINT64=10", + "DT_DOUBLE=11", + "DT_BOOL=12", + "DT_STRING=13"}; + + auto x = static_cast(data_type); + CHECK_LT(x, MAX_DATATYPE_LENGTH); + return datatype2string[x]; +} + +} // namespace huawei_ascend_npu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/huawei_ascend_npu/bridges/utility.h b/lite/kernels/huawei_ascend_npu/bridges/utility.h new file mode 100644 index 0000000000000000000000000000000000000000..da9a8999ad09e545745f30e02ca62c60e6f9bf82 --- /dev/null +++ b/lite/kernels/huawei_ascend_npu/bridges/utility.h @@ -0,0 +1,59 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +// #include "graph/buffer.h" +#include "graph/tensor.h" +#include "graph/types.h" +#include "lite/core/op_lite.h" +#include "lite/utils/macros.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace huawei_ascend_npu { + +// Type/tensor converters for converting Paddle type/tensor to HiAI type/tensor +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname); + +ge::DataType CvtPrecisionType(PrecisionType itype); + +ge::Format CvtDataLayoutType(DataLayoutType itype); + +// Padding the shape to 4-dimensions(NCHW) for HiAI +std::vector CvtShape(const std::vector& in_shape); + +std::vector CvtShape(const DDim& in_dims); + +ge::Tensor CvtTensor(const Tensor& in_tensor, + std::vector out_shape = {}, + DataLayoutType in_layout = DATALAYOUT(kNCHW)); + +int CvtActMode(std::string act_type); + +const std::string& CvtFormat(ge::Format format); +const std::string& CvtDataType(ge::DataType data_type); + +} // namespace huawei_ascend_npu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/huawei_ascend_npu/subgraph_compute.cc b/lite/kernels/huawei_ascend_npu/subgraph_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..6e71c71ca28b163f27a9783572d585466335ef87 --- /dev/null +++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.cc @@ -0,0 +1,483 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/huawei_ascend_npu/subgraph_compute.h" +#include +#include +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/huawei_ascend_npu/bridges/graph.h" +#include "lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h" +#include "lite/kernels/huawei_ascend_npu/bridges/utility.h" +#include "lite/utils/io.h" +#include "lite/utils/md5.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace huawei_ascend_npu { + +// Generate the model name by using md5 hashes based on: +// 1. the sorted variable input names +// 2. the shapes of the origin input tensors +// 3. the sorted variable output names +std::string DeviceProgram::GenerateModelName( + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims) { + std::ostringstream os; + CHECK_EQ(input_names.size(), origin_idims.size()); + for (size_t i = 0; i < input_names.size(); i++) { + os << input_names[i]; + for (auto dim : origin_idims[i]) { + os << dim; + } + } + for (auto output_name : output_names) { + os << output_name; + } + return MD5(os.str()); +} + +// Serialize the generated model, the precisions and dimensions of the origin +// output tensors of the subgraph op into files +bool DeviceProgram::LoadFromCacheFile( + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims, + const std::string& model_cache_dir, + const int device_id) { + // Generate the model name if not initialized + if (model_name_.empty()) { + model_name_ = GenerateModelName(input_names, output_names, origin_idims); + } + // Load from the cached model file, return a HiAI model manager client for + // inference + auto model_path = model_cache_dir + "/" + model_name_ + ".om"; + VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model from cached file from:" + << model_path; + model_client_ = lite::huawei_ascend_npu::Device::Global().LoadFromFile( + model_path, device_id); + if (!model_client_) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from cached file failed!"; + return false; + } + VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path; + // Deserialize the precisions and shapes of the origin output tensors from the + // cached configuration file + auto config_path = model_cache_dir + "/" + model_name_ + ".cfg"; + VLOG(3) << "[HUAWEI_ASCEND_NPU] Load configuration from " << config_path; + std::vector config_buffer; + if (!ReadFile(config_path, &config_buffer)) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] read from " << config_path + << " failed!"; + return false; + } + VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading configuration success:" + << config_path; + std::string config_str(config_buffer.begin(), config_buffer.end()); + // Parse the precision and shapes of the output tensors + auto output_options = Split(config_str, ";"); + CHECK_EQ(output_options.size(), output_names.size()); + origin_otypes_.resize(output_names.size()); + origin_odims_.resize(output_names.size()); + for (size_t i = 0; i < output_names.size(); i++) { + auto items = Split(output_options[i], ":"); + CHECK_EQ(items.size(), 2); // precision and shapes + origin_otypes_[i] = static_cast(std::stoi(items[0])); + origin_odims_[i] = Split(items[1], ","); + } + return true; +} + +bool DeviceProgram::BuildGraphAndCacheToFile( + RuntimeProgram* origin_program, + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims, + const std::vector& origin_otensors, + const std::string& model_cache_dir, + const int device_id) { + // Generate the model name if not initialized + if (model_name_.empty()) { + model_name_ = GenerateModelName(input_names, output_names, origin_idims); + } + // Convert all of ops and their input vars and weights to HiAI IR nodes, + // then added them into the IR graph + int status = 0; + subgraph::huawei_ascend_npu::Graph graph; + const auto& bridges = subgraph::Registry::Instance(); + CHECK(origin_program) + << "[HUAWEI_ASCEND_NPU] The origin program is not initialized!"; + CHECK_GT(origin_program->instructions(kRootBlockIdx).size(), 0) + << "[HUAWEI_ASCEND_NPU] No instructions found in the origin program!"; + const auto& insts = origin_program->instructions(kRootBlockIdx); + for (auto& inst : insts) { + auto op = const_cast(inst.op()); + CHECK(op); + op->CheckShape(); + op->InferShape(); + std::string op_type = op->op_info()->Type(); + if (!bridges.Exists(op_type, TARGET(kHuaweiAscendNPU))) { + return false; + } + auto kernel = inst.kernel(); + status |= bridges.Select(op_type, TARGET(kHuaweiAscendNPU))( + reinterpret_cast(&graph), op, const_cast(kernel)); + if (subgraph::CHECK_FAILED(status)) { + return false; + } + } + // Collect the input and output nodes of the IR graph + std::vector device_inodes; + for (size_t i = 0; i < input_names.size(); i++) { + CHECK(graph.Has(input_names[i])); + CHECK(graph.Get(input_names[i])->is_data()); + device_inodes.push_back(*graph.Get(input_names[i])->data()); + } + std::vector device_onodes; + for (size_t i = 0; i < output_names.size(); i++) { + CHECK(graph.Has(output_names[i])); + device_onodes.push_back(*graph.Get(output_names[i])->data()); + } + // Build the IR graph to the om model + std::vector model_buffer; + VLOG(3) << "[HUAWEI_ASCEND_NPU] Building model from model buffer..."; + if (!lite::huawei_ascend_npu::Device::Global().Build( + device_inodes, device_onodes, &model_buffer)) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Build model failed!"; + return false; + } + VLOG(3) << "[HUAWEI_ASCEND_NPU] Build model success."; + // Load the om model and create a model manager client + VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model from memory ..."; + model_client_ = lite::huawei_ascend_npu::Device::Global().LoadFromMem( + model_buffer, device_id); + if (!model_client_) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from memory failed!"; + return false; + } + VLOG(3) << "[HUAWEI_ASCEND_NPU] Load model from memory success."; + // Update the precison and dimensions of the origin output tensors + CHECK_EQ(origin_otensors.size(), output_names.size()); + origin_otypes_.resize(output_names.size()); + origin_odims_.resize(output_names.size()); + for (size_t i = 0; i < output_names.size(); i++) { + origin_otypes_[i] = graph.Get(output_names[i])->precision(); + origin_odims_[i] = origin_otensors[i]->dims().Vectorize(); + } + if (!model_cache_dir.empty()) { + auto model_path = model_cache_dir + "/" + model_name_ + ".om"; + VLOG(3) << "[HUAWEI_ASCEND_NPU] Saving model to " << model_path; + if (!WriteFile(model_path, model_buffer)) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Open " << model_path + << " for writting failed!"; + } + VLOG(3) << "[HUAWEI_ASCEND_NPU] Saved OM model success:"; + // Serialize the precisions and shapes of the origin output tensors into the + // configuration file + std::ostringstream os; + for (size_t i = 0; i < output_names.size(); i++) { + os << static_cast(origin_otypes_[i]) << ":"; + for (auto dim : origin_odims_[i]) { + os << dim << ","; + } + os << ";"; + } + auto str = os.str(); + std::vector config_buffer(str.begin(), str.end()); + auto config_path = model_cache_dir + "/" + model_name_ + ".cfg"; + VLOG(3) << "[HUAWEI_ASCEND_NPU] Saving configuration to " << config_path; + if (!WriteFile(config_path, config_buffer)) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Open " << config_path + << " for writting failed!"; + } + VLOG(3) << "[HUAWEI_ASCEND_NPU] Saved configuration file success."; + } + return true; +} + +bool DeviceProgram::ShareBufferWithOriginTensors( + const std::vector& input_names, + const std::vector& output_names, + std::vector* origin_itensors, + std::vector* origin_otensors, + std::vector>* device_itensors, + std::vector>* device_otensors) { + CHECK(!model_name_.empty() && model_client_); + // Query the dimensions of the device input and output tensors if not + // initialized + VLOG(3) << "[HUAWEI_ASCEND_NPU] Sharing buffer with origin tnsors..."; + if (device_idims_.empty() || device_odims_.empty()) { + if (!(model_client_->GetModelIOTensorDim(&device_idims_, &device_odims_))) { + LOG(WARNING) + << "[HUAWEI_ASCEND_NPU] Get the dimensions of input and output " + "tensors failed!"; + return false; + } + } + VLOG(3) << "[HUAWEI_ASCEND_NPU] GetModelIOTensorDim success."; + // Check the dimensions of the device tensors and the origin tensors + CHECK_EQ(device_itensors->size(), input_names.size()); + CHECK_EQ(device_otensors->size(), output_names.size()); + CHECK_EQ(origin_otypes_.size(), output_names.size()); + CHECK_EQ(origin_odims_.size(), output_names.size()); + CHECK_EQ(device_idims_.size(), input_names.size()); + CHECK_EQ(device_odims_.size(), output_names.size()); + for (size_t i = 0; i < input_names.size(); i++) { + VLOG(3) << "[HUAWEI_ASCEND_NPU] Inputs[" << i + << "] name: " << input_names[i] + << " origin dims:" << (*origin_itensors)[i]->dims().repr() + << " device dims: {" << device_idims_[i].GetNumber() << "," + << device_idims_[i].GetChannel() << "," + << device_idims_[i].GetHeight() << "," + << device_idims_[i].GetWidth() << "}"; + CHECK_EQ((*origin_itensors)[i]->dims().production(), + device_idims_[i].GetNumber() * device_idims_[i].GetChannel() * + device_idims_[i].GetHeight() * device_idims_[i].GetWidth()); + + // reset tensor desc + if ((*device_itensors)[i]->SetTensorDesc( + device_idims_[i].GetGeTensorDesc()) != ge::GRAPH_SUCCESS) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Tensor input tensor " + "SetTensorDesc failed!"; + } else { + VLOG(3) << "[HUAWEI_ASCEND_NPU] ge::Tensor input tensor SetTensorDesc " + "success."; + } + // copy data from origin to device + if ((*device_itensors)[i]->SetData( + reinterpret_cast((*origin_itensors)[i]->raw_data()), + (*origin_itensors)[i]->memory_size()) != ge::GRAPH_SUCCESS) { + LOG(WARNING) + << "[HUAWEI_ASCEND_NPU] ge::Tensor input tensor SetData failed!"; + } else { + VLOG(3) << "[HUAWEI_ASCEND_NPU] ge::Tensor input tensor SetData success."; + } + VLOG(3) + << "[HUAWEI_ASCEND_NPU] Init the input tensors for the device program " + "and share their buffers with the origin input tensors"; + + // Share data buf between device_itensor and origin_itensor + std::shared_ptr buffer = std::make_shared( + reinterpret_cast((*device_itensors)[i]->GetData()), + lite_api::TargetType::kHost, + (*device_itensors)[i]->GetSize()); + (*origin_itensors)[i]->ResetBuffer(buffer, + (*device_itensors)[i]->GetSize()); + } + for (size_t i = 0; i < output_names.size(); i++) { + (*origin_otensors)[i]->set_precision(origin_otypes_[i]); + (*origin_otensors)[i]->Resize(origin_odims_[i]); + VLOG(3) << "[HUAWEI_ASCEND_NPU] Outputs[" << i + << "] name: " << output_names[i] + << " origin dims:" << (*origin_otensors)[i]->dims().repr() + << " device dims: {" << device_odims_[i].GetNumber() << "," + << device_odims_[i].GetChannel() << "," + << device_odims_[i].GetHeight() << "," + << device_odims_[i].GetWidth() << "}"; + CHECK_EQ((*origin_otensors)[i]->dims().production(), + device_odims_[i].GetNumber() * device_odims_[i].GetChannel() * + device_odims_[i].GetHeight() * device_odims_[i].GetWidth()); + + // reset tensor desc + if ((*device_otensors)[i]->SetTensorDesc( + device_odims_[i].GetGeTensorDesc()) != ge::GRAPH_SUCCESS) { + LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Tensor output tensor " + "SetTensorDesc failed!"; + } else { + VLOG(3) << "[HUAWEI_ASCEND_NPU] ge::Tensor output tensor SetTensorDesc " + "success."; + } + VLOG(3) + << "[HUAWEI_ASCEND_NPU] Init the output tensors for the device program " + "and share their buffers with the origin output tensors"; + } + return true; +} + +bool DeviceProgram::SharedBufferWithOutputTensors( + const std::vector& output_names, + std::vector* origin_otensors, + std::vector>* device_otensors) { + CHECK(!model_name_.empty() && model_client_); + // Check the dimensions of the device tensors and the origin tensors + CHECK_EQ(device_otensors->size(), output_names.size()); + CHECK_EQ(origin_otypes_.size(), output_names.size()); + CHECK_EQ(origin_odims_.size(), output_names.size()); + + for (size_t i = 0; i < output_names.size(); i++) { + CHECK_EQ((*origin_otensors)[i]->dims().production(), + device_odims_[i].GetNumber() * device_odims_[i].GetChannel() * + device_odims_[i].GetHeight() * device_odims_[i].GetWidth()); + + // Share data buf between device_itensor and origin_itensor + std::shared_ptr buffer = std::make_shared( + reinterpret_cast((*device_otensors)[i]->GetData()), + lite_api::TargetType::kHost, + (*device_otensors)[i]->GetSize()); + (*origin_otensors)[i]->ResetBuffer(buffer, + (*device_otensors)[i]->GetSize()); + } + // unload model after model execution + CHECK_EQ(model_client_->UnloadModel(), true); + return true; +} + +bool DeviceProgram::ZeroCopyRun( + std::vector>* device_itensors, + std::vector>* device_otensors) { + CHECK(!model_name_.empty() && model_client_); + auto GetCurrentUS = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; + }; + // int istamp; + auto start_time = GetCurrentUS(); + VLOG(3) << "[HUAWEI_ASCEND_NPU] Starting ZeroCopyRun to ModelExecute ..."; + CHECK_EQ(model_client_->ModelExecute(device_itensors, device_otensors), true); + VLOG(3) << "[HUAWEI_ASCEND_NPU] Process cost " << GetCurrentUS() - start_time + << " us"; + return true; +} + +bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() { + // Obtain the origin input tensors, and create the origin output + // tensors(Don't try to access them before launch the device program or the + // origin program) + PrepareWorkspaceForOriginProgram(); + // Create the device input and output tensors, but don't initialize them + // with the dimensions + device_itensors_.resize(input_names_.size()); + for (size_t i = 0; i < input_names_.size(); i++) { + device_itensors_[i].reset(new ge::Tensor); + CHECK(device_itensors_[i]); + } + device_otensors_.resize(output_names_.size()); + for (size_t i = 0; i < output_names_.size(); i++) { + device_otensors_[i].reset(new ge::Tensor); + CHECK(device_otensors_[i]); + } + return true; +} + +bool SubgraphEngine::BuildDeviceProgram() { + // Check if the cache device program exists + if (!device_programs_.count(origin_idims_)) { + auto device_program = std::make_shared(); + // Obtain the model cache dir from the NPU Context of the subgraph op + auto model_cache_dir = + ctx_->As().SubgraphModelCacheDir(); + auto device_id = ctx_->As().HuaweiAscendDeviceID(); + VLOG(3) << "[HUAWEI_ASCEND_NPU] Get model cached dir: " << model_cache_dir; + VLOG(3) << "[HUAWEI_ASCEND_NPU] Get huawei ascend npu device id: " + << device_id; + // Check and load if the cached model and configuration file exists + if (model_cache_dir.empty() || + !device_program->LoadFromCacheFile(input_names_, + output_names_, + origin_idims_, + model_cache_dir, + device_id)) { + // Build the model online, including converting the paddle ops to the HiAI + // IR nodes, building the HiAI IR graph to the om model, then load it as a + // new HiAI model manager client for inference. + if (!origin_program_) { + BuildOriginProgram(); + } + CHECK(origin_program_) + << "[HUAWEI_ASCEND_NPU] The origin program is not initialized!"; + CHECK_GT(origin_program_->instructions().size(), 0) + << "[HUAWEI_ASCEND_NPU] No instructions found in the origin program!"; + if (!device_program->BuildGraphAndCacheToFile(origin_program_.get(), + input_names_, + output_names_, + origin_idims_, + origin_otensors_, + model_cache_dir, + device_id)) { + return false; + } + } + if (device_program->model_client_ == nullptr) { + return false; + } + device_programs_[origin_idims_] = device_program; + } + auto device_program = device_programs_[origin_idims_]; + CHECK(device_program && device_program->model_client_); + return device_program->ShareBufferWithOriginTensors(input_names_, + output_names_, + &origin_itensors_, + &origin_otensors_, + &device_itensors_, + &device_otensors_); +} + +bool SubgraphEngine::LaunchDeviceProgram() { + // Roll back to launch the origin program if the device program can't be + // found or the model client isn't initialized. + if (device_programs_.count(origin_idims_) == 0 || + device_programs_[origin_idims_]->model_client_ == nullptr) { + return LaunchOriginProgram(); + } + auto device_program = device_programs_[origin_idims_]; + if (!device_program->model_client_) { + return LaunchOriginProgram(); + } + if (!device_program->ZeroCopyRun(&device_itensors_, &device_otensors_)) { + return false; + } + if (!device_program->SharedBufferWithOutputTensors( + output_names_, &origin_otensors_, &device_otensors_)) { + return false; + } + return true; +} + +void SubgraphCompute::PrepareForRun() { + auto& param = this->Param(); + engine_.reset(new SubgraphEngine(ctx_.get(), + param.block_idx, + param.program_desc, + param.exec_scope, + param.input_data_names, + param.output_data_names)); + CHECK(engine_); +} + +void SubgraphCompute::Run() { + CHECK(engine_); + engine_->Run(); +} + +} // namespace huawei_ascend_npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(subgraph, + kHuaweiAscendNPU, + kAny, + kNCHW, + paddle::lite::kernels::huawei_ascend_npu::SubgraphCompute, + def) + .BindInput("Inputs", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindOutput("Outputs", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .Finalize(); diff --git a/lite/kernels/huawei_ascend_npu/subgraph_compute.h b/lite/kernels/huawei_ascend_npu/subgraph_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..fb7d2efe0c29912a07f11a544c91432d69c51fa0 --- /dev/null +++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.h @@ -0,0 +1,121 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "graph/tensor.h" +#include "lite/backends/huawei_ascend_npu/device.h" +#include "lite/core/kernel.h" +#include "lite/kernels/npu/bridges/engine.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace huawei_ascend_npu { + +using TensorDesc = paddle::lite::huawei_ascend_npu::TensorDesc; +using AclModelClient = paddle::lite::huawei_ascend_npu::AclModelClient; + +class DeviceProgram { + public: + DeviceProgram() {} + ~DeviceProgram() {} + std::string GenerateModelName( + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims); + bool LoadFromCacheFile(const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims, + const std::string& model_cache_dir, + const int device_id); + bool BuildGraphAndCacheToFile( + RuntimeProgram* origin_program, + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims, + const std::vector& origin_otensors, + const std::string& model_cache_dir, + const int device_id); + bool ShareBufferWithOriginTensors( + const std::vector& input_names, + const std::vector& output_names, + std::vector* origin_itensors, + std::vector* origin_otensors, + std::vector>* device_itensors, + std::vector>* device_otensors); + bool SharedBufferWithOutputTensors( + const std::vector& output_names, + std::vector* origin_otensors, + std::vector>* device_otensors); + bool ZeroCopyRun(std::vector>* device_itensors, + std::vector>* device_otensors); + + public: + std::string model_name_{""}; + std::shared_ptr model_client_{nullptr}; + std::vector> origin_odims_; + std::vector origin_otypes_; + std::vector device_idims_{}; + std::vector device_odims_{}; +}; + +class SubgraphEngine : public subgraph::Engine { + public: + SubgraphEngine(KernelContext* ctx, + int block_idx, + const std::shared_ptr& program_desc, + Scope* exec_scope, + const std::vector& input_names, + const std::vector& output_names) + : subgraph::Engine(ctx, + block_idx, + program_desc, + exec_scope, + input_names, + output_names) {} + + protected: + bool PrepareWorkspaceForDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; + + private: + std::vector> device_itensors_{}; + std::vector> device_otensors_{}; + std::map>, std::shared_ptr> + device_programs_; +}; + +class SubgraphCompute + : public KernelLite { + public: + using param_t = operators::SubgraphParam; + void PrepareForRun() override; + void Run() override; + virtual ~SubgraphCompute() = default; + + private: + std::unique_ptr engine_; +}; + +} // namespace huawei_ascend_npu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/mlu/CMakeLists.txt b/lite/kernels/mlu/CMakeLists.txt index f9395d45ccecccaf3f873797d0c2d71eda266319..634a0afc551d83be58487d7393e092196e0f6cc5 100644 --- a/lite/kernels/mlu/CMakeLists.txt +++ b/lite/kernels/mlu/CMakeLists.txt @@ -4,6 +4,7 @@ endif() add_subdirectory(bridges) add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges}) -add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) -add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) -add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu}) +add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu}) +add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps}) +# depend on transpose function in backend/x86/math/math_function +add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_function}) diff --git a/lite/kernels/mlu/bridges/CMakeLists.txt b/lite/kernels/mlu/bridges/CMakeLists.txt index 82510ab9b6a794f5c6b1ffb43d2d3c55db3a5514..91323925e1ef49462c180fd96392d638e273fd69 100644 --- a/lite/kernels/mlu/bridges/CMakeLists.txt +++ b/lite/kernels/mlu/bridges/CMakeLists.txt @@ -3,7 +3,7 @@ if(NOT LITE_WITH_MLU) endif() lite_cc_library(subgraph_bridge_utility_mlu SRCS utility.cc DEPS ${mlu_builder_libs} tensor) -lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs}) +lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs} subgraph_bridge_utility_mlu) lite_cc_library(subgraph_bridge_graph_mlu SRCS graph.cc DEPS subgraph_bridge_utility_mlu subgraph_bridge_tensor_mlu) set(mlu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_mlu subgraph_bridge_graph_mlu) @@ -18,6 +18,16 @@ lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_d lite_cc_library(subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS ${subgraph_bridge_deps_mlu}) lite_cc_library(subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_transpose_op_mlu SRCS transpose_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_dropout_op_mlu SRCS dropout_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_slice_op_mlu SRCS slice_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_split_op_mlu SRCS split_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_cast_op_mlu SRCS cast_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_layout_op_mlu SRCS layout_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_argmax_op_mlu SRCS argmax_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_squeeze_op_mlu SRCS squeeze_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_reshape_op_mlu SRCS reshape_op.cc DEPS ${subgraph_bridge_deps_mlu}) +lite_cc_library(subgraph_bridge_flatten_op_mlu SRCS flatten_op.cc DEPS ${subgraph_bridge_deps_mlu}) set(mlu_subgraph_bridges subgraph_bridge_registry subgraph_bridge_utility_mlu @@ -28,12 +38,35 @@ set(mlu_subgraph_bridges subgraph_bridge_pool_op_mlu subgraph_bridge_softmax_op_mlu subgraph_bridge_fc_op_mlu + subgraph_bridge_transpose_op_mlu subgraph_bridge_batch_norm_op_mlu subgraph_bridge_scale_op_mlu subgraph_bridge_interp_op_mlu subgraph_bridge_concat_op_mlu + subgraph_bridge_dropout_op_mlu + subgraph_bridge_slice_op_mlu + subgraph_bridge_split_op_mlu + subgraph_bridge_cast_op_mlu + subgraph_bridge_layout_op_mlu + subgraph_bridge_argmax_op_mlu + subgraph_bridge_squeeze_op_mlu + subgraph_bridge_reshape_op_mlu + subgraph_bridge_flatten_op_mlu CACHE INTERNAL "mlu_subgraph_bridges") + +if (LITE_BUILD_EXTRA) + lite_cc_library(subgraph_bridge_lrn_op_mlu SRCS lrn_op.cc DEPS ${subgraph_bridge_deps_mlu}) + lite_cc_library(subgraph_bridge_gather_op_mlu SRCS gather_op.cc DEPS ${subgraph_bridge_deps_mlu}) + lite_cc_library(subgraph_bridge_norm_op_mlu SRCS norm_op.cc DEPS ${subgraph_bridge_deps_mlu}) + set(mlu_subgraph_bridges + "${mlu_subgraph_bridges}" + subgraph_bridge_lrn_op_mlu + subgraph_bridge_gather_op_mlu + subgraph_bridge_norm_op_mlu + CACHE INTERNAL "mlu_subgraph_bridges") +endif() + lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges}) lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) @@ -45,4 +78,21 @@ lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer targe lite_cc_test(test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) lite_cc_test(test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_transpose_converter_mlu SRCS transpose_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_dropout_converter_mlu SRCS dropout_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_slice_converter_mlu SRCS slice_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_split_converter_mlu SRCS split_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_layout_converter_mlu SRCS layout_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_cast_converter_mlu SRCS cast_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_argmax_converter_mlu SRCS argmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_squeeze_converter_mlu SRCS squeeze_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_reshape_converter_mlu SRCS reshape_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +lite_cc_test(test_flatten_converter_mlu SRCS flatten_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) + +if (LITE_BUILD_EXTRA) + lite_cc_test(test_norm_converter_mlu SRCS norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) + lite_cc_test(test_lrn_converter_mlu SRCS lrn_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) + lite_cc_test(test_gather_converter_mlu SRCS gather_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu) +endif() + message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}") diff --git a/lite/kernels/mlu/bridges/act_op.cc b/lite/kernels/mlu/bridges/act_op.cc index 286195d9d5f961288dd0156db31ff8aacae58227..d24c7fac216ed0ba213a4fd95365132a693281c3 100644 --- a/lite/kernels/mlu/bridges/act_op.cc +++ b/lite/kernels/mlu/bridges/act_op.cc @@ -60,6 +60,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { output_tensor->mlu_tensor())); } graph->FuseOp(activation_op); + CNML_CALL(cnmlDestroyBaseOp(&activation_op)); return SUCCESS; } @@ -72,6 +73,9 @@ REGISTER_SUBGRAPH_BRIDGE(sigmoid, kMLU, paddle::lite::subgraph::mlu::ActConverter); REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE(relu6, + kMLU, + paddle::lite::subgraph::mlu::ActConverter); REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter); REGISTER_SUBGRAPH_BRIDGE(leaky_relu, kMLU, diff --git a/lite/kernels/mlu/bridges/act_op_test.cc b/lite/kernels/mlu/bridges/act_op_test.cc index 2b7747f4d8b647b8cb621876907f6178ebf9fe88..11c0c3f732c4c29fff3aedc6cfdcf55760128b5d 100644 --- a/lite/kernels/mlu/bridges/act_op_test.cc +++ b/lite/kernels/mlu/bridges/act_op_test.cc @@ -13,7 +13,9 @@ // limitations under the License. #include + #include + #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" #include "lite/kernels/mlu/bridges/test_helper.h" @@ -116,7 +118,7 @@ void test_act(std::vector x_shape, std::string op_type) { opdesc.SetAttr("offset", 0.5f); } - // create and convert op to NPU model, then run it on NPU + // create and convert op to MLU model, then run it on MLU auto op = CreateOp(opdesc, &scope); // execute reference implementation and save to output tensor act_ref(op); @@ -134,7 +136,8 @@ void test_act(std::vector x_shape, std::string op_type) { TEST(MLUBridges, activation) { std::vector> shapes{{1}, {2, 3}, {1, 2, 3, 4}}; - std::vector types{"sigmoid", "relu", "tanh", "leaky_relu"}; + std::vector types{ + "sigmoid", "relu", "relu6", "tanh", "leaky_relu"}; for (auto x_shape : shapes) { for (auto op_type : types) { test_act(x_shape, op_type); @@ -149,5 +152,6 @@ TEST(MLUBridges, activation) { USE_SUBGRAPH_BRIDGE(sigmoid, kMLU) USE_SUBGRAPH_BRIDGE(relu, kMLU) +USE_SUBGRAPH_BRIDGE(relu6, kMLU) USE_SUBGRAPH_BRIDGE(tanh, kMLU) USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU) diff --git a/lite/kernels/mlu/bridges/argmax_op.cc b/lite/kernels/mlu/bridges/argmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b004639f07c79e5cc414e2d60bc1f32ec522f0f5 --- /dev/null +++ b/lite/kernels/mlu/bridges/argmax_op.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Get input vars and op attributes + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims().Vectorize(); + + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + + int axis = op_info->GetAttr("axis"); + if (axis < 0) { + axis = axis + x_dims.size(); + } + cnmlDimension_t argmax_mode = static_cast(axis); + auto mlu_output_dim = x->dims().Vectorize(); + // shape is NCHW, layout is NHWC + mlu_output_dim[axis] = 1; + auto input_tensor = graph->GetNode(x_var_name); + // if use_fp16 and axis is not c, cast input datatype from fp16 to fp32, so + // output datatype is int32 + bool cast_to_fp32 = + graph->FPType() == CNML_DATA_FLOAT16 && argmax_mode != CNML_DIM_C; + cnmlBaseOp_t cast_op{nullptr}; + std::shared_ptr fp32_input_tensor; + if (cast_to_fp32) { + fp32_input_tensor = graph->AddNode(x_var_name + ".fp32", + x_dims, + CNML_TENSOR, + CNML_NCHW, + CNML_DATA_FLOAT32); + cnmlCreateCastOp(&cast_op, + CNML_CAST_FLOAT16_TO_FLOAT32, + input_tensor->mlu_tensor(), + fp32_input_tensor->mlu_tensor()); + } + auto output_tensor = graph->AddNode( + out_var_name, mlu_output_dim, CNML_TENSOR, CNML_NCHW, CNML_DATA_INT32); + + CHECK(graph->HasNode(x_var_name)); + cnmlBaseOp_t argmax_op{nullptr}; + // ======================= DEBUG INFO ===================== + VLOG(6) << "x_var_name: " << x_var_name; + VLOG(6) << "out_var_name: " << out_var_name; + VLOG(6) << "x dims: " << x->dims(); + VLOG(6) << "output dims: " << output->dims(); + VLOG(6) << "axis: " << axis; + VLOG(6) << "cast_to_fp32: " << cast_to_fp32; + cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR); + cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR); + // ======================= DEBUG END ===================== + + CNML_CALL(cnmlCreateArgmaxOp(&argmax_op, + argmax_mode, + cast_to_fp32 ? fp32_input_tensor->mlu_tensor() + : input_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + if (cast_to_fp32) { + graph->FuseOp(cast_op); + } + graph->FuseOp(argmax_op); + CNML_CALL(cnmlDestroyBaseOp(&argmax_op)); + if (cast_op) { + CNML_CALL(cnmlDestroyBaseOp(&cast_op)); + } + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(arg_max, + kMLU, + paddle::lite::subgraph::mlu::ArgmaxConverter); diff --git a/lite/kernels/mlu/bridges/argmax_op_test.cc b/lite/kernels/mlu/bridges/argmax_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9eeb172812b8deecd6a8f1f2eb321ade4289fa9b --- /dev/null +++ b/lite/kernels/mlu/bridges/argmax_op_test.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/argmax_op.h" + +#include + +#include +#include + +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +template +void argmax_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + int axis = op_info->GetAttr("axis"); + auto x_dims = x->dims(); + if (axis < 0) { + axis += x_dims.size(); + } + auto y_shape = x_dims.Vectorize(); + y_shape.erase(y_shape.begin() + axis); + out->Resize(y_shape); + auto out_dims = out->dims(); + + auto* x_data = x->mutable_data(); + auto* out_data = out->mutable_data(); + + const int size = x_dims[axis]; + const int in_channel = x_dims.count(axis, x_dims.size()); + const int out_channel = out_dims.count(axis, out_dims.size()); + const int in_stride = x_dims.count(axis + 1, x_dims.size()); + const int out_stride = x_dims.count(0, axis); + // int index = 0; + for (int n = 0; n < out_stride; n++) { + for (int k = 0; k < in_stride; k++) { + const float* in_ptr = x_data + n * in_channel + k; + std::vector> vec; + vec.resize(size); + for (int i = 0; i < size; i++) { + vec[i] = std::make_pair(in_ptr[i * in_stride], i); + } + // sort + std::partial_sort(vec.begin(), + vec.begin() + 1, + vec.end(), + std::greater>()); + + out_dtype* out_ptr = out_data + n * out_channel + k; + *out_ptr = vec[0].second; + } + } +} + +void test_argmax(const std::vector& input_shape, int axis) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(input_shape); + // initialize input&output data + FillTensor(x, -9, 9); + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("arg_max"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", static_cast(axis)); + + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + argmax_ref(op); + out_ref->CopyDataFrom(*out); + Tensor input_x; + input_x.Resize(DDim(input_shape)); + // change input layout from NCHW to NHWC + transpose(x->mutable_data(), + input_x.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_x); + + LaunchOp(op, {x_var_name}, {out_var_name}); + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + std::vector out_shape = input_shape; + out_shape[axis] = 1; + Tensor output_trans; + output_trans.Resize(out_shape); + // Change output layout from NHWC to NCHW + transpose(out_data, + output_trans.mutable_data(), + {static_cast(out_shape[0]), + static_cast(out_shape[2]), + static_cast(out_shape[3]), + static_cast(out_shape[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(MLUBridges, arg_max) { + test_argmax({1, 2, 3, 4}, 1); + test_argmax({1, 2, 3, 4}, 2); + test_argmax({1, 2, 3, 4}, 3); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(arg_max, kMLU); diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc index 7353a685dd5fd3a5bcc8c88def8ffb8b96fdde55..ceac1ac696d788869e77a1b173cc0bb4d10a4e21 100644 --- a/lite/kernels/mlu/bridges/batch_norm_op.cc +++ b/lite/kernels/mlu/bridges/batch_norm_op.cc @@ -48,25 +48,32 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto mean = scope->FindVar(mean_var_name)->GetMutable(); auto mean_dims = mean->dims().Vectorize(); + if (mean_dims.size() < 4) { + mean_dims.insert(mean_dims.begin(), 4 - mean_dims.size(), 1); + } auto mean_tensor = graph->AddNode( - mean_var_name, mean_dims, CNML_CONST, CNML_CNHW, graph->FPType()); + mean_var_name, mean_dims, CNML_CONST, CNML_NHWC, graph->FPType()); auto variance = scope->FindVar(variance_var_name)->GetMutable(); auto variance_dims = variance->dims().Vectorize(); + if (variance_dims.size() < 4) { + variance_dims.insert(variance_dims.begin(), 4 - variance_dims.size(), 1); + } auto variance_tensor = graph->AddNode( - variance_var_name, variance_dims, CNML_CONST, CNML_CNHW, graph->FPType()); + variance_var_name, variance_dims, CNML_CONST, CNML_NHWC, graph->FPType()); auto scale = scope->FindVar(scale_var_name)->GetMutable(); auto bias = scope->FindVar(bias_var_name)->GetMutable(); - int co = static_cast(mean_dims[0]); + int co = static_cast(mean_dims[3]); + std::vector variance_trans(co); + std::vector mean_trans(co); for (int i = 0; i < co; ++i) { - variance->mutable_data()[i] = + variance_trans[i] = scale->data()[i] / sqrtf(variance->data()[i] + epsilon); - mean->mutable_data()[i] = - mean->data()[i] - - bias->data()[i] / variance->data()[i]; + mean_trans[i] = + mean->data()[i] - bias->data()[i] / variance_trans[i]; } auto input_tensor = graph->GetNode(x_var_name); @@ -77,10 +84,14 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { mean_tensor->mlu_tensor(), variance_tensor->mlu_tensor())); - graph->BindConstData(variance_var_name, variance); - graph->BindConstData(mean_var_name, mean); + graph->BindConstRawData( + variance_var_name, variance_trans.data(), variance_trans.size(), true); + graph->BindConstRawData( + mean_var_name, mean_trans.data(), mean_trans.size(), true); graph->FuseOp(bn_op); + CNML_CALL(cnmlDestroyBaseOp(&bn_op)); + return SUCCESS; } diff --git a/lite/kernels/mlu/bridges/cast_op.cc b/lite/kernels/mlu/bridges/cast_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..25d988ce5aee519dfb00574343956022b30a89e7 --- /dev/null +++ b/lite/kernels/mlu/bridges/cast_op.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int CastConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto in_dtype = op_info->GetAttr("in_dtype"); + auto out_dtype = op_info->GetAttr("out_dtype"); + + CHECK(graph->HasNode(x_var_name)); + auto x_tensor = graph->GetNode(x_var_name); + + cnmlDataType_t out_type; + cnmlCastType_t cast_type; + if (in_dtype == 4 && out_dtype == 5) { + cast_type = CNML_CAST_FLOAT16_TO_FLOAT32; + out_type = CNML_DATA_FLOAT32; + } else if (in_dtype == 5 && out_dtype == 4) { + cast_type = CNML_CAST_FLOAT32_TO_FLOAT16; + out_type = CNML_DATA_FLOAT16; + } else { + CHECK(0) << "Unsupported cast type"; + } + + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, out_type); + + cnmlBaseOp_t cast_op; + CNML_CALL(cnmlCreateCastOp(&cast_op, + cast_type, + x_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + graph->FuseOp(cast_op); + CNML_CALL(cnmlDestroyBaseOp(&cast_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(cast, + kMLU, + paddle::lite::subgraph::mlu::CastConverter); diff --git a/lite/kernels/mlu/bridges/cast_op_test.cc b/lite/kernels/mlu/bridges/cast_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2389ad5560cd2ede710626cfd40f8db8bff56351 --- /dev/null +++ b/lite/kernels/mlu/bridges/cast_op_test.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/cast_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void test_cast_FP16_to_FP32(std::vector shape) { + // prepare input&output variables + std::string x_var_name = "x"; + std::string out_var_name = "out"; + + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + x->Resize(DDim(shape)); + auto* x_data = x->mutable_data(); + + // initialize input&output data + for (int i = 0; i < x->dims().production(); i++) { + x_data[i] = static_cast(i); + } + // initialize op desc + int in_dtype = 4, out_dtype = 5; + cpp::OpDesc opdesc; + opdesc.SetType("cast"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("in_dtype", in_dtype); + opdesc.SetAttr("out_dtype", out_dtype); + + auto op = CreateOp(opdesc, &scope); + + Tensor data; + data.Resize(DDim(shape)); + auto* copy_data = data.mutable_data(); + data.CopyDataFrom(*x); + x->set_precision(paddle::lite_api::PrecisionType::kFP16); + LaunchOp(op, {x_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], static_cast(copy_data[i]), 5e-4); + } +} + +void test_cast_FP32_to_FP16(std::vector shape) { + // prepare input&output variables + std::string x_var_name = "x"; + std::string out_var_name = "out"; + + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + x->Resize(DDim(shape)); + auto* x_data = x->mutable_data(); + + // initialize input&output data + for (int i = 0; i < x->dims().production(); i++) { + x_data[i] = static_cast(i); + } + // initialize op desc + int in_dtype = 5, out_dtype = 4; + cpp::OpDesc opdesc; + opdesc.SetType("cast"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("in_dtype", in_dtype); + opdesc.SetAttr("out_dtype", out_dtype); + + auto op = CreateOp(opdesc, &scope); + + Tensor data; + data.Resize(DDim(shape)); + auto* copy_data = data.mutable_data(); + data.CopyDataFrom(*x); + x->set_precision(paddle::lite_api::PrecisionType::kFloat); + LaunchOp(op, {x_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(static_cast(out_data[i]), copy_data[i], 5e-4); + } +} + +TEST(MLUBridges, cast) { + test_cast_FP16_to_FP32({2, 3, 4, 5}); + test_cast_FP16_to_FP32({6, 3, 2, 5}); + test_cast_FP32_to_FP16({2, 3, 4, 5}); + test_cast_FP32_to_FP16({6, 3, 2, 5}); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(cast, kMLU); diff --git a/lite/kernels/mlu/bridges/concat_op.cc b/lite/kernels/mlu/bridges/concat_op.cc index 14f0da746a00c1ea10ffae824217dbb2df84df55..1d566639937d79cf1c98c70bfc1294d874fb89c4 100644 --- a/lite/kernels/mlu/bridges/concat_op.cc +++ b/lite/kernels/mlu/bridges/concat_op.cc @@ -44,9 +44,10 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto dims = output_dims.size(); int axis = (param_axis < 0) ? (param_axis + dims) : param_axis; - CHECK_LE(axis, 4) << "Unsupport dims in mlu concat"; - int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2}; - int nhwc_axis = nchw_to_nhwc_axis_map[axis]; + CHECK_LT(axis, dims) << "Unsupport dims in mlu concat"; + // value of nhwc2nchw_axis is index of nhwc + // order of nhwc2nchw_axis is nchw + int nhwc_axis = GetAxisNHWC2NCHW(dims)[axis]; auto output_tensor = graph->AddNode( out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); @@ -60,6 +61,7 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { &outputs, 1)); graph->FuseOp(concat_op); + CNML_CALL(cnmlDestroyBaseOp(&concat_op)); return SUCCESS; } diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc index e7e21f7ad2f64275746e015289c9372368e46f5c..6d10605e2c4060cbd8b30d358ac15f2e78f13ca5 100644 --- a/lite/kernels/mlu/bridges/conv_op.cc +++ b/lite/kernels/mlu/bridges/conv_op.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "lite/operators/conv_op.h" + #include + #include "lite/kernels/mlu/bridges/graph.h" #include "lite/kernels/mlu/bridges/utility.h" #include "lite/kernels/npu/bridges/registry.h" @@ -30,6 +32,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { const auto* op_info = op->op_info(); const auto* scope = op->scope(); VLOG(3) << "[MLU] Converting " << op_info->Type() << "... "; + CHECK(!op_info->HasAttr("act_type")); // get input, filter and op attributes const auto input_var_name = op_info->Input("Input").front(); @@ -43,8 +46,13 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { const auto output_shape = output->dims().Vectorize(); const auto bs = input_dims[0]; const auto oc = filter_dims[0]; + const auto groups = op_info->GetAttr("groups"); + CHECK_EQ(input_dims.size(), 4u); CHECK_EQ(filter_dims.size(), 4u); + CHECK(!(op_info->HasAttr("fuse_relu") && + (op_info->GetAttr("fuse_relu") == true))) + << "UnSupported param fuse_relu is true!"; const auto strides = op_info->GetAttr>("strides"); auto dilations = op_info->GetAttr>("dilations"); auto paddings = op_info->GetAttr>("paddings"); @@ -70,18 +78,36 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { padding_algorithm, input_dims, filter_dims); + bool is_group_mode = groups > 1; + bool is_depthwise_mode = false; + if (filter_dims[0] == groups && filter_dims[1] == 1 && dilations[0] == 1 && + dilations[1] == 1) { // depthwise filter shape = {1, ic ,kh ,kw} + is_depthwise_mode = true; + is_group_mode = false; + } + + auto input_tensor = graph->GetNode(input_var_name); const auto output_tensor = graph->AddNode( output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType()); + std::vector cnml_filter_shape = { + filter_dims[0], filter_dims[1], filter_dims[2], filter_dims[3]}; + if (is_depthwise_mode) { + /*paddle filter shape is {oc , ic / groups == 1, kh, kw} while + cnml depthwise conv filter expect shape {oc / groups == 1 , ic , kh, kw} + so we should shape filter shape + */ + cnml_filter_shape = { + filter_dims[1], filter_dims[0], filter_dims[2], filter_dims[3]}; + } // Create filter node const auto filter_tensor = graph->AddNode(filter_var_name, - filter_dims.Vectorize(), + cnml_filter_shape, CNML_FILTER, CNML_NCHW, graph->FPType()); - const auto weight_scale = - op_info->GetAttr>("weight_scale"); + const auto weight_scale = op_info->GetInputScale(filter_var_name); if (filter->precision() == PrecisionType::kUnk || filter->precision() == PrecisionType::kInt8) { @@ -89,15 +115,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { dequant(filter_dequant.data(), filter->mutable_data(), 1, - filter_dims[0], - filter_dims[1] * filter_dims[2] * filter_dims[3], + cnml_filter_shape[0], + cnml_filter_shape[1] * cnml_filter_shape[2] * cnml_filter_shape[3], weight_scale); transpose(filter_dequant.data(), filter->mutable_data(), - {static_cast(filter_dims[0]), - static_cast(filter_dims[1]), - static_cast(filter_dims[2]), - static_cast(filter_dims[3])}, + {static_cast(cnml_filter_shape[0]), + static_cast(cnml_filter_shape[1]), + static_cast(cnml_filter_shape[2]), + static_cast(cnml_filter_shape[3])}, {0, 2, 3, 1}); filter->set_precision(PrecisionType::kFloat); } else if (filter->precision() != PrecisionType::kFloat) { @@ -116,7 +142,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { std::vector bias_shape; if (bias_data_size == oc) { // 0: {oc} - bias_shape = {oc}; + bias_shape = {1, 1, 1, oc}; } else if (bias_data_size == output_data_size / bs) { LOG(FATAL) << "Unsupported ... ..."; // 1: {1, oc, oh, ow} @@ -130,18 +156,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { << " isn't supported in conv2d Op when output dimension is " << output_dims; } - bias_tensor = graph->AddNode(bias_var_name, - bias_dims.Vectorize(), - CNML_CONST, - CNML_CNHW, - graph->FPType()); + bias_tensor = graph->AddNode( + bias_var_name, bias_shape, CNML_CONST, CNML_NHWC, graph->FPType()); graph->BindConstData(bias_var_name, bias); } - const auto input_scale = op_info->GetAttr("input_scale"); + const auto input_scale = op_info->GetInputScale(input_var_name)[0]; bool use_first_conv = false; - if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) { + if (lite::TargetWrapperMlu::UseFirstConv() && input_dims[1] == 3) { use_first_conv = true; } @@ -158,38 +181,75 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { paddings[0], paddings[0])); const auto mean_tensor = graph->AddNode("first_conv_mean_tensor", - std::vector{3}, + std::vector{1, 1, 1, 3}, CNML_CONST, - CNML_CNHW, + CNML_NHWC, graph->FPType()); const auto std_tensor = graph->AddNode("first_conv_std_tensor", - std::vector{3}, + std::vector{1, 1, 1, 3}, CNML_CONST, - CNML_CNHW, + CNML_NHWC, graph->FPType()); graph->BindConstRawData("first_conv_mean_tensor", - lite::DeviceInfo::Global().MeanVec().data(), + lite::TargetWrapperMlu::MeanVec().data(), 3, false); graph->BindConstRawData("first_conv_std_tensor", - lite::DeviceInfo::Global().StdVec().data(), + lite::TargetWrapperMlu::StdVec().data(), 3, false); - graph->GetNode(input_var_name)->set_mlu_dtype(CNML_DATA_UINT8); + input_tensor->set_mlu_dtype(CNML_DATA_UINT8); CNML_CALL(cnmlCreateConvFirstOpForward( &conv_op, conv_param, - graph->GetNode(input_var_name)->mlu_tensor(), + input_tensor->mlu_tensor(), mean_tensor->mlu_tensor(), output_tensor->mlu_tensor(), filter_tensor->mlu_tensor(), bias_tensor ? bias_tensor->mlu_tensor() : nullptr, std_tensor->mlu_tensor())); CNML_CALL(cnmlDestroyConvFirstOpParam(&conv_param)); + } else if (is_depthwise_mode) { + cnmlConvDepthwiseOpParam_t conv_depthwise_param; + cnmlCreateConvDepthwiseOpParam_V2(&conv_depthwise_param, + strides[0], + strides[1], + paddings[0] * 2, + paddings[2] * 2); + CNML_CALL(cnmlCreateConvDepthwiseOpForward( + &conv_op, + conv_depthwise_param, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + filter_tensor->mlu_tensor(), + bias_tensor ? bias_tensor->mlu_tensor() : nullptr)); + CNML_CALL(cnmlDestroyConvDepthwiseOpParam(&conv_depthwise_param)); + } else if (is_group_mode) { + cnmlConvOpParam_t conv_param; + CNML_CALL(cnmlCreateConvOpParam(&conv_param, + strides[0], + strides[1], + dilations[0], + dilations[1], + paddings[0] * 2, + paddings[2] * 2)); + CNML_CALL(cnmlCreateConvGroupOpForward( + &conv_op, + conv_param, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + filter_tensor->mlu_tensor(), + bias_tensor ? bias_tensor->mlu_tensor() : nullptr, + groups)); + CNML_CALL(cnmlDestroyConvOpParam(&conv_param)); } else { cnmlConvOpParam_t conv_param; + VLOG(5) << "conv param (" << input_var_name << ")" + << "stride: " << strides[0] << ',' << strides[1] << '\t' + << "dilations: " << dilations[0] << ',' << dilations[1] << '\t' + << "paddings: " << paddings[0] << ',' << paddings[2] << std::endl; CNML_CALL(cnmlCreateConvOpParam(&conv_param, strides[0], strides[1], @@ -200,19 +260,21 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { CNML_CALL(cnmlCreateConvOpForward( &conv_op, conv_param, - graph->GetNode(input_var_name)->mlu_tensor(), + input_tensor->mlu_tensor(), output_tensor->mlu_tensor(), filter_tensor->mlu_tensor(), bias_tensor ? bias_tensor->mlu_tensor() : nullptr)); CNML_CALL(cnmlDestroyConvOpParam(&conv_param)); } - graph->SetComputingDataType( - conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale); - graph->SetComputingDataType( - conv_op, - filter_tensor->mlu_tensor(), - 1 / *min_element(weight_scale.begin(), weight_scale.end())); + if (!is_depthwise_mode) { + graph->SetComputingDataType( + conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale); + graph->SetComputingDataType( + conv_op, + filter_tensor->mlu_tensor(), + 1 / *max_element(weight_scale.begin(), weight_scale.end())); + } CNML_CALL(cnmlSetOperationComputingLayout(conv_op, CNML_NHWC)); if (HasInputArg(op_info, scope, "Bias")) { auto* bias = scope->FindVar(bias_var_name)->GetMutable(); @@ -220,6 +282,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { } graph->BindConstData(filter_var_name, filter); graph->FuseOp(conv_op); + CNML_CALL(cnmlDestroyBaseOp(&conv_op)); return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/mlu/bridges/conv_op_test.cc b/lite/kernels/mlu/bridges/conv_op_test.cc index 1b04814d7d88d227d0bb3e0b58aef26d62f06966..e23f7c68ab0048b8cc04ffdae33ea94fcabbcf65 100644 --- a/lite/kernels/mlu/bridges/conv_op_test.cc +++ b/lite/kernels/mlu/bridges/conv_op_test.cc @@ -13,8 +13,11 @@ // limitations under the License. #include "lite/operators/conv_op.h" + #include + #include + #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" #include "lite/kernels/mlu/bridges/test_helper.h" @@ -221,8 +224,10 @@ void test_conv(int bs, opdesc_mlu.SetAttr("groups", groups); opdesc_mlu.SetAttr("fuse_relu", static_cast(fuse_relu)); - opdesc_mlu.SetAttr("weight_scale", std::vector(oc, filter_scale)); - opdesc_mlu.SetAttr("input_scale", input_scale); + OpInfo op_info(opdesc_mlu); + op_info.SetInputScale(filter_int_var_name, + std::vector(oc, filter_scale)); + op_info.SetInputScale(input_var_name, {input_scale}); if (has_bias) { if (is_channel_bias) { @@ -231,7 +236,7 @@ void test_conv(int bs, bias->Resize({output_shape}); } FillTensor(bias); - opdesc_mlu.SetInput("Bias", {bias_var_name}); + op_info.SetInput("Bias", {bias_var_name}); } for (int i = 0; i < bs; i++) { @@ -245,7 +250,7 @@ void test_conv(int bs, } // create and convert op to MLU model, then run it on MLU - auto op = CreateOp(opdesc_mlu, &scope); + auto op = CreateOp(op_info, &scope); LaunchOp(op, {input_var_name}, {output_var_name}); // compare results auto* output_data = output->mutable_data(); @@ -331,6 +336,10 @@ TEST(MLUBridges, conv) { #endif } +TEST(MLUBridges, depthwise_conv2d) { + test_conv(1, 8, 8, 14, 14, false, false, false, true, 1, 1, 2, 3); +} + } // namespace mlu } // namespace subgraph } // namespace lite diff --git a/lite/kernels/mlu/bridges/dropout_op.cc b/lite/kernels/mlu/bridges/dropout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9aa296236e05a0c80ed9b7001f940cce99b019f7 --- /dev/null +++ b/lite/kernels/mlu/bridges/dropout_op.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Create act node and set params from op + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + /* auto mask_var_name = op_info->Output("Mask").front(); */ + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + /* auto mask = scope->FindVar(mask_var_name)->GetMutable(); */ + /* auto mask_dims = mask->dims().Vectorize(); */ + /* auto mask_tensor = graph->AddNode( */ + /* mask_var_name, mask_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); */ + + // is_test is true by default + // if(op_info->HasAttr("is_test")){ + // auto is_test = op_info->GetAttr("is_test"); + // CHECK(is_test != true); + // } + + // Param fix_seed and seed is useless in MLU + + auto dropout_implementation = + op_info->GetAttr("dropout_implementation"); + auto dropout_prob = op_info->GetAttr("dropout_prob"); + float alpha = 1.0f - dropout_prob; + if (dropout_implementation == "upscale_in_train") { + alpha = 1.; + } + float beta = 0.; + + std::vector shape = {1, 1, 1, 1}; + std::string alpha_var_name = string_format("dropout_alpha_%p", op); + std::string beta_var_name = string_format("dropout_beta_%p", op); + auto alpha_tensor = graph->AddNode( + alpha_var_name, shape, CNML_CONST, CNML_NHWC, graph->FPType()); + auto beta_tensor = graph->AddNode( + beta_var_name, shape, CNML_CONST, CNML_NHWC, graph->FPType()); + + graph->BindConstRawData(alpha_var_name, &alpha, 1); + graph->BindConstRawData(beta_var_name, &beta, 1); + + auto input_tensor = graph->GetNode(x_var_name); + cnmlBaseOp_t scale_op; + CNML_CALL(cnmlCreateScaleOp(&scale_op, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + alpha_tensor->mlu_tensor(), + beta_tensor->mlu_tensor())); + graph->FuseOp(scale_op); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(dropout, + kMLU, + paddle::lite::subgraph::mlu::DropoutConverter); diff --git a/lite/kernels/mlu/bridges/dropout_op_test.cc b/lite/kernels/mlu/bridges/dropout_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..44f03e3051a6c568d541b98b64808e27470d8916 --- /dev/null +++ b/lite/kernels/mlu/bridges/dropout_op_test.cc @@ -0,0 +1,158 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/dropout_op.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void dropout_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto dropout_implementation = + op_info->GetAttr("dropout_implementation"); + auto dropout_prob = op_info->GetAttr("dropout_prob"); + float alpha = 1.0f - dropout_prob; + if (dropout_implementation == "upscale_in_train") { + alpha = 1.; + } + float beta = 0.; + + auto x_data = x->data(); + auto out_data = out->mutable_data(); + DDim x_dims = x->dims(); + DDim out_dims = out->dims(); + CHECK_EQ(x_dims.production(), out_dims.production()); + for (int i = 0; i < out_dims.production(); i++) { + out_data[i] = x_data[i] * alpha + beta; + } +} + +void test_dropout(int bs, + int ic, + int ih, + int iw, + std::string dropout_implementation, + float dropout_prob, + float bias) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string mask_var_name("mask"); + std::string out_ref_var_name("out_ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* mask = scope.Var(mask_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize({bs, ic, ih, iw}); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + bool is_test = true; + bool fix_seed = false; + int seed = 0; + cpp::OpDesc opdesc; + opdesc.SetType("dropout"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetOutput("Mask", {mask_var_name}); + opdesc.SetAttr("is_test", is_test); + opdesc.SetAttr("fix_seed", fix_seed); + opdesc.SetAttr("seed", seed); + opdesc.SetAttr("dropout_implementation", dropout_implementation); + opdesc.SetAttr("dropout_prob", dropout_prob); + VLOG(6) << "mask: " << mask->dims()[0] << std::endl; + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + dropout_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input_trans; + input_trans.Resize({bs, ic, ih, iw}); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {bs, ic, ih, iw}, + {0, 2, 3, 1}); + auto os = out->dims(); + out->Resize({static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}); + x->CopyDataFrom(input_trans); + x->Resize({bs, ih, iw, ic}); + + LaunchOp(op, {x_var_name}, {out_var_name}); + + // execute reference implementation and save to output tensor('out') + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + Tensor output_trans; + output_trans.Resize(os); + transpose(out_data, + output_trans.mutable_data(), + {static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(MLUBridges, dropout) { + for (auto bs : {1, 3}) { + for (auto ic : {1, 3}) { + for (auto ih : {3, 4}) { + for (auto iw : {4, 3}) { + for (auto dropout_implementation : + {"downgrade_in_infer", "upscale_in_train"}) { + for (auto dropout_prob : {0.f, 1.0f}) { + VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih + << " iw: " << iw + << " dropout_implementation: " << dropout_implementation + << " dropout_prob: " << dropout_prob; + test_dropout( + bs, ic, ih, iw, dropout_implementation, dropout_prob, 0.); + } + } + } + } + } + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(dropout, kMLU); diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc index 41526a0100ba71be9eda25983cb96aa888d6cf4d..5f7192a0628a7887dbca15d63f1ba22799d7ee4b 100644 --- a/lite/kernels/mlu/bridges/elementwise_ops.cc +++ b/lite/kernels/mlu/bridges/elementwise_ops.cc @@ -23,7 +23,7 @@ namespace mlu { std::vector CvtYShape(const Tensor& x, Tensor* y, int axis) { auto x_dims = x.dims(); - CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x"; + // CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x"; auto y_dims = y->dims(); CHECK_GE(x_dims.size(), y_dims.size()); @@ -117,6 +117,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { } graph->FuseOp(elementwise_op); + CNML_CALL(cnmlDestroyBaseOp(&elementwise_op)); cnmlBaseOp_t act_op; if (op_type == "fusion_elementwise_add_activation") { auto mid_tensor = graph->GetNode(out_var_name + "_mid"); @@ -127,6 +128,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { mid_tensor->mlu_tensor(), output_tensor->mlu_tensor())); graph->FuseOp(act_op); + CNML_CALL(cnmlDestroyBaseOp(&act_op)); } return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/mlu/bridges/elementwise_ops_test.cc b/lite/kernels/mlu/bridges/elementwise_ops_test.cc index e5087dd708eee3ba255fbfa0383d31b12a6b6870..7844e5b1b57567f72750b21ba288547cb165eb54 100644 --- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc +++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc @@ -153,7 +153,7 @@ void test_elementwise_add(const std::vector& x_shape, opdesc.SetOutput("Out", {out_var_name}); opdesc.SetAttr("axis", axis); - // create and convert op to NPU model, then run it on NPU + // create and convert op to MLU model, then run it on MLU auto op = CreateOp(opdesc, &scope); // execute reference implementation and save to output tensor diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc index 286feec8d4d44eaa025f333d559c32ca72f042ff..e820fc7abca89a573cfbd7efd7ecca1640905e6a 100644 --- a/lite/kernels/mlu/bridges/fc_op.cc +++ b/lite/kernels/mlu/bridges/fc_op.cc @@ -34,7 +34,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto w_var_name = op_info->Input("W").front(); auto output_var_name = op_info->Output("Out").front(); - // int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); + CHECK(!op_info->HasAttr("activation_type")); auto x = scope->FindVar(x_var_name)->GetMutable(); auto w = scope->FindVar(w_var_name)->GetMutable(); auto output = scope->FindVar(output_var_name)->GetMutable(); @@ -45,11 +45,30 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { CHECK_EQ(w_dims.size(), 2UL); // Create w node - std::vector w_shape{w_dims[1], w_dims[0]}; + std::vector cnml_w_shape; + if (x_dims.size() == 4) { + if (x_dims[1] * x_dims[2] * x_dims[3] == w_dims[0]) { + cnml_w_shape = { + static_cast(w_dims[1]), + static_cast(x_dims[1]), // input_c + static_cast(x_dims[2]), // input_h + static_cast(x_dims[3]), // input_w + }; + } else { + LOG(FATAL) + << "in fc op, we expect input_h * input_w * input_c == filter_c" + << " but we got input_c = " << x_dims[1] << " input_h = " << x_dims[2] + << " input_w = " << x_dims[3] << " filter_c = " << w_dims[0] + << std::endl; + } + } else { + cnml_w_shape = {w_dims[1], w_dims[0]}; + } + auto w_tensor = graph->AddNode( - w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType()); + w_var_name, cnml_w_shape, CNML_FILTER, CNML_NCHW, graph->FPType()); - auto input_scale = op_info->GetAttr("input_scale"); + auto input_scale = op_info->GetInputScale(x_var_name)[0]; auto output_tensor = graph->AddNode(output_var_name, output->dims().Vectorize(), @@ -63,15 +82,15 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (HasInputArg(op_info, scope, "Bias")) { bias_var_name = op_info->Input("Bias").front(); auto bias = scope->FindVar(bias_var_name)->GetMutable(); - auto bias_dims = bias->dims(); + auto bias_dims = bias->dims().Vectorize(); CHECK(!graph->HasNode(bias_var_name)); + if (bias_dims.size() < 4u) { + bias_dims.insert(bias_dims.begin(), 4 - bias_dims.size(), 1); + } // CHECK_EQ(bias_dims.production(), n); - bias_tensor = graph->AddNode(bias_var_name, - bias_dims.Vectorize(), - CNML_CONST, - CNML_CNHW, - graph->FPType()); + bias_tensor = graph->AddNode( + bias_var_name, bias_dims, CNML_CONST, CNML_NHWC, graph->FPType()); graph->BindConstData(bias_var_name, bias); } cnmlBaseOp_t fc_op; @@ -82,24 +101,52 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { bias_tensor ? bias_tensor->mlu_tensor() : nullptr)); graph->SetComputingDataType( fc_op, graph->GetNode(x_var_name)->mlu_tensor(), 1 / input_scale); - auto weight_scale = op_info->GetAttr>("weight_scale"); + auto weight_scale = op_info->GetInputScale(w_var_name); // LOG(INFO) << "W precision " << int(w->precision()); if (w->precision() == PrecisionType::kUnk || w->precision() == PrecisionType::kInt8) { std::vector w_dequant(w->data_size()); - dequant(w_dequant.data(), - w->mutable_data(), - 1, - w_dims[1], - w_dims[0], - weight_scale); - for (int i = 0; i < w_dims[1]; i++) { - for (int j = 0; j < w_dims[0]; j++) { - w->mutable_data()[i * w_dims[0] + j] = - w_dequant[i + j * w_dims[1]]; - } + if (cnml_w_shape.size() == 2) { + dequant(w_dequant.data(), + w->mutable_data(), + 1, + cnml_w_shape[0], + cnml_w_shape[1], + weight_scale); + transpose2d(w_dequant.data(), + w->mutable_data(), + {static_cast(cnml_w_shape[0]), + static_cast(cnml_w_shape[1])}); + } else if (cnml_w_shape.size() == 4) { + dequant(w_dequant.data(), + w->mutable_data(), + 1, + cnml_w_shape[0], + cnml_w_shape[1] * cnml_w_shape[2] * cnml_w_shape[3], + weight_scale); + + int c_o_num = cnml_w_shape[0]; + int c_i_num = cnml_w_shape[1]; + int h_i_num = cnml_w_shape[2]; + int w_i_num = cnml_w_shape[3]; + + // chw == ci * hi * wi == w_dim[0] + // first trans [chw, co] -> [co,chw] + std::vector first_trans_output(w_dequant.size()); + int chw = c_i_num * h_i_num * w_i_num; + transpose2d(w_dequant.data(), first_trans_output.data(), {chw, c_o_num}); + + // second trans [co,ci,hi,wi] -> [co,hi,wi,ci] + transpose(first_trans_output.data(), + w->mutable_data(), + {c_o_num, c_i_num, h_i_num, w_i_num}, + {0, 2, 3, 1}); + } else { + LOG(FATAL) << "expect w_shape.size == 2 or 4, but got " + << cnml_w_shape.size() << std::endl; } + w->set_precision(PrecisionType::kFloat); } else if (w->precision() != PrecisionType::kFloat) { LOG(FATAL) << "UnSupported weight precision!"; @@ -110,9 +157,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { graph->SetComputingDataType( fc_op, w_tensor->mlu_tensor(), - 1 / *min_element(weight_scale.begin(), weight_scale.end())); + 1 / *max_element(weight_scale.begin(), weight_scale.end())); graph->FuseOp(fc_op); + CNML_CALL(cnmlDestroyBaseOp(&fc_op)); return REBUILD_WHEN_SHAPE_CHANGED; } diff --git a/lite/kernels/mlu/bridges/fc_op_test.cc b/lite/kernels/mlu/bridges/fc_op_test.cc index fe1c889f431350b4175ac400aefe77e6392405c5..b7c576581b7bab4b5dd3f2538350a65f94d62c62 100644 --- a/lite/kernels/mlu/bridges/fc_op_test.cc +++ b/lite/kernels/mlu/bridges/fc_op_test.cc @@ -131,14 +131,15 @@ void test_fc(const std::vector& input_shape, fc_op_desc_mlu.SetOutput("Out", {out_var_name}); fc_op_desc_mlu.SetAttr("in_num_col_dims", static_cast(in_num_col_dims)); - fc_op_desc_mlu.SetAttr("weight_scale", - std::vector(w_shape[1], w_scale)); - fc_op_desc_mlu.SetAttr("input_scale", input_scale); + OpInfo op_info(fc_op_desc_mlu); + op_info.SetInputScale(w_int_var_name, + std::vector(w_shape[1], w_scale)); + op_info.SetInputScale(input_var_name, {input_scale}); if (has_bias) { - fc_op_desc_mlu.SetInput("Bias", {bias_var_name}); + op_info.SetInput("Bias", {bias_var_name}); } - auto fc_op_mlu = CreateOp(fc_op_desc_mlu, &scope); + auto fc_op_mlu = CreateOp(op_info, &scope); Tensor input_tmp, out_tmp; input_tmp.Resize(input_shape); @@ -175,9 +176,9 @@ void test_fc(const std::vector& input_shape, TEST(MLUBridges, fc) { for (bool use_bias : {true, false}) { - // test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias); - // test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias); - // test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias); + test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias); + test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias); + test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias); test_fc({1, 1024, 1, 1}, {1024, 32}, 1, use_bias); } } diff --git a/lite/kernels/mlu/bridges/flatten_op.cc b/lite/kernels/mlu/bridges/flatten_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..faf7e6fd2801cdcaad4bce0a20921843f1d1b516 --- /dev/null +++ b/lite/kernels/mlu/bridges/flatten_op.cc @@ -0,0 +1,124 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int FlattenConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + + // ================== Trans1: NHWC => NCHW =========================== + auto input_tensor = graph->GetNode(x_var_name); + auto trans_1_axis = std::move(GetAxisNHWC2NCHW(x->dims().size())); + auto trans1_out = graph->AddNode(x_var_name + ".trans.i", + x->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType(), + CNML_NCHW); + cnmlBaseOp_t trans1_op{nullptr}; + cnmlNdTransposeOpParam_t trans1_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &trans1_param, trans_1_axis.data(), trans_1_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&trans1_op, + input_tensor->mlu_tensor(), + trans1_out->mlu_tensor(), + trans1_param)); + // ======================== Trans1 End ================================== + + // ======================= Flatten op =================================== + cnmlBaseOp_t flatten_op; + auto trans2_input = graph->AddNode(out_var_name + ".trans.o", + output_dims, + CNML_TENSOR, + CNML_NCHW, + graph->FPType(), + CNML_NCHW); + int cnml_trans2_input_shape[4]; + CNML_CALL( + cnmlGetTensorShape(trans2_input->mlu_tensor(), cnml_trans2_input_shape)); + cnmlReshapeOpParam_t reshape_param{nullptr}; + CNML_CALL(cnmlCreateNdReshapeOpParam( + &reshape_param, cnml_trans2_input_shape, output->dims().size())); + + // Use cnmlCreatexxxOpForward to create op. + CNML_CALL(cnmlCreateReshapeOp(&flatten_op, + reshape_param, + trans1_out->mlu_tensor(), + trans2_input->mlu_tensor())); + // ======================= Flatten End =================================== + + // ================== Trans2: NCHW => NHWC =============================== + auto trans_2_axis = std::move(GetAxisNCHW2NHWC(output->dims().size())); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + cnmlBaseOp_t trans2_op{nullptr}; + cnmlNdTransposeOpParam_t trans2_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &trans2_param, trans_2_axis.data(), trans_2_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&trans2_op, + trans2_input->mlu_tensor(), + output_tensor->mlu_tensor(), + trans2_param)); + // ======================== Trans2 End ================================== + + // ============== DEBUG LOG =============== + + VLOG(6) << "x_var_name: " << x_var_name; + VLOG(6) << "out_var_name: " << out_var_name; + VLOG(6) << "input dim: " << x->dims(); + VLOG(6) << "output dim: " << output->dims(); + // cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(trans1_out->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(trans2_input->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR); + // ============== DEBUG END =============== + graph->FuseOp(trans1_op); + graph->FuseOp(flatten_op); + graph->FuseOp(trans2_op); + CNML_CALL(cnmlDestroyBaseOp(&trans1_op)); + CNML_CALL(cnmlDestroyBaseOp(&flatten_op)); + CNML_CALL(cnmlDestroyBaseOp(&trans2_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(flatten, + kMLU, + paddle::lite::subgraph::mlu::FlattenConverter); +REGISTER_SUBGRAPH_BRIDGE(flatten2, + kMLU, + paddle::lite::subgraph::mlu::FlattenConverter); diff --git a/lite/kernels/mlu/bridges/flatten_op_test.cc b/lite/kernels/mlu/bridges/flatten_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..190b837ffeecfd494ffbd748220207cd63da5c06 --- /dev/null +++ b/lite/kernels/mlu/bridges/flatten_op_test.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/flatten_op.h" + +#include + +#include + +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void test_flatten(std::vector input_shape, int axis) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + x->Resize(input_shape); + Tensor x_cpu; + + // initialize input&output data + FillTensor(x); + x_cpu.CopyDataFrom(*x); + + Tensor input_trans; + input_trans.Resize(input_shape); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_trans); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("flatten2"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", axis); + auto op = CreateOp(opdesc, &scope); + + LaunchOp(op, {x_var_name}, {out_var_name}); + // compare results + auto* out_data = out->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], x_cpu.mutable_data()[i], 1e-5); + } +} + +TEST(MLUBridges, flatten) { test_flatten({1, 2, 4, 4}, 2); } +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(flatten, kMLU); +USE_SUBGRAPH_BRIDGE(flatten2, kMLU); diff --git a/lite/kernels/mlu/bridges/gather_op.cc b/lite/kernels/mlu/bridges/gather_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b68f1af76456eede14ec550c623d6a8355f5d5e8 --- /dev/null +++ b/lite/kernels/mlu/bridges/gather_op.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X").front(); + auto index_var_name = op_info->Input("Index").front(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + + CHECK(graph->HasNode(x_var_name)); + auto x_tensor = graph->GetNode(x_var_name); + auto index_tensor = graph->GetNode(index_var_name); + + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + + cnmlBaseOp_t gather_op; + CNML_CALL(cnmlCreateGatherV2Op(&gather_op, + x_tensor->mlu_tensor(), + index_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + CNML_DIM_N)); + graph->FuseOp(gather_op); + CNML_CALL(cnmlDestroyBaseOp(&gather_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(gather, + kMLU, + paddle::lite::subgraph::mlu::GatherConverter); diff --git a/lite/kernels/mlu/bridges/gather_op_test.cc b/lite/kernels/mlu/bridges/gather_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..413de7c9d7fda750b387c2daa21ef1e40e7982c7 --- /dev/null +++ b/lite/kernels/mlu/bridges/gather_op_test.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/gather_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +template +void gather_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto index = + scope->FindVar(op_info->Input("Index").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + + auto x_dims = x->dims(); + auto index_dims = index->dims(); + CHECK(index_dims.size() == 1 || + (index_dims.size() == 2 && index_dims[1] == 1)); + + int batch_size = index_dims[0]; + DDim out_dims = x_dims; + out_dims[0] = batch_size; + out->Resize(out_dims); + + auto x_data = x->data(); + auto index_data = index->data(); + auto out_data = out->mutable_data(); + + auto slice_num = x_dims[0]; + auto slice_size = x_dims.Slice(1, x_dims.size()).production(); + for (int i = 0; i < batch_size; i++) { + auto index = index_data[i]; + CHECK_LT(index, slice_num) << "index <= slice_num"; + CHECK_GE(index, 0) << "index > 0"; + memcpy(out_data + i * slice_size, + x_data + index * slice_size, + slice_size * sizeof(float)); + } +} + +void test_gather() { + // prepare input&output variables + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + std::string index_var_name = "index"; + + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + auto* index = scope.Var(index_var_name)->GetMutable(); + + x->Resize({5, 4, 3, 2}); + index->Resize({2}); + // initialize input&output data + FillTensor(x); + FillTensor(index, 1, 3); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("gather"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetInput("Index", {index_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + auto op = CreateOp(opdesc, &scope); + gather_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input; + input.Resize({5, 4, 3, 2}); + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(5), + static_cast(4), + static_cast(3), + static_cast(2)}, + {0, 2, 3, 1}); + x->CopyDataFrom(input); + LaunchOp(op, {x_var_name, index_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + + Tensor output; + output.Resize(out->dims()); + transpose(out_data, + output.mutable_data(), + {static_cast(out->dims()[0]), + static_cast(out->dims()[2]), + static_cast(out->dims()[3]), + static_cast(out->dims()[1])}, + {0, 3, 1, 2}); + out_data = output.mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4); + } +} + +TEST(MLUBridges, gather) { test_gather(); } + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(gather, kMLU); diff --git a/lite/kernels/mlu/bridges/graph.cc b/lite/kernels/mlu/bridges/graph.cc index 65c2f8214c13ee8d004dbe4b2e706523d007469c..bbe88547c8d60e1468653a28dad97af09b24f952 100644 --- a/lite/kernels/mlu/bridges/graph.cc +++ b/lite/kernels/mlu/bridges/graph.cc @@ -27,10 +27,14 @@ std::shared_ptr Graph::AddNode(const std::string& name, cnmlTensorType_t tensor_type, cnmlDataOrder_t shape_order, cnmlDataType_t mlu_dtype, + cnmlDataOrder_t data_order, void* raw_ptr) { CHECK(!HasNode(name)); + VLOG(5) << "add mlu node: " << name << "\t data type " + << static_cast(mlu_dtype) << "\t data order " + << static_cast(data_order); auto node = std::shared_ptr( - new MLUTensor(shape, tensor_type, shape_order, mlu_dtype)); + new MLUTensor(shape, tensor_type, shape_order, mlu_dtype, data_order)); node->set_mlu_ptr(raw_ptr); nodes_.insert(std::make_pair(name, node)); return node; diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h index 2c6bd63a87e53332a329d0c5c66fcf372a2584ca..07c6b20efb9a72106cf6ae288c411e490345b089 100644 --- a/lite/kernels/mlu/bridges/graph.h +++ b/lite/kernels/mlu/bridges/graph.h @@ -15,13 +15,15 @@ #pragma once #include -#include #include #include +#include #include + #include "lite/core/op_lite.h" #include "lite/core/tensor.h" #include "lite/kernels/mlu/bridges/tensor.h" +#include "lite/utils/env.h" #define PRINT_HW_TIME false @@ -45,32 +47,30 @@ class Graph { CNRT_CALL(cnrtCreateNotifier(¬ifier_end_)); #endif } - ~Graph() { FreeConstData(); CNML_CALL(cnmlDestroyFusionOp(&fusion_op_)); - for (auto op : ops_) { - CNML_CALL(cnmlDestroyBaseOp(&op)); - } #if PRINT_HW_TIME CNRT_CALL(cnrtDestroyNotifier(¬ifier_start_)); CNRT_CALL(cnrtDestroyNotifier(¬ifier_end_)); double total_time = 0; - for (auto& f : time_log_) { - total_time += f; + if (!time_log_.empty()) { + for (auto& f : time_log_) { + total_time += f; + } + std::cout << "cnml hardware time for " << time_log_.size() + << " process:" << total_time / time_log_.size() << std::endl; } - std::cout << "cnml hardware time for " << time_log_.size() - << " process:" << total_time / time_log_.size() << std::endl; #endif } - // Data node std::shared_ptr AddNode( const std::string& name, std::vector shape, cnmlTensorType_t tensor_type = CNML_TENSOR, - cnmlDataOrder_t data_order = CNML_NCHW, + cnmlDataOrder_t shape_order = CNML_NCHW, cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32, + cnmlDataOrder_t data_order = CNML_NHWC, void* raw_ptr = nullptr); std::shared_ptr GetNode(const std::string& name) { @@ -82,9 +82,16 @@ class Graph { return nodes_.find(name) != nodes_.end(); } - void AddInput(std::shared_ptr tensor) { + void AddInput(std::shared_ptr tensor, + bool disable_batch_size_changeable = true) { inputs_.push_back(tensor->mlu_tensor()); input_tensors_.push_back(tensor); + if (!disable_batch_size_changeable) { + constexpr int input_dimNb = 4; + bool input_dim_mutable[4] = {true, false, false, false}; + CNML_CALL(cnmlSetTensorDimMutable( + tensor->mlu_tensor(), input_dim_mutable, input_dimNb)); + } } void AddOutput(std::shared_ptr tensor) { @@ -92,6 +99,22 @@ class Graph { output_tensors_.push_back(tensor); } + std::vector>* MutableInputs() { + return &input_tensors_; + } + + std::vector>* MutableOutputs() { + return &output_tensors_; + } + void GenOfflineModel(const std::string& name) { + cnmlModel_t model; + const std::string& symbol = "subnet0"; + const auto& filename = name + ".offline.cambricon"; + CNML_CALL(cnmlCreateModel(&model, filename.c_str())); + CNML_CALL(cnmlAddFusionOpToModel(model, fusion_op_, symbol.c_str())); + CNML_CALL(cnmlSaveModel(model, filename.c_str())); + CNML_CALL(cnmlDestroyModel(model)); + } void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); } void Compile(cnmlCoreVersion_t core_version, int core_number) { @@ -103,18 +126,37 @@ class Graph { CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number)); CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version)); CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_)); - for (auto in : input_tensors_) { - input_addrs_.push_back(in->mlu_data()); - } - for (auto out : output_tensors_) { - output_addrs_.push_back(out->mlu_data()); - } } +#define MEASURE_HWTIME_START(que) \ + do { \ + CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); \ + } while (0) + +#define MEASURE_HWTIME_END(que) \ + do { \ + thread_local float hw_time; \ + CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que)); \ + CNRT_CALL(cnrtSyncQueue(que)); \ + CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); \ + hw_time /= 1000.0f; \ + DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl; \ + std::lock_guard lk(time_mut_); \ + time_log_.push_back(hw_time); \ + } while (0) + void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) { + input_addrs_.resize(input_tensors_.size()); + output_addrs_.resize(output_tensors_.size()); + for (size_t i = 0; i < input_addrs_.size(); ++i) { + input_addrs_[i] = input_tensors_[i]->mlu_data(); + } + for (size_t i = 0; i < output_addrs_.size(); ++i) { + output_addrs_[i] = output_tensors_[i]->mlu_data(); + } + #if PRINT_HW_TIME - thread_local float hw_time; - CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); + MEASURE_HWTIME_START(que); #endif CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_, input_addrs_.data(), @@ -124,18 +166,46 @@ class Graph { &forward_param, que)); #if PRINT_HW_TIME - CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que)); + MEASURE_HWTIME_END(que); #endif + } - CNRT_CALL(cnrtSyncQueue(que)); + void Compute(cnrtQueue_t que, + const std::vector>& in, + const std::vector>& out) { + std::vector in_tensor; + std::vector out_tensor; + input_addrs_.resize(in.size()); + output_addrs_.resize(out.size()); + for (size_t i = 0; i < input_addrs_.size(); ++i) { + input_addrs_[i] = in[i]->mlu_data(); + in_tensor.push_back(in[i]->mlu_tensor()); + } + for (size_t i = 0; i < output_addrs_.size(); ++i) { + output_addrs_[i] = out[i]->mlu_data(); + out_tensor.push_back(out[i]->mlu_tensor()); + } + +#if PRINT_HW_TIME + MEASURE_HWTIME_START(que); +#endif + /* Because of using cnmlSetTensorDimMutable, cnmlComputeFusionOpForward_V3 + * -> cnmlComputeFusionOpForward_V4 */ + CNML_CALL(cnmlComputeFusionOpForward_V4(fusion_op_, + &in_tensor[0], + input_addrs_.data(), + input_addrs_.size(), + &out_tensor[0], + output_addrs_.data(), + output_addrs_.size(), + que, + NULL)); #if PRINT_HW_TIME - CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); - hw_time /= 1000.0f; - DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl; - std::lock_guard lk(time_mut_); - time_log_.push_back(hw_time); + MEASURE_HWTIME_END(que); #endif } +#undef MEASURE_HWTIME_START +#undef MEASURE_HWTIME_END template void* RegisterConstData(size_t len) { @@ -165,7 +235,7 @@ class Graph { CNML_CALL(cnmlBindConstData_V2( nodes_[tensor_name]->mlu_tensor(), alloc_data, false)); } else if (fp_type_ == CNML_DATA_FLOAT16) { - void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len); + void* data_fp16 = RegisterConstData(len); CNRT_CALL( cnrtCastDataType(const_cast(static_cast(data)), CNRT_FLOAT32, @@ -180,7 +250,7 @@ class Graph { } } - void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) { + void BindConstData(std::string tensor_name, paddle::lite::Tensor* tensor) { const float* data = tensor->data(); size_t len = tensor->data_size(); if (fp_type_ == CNML_DATA_FLOAT32) { @@ -189,10 +259,14 @@ class Graph { const_cast(static_cast(data)), false)); } else if (fp_type_ == CNML_DATA_FLOAT16) { - auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>(); - for (size_t i = 0; i < len; ++i) { - data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]); - } + void* data_fp16 = RegisterConstData(len); + CNRT_CALL( + cnrtCastDataType(const_cast(static_cast(data)), + CNRT_FLOAT32, + data_fp16, + CNRT_FLOAT16, + len, + nullptr)); CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(), static_cast(data_fp16), false)); @@ -206,19 +280,23 @@ class Graph { float scale, cnmlDataType_t data_type = CNML_DATA_INT8) { cnmlQuantizedParam_t quant_param; - CNML_CALL( - cnmlCreateQuantizedParam(&quant_param, scale2position(scale), 1, 0.0)); + int pos = scale2position(scale); + auto cnml_scale = pow(2, pos) * scale; + VLOG(5) << "[cnml quantized param] pos: " << pos + << "\tscale: " << cnml_scale << std::endl; + CNML_CALL(cnmlCreateQuantizedParam(&quant_param, pos, cnml_scale, 0.0)); CNML_CALL( cnmlSetOperationComputingDataType(op, tensor, data_type, quant_param)); CNML_CALL(cnmlDestroyQuantizedParam(&quant_param)); } - void SetFPType(::paddle::lite_api::PrecisionType type) { + void SetFPType(paddle::lite_api::PrecisionType type) { + origin_fp_type_ = type; switch (type) { - case ::paddle::lite_api::PrecisionType::kFP16: + case paddle::lite_api::PrecisionType::kFP16: fp_type_ = CNML_DATA_FLOAT16; break; - case ::paddle::lite_api::PrecisionType::kFloat: + case paddle::lite_api::PrecisionType::kFloat: fp_type_ = CNML_DATA_FLOAT32; break; default: @@ -230,14 +308,14 @@ class Graph { private: cnmlDataType_t fp_type_{CNML_DATA_FLOAT32}; - std::map> nodes_; + paddle::lite_api::PrecisionType origin_fp_type_{PRECISION(kFloat)}; + std::unordered_map> nodes_; std::vector inputs_; std::vector outputs_; std::vector input_addrs_; std::vector output_addrs_; std::vector> input_tensors_; std::vector> output_tensors_; - std::vector ops_; cnmlFusionOp_t fusion_op_; std::vector const_data_storage_; #if PRINT_HW_TIME diff --git a/lite/kernels/mlu/bridges/interpolate_op.cc b/lite/kernels/mlu/bridges/interpolate_op.cc index 2c1a2aeeff799d31d4328169fce058259543fb1f..32840736b8d9a9712d59a8175cd7d70311a34aad 100644 --- a/lite/kernels/mlu/bridges/interpolate_op.cc +++ b/lite/kernels/mlu/bridges/interpolate_op.cc @@ -85,6 +85,7 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) { nn_param)); CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param)); graph->FuseOp(interp_op); + CNML_CALL(cnmlDestroyBaseOp(&interp_op)); return SUCCESS; } diff --git a/lite/kernels/mlu/bridges/layout_op.cc b/lite/kernels/mlu/bridges/layout_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d14695c4357e06832e06a68646628bfa8d211c43 --- /dev/null +++ b/lite/kernels/mlu/bridges/layout_op.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int LayoutConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("Input").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + std::shared_ptr output_tensor; + + CHECK(graph->HasNode(x_var_name)); + std::vector axis; + auto x_tensor = graph->GetNode(x_var_name); + auto x_data_order = x_tensor->dorder(); + auto x_dims = x->dims().Vectorize(); + if (x_data_order == CNML_NCHW) { + switch (x_dims.size()) { + case 2: + axis = {0, 1}; + break; + case 3: + axis = {0, 2, 1}; + break; + case 4: + axis = {0, 2, 3, 1}; + break; + case 5: + axis = {0, 2, 3, 4, 1}; + break; + default: + CHECK(0) << "Unsupport shape"; + } + output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, x_tensor->dtype()); + VLOG(3) << "layout transpose nchw to nhwc" << std::endl; + } else { + switch (x_dims.size()) { + case 2: + axis = {0, 1}; + break; + case 3: + axis = {0, 2, 1}; + break; + case 4: + axis = {0, 3, 1, 2}; + break; + case 5: + axis = {0, 4, 1, 2, 3}; + break; + default: + CHECK(0) << "Unsupport shpae"; + } + VLOG(3) << "layout transpose nhwc to nchw" << std::endl; + output_tensor = graph->AddNode(out_var_name, + output_dims, + CNML_TENSOR, + CNML_NCHW, + x_tensor->dtype(), + CNML_NCHW); + } + cnmlBaseOp_t layout_op; + cnmlNdTransposeOpParam_t transpose_param; + CNML_CALL( + cnmlCreateNdTransposeOpParam(&transpose_param, axis.data(), axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&layout_op, + x_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + transpose_param)); + graph->FuseOp(layout_op); + CNML_CALL(cnmlDestroyBaseOp(&layout_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(layout, + kMLU, + paddle::lite::subgraph::mlu::LayoutConverter); diff --git a/lite/kernels/mlu/bridges/layout_op_test.cc b/lite/kernels/mlu/bridges/layout_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..69b905b0750fe99e29c6aaa9bffdc9f20229a239 --- /dev/null +++ b/lite/kernels/mlu/bridges/layout_op_test.cc @@ -0,0 +1,190 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/layout_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void test_layout_NHWC2NCHW(std::vector input_shape) { + // prepare input&output variables + std::string x_var_name = "input"; + std::string out_var_name = "out"; + + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + x->Resize(DDim(input_shape)); + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("layout"); + opdesc.SetInput("Input", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + auto op = CreateOp(opdesc, &scope); + + // execute reference implementation and save to output tensor + Tensor input; + input.Resize(DDim(input_shape)); + switch (input_shape.size()) { + case 2: + transpose( + x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), static_cast(input_shape[1])}, + {0, 1}); + break; + case 3: + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[2]), + static_cast(input_shape[1])}, + {0, 2, 1}); + break; + case 4: + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[2]), + static_cast(input_shape[3]), + static_cast(input_shape[1])}, + {0, 3, 1, 2}); + break; + case 5: + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[2]), + static_cast(input_shape[3]), + static_cast(input_shape[4]), + static_cast(input_shape[1])}, + {0, 4, 1, 2, 3}); + break; + default: + CHECK(0) << "Unsupport"; + } + auto* x_data = input.mutable_data(); + LaunchOp(op, {x_var_name}, {out_var_name}); + + // compare results + auto* out_data = out->mutable_data(); + + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], x_data[i], 5e-4); + } +} + +void test_layout_NCHW2NHWC(std::vector input_shape) { + // prepare input&output variables + std::string x_var_name = "input"; + std::string out_var_name = "out"; + + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + x->Resize(DDim(input_shape)); + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("layout"); + opdesc.SetInput("Input", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + auto op = CreateOp(opdesc, &scope); + + // execute reference implementation and save to output tensor + Tensor input; + input.Resize(DDim(input_shape)); + switch (input_shape.size()) { + case 2: + transpose( + x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), static_cast(input_shape[1])}, + {0, 1}); + break; + case 3: + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2])}, + {0, 2, 1}); + break; + case 4: + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + break; + case 5: + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3]), + static_cast(input_shape[4])}, + {0, 2, 3, 4, 1}); + break; + default: + CHECK(0) << "Unsupport"; + } + auto* x_data = input.mutable_data(); + LaunchOp(op, {x_var_name}, {out_var_name}, CNML_NCHW); + + // compare results + auto* out_data = out->mutable_data(); + + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], x_data[i], 5e-4); + } +} + +TEST(MLUBridges, layout) { + test_layout_NHWC2NCHW({12, 32, 4}); + test_layout_NHWC2NCHW({12, 32, 44, 3}); + test_layout_NHWC2NCHW({12, 32, 44, 3, 6}); + test_layout_NCHW2NHWC({12, 32, 55}); + test_layout_NCHW2NHWC({12, 32, 44, 3}); + test_layout_NCHW2NHWC({12, 32, 44, 3, 8}); + test_layout_NHWC2NCHW({12, 32}); + test_layout_NCHW2NHWC({12, 32}); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(layout, kMLU); diff --git a/lite/kernels/mlu/bridges/lrn_op.cc b/lite/kernels/mlu/bridges/lrn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ff428ab10cef170983de788b9af517558e1fd7f5 --- /dev/null +++ b/lite/kernels/mlu/bridges/lrn_op.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int LrnConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Create lrn node and get params from op + auto fp_type = graph->FPType(); + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type); + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + + auto alpha = op_info->GetAttr("alpha"); + auto beta = op_info->GetAttr("beta"); + auto k = op_info->GetAttr("k"); + if (op_info->HasAttr("norm_region")) { + CHECK(op_info->GetAttr("norm_region") == "AcrossChannels") + << "Unsuport WithinChannel"; + } + auto local_size = op_info->GetAttr("n"); + auto input_scale = op_info->GetInputScale(x_var_name)[0]; + VLOG(5) << "lrn input scale: " << input_scale; + + cnmlLrnOpParam_t param; + cnmlBaseOp_t lrn_op; + CNML_CALL( + cnmlCreateLrnOpParam(¶m, CNML_LRN_V3, local_size, alpha, beta, k)); + CNML_CALL(cnmlCreateLrnOp( + &lrn_op, param, input_tensor->mlu_tensor(), output_tensor->mlu_tensor())); + CNML_CALL(cnmlDestroyLrnOpParam(¶m)); + + graph->SetComputingDataType( + lrn_op, input_tensor->mlu_tensor(), 1 / input_scale); + CNML_CALL(cnmlSetOperationComputingDataType( + lrn_op, output_tensor->mlu_tensor(), fp_type, nullptr)); + + graph->FuseOp(lrn_op); + CNML_CALL(cnmlDestroyBaseOp(&lrn_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(lrn, kMLU, paddle::lite::subgraph::mlu::LrnConverter); diff --git a/lite/kernels/mlu/bridges/lrn_op_test.cc b/lite/kernels/mlu/bridges/lrn_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..266446d6d3353bffa4398385703cd4cb64b4f53b --- /dev/null +++ b/lite/kernels/mlu/bridges/lrn_op_test.cc @@ -0,0 +1,243 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/lrn_op.h" +#include +#include +#include +#include +#include + +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +/** + * @brief get sum of x^2 between channels [size elements] + * + * @tparam float + * @param input + * @param channel_id: the c-th channel within n-th graph. + * @param offset_within_channel: the pixel's offset within a channel. + * @param offset_num: the first address of n-th graph. + * @param c + * @param h + * @param w + * @param size + * @return float + */ +float lrn_square(const float* input, + int channel_id, + int offset_within_channel, + int offset_num, + int c, + int h, + int w, + int size) { + int pre_pad = (size - 1) / 2; + float res = 0; + const float* src = input + offset_num; + + // handle left channels with padding situation. + if (channel_id - pre_pad < 0) { + for (int i = 0; i <= channel_id; ++i) { + res += src[i * h * w + offset_within_channel] * + src[i * h * w + offset_within_channel]; + } + } + + // handle left channels. + if (channel_id - pre_pad >= 0) { + for (int i = channel_id - pre_pad; i <= channel_id; ++i) { + res += src[i * h * w + offset_within_channel] * + src[i * h * w + offset_within_channel]; + } + } + + // handle right channels. + if (channel_id + pre_pad < c) { + for (int i = channel_id + 1; i <= channel_id + pre_pad; ++i) { + res += src[i * h * w + offset_within_channel] * + src[i * h * w + offset_within_channel]; + } + } + + // handle right channels with padding situation. + if (channel_id + pre_pad >= c && channel_id + 1 < c) { + for (int i = channel_id + 1; i < c; ++i) { + res += src[i * h * w + offset_within_channel] * + src[i * h * w + offset_within_channel]; + } + } + + return res; +} + +void lrn_compute_ref(std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = + scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = scope->FindVar(op_info->Output("Out").front()) + ->GetMutable(); + + const float* x_data = x->data(); + float* out_data = out->mutable_data(); + auto x_dims = x->dims(); + + auto alpha = op_info->GetAttr("alpha"); + auto beta = op_info->GetAttr("beta"); + auto k = op_info->GetAttr("k"); + auto norm_region = op_info->GetAttr("norm_region"); + auto local_size = op_info->GetAttr("n"); + + int N = x_dims[0]; + int C = x_dims[1]; + int H = x_dims[2]; + int W = x_dims[3]; + + int offset_num = 0; + int offset_within_channel = 0; + int dst_id; + + float square; + + for (int n = 0; n < N; ++n) { + offset_num = n * C * H * W; + + for (int c = 0; c < C; ++c) { + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + offset_within_channel = h * W + w; + dst_id = offset_num + c * H * W + offset_within_channel; + square = lrn_square(x_data, + c, + offset_within_channel, + offset_num, + C, + H, + W, + local_size); + out_data[dst_id] = x_data[dst_id] * pow(k + alpha * square, -beta); + } + } + } + } +} + +void test_lrn(float alpha, + float beta, + float k, + int local_size, + int n, + int c, + int h, + int w, + const std::string& norm_region) { + Scope scope; + std::string x_var_name("X_test"); + std::string out_var_name("Out_test"); + std::string out_ref_var_name("Out_ref"); + auto* x = scope.NewTensor(x_var_name); + auto* out = scope.NewTensor(out_var_name); + auto* out_ref = scope.NewTensor(out_ref_var_name); + + std::vector x_dim{n, c, h, w}; + x->Resize(x_dim); + out->Resize(x_dim); + out_ref->Resize(x_dim); + auto* x_data = x->mutable_data(); + FillTensor(x, 0.f, 1.f); + float *dmax, *dmin; + std::tie(dmin, dmax) = + std::minmax_element(x_data, x_data + x->data_size() - 1); + + cpp::OpDesc opdesc; + opdesc.SetType("lrn"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("alpha", alpha); + opdesc.SetAttr("beta", beta); + opdesc.SetAttr("k", k); + opdesc.SetAttr("n", local_size); + opdesc.SetAttr("norm_region", norm_region); + OpInfo op_info(opdesc); + op_info.SetInputScale(x_var_name, {(*dmax - *dmin) / 255.f}); + + auto op = CreateOp(op_info, &scope); + + // baseline + lrn_compute_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input_x; + input_x.Resize(x->dims()); + transpose(x->mutable_data(), + input_x.mutable_data(), + {static_cast(x_dim[0]), + static_cast(x_dim[1]), + static_cast(x_dim[2]), + static_cast(x_dim[3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_x); + + LaunchOp(op, {x_var_name}, {out_var_name}); + + Tensor output_trans; + auto os = out->dims(); + output_trans.Resize(os); + transpose(out->mutable_data(), + output_trans.mutable_data(), + {static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}, + {0, 3, 1, 2}); + + auto output_data = output_trans.mutable_data(); + auto* output_ref_data = out_ref->mutable_data(); + for (size_t i = 0; i < out->data_size(); i++) { + EXPECT_NEAR(output_data[i], output_ref_data[i], 5e-4); + } +} + +TEST(MLUBridges, lrn) { + int local_size = 5; + float alpha = 0.0001f; + float beta = 0.75; + float k = 2.0f; + std::string norm_region = "AcrossChannels"; + for (int w : {2, 4, 8}) { + for (int h : {2, 4, 8}) { + for (int c : {1, 2, 3, 4}) { + for (int n : {1, 2, 3, 4}) { + test_lrn(alpha, beta, k, local_size, n, c, h, w, norm_region); + } + } + } + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(lrn, kMLU) diff --git a/lite/kernels/mlu/bridges/norm_op.cc b/lite/kernels/mlu/bridges/norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..492c3932a8c8a68f7eba687dde30d888d6e0f297 --- /dev/null +++ b/lite/kernels/mlu/bridges/norm_op.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int NormConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Get input vars and op attributes + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims().Vectorize(); + + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + int axis = op_info->GetAttr("axis"); + int epsilon = op_info->GetAttr("epsilon"); + if (axis < 0) { + axis = axis + x_dims.size(); + } + std::vector nchw2nhwc = {0, 3, 1, 2}; + int nhwc_axis = nchw2nhwc[axis]; + + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + + // ======== DEBUG =============== + VLOG(6) << "x name=" << x_var_name; + VLOG(6) << "out name=" << out_var_name; + VLOG(6) << "x dims=" << x->dims(); + VLOG(6) << "out dims=" << output->dims(); + VLOG(6) << "axis =" << axis; + VLOG(6) << "nwhc axis=" << nhwc_axis; + VLOG(6) << "epsilon =" << epsilon; + // cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR); + // ======== DEBUG END ============ + cnmlBaseOp_t norm_op{nullptr}; + + cnmlNormalizeOpParam_t param; + int mode = -1; + switch (axis) { + case 0: + mode = 3; // N + break; + case 1: + mode = 0; // C + break; + case 2: + mode = 4; // H + break; + case 3: + mode = 5; // W + break; + default: + CHECK(0); + break; + } + cnmlCreateNormalizeOpParamV2(¶m, + 0, // p + 0, // use_scale + mode, + 1, // weight + epsilon); + + CNML_CALL(cnmlCreateNormalizeOp(&norm_op, + param, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + nullptr, + false /*is_fix8_mode*/)); + graph->FuseOp(norm_op); + CNML_CALL(cnmlDestroyBaseOp(&norm_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(norm, + kMLU, + paddle::lite::subgraph::mlu::NormConverter); diff --git a/lite/kernels/mlu/bridges/norm_op_test.cc b/lite/kernels/mlu/bridges/norm_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..35b5eabbb9ffacd96c3ca6500dd9181f4d5bec5b --- /dev/null +++ b/lite/kernels/mlu/bridges/norm_op_test.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/norm_op.h" + +#include + +#include +#include + +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +// void ToFile(std::string file_name, Tensor* tensor) { +// int count = tensor->dims().production(); +// auto data = tensor->mutable_data(); +// std::ostringstream outs; +// for (size_t i = 0; i < count; i++) { +// outs << data[i] << std::endl; +// } +// std::ofstream of; +// of.open(file_name, std::ios::out); +// of << outs.str(); +// of.close(); +// } + +void norm_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + int axis = op_info->GetAttr("axis"); + int epsilon = op_info->GetAttr("epsilon"); + auto x_dims = x->dims(); + if (axis < 0) { + axis += x_dims.size(); + } + out->Resize(x_dims.Vectorize()); + auto* out_data = out->mutable_data(); + + const auto* x_data = x->data(); + int pre_n = x_dims.count(0, axis); + int n = x_dims[axis]; + int post_n = x_dims.count(axis + 1, x_dims.size()); + for (int i = 0; i < pre_n; i++) { + for (int k = 0; k < post_n; k++) { + float sum = epsilon; + const float* in_tmp = x_data + i * n * post_n + k; + for (int j = 0; j < n; j++) { + sum += in_tmp[j * post_n] * in_tmp[j * post_n]; + } + sum = std::sqrt(sum); + float* out_tmp = out_data + i * n * post_n + k; + for (int j = 0; j < n; j++) { + out_tmp[j * post_n] = in_tmp[j * post_n] / sum; + } + } + } +} + +void test_norm(const std::vector& input_shape, int axis) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(input_shape); + // initialize input&output data + FillTensor(x, -9, 9); + // initialize op desc + cpp::OpDesc opdesc; + float epsilon = 1e-9f; + opdesc.SetType("norm"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", static_cast(axis)); + opdesc.SetAttr("epsilon", static_cast(epsilon)); + + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + norm_ref(op); + out_ref->CopyDataFrom(*out); + Tensor input_x; + input_x.Resize(DDim(input_shape)); + // change input layout from NCHW to NHWC + transpose(x->mutable_data(), + input_x.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_x); + + LaunchOp(op, {x_var_name}, {out_var_name}); + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + std::vector out_shape = input_shape; + Tensor output_trans; + output_trans.Resize(out_shape); + // Change output layout from NHWC to NCHW + transpose(out_data, + output_trans.mutable_data(), + {static_cast(out_shape[0]), + static_cast(out_shape[2]), + static_cast(out_shape[3]), + static_cast(out_shape[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(MLUBridges, norm) { + test_norm({1, 2, 3, 4}, 1); + test_norm({1, 2, 3, 4}, 2); + test_norm({1, 2, 3, 4}, 3); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(norm, kMLU); diff --git a/lite/kernels/mlu/bridges/paddle_use_bridges.h b/lite/kernels/mlu/bridges/paddle_use_bridges.h index d31ba0dd41111860a3b26d8ac3afb3273bef4557..be5c64b3b7056d0b8de1589d198db541b5a3777b 100644 --- a/lite/kernels/mlu/bridges/paddle_use_bridges.h +++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h @@ -15,6 +15,7 @@ #pragma once USE_SUBGRAPH_BRIDGE(relu, kMLU); +USE_SUBGRAPH_BRIDGE(relu6, kMLU) USE_SUBGRAPH_BRIDGE(conv2d, kMLU); USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU); USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU); @@ -24,5 +25,26 @@ USE_SUBGRAPH_BRIDGE(batch_norm, kMLU); USE_SUBGRAPH_BRIDGE(fc, kMLU); USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU); USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU); +USE_SUBGRAPH_BRIDGE(transpose, kMLU); +USE_SUBGRAPH_BRIDGE(transpose2, kMLU); USE_SUBGRAPH_BRIDGE(concat, kMLU); USE_SUBGRAPH_BRIDGE(scale, kMLU); +USE_SUBGRAPH_BRIDGE(sigmoid, kMLU); +USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU); +USE_SUBGRAPH_BRIDGE(dropout, kMLU); +USE_SUBGRAPH_BRIDGE(arg_max, kMLU); +USE_SUBGRAPH_BRIDGE(split, kMLU); +USE_SUBGRAPH_BRIDGE(cast, kMLU); +USE_SUBGRAPH_BRIDGE(layout, kMLU); +USE_SUBGRAPH_BRIDGE(slice, kMLU); +USE_SUBGRAPH_BRIDGE(squeeze, kMLU); +USE_SUBGRAPH_BRIDGE(squeeze2, kMLU); +USE_SUBGRAPH_BRIDGE(flatten, kMLU); +USE_SUBGRAPH_BRIDGE(flatten2, kMLU); +USE_SUBGRAPH_BRIDGE(reshape, kMLU); +USE_SUBGRAPH_BRIDGE(reshape2, kMLU); +#ifdef LITE_BUILD_EXTRA +USE_SUBGRAPH_BRIDGE(gather, kMLU); +USE_SUBGRAPH_BRIDGE(lrn, kMLU) +USE_SUBGRAPH_BRIDGE(norm, kMLU) +#endif diff --git a/lite/kernels/mlu/bridges/pool_op.cc b/lite/kernels/mlu/bridges/pool_op.cc index f77c8084c76fc52c39938e723f02bde9b3cac41b..c734de1eec75d253a9b6b8d7a7f21d710df3d949 100644 --- a/lite/kernels/mlu/bridges/pool_op.cc +++ b/lite/kernels/mlu/bridges/pool_op.cc @@ -55,6 +55,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto global_pooling = op_info->GetAttr("global_pooling"); auto ksize = op_info->GetAttr>("ksize"); auto strides = op_info->GetAttr>("strides"); + CHECK(!(op_info->HasAttr("exclusive") && + op_info->GetAttr("exclusive") == false)) + << "Unsupport param exclusive is false!"; if (paddings.size() == 2L) { for (size_t i = 0; i < 2L; ++i) { @@ -62,8 +65,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); } } - int pad_height = paddings[0]; - int pad_width = paddings[2]; std::string padding_algorithm(""); if (op_info->HasAttr("padding_algorithm")) { padding_algorithm = op_info->GetAttr("padding_algorithm"); @@ -72,6 +73,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { if (op_info->HasAttr("adaptive")) { adaptive = op_info->GetAttr("adaptive"); } + auto input_dims = x->dims(); + lite::operators::UpdatePadding(&paddings, global_pooling, adaptive, @@ -80,31 +83,31 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { strides, ksize); - // std::vector output_shape({input_dims[0], input_dims[1]}); - // for (size_t i = 0; i < 2; i++) { - // output_shape.push_back( - // (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - - // ksize[0]) / - // strides[i] + - // 1); - // } + if (global_pooling) { + ksize.resize(static_cast(input_dims.size()) - 2); + for (size_t i = 0; i < ksize.size(); ++i) { + ksize[i] = static_cast(input_dims[i + 2]); + } + } auto output_tensor = graph->AddNode( output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType()); cnmlPoolOpParam_t pool_param; CNML_CALL( - cnmlCreatePoolOpParam_V2(&pool_param, + cnmlCreatePoolOpParam_V3(&pool_param, ksize[0], ksize[1], strides[0], strides[1], - pad_height, - pad_width, - 1, // dilation - 1, + paddings[0], + paddings[1], + paddings[2], + paddings[3], + 1, // dilation h + 1, // dilation w ToCnmlPoolMode(pooling_type), - ceil_mode ? CNML_POOL_KVALID : CNML_POOL_KFULL, + ceil_mode ? CNML_POOL_KFULL : CNML_POOL_KVALID, true, /* real */ 1 /* blend factor */)); cnmlBaseOp_t pool_op; @@ -114,6 +117,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { output_tensor->mlu_tensor())); CNML_CALL(cnmlDestroyPoolOpParam(&pool_param)); graph->FuseOp(pool_op); + CNML_CALL(cnmlDestroyBaseOp(&pool_op)); return SUCCESS; } diff --git a/lite/kernels/mlu/bridges/pool_op_test.cc b/lite/kernels/mlu/bridges/pool_op_test.cc index 8cee8dbe86109b14cff49f329d71074a9b3bfb61..2ae888744fde3e94e857f04d50ceb1eb878f3c1c 100644 --- a/lite/kernels/mlu/bridges/pool_op_test.cc +++ b/lite/kernels/mlu/bridges/pool_op_test.cc @@ -43,6 +43,12 @@ void pool_ref(const std::shared_ptr op) { std::string pooling_type = op_info->GetAttr("pooling_type"); bool global_pooling = op_info->GetAttr("global_pooling"); + if (pooling_type == "max") { + for (int i = 0; i < out_dims.production(); ++i) { + dst_ptr[i] = -65504.f; + } + } + int in_n = in_dims[0]; int in_c = in_dims[1]; int in_h = in_dims[2]; @@ -203,62 +209,46 @@ void test_pool(int bs, } TEST(MLUBridges, pool) { - // for (auto pooling_type : {"max", "avg"}) { - // for (auto ceil_mode : {true, false}) { - // for (auto global_pooling : {/*true, */ false}) { - // for (auto exclusive : {true /*, false*/}) { - // for (auto ksize : {2, 3}) { - // for (auto stride : {1, 2}) { - // for (auto padding : {0, 1}) { - // for (auto bs : {1, 3}) { - // for (auto ic : {1, 3}) { - // for (auto ih : {3, 7}) { - // for (auto iw : {3, 7}) { - // test_pool(bs, - // ic, - // ih, - // iw, - // pooling_type, - // ceil_mode, - // global_pooling, - // exclusive, - // ksize, - // stride, - // padding); - // } - // } - // } - // } - // } - // } - // } - // } - // } - // } - // } - for (auto pooling_type : {"max", "avg"}) { for (auto ceil_mode : {true, false}) { - bool global_pooling = false; - bool exclusive = true; - int ksize = 2; - int stride = 1; - int padding = 0; - int bs = 6; - int ic = 6; - int ih = 6; - int iw = 6; - test_pool(bs, - ic, - ih, - iw, - pooling_type, - ceil_mode, - global_pooling, - exclusive, - ksize, - stride, - padding); + for (auto global_pooling : {true, false}) { + for (auto exclusive : {true /*, false*/}) { + for (auto ksize : {2, 3}) { + for (auto stride : {1, 2}) { + for (auto padding : {0, 1}) { + for (auto bs : {1, 3}) { + for (auto ic : {1, 3}) { + for (auto ih : {3, 7}) { + for (auto iw : {3, 7}) { + LOG(INFO) + << "shape: " << bs << ',' << ic << ',' << ih << ',' + << iw << '\t' << "pooling type: " << pooling_type + << '\t' << "ceil model: " << ceil_mode << '\t' + << "global_pooling: " << global_pooling << '\t' + << "exclusive: " << exclusive << '\t' + << "ksize: " << ksize << '\t' + << "stride: " << stride << '\t' + << "padding: " << padding; + test_pool(bs, + ic, + ih, + iw, + pooling_type, + ceil_mode, + global_pooling, + exclusive, + ksize, + stride, + padding); + } + } + } + } + } + } + } + } + } } } } diff --git a/lite/kernels/mlu/bridges/reshape_op.cc b/lite/kernels/mlu/bridges/reshape_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b47322b3462525be64e42b608d052719d7c5f0b --- /dev/null +++ b/lite/kernels/mlu/bridges/reshape_op.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + + // ================== Trans1: NHWC => NCHW =========================== + auto input_tensor = graph->GetNode(x_var_name); + auto trans_1_axis = std::move(GetAxisNHWC2NCHW(x->dims().size())); + auto trans1_out = graph->AddNode(x_var_name + ".trans.i", + x->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType(), + CNML_NCHW); + cnmlBaseOp_t trans1_op{nullptr}; + cnmlNdTransposeOpParam_t trans1_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &trans1_param, trans_1_axis.data(), trans_1_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&trans1_op, + input_tensor->mlu_tensor(), + trans1_out->mlu_tensor(), + trans1_param)); + // ======================== Trans1 End ================================== + + // ======================= Reshape op =================================== + cnmlBaseOp_t reshape_op; + auto trans2_input = graph->AddNode(out_var_name + ".trans.o", + output_dims, + CNML_TENSOR, + CNML_NCHW, + graph->FPType(), + CNML_NCHW); + cnmlReshapeOpParam_t reshape_param{nullptr}; + int cnml_trans2_input_shape[4]; + CNML_CALL( + cnmlGetTensorShape(trans2_input->mlu_tensor(), cnml_trans2_input_shape)); + CNML_CALL( + cnmlCreateNdReshapeOpParam(&reshape_param, cnml_trans2_input_shape, 4)); + + // Use cnmlCreatexxxOpForward to create op. + CNML_CALL(cnmlCreateReshapeOp(&reshape_op, + reshape_param, + trans1_out->mlu_tensor(), + trans2_input->mlu_tensor())); + // ======================= Reshape op End =================================== + + // ================== Trans2: NCHW => NHWC =============================== + auto trans_2_axis = std::move(GetAxisNCHW2NHWC(output->dims().size())); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + cnmlBaseOp_t trans2_op{nullptr}; + cnmlNdTransposeOpParam_t trans2_param{nullptr}; + CNML_CALL(cnmlCreateNdTransposeOpParam( + &trans2_param, trans_2_axis.data(), trans_2_axis.size())); + CNML_CALL(cnmlCreateNdTransposeProOp(&trans2_op, + trans2_input->mlu_tensor(), + output_tensor->mlu_tensor(), + trans2_param)); + // ======================== Trans2 End ================================== + + // =============== DEBUG ==================== + VLOG(6) << "x_var_name: " << x_var_name; + VLOG(6) << "out_var_name: " << out_var_name; + VLOG(6) << "input dim: " << x->dims(); + VLOG(6) << "output dim: " << output->dims(); + int cnml_input_shape[4]; + CNML_CALL(cnmlGetTensorShape(input_tensor->mlu_tensor(), cnml_input_shape)); + VLOG(6) << "cnml input dim: "; + for (size_t i = 0; i < 4; i++) { + VLOG(6) << cnml_input_shape[i]; + } + // cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(trans1_out->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(trans2_input->mlu_tensor(), CNML_TENSOR); + // cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR); + // =============== DEBUG END ================= + + graph->FuseOp(trans1_op); + graph->FuseOp(reshape_op); + graph->FuseOp(trans2_op); + CNML_CALL(cnmlDestroyBaseOp(&trans1_op)); + CNML_CALL(cnmlDestroyBaseOp(&reshape_op)); + CNML_CALL(cnmlDestroyBaseOp(&trans2_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(reshape, + kMLU, + paddle::lite::subgraph::mlu::ReshapeConverter); +REGISTER_SUBGRAPH_BRIDGE(reshape2, + kMLU, + paddle::lite::subgraph::mlu::ReshapeConverter); diff --git a/lite/kernels/mlu/bridges/reshape_op_test.cc b/lite/kernels/mlu/bridges/reshape_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0cd2c6cc26f8f40ee83c99755d8842b072693b1a --- /dev/null +++ b/lite/kernels/mlu/bridges/reshape_op_test.cc @@ -0,0 +1,98 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/reshape_op.h" + +#include + +#include + +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +void test_reshape(std::vector input_shape, + std::vector out_shape) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + x->Resize(input_shape); + Tensor x_cpu; + + // initialize input&output data + FillTensor(x); + x_cpu.CopyDataFrom(*x); + + Tensor input_trans; + input_trans.Resize(input_shape); + transpose(x->mutable_data(), + input_trans.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_trans); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("reshape2"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + std::vector shape_attr; + shape_attr.resize(out_shape.size()); + for (size_t i = 0; i < out_shape.size(); i++) { + shape_attr[i] = static_cast(out_shape[i]); + } + + opdesc.SetAttr>("shape", shape_attr); + auto op = CreateOp(opdesc, &scope); + + auto os = out->dims(); + out->Resize(out_shape); + LaunchOp(op, {x_var_name}, {out_var_name}); + + Tensor out_trans; + out_trans.Resize(out_shape); + transpose(out->mutable_data(), + out_trans.mutable_data(), + {static_cast(out_shape[0]), + static_cast(out_shape[1]), + static_cast(out_shape[2]), + static_cast(out_shape[3])}, + {0, 3, 1, 2}); + out->CopyDataFrom(out_trans); + // compare results + auto* out_data = out->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], x_cpu.mutable_data()[i], 1e-5); + } +} + +TEST(MLUBridges, reshape) { test_reshape({1, 2, 4, 4}, {1, 4, 2, 4}); } +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(reshape, kMLU); +USE_SUBGRAPH_BRIDGE(reshape2, kMLU); diff --git a/lite/kernels/mlu/bridges/scale_op.cc b/lite/kernels/mlu/bridges/scale_op.cc index 5557602bd7576ccd71c51f52a538a45fe27f7ada..5b6b3dff7969562b19344f9eccbf219d26c3e02d 100644 --- a/lite/kernels/mlu/bridges/scale_op.cc +++ b/lite/kernels/mlu/bridges/scale_op.cc @@ -61,6 +61,7 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { alpha_tensor->mlu_tensor(), beta_tensor->mlu_tensor())); graph->FuseOp(scale_op); + CNML_CALL(cnmlDestroyBaseOp(&scale_op)); return SUCCESS; } diff --git a/lite/kernels/mlu/bridges/slice_op.cc b/lite/kernels/mlu/bridges/slice_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..067d110bf4160c5bcf2bbd3009d82bbb5804c998 --- /dev/null +++ b/lite/kernels/mlu/bridges/slice_op.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // input + auto input_var_name = op_info->Input("Input").front(); + auto input = scope->FindVar(input_var_name)->GetMutable(); + auto input_shape = input->dims().Vectorize(); + // output + auto output_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(output_var_name)->GetMutable(); + // attr + auto axes = op_info->GetAttr>("axes"); + auto starts = op_info->GetAttr>("starts"); + auto ends = op_info->GetAttr>("ends"); + + CHECK(graph->HasNode(input_var_name)); + auto input_tensor = graph->GetNode(input_var_name); + auto output_tensor = graph->AddNode(output_var_name, + output->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + graph->FPType()); + + std::vector begin_index(input_shape.size(), 0); + std::vector end_index(input_shape.size()); + std::vector strides(input_shape.size(), 1); + auto nhwc2nchw_axis = std::move(GetAxisNHWC2NCHW(input_shape.size())); + for (size_t i = 0; i < input_shape.size(); ++i) { + end_index[nhwc2nchw_axis[i]] = input_shape[i]; + } + for (size_t i = 0; i < axes.size(); i++) { + int dim_value = input_shape[axes[i]]; + int end = ends[i] < 0 ? std::max(ends[i] + dim_value, 0) : ends[i]; + begin_index[nhwc2nchw_axis[axes[i]]] = + starts[i] < 0 ? std::max(starts[i] + dim_value, 0) : starts[i]; + end_index[nhwc2nchw_axis[axes[i]]] = std::min(end, dim_value); + } + + cnmlNdStridedSliceOpParam_t param; + cnmlBaseOp_t slice_op; + CNML_CALL(cnmlCreateNdStridedSliceOpParam(¶m, + input_shape.size(), + begin_index.data(), + end_index.data(), + strides.data())); + CNML_CALL(cnmlCreateNdStridedSliceOp(&slice_op, + param, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + CNML_CALL(cnmlDestroyNdStridedSliceOpParam(¶m)); + + graph->FuseOp(slice_op); + CNML_CALL(cnmlDestroyBaseOp(&slice_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(slice, + kMLU, + paddle::lite::subgraph::mlu::SliceConverter); diff --git a/lite/kernels/mlu/bridges/slice_op_test.cc b/lite/kernels/mlu/bridges/slice_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a5e2a9f5a4c99b6f46fff24686cdbe546cae727d --- /dev/null +++ b/lite/kernels/mlu/bridges/slice_op_test.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/slice_op.h" +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +static void slice_ref(const float* input, + std::vector in_dims, + std::vector axes, + std::vector starts, + std::vector ends, + float* out) { + auto out_dims = in_dims; + std::vector real_starts(in_dims.size(), 0); + std::vector real_ends(in_dims.size(), 0); + std::vector real_step(in_dims.size(), 0); + for (size_t i = 0; i < in_dims.size(); i++) { + real_ends[i] = in_dims[i]; + } + for (size_t i = 0; i < axes.size(); i++) { + int dim_value = in_dims[axes[i]]; + if (dim_value > 0) { + int start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; + int end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; + start = std::max(start, 0); + end = std::max(end, 0); + end = std::min(end, dim_value); + out_dims[axes[i]] = end - start; + real_starts[axes[i]] = start; + real_ends[axes[i]] = end; + } + } + const int LEN = in_dims.size(); + int dst_step[LEN]; + for (size_t i = 0; i < in_dims.size(); ++i) { + dst_step[i] = 1; + } + int src_step[LEN]; + for (size_t i = 0; i < in_dims.size(); ++i) { + src_step[i] = 1; + } + int out_num = out_dims[in_dims.size() - 1]; + for (int i = in_dims.size() - 2; i >= 0; i--) { + dst_step[i] = out_dims[i + 1] * dst_step[i + 1]; + src_step[i] = in_dims[i + 1] * src_step[i + 1]; + out_num *= out_dims[i]; + } + + for (int dst_id = 0; dst_id < out_num; dst_id++) { + int src_id = 0; + int index_id = dst_id; + for (size_t j = 0; j < out_dims.size(); j++) { + int cur_id = index_id / dst_step[j]; + index_id = index_id % dst_step[j]; + src_id += (cur_id + real_starts[j]) * src_step[j]; + } + out[dst_id] = input[src_id]; + } +} + +static void test_case(std::vector x_shape, + std::vector out_shape, + std::vector starts, + std::vector ends, + std::vector axes) { + Scope scope; + + std::string x_var_name = "x"; + std::string out_var_name = "out"; + auto* x = scope.NewTensor(x_var_name); + auto* out = scope.NewTensor(out_var_name); + x->Resize(lite::DDim(x_shape)); + out->Resize(lite::DDim(out_shape)); + + auto x_data = x->mutable_data(); + FillTensor(x, 0.f, 2.f); + + cpp::OpDesc opdesc; + opdesc.SetType("slice"); + opdesc.SetInput("Input", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axes", axes); + opdesc.SetAttr("starts", starts); + opdesc.SetAttr("ends", ends); + + std::vector out_ref(out->data_size(), 0); + slice_ref(x_data, x_shape, axes, starts, ends, out_ref.data()); + + auto type_cast = [](int64_t in) { return static_cast(in); }; + std::vector i_dims; + std::transform( + x_shape.cbegin(), x_shape.cend(), std::back_inserter(i_dims), type_cast); + + auto nchw2nhwc_axis = std::move(GetAxisNCHW2NHWC(x_shape.size())); + + Tensor input_x; + input_x.Resize(x->dims()); + transpose(x->mutable_data(), + input_x.mutable_data(), + i_dims, + nchw2nhwc_axis); + x->CopyDataFrom(input_x); + + auto op = CreateOp(opdesc, &scope); + LaunchOp(op, {x_var_name}, {out_var_name}); + + Tensor output_trans; + auto os = out->dims().Vectorize(); + output_trans.Resize(os); + std::vector o_dims(os.size()); + for (size_t i = 0; i < os.size(); ++i) { + o_dims[i] = os[nchw2nhwc_axis[i]]; + } + transpose(out->mutable_data(), + output_trans.mutable_data(), + o_dims, + GetAxisNHWC2NCHW(x_shape.size())); + + auto out_data = output_trans.mutable_data(); + for (DDim::value_type i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_ref[i], out_data[i], 1e-4); + } +} + +TEST(MLUBridges, slice) { + /* test_case({3}, {3}, {-3}, {3}, {0}); */ + test_case({3, 4}, {3, 4}, {-3, 0}, {3, 100}, {0, 1}); + test_case({3, 4, 5}, {3, 4, 2}, {-3, 0, 2}, {3, 100, -1}, {0, 1, 2}); + test_case({3, 4, 5, 6}, {3, 4, 2, 6}, {-3, 0, 2}, {3, 100, -1}, {0, 1, 2}); + /* test_case({3, 4, 5, 6, 3}, {3, 4, 2, 6, 3}, {-3, 0, 2}, {3, 100, -1}, {0, + * 1, 2}); */ + /* test_case({3, 4, 5, 6, 5, 2}, {3, 4, 2, 6, 5, 2}, {-3, 0, 2}, {3, 100, 1}, + * {0, 1, 2}); */ +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(slice, kMLU); diff --git a/lite/kernels/mlu/bridges/softmax_op.cc b/lite/kernels/mlu/bridges/softmax_op.cc index 17c911675718a15c7ede4888b268ffcd62b4d8ed..b1b621c1efc6cbc54092a8082e4d624355e07652 100644 --- a/lite/kernels/mlu/bridges/softmax_op.cc +++ b/lite/kernels/mlu/bridges/softmax_op.cc @@ -35,9 +35,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto out_var_name = op_info->Output("Out").front(); auto output = scope->FindVar(out_var_name)->GetMutable(); auto output_dims = output->dims().Vectorize(); + auto x_shape = + scope->FindVar(x_var_name)->GetMutable()->dims().Vectorize(); - // nchw axis to nhwc aixs - int nchw_to_nhwc_aixs_map[4] = {0, 3, 1, 2}; + // nchw axis to nhwc axis int axis = 1; if (op_info->HasAttr("axis")) { axis = op_info->GetAttr("axis"); @@ -45,7 +46,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { axis = output_dims.size() + axis; } } - int nhwc_axis = nchw_to_nhwc_aixs_map[axis]; + // value of nhwc2nchw_axis is index of nhwc + // order of nhwc2nchw_axis is nchw + int nhwc_axis = GetAxisNHWC2NCHW(x_shape.size())[axis]; auto output_tensor = graph->AddNode( out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); @@ -55,6 +58,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { graph->GetNode(x_var_name)->mlu_tensor(), output_tensor->mlu_tensor())); graph->FuseOp(softmax_op); + CNML_CALL(cnmlDestroyBaseOp(&softmax_op)); return SUCCESS; } diff --git a/lite/kernels/mlu/bridges/softmax_op_test.cc b/lite/kernels/mlu/bridges/softmax_op_test.cc index a5251ed43c9187fc2874f9b01853b45b8abf7f1c..d5d7251205a0f60b9e5c8568a58ba48661c9df3e 100644 --- a/lite/kernels/mlu/bridges/softmax_op_test.cc +++ b/lite/kernels/mlu/bridges/softmax_op_test.cc @@ -93,7 +93,7 @@ void test_softmax(const std::vector& input_shape, int axis) { opdesc.SetOutput("Out", {out_var_name}); opdesc.SetAttr("axis", axis); - // create and convert op to NPU model, then run it on NPU + // create and convert op to MLU model, then run it on MLU auto op = CreateOp(opdesc, &scope); // execute reference implementation and save to output tensor softmax_ref(op); diff --git a/lite/kernels/mlu/bridges/split_op.cc b/lite/kernels/mlu/bridges/split_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4188ba3ec08161552bc688c212408fa81ae815a3 --- /dev/null +++ b/lite/kernels/mlu/bridges/split_op.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims().Vectorize(); + + auto out_var_name = op_info->Output("Out"); + + auto param_axis = op_info->GetAttr("axis"); + + auto num = op_info->GetAttr("num"); + auto sections = op_info->GetAttr>("sections"); + int64_t sections_num = static_cast(sections.size()); + auto output_num = num > 0 ? num : sections_num; + + std::vector output_tensor; + for (auto out_name : out_var_name) { + auto out = scope->FindVar(out_name)->GetMutable(); + auto out_dims = out->dims().Vectorize(); + auto out_tensor = graph->AddNode( + out_name, out_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + output_tensor.push_back(out_tensor->mlu_tensor()); + } + + auto dims = x_dims.size(); + int axis = (param_axis < 0) ? (param_axis + dims) : param_axis; + CHECK_LE(axis, 4) << "Unsupport dims in mlu concat"; + int nhwc_axis = GetAxisNHWC2NCHW(dims)[axis]; + + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + + cnmlBaseOp_t split_op; + cnmlTensor_t inputs = input_tensor->mlu_tensor(); + CNML_CALL(cnmlCreateNdSplitOp( + &split_op, nhwc_axis, &inputs, 1, output_tensor.data(), output_num)); + graph->FuseOp(split_op); + CNML_CALL(cnmlDestroyBaseOp(&split_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(split, + kMLU, + paddle::lite::subgraph::mlu::SplitConverter); diff --git a/lite/kernels/mlu/bridges/split_op_test.cc b/lite/kernels/mlu/bridges/split_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a44a45504036e9ef6199e9d2b534aa3dde63bb01 --- /dev/null +++ b/lite/kernels/mlu/bridges/split_op_test.cc @@ -0,0 +1,199 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/split_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +template +void split_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + int num = op_info->GetAttr("num"); + int axis = op_info->GetAttr("axis"); + std::vector sections = op_info->GetAttr>("sections"); + std::vector output_vec; + auto output = op_info->Output("Out"); + for (auto out_var : output) { + output_vec.push_back(scope->Var(out_var)->GetMutable()); + } + auto in_dims = x->dims(); + auto rank = in_dims.size(); + int outs_number = output_vec.size(); + std::vector outs_dims; + outs_dims.reserve(outs_number); + if (axis < 0) { + axis += rank; + } + if (num > 0) { + int out_axis_dim = in_dims[axis] / num; + for (int i = 0; i < outs_number; ++i) { + auto dim = in_dims; + dim[axis] = out_axis_dim; + outs_dims.push_back(dim); + } + } else if (sections.size() > 0) { + for (size_t i = 0; i < outs_number; ++i) { + auto dim = in_dims; + dim[axis] = sections[i]; + outs_dims.push_back(dim); + } + } + for (int j = 0; j < outs_dims.size(); ++j) { + output_vec[j]->Resize(outs_dims[j]); + } + + const dtype* din = x->mutable_data(); + std::vector in_strides(in_dims.size()); + in_strides[in_dims.size() - 1] = in_dims[in_dims.size() - 1]; + for (int i = in_dims.size() - 2; i >= 0; --i) { + in_strides[i] = in_strides[i + 1] * in_dims[i]; + } + + int input_offset = 0; + for (auto out : output_vec) { + auto out_dim = out->dims(); + std::vector out_strides(out_dim.size()); + out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1]; + for (int i = out_dim.size() - 2; i >= 0; --i) { + out_strides[i] = out_strides[i + 1] * out_dim[i]; + } + + dtype* out_data = out->mutable_data(); + int before = out_strides[0] / out_strides[axis]; + int in_after = in_strides[axis]; + int out_after = out_strides[axis]; + + for (int i = 0; i < before; ++i) { + std::memcpy(out_data + i * out_after, + din + input_offset + i * in_after, + sizeof(dtype) * out_after); + } + input_offset += out_strides[axis]; + } +} + +void test_split(int bs, + int ic, + int ih, + int iw, + int axis, + int num, + std::vector sections) { + // prepare input&output variables + std::string x_var_name = "x"; + std::string out_var_name_1 = "out_1"; + std::string out_var_name_2 = "out_2"; + std::string out_ref_var_name_1 = "out_ref_1"; + std::string out_ref_var_name_2 = "out_ref_2"; + + Scope scope; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out_1 = scope.Var(out_var_name_1)->GetMutable(); + auto* out_2 = scope.Var(out_var_name_2)->GetMutable(); + auto* out_ref_1 = scope.Var(out_ref_var_name_1)->GetMutable(); + auto* out_ref_2 = scope.Var(out_ref_var_name_2)->GetMutable(); + x->Resize({bs, ic, ih, iw}); + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("split"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name_1, out_var_name_2}); + opdesc.SetAttr("axis", axis); + opdesc.SetAttr("sections", sections); + opdesc.SetAttr("num", num); + + auto op = CreateOp(opdesc, &scope); + split_ref(op); + out_ref_1->CopyDataFrom(*out_1); + out_ref_2->CopyDataFrom(*out_2); + // execute reference implementation and save to output tensor + + Tensor input; + input.Resize({bs, ic, ih, iw}); + transpose(x->mutable_data(), + input.mutable_data(), + {static_cast(bs), + static_cast(ic), + static_cast(ih), + static_cast(iw)}, + {0, 2, 3, 1}); + x->CopyDataFrom(input); + LaunchOp(op, {x_var_name}, {out_var_name_1, out_var_name_2}); + + // compare results + auto* out_data_1 = out_1->mutable_data(); + auto* out_data_2 = out_2->mutable_data(); + auto* out_ref_data_1 = out_ref_1->mutable_data(); + auto* out_ref_data_2 = out_ref_2->mutable_data(); + + Tensor output1, output2; + output1.Resize(out_1->dims()); + output2.Resize(out_2->dims()); + transpose(out_data_1, + output1.mutable_data(), + {static_cast(out_1->dims()[0]), + static_cast(out_1->dims()[2]), + static_cast(out_1->dims()[3]), + static_cast(out_1->dims()[1])}, + {0, 3, 1, 2}); + transpose(out_data_2, + output2.mutable_data(), + {static_cast(out_2->dims()[0]), + static_cast(out_2->dims()[2]), + static_cast(out_2->dims()[3]), + static_cast(out_2->dims()[1])}, + {0, 3, 1, 2}); + out_data_1 = output1.mutable_data(); + out_data_2 = output2.mutable_data(); + for (int i = 0; i < out_1->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data_1[i], out_ref_data_1[i], 5e-4); + } + for (int i = 0; i < out_2->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data_2[i], out_ref_data_2[i], 5e-4); + } +} + +TEST(MLUBridges, split) { + test_split(4, 2, 3, 1, 0, 2, {}); + test_split(4, 2, 3, 1, 0, 0, {3, 1}); + test_split(4, 6, 3, 1, 1, 2, {}); + test_split(4, 6, 3, 1, 1, 0, {2, 4}); + test_split(4, 2, 2, 1, 2, 2, {}); + test_split(4, 2, 6, 1, 2, 0, {3, 3}); + test_split(4, 2, 3, 4, 3, 2, {}); + test_split(4, 2, 3, 6, 3, 0, {5, 1}); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(split, kMLU); diff --git a/lite/kernels/mlu/bridges/squeeze_op.cc b/lite/kernels/mlu/bridges/squeeze_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f8af5b014bdba29bb50036473f671ec359f26d4 --- /dev/null +++ b/lite/kernels/mlu/bridges/squeeze_op.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int SqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Create act node and set params from op + auto fp_type = graph->FPType(); + auto x_var_name = op_info->Input("X").front(); + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type); + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + + auto output_dims_nhwc = DimNCHW2NHWC(output_dims); + std::vector o_dims(output_dims.size()); + std::transform(output_dims_nhwc.cbegin(), + output_dims_nhwc.cend(), + o_dims.begin(), + [](DDim::value_type d) { return static_cast(d); }); + + cnmlReshapeOpParam_t param; + cnmlBaseOp_t squeeze_op; + CNML_CALL(cnmlCreateNdReshapeOpParam(¶m, o_dims.data(), o_dims.size())); + CNML_CALL(cnmlCreateReshapeOp(&squeeze_op, + param, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor())); + CNML_CALL(cnmlDestroyReshapeOpParam(¶m)); + graph->FuseOp(squeeze_op); + CNML_CALL(cnmlDestroyBaseOp(&squeeze_op)); + + if (op_type == "squeeze2") { + auto xshape_var_name = op_info->Output("XShape").front(); + auto xshape = scope->FindVar(xshape_var_name)->GetMutable(); + auto dims_64 = xshape->dims().Vectorize(); + auto dims_64_nhwc = DimNCHW2NHWC(dims_64); + auto xshape_tensor = graph->AddNode( + xshape_var_name, dims_64, CNML_TENSOR, CNML_NCHW, fp_type); + + std::vector xshape_dims(dims_64.size()); + std::transform(dims_64_nhwc.cbegin(), + dims_64_nhwc.cend(), + xshape_dims.begin(), + [](DDim::value_type d) { return static_cast(d); }); + + cnmlBaseOp_t squeeze2_op; + CNML_CALL(cnmlCreateNdReshapeOpParam( + ¶m, xshape_dims.data(), xshape_dims.size())); + CNML_CALL(cnmlCreateReshapeOp(&squeeze2_op, + param, + input_tensor->mlu_tensor(), + xshape_tensor->mlu_tensor())); + CNML_CALL(cnmlDestroyReshapeOpParam(¶m)); + graph->FuseOp(squeeze2_op); + CNML_CALL(cnmlDestroyBaseOp(&squeeze2_op)); + } + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(squeeze, + kMLU, + paddle::lite::subgraph::mlu::SqueezeConverter); +REGISTER_SUBGRAPH_BRIDGE(squeeze2, + kMLU, + paddle::lite::subgraph::mlu::SqueezeConverter); diff --git a/lite/kernels/mlu/bridges/squeeze_op_test.cc b/lite/kernels/mlu/bridges/squeeze_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ad16dac2e978fa977acacf62ed6adca16365ed6d --- /dev/null +++ b/lite/kernels/mlu/bridges/squeeze_op_test.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/squeeze_op.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +// squeeze +TEST(MLUBridges, squeeze) { + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string ref_var_name("ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(ref_var_name)->GetMutable(); + std::vector x_shape({1, 3, 1, 5}); + x->Resize(x_shape); + out_ref->Resize(x_shape); + std::vector out_shape({3, 5}); + out->Resize(out_shape); + + FillTensor(x, 0, 10); + out_ref->CopyDataFrom(*x); + + // SqueezeCompute squeeze; + cpp::OpDesc opdesc; + opdesc.SetType("squeeze"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + std::vector axes{0, -2}; + opdesc.SetAttr("axes", axes); + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + LaunchOp(op, {x_var_name}, {out_var_name}); + + auto x_data = out_ref->data(); + auto out_data = out->data(); + for (int j = 0; j < out->numel(); ++j) { + EXPECT_NEAR(out_data[j], x_data[j], 1e-5); + } +} + +// squeeze2 +TEST(MLUBridges, squeeze2) { + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string xshape_var_name("xshape"); + std::string ref_var_name("ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* xshape = scope.Var(xshape_var_name)->GetMutable(); + auto* out_ref = scope.Var(ref_var_name)->GetMutable(); + std::vector x_shape({1, 3, 1, 5}); + x->Resize(x_shape); + out_ref->Resize(x_shape); + std::vector out_shape({3, 5}); + out->Resize(out_shape); + std::vector xshape_shape({1, 3, 1, 5}); + xshape->Resize(xshape_shape); + + FillTensor(x, 0, 10); + out_ref->CopyDataFrom(*x); + + // Squeeze2Compute squeeze2; + cpp::OpDesc opdesc; + opdesc.SetType("squeeze2"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetOutput("XShape", {xshape_var_name}); + + std::vector axes({0, -2}); + opdesc.SetAttr("axes", axes); + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + LaunchOp(op, {x_var_name}, {out_var_name, xshape_var_name}); + + auto x_data = out_ref->mutable_data(); + auto out_data = out->mutable_data(); + auto xshape_data = xshape->mutable_data(); + for (int j = 0; j < out->numel(); ++j) { + EXPECT_NEAR(out_data[j], x_data[j], 1e-5); + EXPECT_NEAR(xshape_data[j], x_data[j], 1e-5); + } +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(squeeze, kMLU); +USE_SUBGRAPH_BRIDGE(squeeze2, kMLU); diff --git a/lite/kernels/mlu/bridges/tensor.cc b/lite/kernels/mlu/bridges/tensor.cc index be7e1f09beaee61dace598b958ab4f95f14b38f8..f1bf48d66e8693e72a96f0f52c285a717f464128 100644 --- a/lite/kernels/mlu/bridges/tensor.cc +++ b/lite/kernels/mlu/bridges/tensor.cc @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #include namespace paddle { @@ -25,8 +28,9 @@ namespace mlu { MLUTensor::MLUTensor(const std::vector& shape, cnmlTensorType_t tensor_type, - cnmlDataOrder_t data_order, - cnmlDataType_t mlu_dtype) + cnmlDataOrder_t shape_order, + cnmlDataType_t mlu_dtype, + cnmlDataOrder_t data_order) : mlu_tensor_(nullptr), tensor_type_(tensor_type), mlu_ptr_(nullptr) { std::vector int_shape; for (auto i : shape) { @@ -36,15 +40,18 @@ MLUTensor::MLUTensor(const std::vector& shape, LOG(FATAL) << "Shape size is beyond the limitation of MLUTensor!"; } } - remember(int_shape, tensor_type, mlu_dtype, data_order); + remember(int_shape, tensor_type, mlu_dtype, shape_order, data_order); } void MLUTensor::remember(const std::vector& shape, cnmlTensorType_t tensor_type, cnmlDataType_t mlu_dtype, - cnmlDataOrder_t shape_order) { + cnmlDataOrder_t shape_order, + cnmlDataOrder_t data_order) { tensor_type_ = tensor_type; mlu_dtype_ = mlu_dtype; + data_order_ = data_order; + origin_shape_.assign(shape.begin(), shape.end()); int size = 4; if (shape.size() > 4 || shape_order == CNML_ARRAY) { @@ -239,13 +246,22 @@ void MLUTensor::remember(const std::vector& shape, break; } } - dim_ = shape_.size(); + auto shape_NCHW = DimNHWC2NCHW(shape_); + shape_NCHW.erase(shape_NCHW.begin() + shape.size(), shape_NCHW.end()); + dim_ = shape_NCHW.size(); + shape_ = DimNCHW2NHWC(shape_NCHW); } void MLUTensor::Create() { if (mlu_tensor_ == nullptr) { CNML_CALL(cnmlCreateTensor_V2(&mlu_tensor_, tensor_type_)); std::vector dim_shape(shape_); + if (data_order_ == CNML_NCHW) { + std::transform(origin_shape_.cbegin(), + origin_shape_.cend(), + dim_shape.begin(), + [](DDim::value_type in) { return static_cast(in); }); + } int* dim_strides = nullptr; CNML_CALL(cnmlSetTensorShape_V2( mlu_tensor_, dim_, dim_shape.data(), dim_strides)); @@ -258,6 +274,84 @@ cnmlTensor_t MLUTensor::mlu_tensor() { return mlu_tensor_; } +void MLUTensor::ToFile(std::string file_name) { + if (mlu_ptr_) { + VLOG(5) << "to dump mlu ptr: " << mlu_ptr_ << " to: " << file_name; + int count = 1; + for (size_t i = 0; i < shape_.size(); i++) { + count *= shape_[i]; + } + VLOG(6) << " dump count: " << count; + VLOG(6) << " dump shape: "; + for (size_t i = 0; i < shape_.size(); i++) { + VLOG(6) << shape_[i] << " "; + } + + std::vector cpu_data_fp32(count); + // fp16 to fp32 + if (mlu_dtype_ == CNML_DATA_FLOAT16) { + VLOG(6) << " convert fp16 to fp32 "; + std::vector cpu_data_fp16(count); + cnrtMemcpy(cpu_data_fp16.data(), + mlu_ptr_, + count * sizeof(uint16_t), + CNRT_MEM_TRANS_DIR_DEV2HOST); + for (int i = 0; i < count; i++) { + cnrtConvertHalfToFloat(&(cpu_data_fp32[i]), cpu_data_fp16[i]); + } + } else { + cnrtMemcpy(cpu_data_fp32.data(), + mlu_ptr_, + count * sizeof(float), + CNRT_MEM_TRANS_DIR_DEV2HOST); + } + + // trans to nchw + std::vector cpu_data_trans(count); + if (data_order_ != CNML_NCHW) { + switch (shape_.size()) { + case 4: + transpose(cpu_data_fp32.data(), + cpu_data_trans.data(), + shape_, + {0, 3, 1, 2}); + break; + case 3: + transpose( + cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 2, 1}); + break; + case 2: + transpose( + cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 1}); + break; + case 1: + transpose(cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0}); + break; + default: + CHECK(0) << "ToFile only support dim <=4"; + break; + } + } + + // to file + std::ostringstream outs; + for (int i = 0; i < count; i++) { + if (data_order_ == CNML_NCHW) { + outs << cpu_data_fp32[i] << std::endl; + } else { + outs << cpu_data_trans[i] << std::endl; + } + } + std::ofstream of; + of.open(file_name, std::ios::out); + of << outs.str(); + of.close(); + } else { + LOG(FATAL) << "mlu ptr is null ,can not dump mlu content to : " + << file_name; + } +} + MLUTensor::~MLUTensor() { if (mlu_tensor_ != nullptr) { CNML_CALL(cnmlDestroyTensor(&mlu_tensor_)); diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h index 12dc97a772dabc529bf183f783a22a9f2dfa936d..22268f69ba39926dbbfb1bbb18e3a86331097f90 100644 --- a/lite/kernels/mlu/bridges/tensor.h +++ b/lite/kernels/mlu/bridges/tensor.h @@ -14,6 +14,8 @@ #pragma once +#include +#include #include #include "lite/kernels/mlu/bridges/utility.h" @@ -33,13 +35,15 @@ class MLUTensor { MLUTensor(const std::vector& shape, cnmlTensorType_t tensor_type = CNML_TENSOR, - cnmlDataOrder_t data_order = CNML_NCHW, - cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32); + cnmlDataOrder_t shape_order = CNML_NCHW, + cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32, + cnmlDataOrder_t data_order = CNML_NHWC); void remember(const std::vector& shape, cnmlTensorType_t tensor_type, cnmlDataType_t mlu_dtype, - cnmlDataOrder_t shape_order); + cnmlDataOrder_t shape_order, + cnmlDataOrder_t data_order); void Create(); cnmlTensor_t mlu_tensor(); void* mlu_data() { @@ -47,14 +51,21 @@ class MLUTensor { return mlu_ptr_; } + cnmlDataType_t dtype() { return mlu_dtype_; } void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; } + const std::vector& get_origin_shape() const { return origin_shape_; } + ~MLUTensor(); + void ToFile(std::string file_name); + cnmlDataOrder_t dorder() { return data_order_; } + private: cnmlTensor_t mlu_tensor_; std::vector shape_; + std::vector origin_shape_; cnmlTensorType_t tensor_type_; cnmlDataType_t mlu_dtype_; int dim_{0}; diff --git a/lite/kernels/mlu/bridges/test_helper.cc b/lite/kernels/mlu/bridges/test_helper.cc index 377a00689ef3a27f78ae008072578ab3701cd337..36eeb473f6a37aa28a9447280f808f5fb08978d0 100644 --- a/lite/kernels/mlu/bridges/test_helper.cc +++ b/lite/kernels/mlu/bridges/test_helper.cc @@ -24,18 +24,38 @@ namespace lite { namespace subgraph { namespace mlu { +template +void PrepareInput(Graph* graph, + const std::string& input_name, + Tensor* input_tensor, + cnmlDataOrder_t order) { + thread_local Tensor temp_input; + temp_input.Resize(input_tensor->dims().Vectorize()); + temp_input.CopyDataFrom(*input_tensor); + using data_type = typename MLUTypeTraits::type; + auto input_node = graph->AddNode( + input_name, + input_tensor->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + MLUTypeTraits::cnml_type, + order, + reinterpret_cast( + input_tensor->template mutable_data(TARGET(kMLU)))); + CHECK(input_node); + CNRT_CHECK(cnrtMemcpy(input_tensor->template mutable_data(), + temp_input.mutable_data(), + sizeof(data_type) * input_tensor->dims().production(), + CNRT_MEM_TRANS_DIR_HOST2DEV)); +} + void LaunchOp(const std::shared_ptr op, const std::vector& input_var_names, - const std::vector& output_var_names) { + const std::vector& output_var_names, + cnmlDataOrder_t order) { CNRT_CALL(cnrtInit(0)); - ::paddle::lite::SetMluDevice(0); + lite::SetMluDevice(0); cnrtQueue_t queue_; - cnrtInvokeFuncParam_t forward_param; - u32_t affinity = 1; - int data_param = 1; - forward_param.data_parallelism = &data_param; - forward_param.affinity = &affinity; - forward_param.end = CNRT_PARAM_END; CNRT_CALL(cnrtCreateQueue(&queue_)); cnrtDev_t dev_handle; CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0)); @@ -50,23 +70,21 @@ void LaunchOp(const std::shared_ptr op, // Convert input data var and add it into the MLU IR graph for (auto& input_name : input_var_names) { auto input_tensor = scope->FindMutableTensor(input_name); - CHECK(input_tensor); - Tensor temp_input; - temp_input.Resize(input_tensor->dims().Vectorize()); - temp_input.CopyDataFrom(*input_tensor); - auto input_node = - graph.AddNode(input_name, - input_tensor->dims().Vectorize(), - CNML_TENSOR, - CNML_NCHW, - graph.FPType(), - reinterpret_cast( - input_tensor->mutable_data(TARGET(kMLU)))); - CHECK(input_node); - CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data(), - temp_input.mutable_data(), - sizeof(float) * input_tensor->dims().production(), - CNRT_MEM_TRANS_DIR_HOST2DEV)); + auto data_type = input_tensor->precision(); + + switch (data_type) { +#define PREPARE_INPUT(type__) \ + case PRECISION(type__): \ + PrepareInput(&graph, input_name, input_tensor, order); \ + break; + PREPARE_INPUT(kFP16) + PREPARE_INPUT(kFloat) + PREPARE_INPUT(kInt8) + PREPARE_INPUT(kInt32) +#undef PREPARE_INPUT + default: + CHECK(0); + } } op->CheckShape(); op->InferShape(); @@ -89,8 +107,9 @@ void LaunchOp(const std::shared_ptr op, } graph.Compile(CNML_MLU270, 1); + graph.Compute(queue_, *(graph.MutableInputs()), *(graph.MutableOutputs())); + CNRT_CALL(cnrtSyncQueue(queue_)); - graph.Compute(forward_param, queue_); for (auto& output_name : output_var_names) { auto output_tensor = scope->FindMutableTensor(output_name); Tensor temp_out; diff --git a/lite/kernels/mlu/bridges/test_helper.h b/lite/kernels/mlu/bridges/test_helper.h index 4da9e72dfcc5a81a68467f7622e2c16aedb2ded5..36fe6f1efaed76deccdc6e9542bb52a2aefc2571 100644 --- a/lite/kernels/mlu/bridges/test_helper.h +++ b/lite/kernels/mlu/bridges/test_helper.h @@ -58,7 +58,8 @@ void FillTensor(Tensor* x, void LaunchOp(const std::shared_ptr op, const std::vector& input_var_names, - const std::vector& output_var_names); + const std::vector& output_var_names, + cnmlDataOrder_t order = CNML_NHWC); } // namespace mlu } // namespace subgraph diff --git a/lite/kernels/mlu/bridges/transpose_op.cc b/lite/kernels/mlu/bridges/transpose_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b6caeb3613fea8f348e3990ec2c9660321590116 --- /dev/null +++ b/lite/kernels/mlu/bridges/transpose_op.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +std::vector axis_to_nhwc(const std::vector& axis) { + std::vector new_axis(axis.size()); + + auto nhwc2nchw_axis = std::move(GetAxisNHWC2NCHW(axis.size())); + auto nchw2nhwc_axis = std::move(GetAxisNCHW2NHWC(axis.size())); + + for (size_t i = 0; i < new_axis.size(); ++i) { + new_axis[i] = nhwc2nchw_axis[axis[nchw2nhwc_axis[i]]]; + } + return new_axis; +} + +int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[MLU] Converting " + op_type + "..."; + + // Get input vars and op attributes + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims().Vectorize(); + + auto out_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(out_var_name)->GetMutable(); + auto output_dims = output->dims().Vectorize(); + + auto axis = op_info->GetAttr>("axis"); + std::vector axis_nhwc = axis_to_nhwc(axis); + + auto output_tensor = graph->AddNode( + out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); + + CHECK(graph->HasNode(x_var_name)); + auto input_tensor = graph->GetNode(x_var_name); + cnmlBaseOp_t transpose_op{nullptr}; + + cnmlNdTransposeOpParam_t transpose_param{nullptr}; + + CNML_CALL(cnmlCreateNdTransposeOpParam( + &transpose_param, axis_nhwc.data(), axis_nhwc.size())); + + // Use cnmlCreatexxxOpForward to create op. + CNML_CALL(cnmlCreateNdTransposeProOp(&transpose_op, + input_tensor->mlu_tensor(), + output_tensor->mlu_tensor(), + transpose_param)); + + graph->FuseOp(transpose_op); + CNML_CALL(cnmlDestroyBaseOp(&transpose_op)); + return SUCCESS; +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle +REGISTER_SUBGRAPH_BRIDGE(transpose, + kMLU, + paddle::lite::subgraph::mlu::TransposeConverter); +REGISTER_SUBGRAPH_BRIDGE(transpose2, + kMLU, + paddle::lite::subgraph::mlu::TransposeConverter); diff --git a/lite/kernels/mlu/bridges/transpose_op_test.cc b/lite/kernels/mlu/bridges/transpose_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6e8f7890581279f0ab4d51006c194967fd9c61e7 --- /dev/null +++ b/lite/kernels/mlu/bridges/transpose_op_test.cc @@ -0,0 +1,153 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/transpose_op.h" +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/mlu/bridges/test_helper.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace mlu { + +int data_index(std::vector pos, DDimLite dims) { + int d1 = dims[1]; + int d2 = dims[2]; + int d3 = dims[3]; + return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1; +} + +std::vector pos_trans(std::vector in_pos, std::vector axis) { + std::vector out_pos(in_pos.size()); + for (size_t i = 0; i < axis.size(); i++) { + out_pos[axis[i]] = in_pos[i]; + } + return out_pos; +} + +template +void transpose_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + + auto input = + scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto output = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto x_dims = input->dims(); + auto y_dims = output->dims(); + auto axis = op_info->GetAttr>("axis"); + + // auto input_data = input->data(); + auto* input_data = input->mutable_data(); + auto* output_data = output->mutable_data(); + + int input_n = x_dims[0]; + int input_c = x_dims[1]; + int input_h = x_dims[2]; + int input_w = x_dims[3]; + + for (int n = 0; n < input_n; ++n) { + for (int c = 0; c < input_c; ++c) { + for (int h = 0; h < input_h; ++h) { + for (int w = 0; w < input_w; ++w) { + std::vector in_pos{n, c, h, w}; + std::vector out_pos = pos_trans(in_pos, axis); + int in_index = data_index(in_pos, x_dims); + int out_index = data_index(out_pos, y_dims); + output_data[out_index] = input_data[in_index]; + } + } + } + } +} + +void test_transpose(const std::vector& input_shape, + std::vector axis) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(input_shape); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("transpose"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", axis); + + // create and convert op to MLU model, then run it on MLU + auto op = CreateOp(opdesc, &scope); + + // transpose_ref must run befor LaunchOp + // otherwise get Cannot access memory + // execute reference implementation and save to output tensor + transpose_ref(op); + out_ref->CopyDataFrom(*out); + + Tensor input_x; + input_x.Resize(DDim(input_shape)); + transpose(x->mutable_data(), + input_x.mutable_data(), + {static_cast(input_shape[0]), + static_cast(input_shape[1]), + static_cast(input_shape[2]), + static_cast(input_shape[3])}, + {0, 2, 3, 1}); + x->CopyDataFrom(input_x); + + LaunchOp(op, {x_var_name}, {out_var_name}); + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + + Tensor output_trans; + output_trans.Resize(out->dims()); + auto os = out->dims(); + transpose(out_data, + output_trans.mutable_data(), + {static_cast(os[0]), + static_cast(os[2]), + static_cast(os[3]), + static_cast(os[1])}, + {0, 3, 1, 2}); + out_data = output_trans.mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +// TODO(pmshst): fix the transpose test +TEST(MLUBridges, transpose) { + std::vector input_shape = {2, 3, 4, 5}; + test_transpose(input_shape, std::vector{0, 1, 3, 2}); +} + +} // namespace mlu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +USE_SUBGRAPH_BRIDGE(transpose, kMLU); +USE_SUBGRAPH_BRIDGE(transpose2, kMLU); diff --git a/lite/kernels/mlu/bridges/utility.cc b/lite/kernels/mlu/bridges/utility.cc index cd78553a652433fc41334a6bff5575031f5125e0..b53debd643ae2b1080644d2844d702797addabec 100644 --- a/lite/kernels/mlu/bridges/utility.cc +++ b/lite/kernels/mlu/bridges/utility.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/kernels/mlu/bridges/utility.h" + #include namespace paddle { @@ -20,33 +21,21 @@ namespace lite { namespace subgraph { namespace mlu { -void transpose(float* input_data, - float* output_data, - std::vector input_shape, - std::vector axis) { +void transpose2d(float* input_data, + float* output_data, + std::vector input_shape) { + CHECK_EQ(input_shape.size(), 2); int old_index = -1; int new_index = -1; - int dim[4] = {0}; - std::vector shape = input_shape; - for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) { - for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) { - for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) { - for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) { - old_index = dim[0] * shape[1] * shape[2] * shape[3] + - dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3]; - new_index = - dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] + - dim[axis[1]] * shape[axis[2]] * shape[axis[3]] + - dim[axis[2]] * shape[axis[3]] + dim[axis[3]]; - output_data[new_index] = input_data[old_index]; - } - } + for (int i = 0; i < input_shape[0]; i++) { + for (int j = 0; j < input_shape[1]; j++) { + old_index = i * input_shape[1] + j; + new_index = j * input_shape[0] + i; + output_data[new_index] = input_data[old_index]; } } } -int scale2position(float scale) { return static_cast(-std::log2(scale)); } - void dequant(float* dst, int8_t* src, size_t size, float scale) { for (size_t i = 0; i < size; ++i) { dst[i] = static_cast(src[i]) * scale; diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h index fa8fb1597c0fb068a855928dd20057d48ecd5eaf..fd1e5eb265936f11f258d86e2b6a91af1d55c6ed 100644 --- a/lite/kernels/mlu/bridges/utility.h +++ b/lite/kernels/mlu/bridges/utility.h @@ -16,24 +16,76 @@ #include #include + #include #include #include + #include "lite/backends/mlu/mlu_utils.h" #include "lite/core/op_lite.h" #include "lite/core/tensor.h" -#include "lite/fluid/data_type.h" +#include "lite/fluid/float16.h" namespace paddle { namespace lite { namespace subgraph { namespace mlu { -void transpose(float* input_data, - float* output_data, +void transpose2d(float* input_data, + float* output_data, + std::vector input_shape); + +template +void transpose(dtype* input_data, + dtype* output_data, std::vector input_shape, - std::vector axis); -int scale2position(float scale); + std::vector axis) { + int old_index = -1; + int new_index = -1; + std::vector shape; + std::vector expand_axis; + if (input_shape.size() < 5u) { + for (size_t i = 0; i < 5 - input_shape.size(); i++) { + shape.push_back(1); + expand_axis.push_back(i); + } + for (size_t i = 0; i < input_shape.size(); i++) { + shape.push_back(input_shape[i]); + expand_axis.push_back(axis[i] + 5 - input_shape.size()); + } + } else { + shape = input_shape; + expand_axis = axis; + } + int dim[5] = {0}; + for (dim[0] = 0; dim[0] < shape[0]; dim[0]++) { + for (dim[1] = 0; dim[1] < shape[1]; dim[1]++) { + for (dim[2] = 0; dim[2] < shape[2]; dim[2]++) { + for (dim[3] = 0; dim[3] < shape[3]; dim[3]++) { + for (dim[4] = 0; dim[4] < shape[4]; dim[4]++) { + old_index = dim[0] * shape[1] * shape[2] * shape[3] * shape[4] + + dim[1] * shape[2] * shape[3] * shape[4] + + dim[2] * shape[3] * shape[4] + dim[3] * shape[4] + + dim[4]; + new_index = dim[expand_axis[0]] * shape[expand_axis[1]] * + shape[expand_axis[2]] * shape[expand_axis[3]] * + shape[expand_axis[4]] + + dim[expand_axis[1]] * shape[expand_axis[2]] * + shape[expand_axis[3]] * shape[expand_axis[4]] + + dim[expand_axis[2]] * shape[expand_axis[3]] * + shape[expand_axis[4]] + + dim[expand_axis[3]] * shape[expand_axis[4]] + + dim[expand_axis[4]]; + output_data[new_index] = input_data[old_index]; + } + } + } + } + } +} + +inline int scale2position(float scale) { return std::floor(-std::log2(scale)); } + void dequant(float* dst, int8_t* src, size_t size, float scale); void dequant(float* dst, @@ -64,27 +116,94 @@ inline const ::paddle::lite::DDimLite DimNCHW2NHWC( std::vector({dim[0], dim[2], dim[3], dim[1]})); } -inline const std::vector DimNHWC2NCHW( - const std::vector& dim) { - return std::vector({dim[0], dim[3], dim[1], dim[2]}); +template +inline const std::vector DimNHWC2NCHW( + const std::vector& dim) { + switch (dim.size()) { + case 1: + return dim; + case 2: + return dim; + case 3: + return std::vector({dim[0], dim[2], dim[1]}); + case 4: + return std::vector({dim[0], dim[3], dim[1], dim[2]}); + case 5: + return std::vector({dim[0], dim[4], dim[1], dim[2], dim[3]}); + default: + CHECK(0) << "unsupport dimension"; + } +} + +template +inline const std::vector DimNCHW2NHWC( + const std::vector& dim) { + switch (dim.size()) { + case 1: + return dim; + case 2: + return dim; + case 3: + return std::vector({dim[0], dim[2], dim[1]}); + case 4: + return std::vector({dim[0], dim[2], dim[3], dim[1]}); + case 5: + return std::vector({dim[0], dim[2], dim[3], dim[4], dim[1]}); + default: + CHECK(0) << "unsupport dimension"; + } } -inline const std::vector DimNCHW2NHWC( - const std::vector& dim) { - return std::vector({dim[0], dim[2], dim[3], dim[1]}); +template +inline std::vector GetAxisNHWC2NCHW(size_t n_dims) { + std::vector nhwc2nchw_axis(n_dims); + nhwc2nchw_axis[0] = 0; + if (n_dims > 1) nhwc2nchw_axis[1] = n_dims - 1; + for (size_t i = 2; i < n_dims; ++i) { + nhwc2nchw_axis[i] = i - 1; + } + return nhwc2nchw_axis; +} + +template +inline std::vector GetAxisNCHW2NHWC(size_t n_dims) { + std::vector nchw2nhwc_axis(n_dims); + nchw2nhwc_axis[0] = 0; + for (size_t i = 1; i < n_dims - 1; ++i) { + nchw2nhwc_axis[i] = i + 1; + } + if (n_dims > 1) nchw2nhwc_axis[n_dims - 1] = 1; + return nchw2nhwc_axis; } template -struct FPTypeTraits {}; +struct MLUTypeTraits { + /* using type = void; */ + /* static constexpr cnmlDataType_t cnml_type = CNML_DATA_INVALID; */ +}; + +template <> +struct MLUTypeTraits { + using type = float; + static constexpr cnmlDataType_t cnml_type = CNML_DATA_FLOAT32; +}; + +template <> +struct MLUTypeTraits { + using type = paddle::lite::fluid::float16; + static constexpr cnmlDataType_t cnml_type = CNML_DATA_FLOAT16; +}; template <> -struct FPTypeTraits { - typedef float T; +struct MLUTypeTraits { + using type = int8_t; + static constexpr cnmlDataType_t cnml_type = CNML_DATA_INT8; }; template <> -struct FPTypeTraits { - typedef paddle::lite::fluid::float16 T; +struct MLUTypeTraits { + using type = int32_t; + static constexpr cnmlDataType_t cnml_type = CNML_DATA_INT32; }; } // namespace mlu diff --git a/lite/kernels/mlu/io_copy_compute.cc b/lite/kernels/mlu/io_copy_compute.cc index 02e4d8b28e81e88201b895a4b8fbe9e93d3f17f9..ff8a7ddf6e4c465f288ba42b5b2537294a9d7ffd 100644 --- a/lite/kernels/mlu/io_copy_compute.cc +++ b/lite/kernels/mlu/io_copy_compute.cc @@ -41,6 +41,9 @@ class IoCopyHostToMluCompute auto mem_size = param.x->memory_size(); // LOG(INFO) << "copy size " << mem_size; auto* data = param.y->mutable_data(TARGET(kMLU), mem_size); + VLOG(6) << "io_copy host to mlu] memory size: " << mem_size + << " precision type: " << PrecisionToStr(Precision); + param.y->set_precision(param.x->precision()); CopyFromHostSync(data, param.x->raw_data(), mem_size); } @@ -79,6 +82,13 @@ class IoCopyMluToHostCompute CHECK(param.x->target() == TARGET(kMLU)); auto mem_size = param.x->memory_size(); auto* data = param.y->mutable_data(TARGET(kHost), mem_size); + VLOG(6) << "io_copy mlu to host] memory size: " << mem_size + << " precision type: " << PrecisionToStr(Precision); + + // sync queue to ensure process done + auto& mlu_context = this->ctx_->template As(); + CNRT_CALL(cnrtSyncQueue(mlu_context.exec_queue())); + CopyToHostSync(data, param.x->raw_data(), mem_size); } @@ -97,8 +107,14 @@ REGISTER_LITE_KERNEL( kNHWC, paddle::lite::kernels::mlu::IoCopyHostToMluCompute, host_to_device_kFloat) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))}) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kFloat), + DATALAYOUT(kAny))}) .Finalize(); REGISTER_LITE_KERNEL( @@ -108,8 +124,31 @@ REGISTER_LITE_KERNEL( kNHWC, paddle::lite::kernels::mlu::IoCopyHostToMluCompute, host_to_device_kFP16) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))}) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFP16), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kFP16), + DATALAYOUT(kAny))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + io_copy, + kMLU, + kInt32, + kNHWC, + paddle::lite::kernels::mlu::IoCopyHostToMluCompute, + host_to_device_kInt32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt32), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kInt32), + DATALAYOUT(kAny))}) .Finalize(); REGISTER_LITE_KERNEL( @@ -119,8 +158,14 @@ REGISTER_LITE_KERNEL( kNHWC, paddle::lite::kernels::mlu::IoCopyMluToHostCompute, device_to_host_kFloat) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kFloat), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kAny))}) .Finalize(); REGISTER_LITE_KERNEL( @@ -130,6 +175,29 @@ REGISTER_LITE_KERNEL( kNHWC, paddle::lite::kernels::mlu::IoCopyMluToHostCompute, device_to_host_kFP16) - .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kFP16), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFP16), + DATALAYOUT(kAny))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + io_copy, + kMLU, + kInt8, + kNHWC, + paddle::lite::kernels::mlu::IoCopyHostToMluCompute, + host_to_device_to_kInt8) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kInt8), + DATALAYOUT(kAny))}) .Finalize(); diff --git a/lite/kernels/mlu/layout_compute.cc b/lite/kernels/mlu/layout_compute.cc index d4e16734d6d2dae6f5c119194008bce114a2e918..42b12740ff0edb88ea2944e25ca03ade36caa956 100644 --- a/lite/kernels/mlu/layout_compute.cc +++ b/lite/kernels/mlu/layout_compute.cc @@ -24,9 +24,9 @@ namespace mlu {} // namespace mlu REGISTER_LITE_KERNEL( layout, - kMLU, + kX86, kFloat, - kNHWC, + kNCHW, paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute, def_layout_nhwc2nchw_fp32) .BindInput("Input", @@ -41,9 +41,9 @@ REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL( layout, - kMLU, + kX86, kFP16, - kNHWC, + kNCHW, paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute, def_layout_nhwc2nchw_fp16) .BindInput("Input", @@ -58,9 +58,9 @@ REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL( layout, - kMLU, + kX86, kFloat, - kNHWC, + kNCHW, paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, def_layout_nchw2nhwc_fp32) .BindInput("Input", @@ -75,9 +75,9 @@ REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL( layout, - kMLU, + kX86, kFP16, - kNHWC, + kNCHW, paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, def_layout_nchw2nhwc_fp16) .BindInput("Input", @@ -92,11 +92,11 @@ REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL( layout, - kMLU, + kX86, kInt8, - kNHWC, + kNCHW, paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute, - def_layout_nchw2nhwc_fp32_int8) + def_layout_nchw2nhwc_int8) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt8), diff --git a/lite/kernels/mlu/layout_compute.h b/lite/kernels/mlu/layout_compute.h index edacdf8a98a2ffde6e538f61d4dd8259e3211b22..df254865994fe8548df0e021ecb471f5a1020080 100644 --- a/lite/kernels/mlu/layout_compute.h +++ b/lite/kernels/mlu/layout_compute.h @@ -22,6 +22,7 @@ #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" #include "lite/core/type_system.h" +#include "lite/kernels/mlu/bridges/utility.h" #include "lite/operators/layout_op.h" namespace paddle { @@ -29,24 +30,6 @@ namespace lite { namespace kernels { namespace mlu { -template -struct FPTypeTraits {}; - -template <> -struct FPTypeTraits { - typedef float T; -}; - -template <> -struct FPTypeTraits { - typedef paddle::lite::fluid::float16 T; -}; - -template <> -struct FPTypeTraits { - typedef int8_t T; -}; - template inline void LayoutTransCompute(const int dim, const lite::Context& context, @@ -73,7 +56,7 @@ inline void LayoutTransCompute(const int dim, template class LayoutNchwToNhwcCompute - : public KernelLite { + : public KernelLite { public: using param_t = operators::LayoutParam; @@ -81,36 +64,37 @@ class LayoutNchwToNhwcCompute auto& param = this->template Param(); auto* x = param.x; auto* out = param.y; - out->template mutable_data::T>(); - auto x_dims = param.x->dims().size(); + out->template mutable_data< + typename subgraph::mlu::MLUTypeTraits::type>(); + auto x_ndims = param.x->dims().size(); auto& context = this->ctx_->template As(); const auto origin_dims = out->dims().Vectorize(); std::vector axis; - switch (x_dims) { + switch (x_ndims) { case 2: axis = {0, 1}; break; case 3: axis = {0, 2, 1}; out->Resize(std::vector{ - out->dims()[0], out->dims()[2], out->dims()[1]}); + origin_dims[0], origin_dims[2], origin_dims[1]}); break; case 4: axis = {0, 2, 3, 1}; out->Resize(std::vector{ - out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]}); + origin_dims[0], origin_dims[2], origin_dims[3], origin_dims[1]}); break; default: CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc"; } LayoutTransCompute::T>( - x_dims, context, *x, out, axis); + typename subgraph::mlu::MLUTypeTraits::type>( + x_ndims, context, *x, out, axis); - if (x_dims > 2) { + if (x_ndims > 2) { out->Resize(origin_dims); } } @@ -122,7 +106,7 @@ class LayoutNchwToNhwcCompute template class LayoutNhwcToNchwCompute - : public KernelLite { + : public KernelLite { public: using param_t = operators::LayoutParam; @@ -130,25 +114,27 @@ class LayoutNhwcToNchwCompute auto& param = this->template Param(); auto* x = param.x; auto* out = param.y; - out->template mutable_data::T>(); - auto x_dims = param.x->dims().size(); + out->template mutable_data< + typename subgraph::mlu::MLUTypeTraits::type>(); auto& context = this->ctx_->template As(); - const auto origin_dims = out->dims().Vectorize(); + TensorLite tmp_t; + tmp_t.ShareDataWith(*x); + const auto x_dims = x->dims().Vectorize(); + auto x_ndims = param.x->dims().size(); std::vector axis; - switch (x_dims) { + switch (x_ndims) { case 2: axis = {0, 1}; break; case 3: - out->Resize(std::vector{ - out->dims()[0], out->dims()[2], out->dims()[1]}); + tmp_t.Resize(std::vector{x_dims[0], x_dims[2], x_dims[1]}); axis = {0, 2, 1}; break; case 4: - out->Resize(std::vector{ - out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]}); + tmp_t.Resize( + std::vector{x_dims[0], x_dims[2], x_dims[3], x_dims[1]}); axis = {0, 3, 1, 2}; break; default: @@ -156,12 +142,8 @@ class LayoutNhwcToNchwCompute } LayoutTransCompute::T>( - x_dims, context, *x, out, axis); - - if (x_dims > 2) { - out->Resize(origin_dims); - } + typename subgraph::mlu::MLUTypeTraits::type>( + x_ndims, context, tmp_t, out, axis); } std::string doc() const override { diff --git a/lite/kernels/mlu/subgraph_compute.cc b/lite/kernels/mlu/subgraph_compute.cc index 73ca9dcc20a6311d33e5cff6c6ed6be08f3c7a1f..450031021d3ad70c6abb348a6e498d8876f5ec56 100644 --- a/lite/kernels/mlu/subgraph_compute.cc +++ b/lite/kernels/mlu/subgraph_compute.cc @@ -36,8 +36,14 @@ REGISTER_LITE_KERNEL( kNHWC, paddle::lite::kernels::mlu::SubgraphCompute, def_kFloat) - .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))}) - .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))}) + .BindInput("Inputs", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Outputs", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kAny), + DATALAYOUT(kAny))}) .Finalize(); REGISTER_LITE_KERNEL( @@ -47,6 +53,12 @@ REGISTER_LITE_KERNEL( kNHWC, paddle::lite::kernels::mlu::SubgraphCompute, def_FP16) - .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))}) - .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))}) + .BindInput("Inputs", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Outputs", + {LiteType::GetTensorTy(TARGET(kMLU), + PRECISION(kAny), + DATALAYOUT(kAny))}) .Finalize(); diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h index 3bfba33f4d7e8fd86f7aaf276da2ca4a8b0bd7cf..75570a6249ecaa36a94b73dafb27f655495cab87 100644 --- a/lite/kernels/mlu/subgraph_compute.h +++ b/lite/kernels/mlu/subgraph_compute.h @@ -14,17 +14,24 @@ #pragma once +#include +#include #include #include #include + #include "lite/api/paddle_place.h" #include "lite/core/kernel.h" +#include "lite/core/op_lite.h" #include "lite/core/op_registry.h" +#include "lite/core/tensor.h" #include "lite/core/type_system.h" #include "lite/core/types.h" #include "lite/kernels/mlu/bridges/graph.h" +#include "lite/kernels/mlu/bridges/tensor.h" #include "lite/kernels/npu/bridges/engine.h" #include "lite/kernels/npu/bridges/registry.h" +#include "lite/utils/env.h" namespace paddle { namespace lite { @@ -36,125 +43,434 @@ class SubgraphEngine : public subgraph::Engine { public: SubgraphEngine(KernelContext* ctx, int block_idx, - cpp::BlockDesc* block_desc, + const std::shared_ptr& program_desc, + Scope* exec_scope, const std::vector& input_names, const std::vector& output_names, - Scope* scope, - ::paddle::lite_api::PrecisionType type) - : subgraph::Engine( - ctx, block_idx, block_desc, input_names, output_names, scope) { - graph_.SetFPType(type); + paddle::lite_api::PrecisionType type) + : subgraph::Engine(ctx, + block_idx, + program_desc, + exec_scope, + input_names, + output_names), + fp_type_(type) { + VLOG(4) << "[MLU] PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL is " + << GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL"); + VLOG(4) << "[MLU] PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE is " + << GetBoolFromEnv("PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE", + true); + VLOG(4) << "[MLU] LITE_DISABLE_MLU_CAST is " + << GetBoolFromEnv("LITE_DISABLE_MLU_CAST"); + if (GetBoolFromEnv("PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE", true)) { + disable_batch_size_changeable_ = true; + } } - int Build() { - // In order to attach all of the ops of the block desc, we need to build - // the original program firstly. - BuildOriginProgram(); - // Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph - build_device_program_status_ = BuildDeviceProgram(); - return build_device_program_status_; + bool InputShapeChanged() { + std::vector> new_shape; + // used in batch changable situation + std::vector> all_shape; + for (auto origin_itensor : origin_itensors_) { + if (!disable_batch_size_changeable_) { + auto iv = origin_itensor->dims().Vectorize(); + all_shape.push_back(iv); + iv.erase(iv.begin()); + new_shape.push_back(iv); + } else { + new_shape.push_back(origin_itensor->dims().Vectorize()); + } + } + inputs_shape_ = new_shape; + all_inputs_shape_ = all_shape; + if (shape_graph_map_.count(inputs_shape_) > 0) { + return false; + } + VLOG(3) << "MLU graph input shape changed" << std::endl; + return true; } - int Launch() { - // Rebuild device program when the shapes of input tensors have been - // changed. - if (subgraph::CHECK_SUCCESS(build_device_program_status_) && - subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED( - build_device_program_status_) && - InputShapeChanged()) { - Build(); - } - if (subgraph::CHECK_FAILED(build_device_program_status_)) { - LaunchOriginProgram(); - } else { - LaunchDeviceProgram(); + inline cnmlDataType_t PrecisionToDatatype(PrecisionType data_type) { + switch (data_type) { + case paddle::lite_api::PrecisionType::kFP16: + return CNML_DATA_FLOAT16; + case paddle::lite_api::PrecisionType::kFloat: + return CNML_DATA_FLOAT32; + case paddle::lite_api::PrecisionType::kInt32: + return CNML_DATA_INT32; + case paddle::lite_api::PrecisionType::kInt8: + return CNML_DATA_UINT8; + default: + return PrecisionToDatatype(fp_type_); } - return 0; } protected: - int BuildDeviceProgram() override { + bool BuildDeviceProgram() override { + if (!origin_program_) { + BuildOriginProgram(); + } + if (!error_compile_batch_size_changeable_ && + !disable_batch_size_changeable_) { + int status = BuildDeviceProgramImpl(); + if (subgraph::CHECK_SUCCESS(status)) { + return status; + } + LOG(INFO) << "[MLU] build batch_size changeable subgraph op failed, " + "changed to input_shape changeable"; + } + error_compile_batch_size_changeable_ = true; + disable_batch_size_changeable_ = true; + return BuildDeviceProgramImpl(); + } + + bool BuildDeviceProgramImpl() { int status = 0; + auto graph = std::make_shared(); + graph->SetFPType(fp_type_); + std::vector> new_shape; + origin_itensors_.clear(); + origin_otensors_.clear(); + + auto* sub_block_desc = + program_desc_->GetBlock()(block_idx_); + auto data_order = sub_block_desc->GetOp(0)->Type() == "layout" + ? CNML_NCHW + : CNML_NHWC; // Convert all of input data vars and added into the MLU IR graph + status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED; for (auto& input_name : input_names_) { - auto input_tensor = scope_->FindMutableTensor(input_name); + auto input_tensor = exec_scope_->FindMutableTensor(input_name); + auto data_type = input_tensor->precision(); + cnmlDataType_t fp_type = PrecisionToDatatype(data_type); + origin_itensors_.push_back(input_tensor); + if (!disable_batch_size_changeable_) { + auto iv = input_tensor->dims().Vectorize(); + iv.erase(iv.begin()); + new_shape.push_back(iv); + } else { + new_shape.push_back(input_tensor->dims().Vectorize()); + } + CHECK(input_tensor); - auto input_node = - graph_.AddNode(input_name, - input_tensor->dims().Vectorize(), - CNML_TENSOR, - CNML_NCHW, - graph_.FPType(), - const_cast(input_tensor->raw_data())); + VLOG(4) << "subgraph input tensor " << input_name << std::endl; + auto input_node = graph->AddNode(input_name, + input_tensor->dims().Vectorize(), + CNML_TENSOR, + CNML_NCHW, + fp_type, + data_order); CHECK(input_node); // MLU doesn't support dynamic dimensions/shapes, so need to rebuild // the program when the shape of any input tensor is changed. - status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED; } LOG(INFO) << "START TO CONVERT "; // Convert all of ops and its weights and added into the MLU IR graph const auto& bridges = subgraph::Registry::Instance(); - for (auto& inst : origin_program_) { + const auto& insts = origin_program_->instructions(kRootBlockIdx); + for (auto& inst : insts) { auto op = inst.op(); CHECK(op); std::string op_type = op->op_info()->Type(); + // since cnml's compile api will not return error now, we simply check + // op's type + if (!disable_batch_size_changeable_ && + std::find(unsupport_batch_size_changeable_op_type_.begin(), + unsupport_batch_size_changeable_op_type_.end(), + op_type) != + unsupport_batch_size_changeable_op_type_.end()) { + status |= subgraph::FAILED; + VLOG(4) << "[MLU] found unsupported batch_size changeable op type: " + << op_type; + if (subgraph::CHECK_FAILED(status)) { + return false; + } + return true; + } op->CheckShape(); const_cast(op)->InferShape(); if (!bridges.Exists(op_type, TARGET(kMLU))) { LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type; - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kMLU))( - reinterpret_cast(&graph_), + reinterpret_cast(graph.get()), const_cast(op), const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } // Obtain the output nodes of the MLU IR graph and build the graph to MLU // runtime - std::vector valid_output_names; for (auto& output_name : output_names_) { - if (graph_.HasNode(output_name)) { - graph_.AddOutput(graph_.GetNode(output_name)); - auto output_tensor = scope_->FindMutableTensor(output_name); - void* p_data = static_cast( - output_tensor->mutable_data::T>( - TARGET(kMLU))); - auto node = graph_.GetNode(output_name); - CHECK(p_data); - node->set_mlu_ptr(p_data); - valid_output_names.push_back(output_name); + if (graph->HasNode(output_name)) { + graph->AddOutput(graph->GetNode(output_name)); + auto output_tensor = exec_scope_->FindMutableTensor(output_name); + origin_otensors_.push_back(output_tensor); + VLOG(4) << "subgraph output tensor " << output_name << std::endl; + + // auto node = graph->GetNode(output_name); + // CHECK(p_data); + // node->set_mlu_ptr(p_data); } } for (auto& input_name : input_names_) { - graph_.AddInput(graph_.GetNode(input_name)); + graph->AddInput(graph->GetNode(input_name), + disable_batch_size_changeable_); } - CHECK(!valid_output_names.empty()) << "[MLU] no valid output names"; + + CHECK(!origin_otensors_.empty()) << "[MLU] no valid output names"; auto& mlu_context = this->ctx_->template As(); auto core_version = mlu_context.MLUCoreVersion(); auto core_number = mlu_context.MLUCoreNumber(); - graph_.Compile(core_version, core_number); - return status; + graph->Compile(core_version, core_number); + shape_graph_map_[new_shape] = graph; + if (GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL")) { + graph->GenOfflineModel(GetOfflineModName()); + } + return true; + } + + std::string TrimStrings(const std::string& origin_str) { + std::string str = origin_str; + std::size_t found = str.find("0x"); + std::size_t found_end = 0; + const std::vector del_strs = { + "/trans_io_copy", "/trans_cast", "/trans_layout"}; + for (const auto& iterm : del_strs) { + found_end = str.find(iterm); + // trim point address and one of the del_strs + if (found != std::string::npos && found_end != std::string::npos) { + str.replace(found, found_end - found, ""); + found_end = str.find(iterm); + str.replace(found_end, iterm.size(), ""); + break; + } + } + return str; + } + + std::string GetOfflineModName() { + sort(input_names_.begin(), input_names_.end()); + sort(output_names_.begin(), output_names_.end()); + const auto& delimiter = "__"; + const auto& delimiter_num = "_"; + const auto& input_shape_str = "input_shape_"; + const auto& output_shape_str = "output_shape_"; + std::string name = ""; + std::string tmp = ""; + for (const auto& input_name : input_names_) { + tmp = input_name; + name += TrimStrings(tmp) + delimiter + input_shape_str; + auto input_tensor = exec_scope_->FindMutableTensor(input_name); + for (const auto& iterm : input_tensor->dims().Vectorize()) { + name += std::to_string(iterm) + delimiter_num; + } + name += delimiter; + } + for (const auto& output_name : output_names_) { + tmp = output_name; + name += TrimStrings(tmp) + delimiter + output_shape_str; + auto output_tensor = exec_scope_->FindMutableTensor(output_name); + for (const auto& iterm : output_tensor->dims().Vectorize()) { + name += std::to_string(iterm) + delimiter_num; + } + name += delimiter; + } + std::replace(name.begin(), name.end(), '/', '-'); + return name; } - int LaunchDeviceProgram() override { + void InferOutputsShapeOnly() { + // infer outputs shape when enable BATCH_SIZE_CHANGEABLE + const auto iter = in_out_shape_map_.find(all_inputs_shape_); + if (iter != in_out_shape_map_.end()) { + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + origin_otensors_[i]->Resize(iter->second[i]); + } + } else { + const auto& insts = origin_program_->instructions(kRootBlockIdx); + for (auto& inst : insts) { + auto op = inst.op(); + CHECK(op); + op->CheckShape(); + const_cast(op)->InferShape(); + } + std::vector> outs_shape; + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + outs_shape.push_back(origin_otensors_[i]->dims().Vectorize()); + } + in_out_shape_map_[all_inputs_shape_] = outs_shape; + } + } + + inline void* GetOutputDataPtr(Tensor* tensor, bool use_mlu_cast) { + if (use_mlu_cast) { + // output is float, since cast fused in subgraph + return static_cast(tensor->mutable_data(TARGET(kMLU))); + } else { + return static_cast( + tensor->template mutable_data< + typename subgraph::mlu::MLUTypeTraits::type>( + TARGET(kMLU))); + } + } + + bool LaunchDeviceProgram() override { + // prepare input and output memory auto& mlu_context = this->ctx_->template As(); auto exec_queue = mlu_context.exec_queue(); - u32_t affinity = mlu_context.affinity(); - cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param(); - int data_param = 1; - forward_param.data_parallelism = &data_param; - forward_param.affinity = &affinity; - forward_param.end = CNRT_PARAM_END; - graph_.Compute(forward_param, exec_queue); - return 0; + + auto graph = shape_graph_map_[inputs_shape_]; + auto* graph_input = graph->MutableInputs(); + auto* graph_output = graph->MutableOutputs(); + CHECK_EQ(graph_input->size(), origin_itensors_.size()); + CHECK_EQ(graph_output->size(), origin_otensors_.size()); + + bool disable_mlu_cast = GetBoolFromEnv("LITE_DISABLE_MLU_CAST"); + + if (!disable_batch_size_changeable_) { + std::vector> + graph_in; + if (shape_tensor_map_in_.find(all_inputs_shape_) != + shape_tensor_map_in_.end()) { + graph_in = shape_tensor_map_in_[all_inputs_shape_]; + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + graph_in[i]->set_mlu_ptr( + const_cast(origin_itensors_[i]->raw_data())); + } + } else { + graph_in.reserve(origin_itensors_.size()); + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + paddle::lite::subgraph::mlu::MLUTensor tmp( + origin_itensors_[i]->dims().Vectorize()); + tmp.set_mlu_dtype(graph_input->at(i)->dtype()); + tmp.set_mlu_ptr(const_cast(origin_itensors_[i]->raw_data())); + graph_in.push_back( + std::make_shared(tmp)); + } + shape_tensor_map_in_[all_inputs_shape_] = graph_in; + } + + // TODO(zhangmingwei): we just call every op's infer_shape to get outputs' + // shape, may be it's better to use cnml's api to get output shape. This + // can be done when cnml's tensor dimension is totally equal to lite's + // tensor + // shape. + InferOutputsShapeOnly(); + // const std::vector> new_output_size = + // graph->InferOutputsShape(graph_in); + + std::vector> + graph_out; + + if (shape_tensor_map_out_.find(all_inputs_shape_) != + shape_tensor_map_out_.end()) { + graph_out = shape_tensor_map_out_[all_inputs_shape_]; + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + // origin_otensors_[i]->Resize(new_output_size.at(i)); + graph_out[i]->set_mlu_ptr( + GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast)); + } + } else { + graph_out.reserve(origin_otensors_.size()); + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + // origin_otensors_[i]->Resize(new_output_size.at(i)); + paddle::lite::subgraph::mlu::MLUTensor tmp( + origin_otensors_[i]->dims().Vectorize()); + tmp.set_mlu_dtype(graph_output->at(i)->dtype()); + tmp.set_mlu_ptr( + GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast)); + graph_out.push_back( + std::make_shared(tmp)); + } + shape_tensor_map_out_[all_inputs_shape_] = graph_out; + } + graph->Compute(exec_queue, graph_in, graph_out); + } else { + for (size_t i = 0; i < origin_itensors_.size(); ++i) { + graph_input->at(i)->set_mlu_ptr( + const_cast(origin_itensors_[i]->raw_data())); + } + for (size_t i = 0; i < origin_otensors_.size(); ++i) { + origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape()); + graph_output->at(i)->set_mlu_ptr( + GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast)); + } + // only cnmlComputeFusionOpForward_V3 need cnrtInvokeFuncParam_t + cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param(); + int data_param = 1; + forward_param.data_parallelism = &data_param; + u32_t affinity = mlu_context.affinity(); + forward_param.affinity = &affinity; + forward_param.end = CNRT_PARAM_END; + graph->Compute(forward_param, exec_queue); + +#ifdef MLU_DUMP_SUBGRAPH_IO + // Graph node store compile-time tensor while batchsize mutable is set. + // Only batchsize mutable is disabled, data exists in graph node at + // runtime + // =========== DUMP =================== + for (auto input_name : input_names_) { + auto input_tensor = + shape_graph_map_[inputs_shape_]->GetNode(input_name); + auto dump_name = input_name; + while (dump_name.find("/") != std::string::npos) { + dump_name = dump_name.replace(dump_name.find("/"), 1, "_"); + } + VLOG(6) << "dump_name: " << dump_name; + input_tensor->ToFile(dump_name); + } + for (auto output_name : output_names_) { + if (shape_graph_map_[inputs_shape_]->HasNode(output_name)) { + auto output_tensor = + shape_graph_map_[inputs_shape_]->GetNode(output_name); + auto dump_name = output_name; + while (dump_name.find("/") != std::string::npos) { + dump_name = dump_name.replace(dump_name.find("/"), 1, "_"); + } + VLOG(6) << "dump_name: " << dump_name; + output_tensor->ToFile(dump_name); + } else { + VLOG(6) << "graph does not have " << output_name << " as output" + << std::endl; + } + } +#endif + // =========== DUMP END ================ + } + + return true; } - paddle::lite::subgraph::mlu::Graph graph_; + paddle::lite_api::PrecisionType fp_type_; + std::vector> inputs_shape_{}; + std::vector> all_inputs_shape_{}; + std::map>, + std::shared_ptr> + shape_graph_map_{}; + // enable batch size changeable by default, this cound be changed by + // environment variable PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE and + // whether the op can be compiled with batch size changeable way + bool disable_batch_size_changeable_{false}; + bool error_compile_batch_size_changeable_{false}; + std::vector unsupport_batch_size_changeable_op_type_{"concat"}; + // search output runtime MLUTensor for certain output shape when enable + // BATCH_SIZE_CHANGEABLE + std::map>, + std::vector>> + shape_tensor_map_out_{}; + // search input runtime MLUTensor for certain input shape when enable + // BATCH_SIZE_CHANGEABLE + std::map>, + std::vector>> + shape_tensor_map_in_{}; + // search output shape for certain input shape when enable + // BATCH_SIZE_CHANGEABLE + std::map>, std::vector>> + in_out_shape_map_{}; }; template @@ -167,19 +483,18 @@ class SubgraphCompute auto& param = this->template Param(); // LOG(INFO) << "SUBGRAP Prepare RUN index " << param.sub_block_idx; engine_.reset(new SubgraphEngine(this->ctx_.get(), - param.sub_block_idx, - param.sub_block_desc, + param.block_idx, + param.program_desc, + param.exec_scope, param.input_data_names, param.output_data_names, - param.scope, this->precision())); CHECK(engine_); - engine_->Build(); } void Run() override { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } virtual ~SubgraphCompute() = default; diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index 5157f47867160cf4f705306ca37cfad962373386..be30d1c03988cb8b88761c0719c2785446c0b0ea 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU AND NOT LITE_WITH_APU) +if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU AND NOT LITE_WITH_APU AND NOT LITE_WITH_HUAWEI_ASCEND_NPU) return() endif() diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc index db9a652b6c1b4055e09a70e1f407b1027fd1b1e8..afe689729d3efde0a611c6da2086e0f8cf58a307 100644 --- a/lite/kernels/npu/bridges/act_op.cc +++ b/lite/kernels/npu/bridges/act_op.cc @@ -100,6 +100,9 @@ int ActConverter(void* ctx, auto offset = op_info->GetAttr("offset"); act_op->set_attr_negative_slope(slope); act_op->set_attr_coef(offset); + } else if (op_type == "thresholded_relu") { + auto threshold = op_info->GetAttr("threshold"); + act_op->set_attr_coef(threshold); } return SUCCESS; } @@ -141,6 +144,10 @@ REGISTER_SUBGRAPH_BRIDGE( hard_sigmoid, kNPU, paddle::lite::subgraph::npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + thresholded_relu, + kNPU, + paddle::lite::subgraph::npu::ActConverter); REGISTER_SUBGRAPH_BRIDGE( log, kNPU, paddle::lite::subgraph::npu::ActConverter); diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc index 8ca8357710e1f36a7c3f21417d7633e47f18c59a..b9f81a74ad997966ecb79c66bceed1e84b4a91f7 100644 --- a/lite/kernels/npu/bridges/engine.cc +++ b/lite/kernels/npu/bridges/engine.cc @@ -15,6 +15,7 @@ #include "lite/kernels/npu/bridges/engine.h" #include #include +#include #include #include "lite/kernels/npu/bridges/registry.h" @@ -22,104 +23,90 @@ namespace paddle { namespace lite { namespace subgraph { -int Engine::BuildDeviceProgram() { return FAILED; } +Engine::Engine(KernelContext *ctx, + int block_idx, + const std::shared_ptr &program_desc, + Scope *exec_scope, + const std::vector &input_names, + const std::vector &output_names) + : ctx_(ctx), + block_idx_(block_idx), + program_desc_(program_desc), + exec_scope_(exec_scope) { + input_names_ = input_names; + output_names_ = output_names; + // Sort the name of input and output tensors, it's convenient for us to get + // the info of input and output tensors in the same order from the device + // program, because the result of subgraph division may be different but right + // at each call of the subgraph pass. + std::stable_sort(input_names_.begin(), input_names_.end()); + std::stable_sort(output_names_.begin(), output_names_.end()); +} + +bool Engine::Run() { + if (is_first_epoch_) { + PrepareWorkspaceForDeviceProgram(); + is_first_epoch_ = false; + } + if (InputShapeChanged()) { + BuildDeviceProgram(); + } + return LaunchDeviceProgram(); +} -int Engine::LaunchDeviceProgram() { return 0; } +bool Engine::PrepareWorkspaceForOriginProgram() { + origin_idims_.resize(input_names_.size()); + origin_itensors_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + origin_itensors_[i] = exec_scope_->FindMutableTensor(input_names_[i]); + CHECK(origin_itensors_[i]); + } + origin_otensors_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + origin_otensors_[i] = exec_scope_->FindMutableTensor(output_names_[i]); + CHECK(origin_otensors_[i]); + } + return true; +} -int Engine::BuildOriginProgram() { +bool Engine::BuildOriginProgram() { // TODO(hong19860320) The block_desc need to be divided into subgraphs during // the exection time. But only see them as a subgraph now. - origin_program_.clear(); - for (size_t op_idx = 0; op_idx < block_desc_->OpsSize(); op_idx++) { - auto op_desc = block_desc_->GetOp(op_idx); - CHECK(op_desc); - std::string op_type = op_desc->Type(); - auto op = LiteOpRegistry::Global().Create(op_desc->Type()); - op->Attach(*op_desc, scope_); - std::unique_ptr picked_kernel; - if (op_desc->HasAttr(kKernelTypeAttr)) { - // Create op and pick up kernel according to the kKernelTypeAttr attribute - auto kernel_type = op_desc->GetAttr(kKernelTypeAttr); - std::string alias; - Place place; - KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place); - VLOG(3) << "Found the attr '" << kKernelTypeAttr << "': " << kernel_type - << " for " << op_type; - auto kernels = op->CreateKernels({place}); - CHECK_GT(kernels.size(), 0u) << "No kernels found for " << op_type; - auto it = std::find_if( - kernels.begin(), kernels.end(), [&](std::unique_ptr& it) { - return it->alias() == alias; - }); - CHECK(it != kernels.end()); - picked_kernel = std::move(*it); - } else { - VLOG(3) << "The attr '" << kKernelTypeAttr - << "' not found, pick the first kernel for " << op_type; - std::vector> kernels; -#if defined(LITE_WITH_ARM) - kernels = op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}}); -#elif defined(LITE_WITH_X86) - kernels = op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}}); -#endif - if (kernels.size() > 0) { - picked_kernel = std::move(kernels.front()); - } else { - LOG(WARNING) << "No kernels found for " << op_type; - } - } - if (picked_kernel != nullptr) { - picked_kernel->SetContext( - ContextScheduler::Global().NewContext(picked_kernel->target())); - } - origin_program_.emplace_back(std::move(op), std::move(picked_kernel)); + if (!origin_program_) { + origin_program_.reset( + new RuntimeProgram(program_desc_, exec_scope_, block_idx_)); } - return 0; + return true; } -int Engine::LaunchOriginProgram() { - for (auto& inst : origin_program_) { - auto op_type = inst.op()->op_info()->Type(); - if (op_type == "feed" || op_type == "fetch") continue; - inst.Run(); +bool Engine::LaunchOriginProgram() { + if (!origin_program_) { + BuildOriginProgram(); } - return 0; + if (origin_program_) { + VLOG(3) << "Roll back to run the origin program."; + origin_program_->Run(); + return true; + } + return false; } -int Engine::Build() { - // In order to attach all of the ops of the block desc, we need to build the - // original program firstly. - BuildOriginProgram(); - // Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph - build_device_program_status_ = BuildDeviceProgram(); - return build_device_program_status_; +bool Engine::PrepareWorkspaceForDeviceProgram() { + return PrepareWorkspaceForOriginProgram(); } -void Engine::InitDeviceTensor() { return; } +bool Engine::BuildDeviceProgram() { return BuildOriginProgram(); } + +bool Engine::LaunchDeviceProgram() { return LaunchOriginProgram(); } bool Engine::InputShapeChanged() { + bool changed = false; for (size_t i = 0; i < origin_itensors_.size(); i++) { - if (origin_itensors_[i]->dims() != origin_idims_[i]) { - return true; - } - } - return false; -} - -int Engine::Launch() { - // Rebuild device program when the shapes of input tensors have been changed. - if (CHECK_SUCCESS(build_device_program_status_) && - CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) && - InputShapeChanged()) { - Build(); - InitDeviceTensor(); - } - if (CHECK_FAILED(build_device_program_status_)) { - LaunchOriginProgram(); - } else { - LaunchDeviceProgram(); + auto origin_idim = origin_itensors_[i]->dims().Vectorize(); + changed |= origin_idim != origin_idims_[i]; + origin_idims_[i] = origin_idim; } - return 0; + return changed; } } // namespace subgraph diff --git a/lite/kernels/npu/bridges/engine.h b/lite/kernels/npu/bridges/engine.h index 6a3f72077a9bed7a296b184330af119262472ada..daa02fb0d7bf8f70ebf8b21821a274b6a0ba062d 100644 --- a/lite/kernels/npu/bridges/engine.h +++ b/lite/kernels/npu/bridges/engine.h @@ -30,52 +30,39 @@ class Engine { public: Engine(KernelContext *ctx, int block_idx, - cpp::BlockDesc *block_desc, + const std::shared_ptr &program_desc, + Scope *exec_scope, const std::vector &input_names, - const std::vector &output_names, - lite::Scope *scope, - std::string model_cache_dir = "") - : ctx_(ctx), - block_idx_(block_idx), - block_desc_(block_desc), - input_names_(input_names), - output_names_(output_names), - scope_(scope), - model_cache_dir_(model_cache_dir) {} + const std::vector &output_names); virtual ~Engine() = default; - virtual int Build(); - virtual int Launch(); + virtual bool Run(); private: Engine(const Engine &) = delete; protected: - virtual int BuildDeviceProgram(); - virtual int LaunchDeviceProgram(); + virtual bool PrepareWorkspaceForOriginProgram(); + virtual bool BuildOriginProgram(); + virtual bool LaunchOriginProgram(); - virtual int BuildOriginProgram(); - virtual int LaunchOriginProgram(); + virtual bool PrepareWorkspaceForDeviceProgram(); + virtual bool BuildDeviceProgram(); + virtual bool LaunchDeviceProgram(); - virtual void InitDeviceTensor(); virtual bool InputShapeChanged(); KernelContext *ctx_{nullptr}; - int block_idx_; - cpp::BlockDesc *block_desc_; + int block_idx_{-1}; + const std::shared_ptr program_desc_{nullptr}; std::vector input_names_; std::vector output_names_; - Scope *scope_{nullptr}; - // SUCCESS: device program build successed. FAILED: device program build - // failed. REBUILD_WHEN_SHAPE_CHANGED: device program build successed but need - // to rebuild when input shape changed. - int build_device_program_status_{0}; - std::vector origin_idims_; - std::vector origin_odims_; + Scope *exec_scope_{nullptr}; + bool is_first_epoch_{true}; + std::vector> origin_idims_; std::vector origin_itensors_; std::vector origin_otensors_; - std::vector origin_program_; - std::string model_cache_dir_{""}; + std::unique_ptr origin_program_{nullptr}; }; } // namespace subgraph diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h index 38b03e06fa212728888cf47b3048d71fd4de06fc..1bc588496a253aa82183e020adc39989ad8d7312 100644 --- a/lite/kernels/npu/bridges/graph.h +++ b/lite/kernels/npu/bridges/graph.h @@ -19,7 +19,7 @@ #include #include #include -#include "graph/op/all_ops.h" +#include "graph/compatible/all_ops.h" #include "lite/core/op_lite.h" #include "lite/core/tensor.h" diff --git a/lite/kernels/npu/bridges/matmul_op.cc b/lite/kernels/npu/bridges/matmul_op.cc index 32af1916899454ef7a045339da5e9fc8a6131cfc..79ba82d94f24f61c2b9f51bd29634151bfcfa0ab 100644 --- a/lite/kernels/npu/bridges/matmul_op.cc +++ b/lite/kernels/npu/bridges/matmul_op.cc @@ -94,10 +94,10 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) { } else { matmul_node = graph->Add(out_name); auto matmul_op = matmul_node->data(); - matmul_op->set_input_x(*x_node->data()); - matmul_op->set_input_y(*y_node->data()); - matmul_op->set_attr_adj_x(transpose_x); - matmul_op->set_attr_adj_y(transpose_y); + matmul_op->set_input_x1(*x_node->data()); + matmul_op->set_input_x2(*y_node->data()); + matmul_op->set_attr_adj_x1(transpose_x); + matmul_op->set_attr_adj_x2(transpose_y); } if (fabs(alpha - 1.f) > 1e-6f) { diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h index b6ce66fe34963d8c3bc9c2bccc0f3a294ab16290..d431133bfd2361c3ffd80d54c445d13e382492f5 100644 --- a/lite/kernels/npu/bridges/paddle_use_bridges.h +++ b/lite/kernels/npu/bridges/paddle_use_bridges.h @@ -25,6 +25,7 @@ USE_SUBGRAPH_BRIDGE(hard_sigmoid, kNPU); USE_SUBGRAPH_BRIDGE(log, kNPU); USE_SUBGRAPH_BRIDGE(sqrt, kNPU); USE_SUBGRAPH_BRIDGE(square, kNPU); +USE_SUBGRAPH_BRIDGE(thresholded_relu, kNPU); USE_SUBGRAPH_BRIDGE(batch_norm, kNPU); USE_SUBGRAPH_BRIDGE(less_than, kNPU); diff --git a/lite/kernels/npu/bridges/utility.cc b/lite/kernels/npu/bridges/utility.cc index d9c9ffae923631d20c462149a57fccf3335836fd..abc24ea2ca8fea35007687218db963181e304156 100644 --- a/lite/kernels/npu/bridges/utility.cc +++ b/lite/kernels/npu/bridges/utility.cc @@ -144,6 +144,8 @@ int CvtActMode(std::string act_type) { act_mode = 9; } else if (act_type == "hard_sigmoid") { act_mode = 10; + } else if (act_type == "thresholded_relu") { + act_mode = 11; } else { // TODO(hong19860320) support more activation mode LOG(FATAL) << "[NPU] Unsupported activation type " << act_type; diff --git a/lite/kernels/npu/bridges/utility.h b/lite/kernels/npu/bridges/utility.h index 107d90c116b8239a9060f252c45c2b2d7901ddf7..6e75e58187909ad59da37dbcb0737a92ec014e22 100644 --- a/lite/kernels/npu/bridges/utility.h +++ b/lite/kernels/npu/bridges/utility.h @@ -20,11 +20,11 @@ #include #include #include "graph/buffer.h" +#include "graph/compatible/operator_reg.h" #include "graph/graph.h" #include "graph/model.h" #include "graph/op/all_ops.h" #include "graph/operator.h" -#include "graph/operator_reg.h" #include "lite/core/op_lite.h" #include "lite/utils/macros.h" @@ -97,25 +97,26 @@ REG_OP(Pad) /* * Multiplies slices of two tensors in batches. * - * x : The input tensor - * y : The input tensor + * x1 : The input tensor + * x2 : The input tensor * - * z : The output tensor + * y : The output tensor * - * adj_x : adj_x is true, the input tensor x is transposed, otherwise - * it will not be transposed. Default is false (The current version only - * supports false). - * adj_y : adj_y is true, the input tensor y is transposed, otherwise - * it will not be transposed. Default is false. + * adj_x1 : adj_x1 is true, the input tensor x1 is transposed, + * otherwise it will not be transposed. + * Default is false (The current version only supports false). + * adj_x2 : adj_x2 is true, the input tensor x2 is transposed, + * otherwise it will not be transposed. + * Default is false. * - * 100.320.010.010 + * 100.320.010.010 */ REG_OP(BatchMatMul) - .INPUT(x, TensorType({DT_FLOAT})) - .INPUT(y, TensorType({DT_FLOAT})) - .OUTPUT(z, TensorType({DT_FLOAT})) - .ATTR(adj_x, AttrValue::BOOL{false}) - .ATTR(adj_y, AttrValue::BOOL{false}) + .INPUT(x1, TensorType({DT_FLOAT})) + .INPUT(x2, TensorType({DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT})) + .ATTR(adj_x1, AttrValue::BOOL{false}) + .ATTR(adj_x2, AttrValue::BOOL{false}) .OP_END() } // namespace ge diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc index f17d73f8dfd540c8a1b809d780084b05299ccc2f..e9c5957ff6d8f026f712de04f4e32cd69baf50a9 100644 --- a/lite/kernels/npu/subgraph_compute.cc +++ b/lite/kernels/npu/subgraph_compute.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "hiai_ir_build.h" // NOLINT #include "lite/backends/npu/device.h" @@ -24,205 +25,283 @@ #include "lite/kernels/npu/bridges/paddle_use_bridges.h" #include "lite/kernels/npu/bridges/utility.h" #include "lite/utils/io.h" +#include "lite/utils/md5.h" namespace paddle { namespace lite { namespace kernels { namespace npu { -std::string SubgraphEngine::GenerateModelCacheName() const { - auto inames = device_inames_; - auto onames = device_onames_; - std::stable_sort(inames.begin(), inames.end()); - - std::string model_cache_name = "subgraph_" + std::to_string(block_idx_); - for (auto iname : inames) { - model_cache_name += "_"; - auto itensor = scope_->FindTensor(iname); - int tmp = 0; - for (auto i : itensor->dims().Vectorize()) { - tmp += i * i; +// Generate the model name by using md5 hashes based on: +// 1. the sorted variable input names +// 2. the shapes of the origin input tensors +// 3. the sorted variable output names +std::string DeviceProgram::GenerateModelName( + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims) { + std::ostringstream os; + CHECK_EQ(input_names.size(), origin_idims.size()); + for (int i = 0; i < input_names.size(); i++) { + os << input_names[i]; + for (auto dim : origin_idims[i]) { + os << dim; } - model_cache_name += std::to_string(tmp % 1999); } - model_cache_name += "_.om"; + for (auto output_name : output_names) { + os << output_name; + } + return MD5(os.str()); +} - return model_cache_name; +// Deserialize the generated model, the precisions and dimensions of the origin +// output tensors of the subgraph op from the cached configuration file and HiAI +// om file +bool DeviceProgram::LoadFromCacheFile( + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims, + const std::string& model_cache_dir) { + // Generate the model name if not initialized + if (model_name_.empty()) { + model_name_ = GenerateModelName(input_names, output_names, origin_idims); + } + // Load from the cached model file, return a HiAI model manager client for + // inference + auto model_path = model_cache_dir + "/" + model_name_ + ".om"; + VLOG(3) << "[NPU] Load model from " << model_path; + std::vector model_buffer; + if (!ReadFile(model_path, &model_buffer)) { + LOG(WARNING) << "[NPU] Open " << model_path << " for reading failed!"; + return false; + } + bool model_comp = false; + model_client_ = + lite::npu::Device::Global().Load(model_name_, &model_buffer, &model_comp); + if (!model_client_) { + LOG(WARNING) << "[NPU] Load model failed!"; + return false; + } + // Rewrite with the compatible model data if the cached + // model file is incompatible with the current device + if (!model_comp) { + VLOG(3) << "[NPU] Export the compatible model to " << model_path; + if (!WriteFile(model_path, model_buffer)) { + LOG(WARNING) << "[NPU] Open " << model_path << " for writting failed!"; + } + } + // Deserialize the precisions and shapes of the origin output tensors from the + // cached configuration file + auto config_path = model_cache_dir + "/" + model_name_ + ".cfg"; + VLOG(3) << "[NPU] Load configuration from " << config_path; + std::vector config_buffer; + if (!ReadFile(config_path, &config_buffer)) { + LOG(WARNING) << "[NPU] read from " << config_path << " failed!"; + return false; + } + std::string str(config_buffer.begin(), config_buffer.end()); + // Parse the precision and shapes of the output tensors + auto output_options = Split(str, ";"); + CHECK_EQ(output_options.size(), output_names.size()); + origin_otypes_.resize(output_names.size()); + origin_odims_.resize(output_names.size()); + for (int i = 0; i < output_names.size(); i++) { + auto items = Split(output_options[i], ":"); + CHECK_EQ(items.size(), 2); // precision and shapes + origin_otypes_[i] = static_cast(std::stoi(items[0])); + origin_odims_[i] = Split(items[1], ","); + } + return true; } -int SubgraphEngine::BuildDeviceProgram() { +bool DeviceProgram::BuildGraphAndCacheToFile( + RuntimeProgram* origin_program, + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims, + const std::vector& origin_otensors, + const std::string& model_cache_dir) { + // Generate the model name if not initialized + if (model_name_.empty()) { + model_name_ = GenerateModelName(input_names, output_names, origin_idims); + } + // Convert all of ops and their input vars and weights to HiAI IR nodes, + // then added them into the HiAI IR graph int status = 0; - // Convert all of ops and their input vars and weights and added into the NPU - // HiAI IR graph subgraph::npu::Graph graph; const auto& bridges = subgraph::Registry::Instance(); - for (auto& inst : origin_program_) { + CHECK(origin_program) << "[NPU] The origin program is not initialized!"; + CHECK_GT(origin_program->instructions(kRootBlockIdx).size(), 0) + << "[NPU] No instructions found in the origin program!"; + const auto& insts = origin_program->instructions(kRootBlockIdx); + for (auto& inst : insts) { auto op = const_cast(inst.op()); CHECK(op); op->CheckShape(); op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kNPU))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kNPU))( reinterpret_cast(&graph), op, const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } - // Collect the valid input and output nodes in the HiAI IR graph and update - // the input and output names - device_inames_.clear(); - device_onames_.clear(); + // Collect the input and output nodes of the HiAI IR graph std::vector device_inodes; + for (size_t i = 0; i < input_names.size(); i++) { + CHECK(graph.Has(input_names[i])); + CHECK(graph.Get(input_names[i])->is_data()); + device_inodes.push_back(*graph.Get(input_names[i])->data()); + } std::vector device_onodes; - for (auto& input_name : input_names_) { - if (graph.Has(input_name)) { - if (graph.Get(input_name)->is_data()) { - device_inodes.push_back(*graph.Get(input_name)->data()); - device_inames_.push_back(input_name); - } else { - LOG(WARNING) << "[NPU] Input node " << input_name - << " is ignored because it is not a data node."; - } - } else { - LOG(WARNING) << "[NPU] Input node " << input_name - << " is ignored because it does not exist."; - } + for (size_t i = 0; i < output_names.size(); i++) { + CHECK(graph.Has(output_names[i])); + device_onodes.push_back(*graph.Get(output_names[i])->data()); } - for (auto& output_name : output_names_) { - if (graph.Has(output_name)) { - device_onodes.push_back(*graph.Get(output_name)->data()); - device_onames_.push_back(output_name); - } else { - LOG(WARNING) << "[NPU] Output node " << output_name - << " is ignored because it does not exist."; - } + // Build the HiAI IR graph to the HiAI om model + std::vector model_buffer; + if (!lite::npu::Device::Global().Build( + device_inodes, device_onodes, &model_buffer)) { + LOG(WARNING) << "[NPU] Build model failed!"; + return false; } - CHECK(!device_inames_.empty()) - << "[NPU] No input nodes found for building NPU model"; - CHECK(!device_onames_.empty()) - << "[NPU] No output nodes found for building NPU model"; - - // Build the HiAI IR graph to HiAI om model as the device program - if (device_program_map_.count(inputs_shape_) > 0) { - return status; + // Load the HiAI om model and create a HiAI model manager client(from HiAI + // Service) to run inference. + bool model_comp = true; + model_client_ = + lite::npu::Device::Global().Load(model_name_, &model_buffer, &model_comp); + if (!model_client_) { + LOG(WARNING) << "[NPU] Load model failed!"; + return false; } - std::string model_cache_full_dir = - model_cache_dir_.empty() ? "" : model_cache_dir_ + "/" + - GenerateModelCacheName(); - auto device_client = lite::npu::Device::Global().Build( - model_name_, device_inodes, device_onodes, model_cache_full_dir); - if (device_client == nullptr) { - LOG(WARNING) << "[NPU] Build model failed!"; - return subgraph::FAILED; + // Do not check model compatibility because it assume that the cached om model + // is always compatible with the current device + // Update the precison and dimensions of the origin output tensors + // Update the precison and dimensions of the origin output tensors + CHECK_EQ(origin_otensors.size(), output_names.size()); + origin_otypes_.resize(output_names.size()); + origin_odims_.resize(output_names.size()); + for (size_t i = 0; i < output_names.size(); i++) { + origin_otypes_[i] = graph.Get(output_names[i])->precision(); + origin_odims_[i] = origin_otensors[i]->dims().Vectorize(); } - auto device_program = std::make_shared(device_client); - if (!inputs_shape_.empty()) { - device_program_map_[inputs_shape_] = device_program; + if (!model_cache_dir.empty()) { + // Save the generated model to file, used for the model caching or the + // offline model generation + auto model_path = model_cache_dir + "/" + model_name_ + ".om"; + VLOG(3) << "[NPU] Save model to " << model_path; + if (!WriteFile(model_path, model_buffer)) { + LOG(WARNING) << "[NPU] Open " << model_path << " for writting failed!"; + } + // Serialize the precisions and shapes of the origin output tensors into the + // configuration file + std::ostringstream os; + for (int i = 0; i < output_names.size(); i++) { + os << static_cast(origin_otypes_[i]) << ":"; + for (auto dim : origin_odims_[i]) { + os << dim << ","; + } + os << ";"; + } + auto str = os.str(); + std::vector config_buffer(str.begin(), str.end()); + auto config_path = model_cache_dir + "/" + model_name_ + ".cfg"; + VLOG(3) << "[NPU] Save configuration to " << config_path; + if (!WriteFile(config_path, config_buffer)) { + LOG(WARNING) << "[NPU] Open " << config_path << " for writting failed!"; + } } + return true; +} - // Query and check the dimensions of valid input and output tensors - std::vector device_idims, device_odims; - if (device_program->client->GetModelIOTensorDim( - model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) { - LOG(WARNING) - << "[NPU] Get the dimensions of input and output tensors failed!"; - return subgraph::FAILED; +bool DeviceProgram::ShareBufferWithOriginTensors( + const std::vector& input_names, + const std::vector& output_names, + std::vector* origin_itensors, + std::vector* origin_otensors, + std::vector>* device_itensors, + std::vector>* device_otensors) { + CHECK(!model_name_.empty() && model_client_); + // Query the dimensions of the device input and output tensors if not + // initialized + if (device_idims_.empty() || device_odims_.empty()) { + if (model_client_->GetModelIOTensorDim( + model_name_, device_idims_, device_odims_) != hiai::AI_SUCCESS) { + LOG(WARNING) + << "[NPU] Get the dimensions of input and output tensors failed!"; + return false; + } } - device_program->device_idims = device_idims; - device_program->device_odims = device_odims; + // Check the dimensions of the device tensors and the origin tensors + CHECK_EQ(device_itensors->size(), input_names.size()); + CHECK_EQ(device_otensors->size(), output_names.size()); + CHECK_EQ(origin_otypes_.size(), output_names.size()); + CHECK_EQ(origin_odims_.size(), output_names.size()); + CHECK_EQ(device_idims_.size(), input_names.size()); + CHECK_EQ(device_odims_.size(), output_names.size()); + for (int i = 0; i < input_names.size(); i++) { + VLOG(3) << "[NPU] Inputs[" << i << "] name: " << input_names[i] + << " origin dims:" << (*origin_itensors)[i]->dims().repr() + << " device dims: {" << device_idims_[i].GetNumber() << "," + << device_idims_[i].GetChannel() << "," + << device_idims_[i].GetHeight() << "," + << device_idims_[i].GetWidth() << "}"; + CHECK_EQ((*origin_itensors)[i]->dims().production(), + device_idims_[i].GetNumber() * device_idims_[i].GetChannel() * + device_idims_[i].GetHeight() * device_idims_[i].GetWidth()); + VLOG(3) << "[NPU] Init the input tensors for the device program and share " + "their buffers with the origin input tensors"; + // Reinit device tensor will free shared buffer, so copy data to a tmp + // tensor + Tensor tmp; + tmp.CopyDataFrom(*(*origin_itensors)[i]); + (*device_itensors)[i]->Init(&(device_idims_[i])); - CHECK_EQ(device_idims.size(), device_inames_.size()); - CHECK_EQ(device_odims.size(), device_onames_.size()); - origin_idims_.resize(device_inames_.size()); - origin_itensors_.resize(device_inames_.size()); - device_itensors_.resize(device_inames_.size()); - origin_odims_.resize(device_onames_.size()); - origin_otensors_.resize(device_onames_.size()); - device_otensors_.resize(device_onames_.size()); + std::memcpy( + (*device_itensors)[i]->GetBuffer(), tmp.raw_data(), tmp.memory_size()); - for (int i = 0; i < device_inames_.size(); i++) { - auto node = graph.Get(device_inames_[i]); - auto precision = node->precision(); - auto layout = node->layout(); - origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); - CHECK(origin_itensors_[i]); - origin_idims_[i] = origin_itensors_[i]->dims(); - VLOG(3) << "[NPU] Inputs[" << i << "] name: " << device_inames_[i] - << " precision: " << PrecisionToStr(precision) - << " layout: " << DataLayoutToStr(layout) << " dims: {" - << device_idims[i].GetNumber() << "," - << device_idims[i].GetChannel() << "," - << device_idims[i].GetHeight() << "," << device_idims[i].GetWidth() - << "}"; - // Prepare the device input tensors - CHECK_EQ(origin_idims_[i].production(), - device_idims[i].GetNumber() * device_idims[i].GetChannel() * - device_idims[i].GetHeight() * device_idims[i].GetWidth()); - device_itensors_[i].reset(new hiai::AiTensor); - device_itensors_[i]->Init(&(device_idims[i])); + // Share data buf between device_itensor and origin_itensor + std::shared_ptr buffer = + std::make_shared((*device_itensors)[i]->GetBuffer(), + lite_api::TargetType::kHost, + (*device_itensors)[i]->GetSize()); + (*origin_itensors)[i]->ResetBuffer(buffer, + (*device_itensors)[i]->GetSize()); } - device_program->origin_idims = origin_idims_; - - for (int i = 0; i < device_onames_.size(); i++) { - auto node = graph.Get(device_onames_[i]); - auto precision = node->precision(); - auto layout = node->layout(); - origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); - CHECK(origin_otensors_[i]); - origin_odims_[i] = origin_otensors_[i]->dims(); - VLOG(3) << "[NPU] Outputs[" << i << "] name: " << device_onames_[i] - << " precision: " << PrecisionToStr(precision) - << " layout: " << DataLayoutToStr(layout) << " dims: {" - << device_odims[i].GetNumber() << "," - << device_odims[i].GetChannel() << "," - << device_odims[i].GetHeight() << "," << device_odims[i].GetWidth() - << "}"; - // Prepare the device output tensors - switch (precision) { - case PRECISION(kFloat): - origin_otensors_[i]->mutable_data(); - break; - case PRECISION(kBool): - origin_otensors_[i]->mutable_data(); - break; - case PRECISION(kInt8): - origin_otensors_[i]->mutable_data(); - break; - case PRECISION(kInt16): - origin_otensors_[i]->mutable_data(); - break; - case PRECISION(kInt32): - origin_otensors_[i]->mutable_data(); - break; - case PRECISION(kInt64): - origin_otensors_[i]->mutable_data(); - break; - default: - LOG(FATAL) << "[NPU] " << device_onames_[i] - << " can't mutable data with precision type " - << PrecisionToStr(precision); - break; - } - device_program->origin_odims = origin_odims_; - - CHECK_EQ(origin_odims_[i].production(), - device_odims[i].GetNumber() * device_odims[i].GetChannel() * - device_odims[i].GetHeight() * device_odims[i].GetWidth()); - device_otensors_[i].reset(new hiai::AiTensor); - device_otensors_[i]->Init(&(device_odims[i])); + for (int i = 0; i < output_names.size(); i++) { + (*origin_otensors)[i]->set_precision(origin_otypes_[i]); + (*origin_otensors)[i]->Resize(origin_odims_[i]); + VLOG(3) << "[NPU] Outputs[" << i << "] name: " << output_names[i] + << " origin dims:" << (*origin_otensors)[i]->dims().repr() + << " device dims: {" << device_odims_[i].GetNumber() << "," + << device_odims_[i].GetChannel() << "," + << device_odims_[i].GetHeight() << "," + << device_odims_[i].GetWidth() << "}"; + CHECK_EQ((*origin_otensors)[i]->dims().production(), + device_odims_[i].GetNumber() * device_odims_[i].GetChannel() * + device_odims_[i].GetHeight() * device_odims_[i].GetWidth()); + (*device_otensors)[i]->Init(&(device_odims_[i])); + VLOG(3) << "[NPU] Init the output tensors for the device program and share " + "their buffers with the origin output tensors"; + // Share data buf between device_itensor and origin_itensor + std::shared_ptr buffer = + std::make_shared((*device_otensors)[i]->GetBuffer(), + lite_api::TargetType::kHost, + (*device_otensors)[i]->GetSize()); + (*origin_otensors)[i]->ResetBuffer(buffer, + (*device_otensors)[i]->GetSize()); } - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { - // Copy the data of origin input tensors to the buffer of input HiAI tensors - // init device_itensors_, device_otensors_, origin_otensors_ - auto device_program = device_program_map_[inputs_shape_]; - +bool DeviceProgram::ZeroCopyRun( + std::vector>* device_itensors, + std::vector>* device_otensors) { + CHECK(!model_name_.empty() && model_client_); // Run the HiAI model by name std::string key = "model_name"; // Note: key seems must be model_name hiai::AiContext model_context; @@ -234,88 +313,106 @@ int SubgraphEngine::LaunchDeviceProgram() { }; int istamp; auto start_time = GetCurrentUS(); - CHECK_EQ(device_program->client->Process( - model_context, device_itensors_, device_otensors_, 1000, istamp), + CHECK_EQ(model_client_->Process( + model_context, *device_itensors, *device_otensors, 1000, istamp), hiai::AI_SUCCESS); VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; - - return 0; + return true; } -int SubgraphEngine::Build() { - if (device_program_map_.count(inputs_shape_) > 0) { - return subgraph::SUCCESS; +bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() { + // Obtain the origin input tensors, and create the origin output + // tensors(Don't try to access them before launch the device program or the + // origin program) + PrepareWorkspaceForOriginProgram(); + // Create the device input and output tensors, but don't initialize them + // with the dimensions + device_itensors_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + device_itensors_[i].reset(new hiai::AiTensor); + CHECK(device_itensors_[i]); + } + device_otensors_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + device_otensors_[i].reset(new hiai::AiTensor); + CHECK(device_otensors_[i]); } - // In order to attach all of the ops of the block desc, we need to build the - // original program firstly. - BuildOriginProgram(); - // Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph - build_device_program_status_ = BuildDeviceProgram(); - return build_device_program_status_; + return true; } -void SubgraphEngine::InitDeviceTensor() { - auto device_program = device_program_map_[inputs_shape_]; - for (size_t i = 0; i < device_itensors_.size(); i++) { - if (device_itensors_[i]->GetBuffer() != origin_itensors_[i]->raw_data()) { - VLOG(3) << "init device_itensors and share input tensor buf between " - "device and host"; - device_itensors_[i]->Init(&(device_program->device_idims[i])); - std::memcpy(device_itensors_[i]->GetBuffer(), - origin_itensors_[i]->raw_data(), - origin_itensors_[i]->memory_size()); - // share data buf between device_itensor and origin_itensor - std::shared_ptr buffer = - std::make_shared(device_itensors_[i]->GetBuffer(), - lite_api::TargetType::kHost, - device_itensors_[i]->GetSize()); - origin_itensors_[i]->ResetBuffer(buffer, device_itensors_[i]->GetSize()); +bool SubgraphEngine::BuildDeviceProgram() { + // Check if the cache device program exists + if (!device_programs_.count(origin_idims_)) { + auto device_program = std::make_shared(); + // Obtain the model cache dir from the NPU Context of the subgraph op + auto model_cache_dir = + ctx_->As().SubgraphModelCacheDir(exec_scope_); + VLOG(3) << "[NPU] Getting subgraph_model_cache_dir: " << model_cache_dir; + // Check and load if the cached model and configuration file exists + if (model_cache_dir.empty() || + !device_program->LoadFromCacheFile( + input_names_, output_names_, origin_idims_, model_cache_dir)) { + // Build the model online, including converting the paddle ops to the HiAI + // IR nodes, building the HiAI IR graph to the om model, then load it as a + // new HiAI model manager client for inference. + if (!origin_program_) { + BuildOriginProgram(); + } + CHECK(origin_program_) << "[NPU] The origin program is not initialized!"; + CHECK_GT(origin_program_->instructions().size(), 0) + << "[NPU] No instructions found in the origin program!"; + if (!device_program->BuildGraphAndCacheToFile(origin_program_.get(), + input_names_, + output_names_, + origin_idims_, + origin_otensors_, + model_cache_dir)) { + return false; + } } - } - for (size_t i = 0; i < device_otensors_.size(); i++) { - if (device_otensors_[i]->GetBuffer() != origin_otensors_[i]->raw_data()) { - VLOG(3) << "init device_otensors and share output tensor buf between " - "device and host"; - device_otensors_[i]->Init(&(device_program->device_odims[i])); - // share data buf between device_itensor and origin_itensor - origin_otensors_[i]->Resize(device_program->origin_odims[i]); - std::shared_ptr buffer = - std::make_shared(device_otensors_[i]->GetBuffer(), - lite_api::TargetType::kHost, - device_otensors_[i]->GetSize()); - origin_otensors_[i]->ResetBuffer(buffer, device_otensors_[i]->GetSize()); + if (device_program->model_client_ == nullptr) { + return false; } + device_programs_[origin_idims_] = device_program; } + auto device_program = device_programs_[origin_idims_]; + CHECK(device_program && device_program->model_client_); + return device_program->ShareBufferWithOriginTensors(input_names_, + output_names_, + &origin_itensors_, + &origin_otensors_, + &device_itensors_, + &device_otensors_); } -bool SubgraphEngine::InputShapeChanged() { - std::vector> new_shape; - for (auto origin_itensor : origin_itensors_) { - new_shape.push_back(origin_itensor->dims().Vectorize()); +bool SubgraphEngine::LaunchDeviceProgram() { + // Roll back to launch the origin program if the device program can't be + // found or the model client isn't initialized. + if (device_programs_.count(origin_idims_) == 0 || + device_programs_[origin_idims_]->model_client_ == nullptr) { + return LaunchOriginProgram(); } - if (inputs_shape_ == new_shape) { - return false; + auto device_program = device_programs_[origin_idims_]; + if (!device_program->model_client_) { + return LaunchOriginProgram(); } - inputs_shape_ = new_shape; - return true; + return device_program->ZeroCopyRun(&device_itensors_, &device_otensors_); } void SubgraphCompute::PrepareForRun() { auto& param = this->Param(); engine_.reset(new SubgraphEngine(ctx_.get(), - param.sub_block_idx, - param.sub_block_desc, + param.block_idx, + param.program_desc, + param.exec_scope, param.input_data_names, - param.output_data_names, - param.scope, - NPUContext::SubgraphModelCacheDir())); + param.output_data_names)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace npu diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h index 9f0b5a944137dbf9a521235b80398feca1cd82b0..2203acaee82704b2a9e93d8b14d708197d7afb1a 100644 --- a/lite/kernels/npu/subgraph_compute.h +++ b/lite/kernels/npu/subgraph_compute.h @@ -28,52 +28,69 @@ namespace lite { namespace kernels { namespace npu { +class DeviceProgram { + public: + DeviceProgram() {} + ~DeviceProgram() {} + std::string GenerateModelName( + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims); + bool LoadFromCacheFile(const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims, + const std::string& model_cache_dir); + bool BuildGraphAndCacheToFile( + RuntimeProgram* origin_program, + const std::vector& input_names, + const std::vector& output_names, + const std::vector>& origin_idims, + const std::vector& origin_otensors, + const std::string& model_cache_dir); + bool ShareBufferWithOriginTensors( + const std::vector& input_names, + const std::vector& output_names, + std::vector* origin_itensors, + std::vector* origin_otensors, + std::vector>* device_itensors, + std::vector>* device_otensors); + bool ZeroCopyRun( + std::vector>* device_itensors, + std::vector>* device_otensors); + + public: + std::string model_name_{""}; + std::shared_ptr model_client_{nullptr}; + std::vector> origin_odims_; + std::vector origin_otypes_; + std::vector device_idims_{}; + std::vector device_odims_{}; +}; + class SubgraphEngine : public subgraph::Engine { public: - SubgraphEngine(KernelContext *ctx, + SubgraphEngine(KernelContext* ctx, int block_idx, - cpp::BlockDesc *block_desc, - const std::vector &input_names, - const std::vector &output_names, - Scope *scope, - std::string model_cache_dir = "") + const std::shared_ptr& program_desc, + Scope* exec_scope, + const std::vector& input_names, + const std::vector& output_names) : subgraph::Engine(ctx, block_idx, - block_desc, + program_desc, + exec_scope, input_names, - output_names, - scope, - model_cache_dir) {} - - struct device_program_t { - explicit device_program_t(std::shared_ptr _client) - : client(_client) {} - std::shared_ptr client{nullptr}; - std::vector origin_idims{}; - std::vector origin_odims{}; - std::vector device_idims{}; - std::vector device_odims{}; - }; - - int Build() override; + output_names) {} protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; - - void InitDeviceTensor() override; - bool InputShapeChanged() override; - - std::string GenerateModelCacheName() const; + bool PrepareWorkspaceForDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; - std::string model_name_{"model.om"}; - std::vector> inputs_shape_{}; - std::map>, std::shared_ptr> - device_program_map_{}; - std::vector device_inames_{}; - std::vector device_onames_{}; std::vector> device_itensors_{}; std::vector> device_otensors_{}; + std::map>, std::shared_ptr> + device_programs_; }; class SubgraphCompute : public KernelLite { diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index 600d0d22553af9d857d03491aabd2067db8f32ef..81e1a4d7562a9decab2e2daf4001faec7ac2fcee 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -21,6 +21,7 @@ add_kernel(fusion_elementwise_sub_activation_opencl add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(transpose_opencl OPENCL basic SRCS transpose_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps}) add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps}) @@ -67,6 +68,9 @@ lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc DEPS reshape_opencl op_registry program context) +lite_cc_test(test_transpose_image_opencl SRCS transpose_image_compute_test.cc + DEPS transpose_opencl layout_opencl op_registry program context) + lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc DEPS concat_opencl layout_opencl op_registry program context) diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index fed8171cc273b437be411225363bf4a732769ae3..083f72134eba8afc7db696f68d64098b9c59a0f9 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -28,91 +28,83 @@ namespace paddle { namespace lite { namespace kernels { namespace opencl { -/* image kernel*/ + void ConvImageCompute::PrepareForRun() { - const auto& param = this->Param(); - auto x_dims = param.x->dims(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); + ReInitWhenNeeded(); + + auto filter_dims = conv_param_->filter->dims(); + filter_tensor_n_ = filter_dims[0]; + filter_tensor_c_ = filter_dims[1]; + filter_tensor_h_ = filter_dims[2]; + filter_tensor_w_ = filter_dims[3]; - float* filter_cpu = param.filter->mutable_data(); auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); const bool is_mali = context.cl_context()->IsArmMali(); - filter_gpu_image_ = std::unique_ptr(new Tensor); - tensor_hold_filter_image_ = std::unique_ptr(new Tensor); - tensor_hold_bias_image_ = std::unique_ptr(new Tensor); - int bs = x_dims[0]; - int c_in = x_dims[1]; - int h_out = output_dims[2]; - int w_out = output_dims[3]; - int kernel_h = filter_dims[2]; // oihw - int kernel_w = filter_dims[3]; - auto paddings = *param.paddings; - auto dilations = *param.dilations; - int stride_h = param.strides[0]; - int stride_w = param.strides[1]; - int pad_h = paddings[0]; - int pad_w = paddings[2]; - int groups = param.groups; - bool relu_fused = param.fuse_relu; - bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); - bool zero_pad = (pad_h == 0) && (pad_w == 0); - - bool pad_equal = - ((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) && - (paddings[2] == paddings[3])); - bool stride_equal = stride_h == stride_w; - bool dilation_equal = dilations[0] == dilations[1]; + + auto paddings = *conv_param_->paddings; + pad_up_ = paddings[0]; + pad_down_ = paddings[1]; + pad_left_ = paddings[2]; + pad_right_ = paddings[3]; + + auto dilations = *conv_param_->dilations; + dilation_h_ = dilations[0]; + dilation_w_ = dilations[1]; + + stride_h_ = conv_param_->strides[0]; + stride_w_ = conv_param_->strides[1]; + + groups_ = conv_param_->groups; + relu_fused_ = conv_param_->fuse_relu; + has_bias_ = (conv_param_->bias) != nullptr; + offset_ = filter_tensor_h_ / 2 - pad_up_; + + bool pad_equal = ((pad_left_ == pad_up_) && (pad_up_ == pad_left_) && + (pad_left_ == pad_right_)); + bool stride_equal = stride_h_ == stride_w_; + bool dilation_equal = dilation_h_ == dilation_w_; VLOG(3) << "Is arm mali / " << (is_mali ? "Yes" : "No"); - VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No"); - VLOG(3) << "groups:" << groups << " stride_h:" << stride_h - << " stride_w:" << stride_w << " pad_h:" << pad_h - << " pad_w:" << pad_w << " kernel_h:" << kernel_h - << " kernel_h:" << kernel_h; - VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2] - << " " << x_dims[3]; - VLOG(3) << "dialtion:" << dilations[0] << " " << dilations[1]; - VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " " - << output_dims[2] << " " << output_dims[3]; - VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " " - << filter_dims[2] << " " << filter_dims[3]; + VLOG(3) << "Is relu fused? / " << (relu_fused_ ? "Yes" : "No"); + VLOG(3) << "groups:" << groups_ << " stride_h_:" << stride_h_ + << " stride_w_:" << stride_w_ << " pad_left_:" << pad_left_ + << " pad_up_:" << pad_up_ << " filter_tensor_h_:" << filter_tensor_h_ + << " filter_tensor_h_:" << filter_tensor_h_; + VLOG(3) << "input_tensor_nchw:" << input_tensor_n_ << " " << input_tensor_c_ + << " " << input_tensor_h_ << " " << input_tensor_w_; + VLOG(3) << "dialtion:" << dilation_h_ << " " << dilation_w_; + VLOG(3) << "output_dims:" << output_tensor_n_ << " " << output_tensor_c_ + << " " << output_tensor_h_ << " " << output_tensor_w_; + VLOG(3) << "filter_dims:" << filter_tensor_n_ << " " << filter_tensor_c_ + << " " << filter_tensor_h_ << " " << filter_tensor_w_; VLOG(3) << "pad_equal:" << pad_equal; VLOG(3) << "stride_equal:" << stride_equal; VLOG(3) << "dilation_equal:" << dilation_equal; - VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " " - << paddings[2] << " " << paddings[3]; + VLOG(3) << "padding :" << pad_up_ << " " << pad_down_ << " " << pad_left_ + << " " << pad_right_; CHECK(pad_equal && stride_equal && dilation_equal); + CHECK_GE(conv_param_->dilations->size(), 2); + CHECK(dilation_h_ == dilation_w_); + CHECK_GE(conv_param_->paddings->size(), 2); + CHECK(pad_left_ == pad_up_); + CHECK_GE(conv_param_->strides.size(), 2); + CHECK(stride_h_ == stride_w_); + + if (!is_mali) { + use_tune_ = false; + } - // general gws.. - auto out_image_shape = InitImageDimInfoWith(output_dims); - - const std::vector& default_work_size = - DefaultWorkSize(output_dims, - DDim(std::vector{ - static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])})); - - default_c_blk_ = default_work_size[0]; - default_w_blk_ = default_work_size[1]; - default_nh_blk_ = default_work_size[2]; - c_blk_ = default_c_blk_; - w_blk_ = default_w_blk_; - nh_blk_ = default_nh_blk_; - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - - if (kernel_h == 1 && kernel_w == 1) { - // conv2d_1x1 - // if (param.x->dims()[1] % 4 == 0) { - // kernel_func_names_.push_back("conv2d_1x1_simple"); - // } else { - // kernel_func_names_.push_back("conv2d_1x1_opt"); - // } + /********************************************* + * Upload filter, bias to opencl device + *********************************************/ + float* filter_cpu = conv_param_->filter->mutable_data(); + filter_gpu_image_ = std::unique_ptr(new Tensor); + tensor_hold_filter_image_ = std::unique_ptr(new Tensor); + tensor_hold_bias_image_ = std::unique_ptr(new Tensor); - if (param.x->dims()[1] % 4 == 0) { + if (filter_tensor_h_ == 1 && filter_tensor_h_ == 1) { + if (input_tensor_c_ % 4 == 0) { kernel_func_names_.push_back("conv2d_1x1_simple"); } else { kernel_func_names_.push_back("conv2d_1x1_opt"); @@ -121,89 +113,49 @@ void ConvImageCompute::PrepareForRun() { CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - // std::vector filter_image_v(filter_image_dims[0] * - // filter_image_dims[1] * 4); // 4 : - // RGBA - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); - + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d1x1opt; - { - // calc 1x1 gws - w_blk_ = maptofactor(default_w_blk_, 4); - c_blk_ = default_c_blk_; - nh_blk_ = default_nh_blk_; - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } #define DEPTH_CONV_USE_SPL #ifdef DEPTH_CONV_USE_SPL - } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] && - kernel_h == 3 && kernel_w == 3 && groups > 1) { + } else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_ && + filter_tensor_h_ == 3 && filter_tensor_w_ == 3 && groups_ > 1) { // depth_conv2d_3x3s1, depth_conv2d_3x3 - if (stride_h == 1 && dilations[0] == 1) { + if (stride_h_ == 1 && dilation_h_ == 1) { kernel_func_names_.push_back("depth_conv2d_3x3s1"); impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1; - { - // depthwise spl gws s1 - int c_block = (output_dims[1] + 3) / 4; - int w = output_dims[3]; - int nh = output_dims[0] * output_dims[2]; - int w_blk_size = 2; - int w_blk = (w + w_blk_size - 1) / w_blk_size; - - c_blk_ = c_block; - w_blk_ = w_blk; - nh_blk_ = nh; - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } } else { kernel_func_names_.push_back("depth_conv2d_3x3"); impl_ = &ConvImageCompute::DepthwiseConv2d3x3; - { - // depthwise spl gws - int c_block = (output_dims[1] + 3) / 4; - int w = output_dims[3]; - int nh = output_dims[0] * output_dims[2]; - - c_blk_ = c_block; - w_blk_ = w; - nh_blk_ = nh; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } } kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl"); CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); #endif - } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] + } else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_ #ifdef DEPTH_CONV_USE_SPL && - kernel_h != 3 + filter_tensor_h_ != 3 #endif #undef DEPTH_CONV_USE_SPL ) { @@ -213,75 +165,61 @@ void ConvImageCompute::PrepareForRun() { CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::DepthwiseConv2d; - } else if (kernel_w == 3 && kernel_h == 3) { + } else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) { // #define CONV3x3OPT_FALL_BACK #ifndef CONV3x3OPT_FALL_BACK // conv2d_3x3 - kernel_func_names_.push_back(bs > 1 ? "conv2d_3x3_multi_batch" - : "conv2d_3x3_opt"); + kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" + : "conv2d_3x3_opt"); kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d3x3opt; - - { - int w_blk_size = 5; - int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; - - int h_blk_size = 1; - int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - - c_blk_ = default_c_blk_; - w_blk_ = w_blk; - nh_blk_ = h_blk; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } #else kernel_func_names_.push_back("conv2d_3x3"); kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d3x3; - #endif #undef CONV3x3OPT_FALL_BACK - } else if (kernel_h == 5 && kernel_w == 5) { + } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) { #define CONV_5x5_OPT #ifndef CONV_5x5_OPT // conv2d_5x5 @@ -290,55 +228,42 @@ void ConvImageCompute::PrepareForRun() { CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d5x5; #else // conv2d_5x5_opt - kernel_func_names_.push_back(bs > 1 ? "conv2d_5x5_multi_batch" - : "conv2d_5x5_opt"); + kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_5x5_multi_batch" + : "conv2d_5x5_opt"); kernel_func_paths_.push_back("image/conv2d_5x5_opt_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d5x5opt; - { - int w_blk_size = 5; - int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; - - int h_blk_size = 1; - int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - - c_blk_ = default_c_blk_; - w_blk_ = w_blk; - nh_blk_ = h_blk; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } #endif #undef CONV_5x5_OPT - } else if (kernel_h == 7 && kernel_w == 7) { + } else if (filter_tensor_h_ == 7 && filter_tensor_w_ == 7) { #define CONV_7x7_OPT #ifndef CONV_7x7_OPT // conv2d_7x7 @@ -347,52 +272,39 @@ void ConvImageCompute::PrepareForRun() { CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d7x7; #else // conv2d_7x7 - kernel_func_names_.push_back(bs > 1 ? "conv2d_7x7_multi_batch" - : "conv2d_7x7_opt"); + kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_7x7_multi_batch" + : "conv2d_7x7_opt"); kernel_func_paths_.push_back("image/conv2d_7x7_opt_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - tensor_hold_filter_image_->Resize( - {1, filter_image_dims[0], filter_image_dims[1], 4}); + filter_image_h_ = filter_image_dims[1]; + filter_image_w_ = filter_image_dims[0]; + tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( - filter_image_dims[0], filter_image_dims[1], filter_image_data); + filter_image_w_, filter_image_h_, filter_image_data); impl_ = &ConvImageCompute::Conv2d7x7opt; - { - int w_blk_size = 5; - int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; - - int h_blk_size = 1; - int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - - c_blk_ = default_c_blk_; - w_blk_ = w_blk; - nh_blk_ = h_blk; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } #endif #undef CONV_7x7_OPT } else { @@ -404,30 +316,30 @@ void ConvImageCompute::PrepareForRun() { // build options std::string build_options_single(" -DCL_DTYPE_half"); // relu options - VLOG(3) << "relu_fused:" << relu_fused - << " param.activation_param.active_type:" - << static_cast(param.activation_param.active_type) - << " param.activation_param.has_active:" - << param.activation_param.has_active; - if (param.activation_param.has_active) { - if (param.activation_param.active_type == - lite_api::ActivationType::kRelu) { // Note: judge using `relu_fused` + VLOG(3) << "relu_fused_:" << relu_fused_ + << " conv_param_->activation_param.active_type:" + << static_cast(conv_param_->activation_param.active_type) + << " conv_param_->activation_param.has_active:" + << conv_param_->activation_param.has_active; + if (conv_param_->activation_param.has_active) { + if (conv_param_->activation_param.active_type == + lite_api::ActivationType::kRelu) { // Note: judge using `relu_fused_` // also is ok build_options_single += " -DRELU"; - } else if (param.activation_param.active_type == + } else if (conv_param_->activation_param.active_type == lite_api::ActivationType::kRelu6) { build_options_single += " -DRELU6"; } else { LOG(FATAL) << "Unsupported activation type:" - << static_cast(param.activation_param.active_type); + << static_cast(conv_param_->activation_param.active_type); } } + GetGlobalWorkSize(); // bias options - const bool has_bias = param.bias != nullptr; const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - if (has_bias) { + has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims(); + if (has_bias_) { bias_gpu_image_ = std::unique_ptr(new Tensor); build_options_single += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH"; @@ -435,21 +347,36 @@ void ConvImageCompute::PrepareForRun() { // convert cpu buffer bias --> gpu image CLImageConverterFolder bias_converter; const DDim& bias_image_dims = - bias_converter.InitImageDimInfoWith(param.bias->dims()); - + bias_converter.InitImageDimInfoWith(conv_param_->bias->dims()); + bias_image_h_ = bias_image_dims[1]; + bias_image_w_ = bias_image_dims[0]; tensor_hold_bias_image_->Resize( {1, bias_image_dims[0], bias_image_dims[1], 4}); half_t* bias_image_data = tensor_hold_bias_image_->mutable_data(); - float* bias_cpu_data = param.bias->mutable_data(); + float* bias_cpu_data = conv_param_->bias->mutable_data(); bias_converter.NCHWToImage( - bias_cpu_data, bias_image_data, param.bias->dims()); + bias_cpu_data, bias_image_data, conv_param_->bias->dims()); this->bias_gpu_image_->mutable_data( bias_image_dims[0], bias_image_dims[1], bias_image_data); // convert cpu buffer bias --> gpu image --- end ---- + } else { + bias_gpu_image_ = std::unique_ptr(new Tensor); + CLImageConverterFolder bias_converter; + tensor_hold_bias_image_->Resize({1, 1, 1, 4}); + half_t* bias_image_data = tensor_hold_bias_image_->mutable_data(); + this->bias_gpu_image_->mutable_data( + 1, 1, bias_image_data); } + // define image pointer for filter, bias + input_image_p_ = conv_param_->x->data(); + filter_image_p_ = filter_gpu_image_->data(); + bias_image_p_ = bias_gpu_image_->data(); + output_image_p_ = conv_param_->output->mutable_data( + output_image_w_, output_image_h_); + build_options_.push_back(build_options_single); for (size_t i = 0; i < kernel_func_names_.size(); i++) { @@ -475,55 +402,55 @@ void ConvImageCompute::PrepareForRun() { VLOG(4) << "max_work_group_size: " << max_work_group_size; if (max_work_group_size > 0 && use_lws_) { - double min_turn_time = DBL_MAX; + double min_tune_time = DBL_MAX; cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize( global_work_size_, max_work_group_size); VLOG(3) << "origin :local_work_size_ : " << best_local_work_size[0] << " " << best_local_work_size[1] << " " << best_local_work_size[2]; cl::NDRange last_local_work_size = cl::NDRange{ static_cast(0), static_cast(0), static_cast(0)}; - if (use_turn_) { + if (use_tune_) { for (size_t i = 1; i < 15; i++) { - if (kernel_h == 1 && kernel_w == 1) { + if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) { // todo use diff logics - local_work_size_ = context.cl_context()->LocalWorkSizeTurn( + local_work_size_ = context.cl_context()->LocalWorkSizeTune( global_work_size_, max_work_group_size, i); } else { - local_work_size_ = context.cl_context()->LocalWorkSizeTurn( + local_work_size_ = context.cl_context()->LocalWorkSizeTune( global_work_size_, max_work_group_size, i); } if (last_local_work_size[0] == local_work_size_[0] && last_local_work_size[1] == local_work_size_[1] && last_local_work_size[2] == local_work_size_[2]) { - // skiped turned lws + // skiped tuneed lws continue; } - auto turn_time = this->Turn(10); - if (min_turn_time > turn_time) { - min_turn_time = turn_time; + auto tune_time = this->Tune(10); + if (min_tune_time > tune_time) { + min_tune_time = tune_time; best_local_work_size = local_work_size_; } last_local_work_size = local_work_size_; } // reverse for (size_t i = 1; i < 15; i++) { - if (kernel_h == 1 && kernel_w == 1) { + if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) { // todo use diff logics - local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse( + local_work_size_ = context.cl_context()->LocalWorkSizeTuneReverse( global_work_size_, max_work_group_size, i); } else { - local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse( + local_work_size_ = context.cl_context()->LocalWorkSizeTuneReverse( global_work_size_, max_work_group_size, i); } if (last_local_work_size[0] == local_work_size_[0] && last_local_work_size[1] == local_work_size_[1] && last_local_work_size[2] == local_work_size_[2]) { - // skiped turned lws + // skiped tuneed lws continue; } - auto turn_time = this->Turn(10); - if (min_turn_time > turn_time) { - min_turn_time = turn_time; + auto tune_time = this->Tune(10); + if (min_tune_time > tune_time) { + min_tune_time = tune_time; best_local_work_size = local_work_size_; } last_local_work_size = local_work_size_; @@ -537,548 +464,316 @@ void ConvImageCompute::PrepareForRun() { } } -void ConvImageCompute::Conv2d1x1opt(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; - +void ConvImageCompute::ReInitWhenNeeded() { + conv_param_ = param_.get_mutable(); + auto x_dims = conv_param_->x->dims(); #ifdef LITE_WITH_LOG - // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," - << global_work_size_[1] << "," << global_work_size_[2] << "}"; + LOG(INFO) << "is_first_epoch_for_run_:" << is_first_epoch_for_run_ + << ", last_input_dims_:" << last_input_dims_ + << ", x_dims:" << x_dims; #endif -#ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d_1x1 params ============"; - VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - VLOG(4) << "input_c_block: " << input_c_block; - VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "offset: " << offset; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; -// VLOG(4) << "default work size{c_block, w, nh}: " -// << "{" << c_block << ", " << w << ", " << nh << "" -// << "}"; -#endif - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - // handle bias use buffer for channel wise , use image for element wise - const cl::Buffer* bias_buf = nullptr; - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } - - auto kernel = kernel_; - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, default_w_blk_); - CL_CHECK_FATAL(status); - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - if (is_turn) { - CLRuntime::Global()->command_queue().finish(); - } -} -void ConvImageCompute::Conv2d3x3(bool is_turn) { - auto kernel = kernel_; - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int input_channel = input_dims[1]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int output_channel = output_dims[1]; - int filter_width = filter_dims[3]; - int filter_height = filter_dims[2]; - int filter_channel = filter_dims[1]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; - - // re-calc group - int new_groups{param.groups}; - if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) { - new_groups = 1; - } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) { - new_groups = input_channel / filter_channel; - } - /* TODO(ysh329): mobile has no case below - else { - LOG(FATAL) << "Not support conv3x3 case with" - << " input_dims:" << input_dims << " output_dims:" << - output_dims - << " filter_dims:" << filter_dims; + if (is_first_epoch_for_run_ || last_input_dims_ != x_dims) { + is_first_epoch_for_run_ = false; + last_input_dims_ = x_dims; + + input_tensor_n_ = x_dims[0]; + input_tensor_c_ = x_dims[1]; + input_tensor_h_ = x_dims[2]; + input_tensor_w_ = x_dims[3]; + auto x_image_shape = InitImageDimInfoWith(x_dims); + input_image_h_ = x_image_shape["height"]; + input_image_w_ = x_image_shape["width"]; + + auto output_dims = conv_param_->output->dims(); + output_tensor_n_ = output_dims[0]; + output_tensor_c_ = output_dims[1]; + output_tensor_h_ = output_dims[2]; + output_tensor_w_ = output_dims[3]; + auto output_image_shape = InitImageDimInfoWith(output_dims); + output_image_h_ = output_image_shape["height"]; + output_image_w_ = output_image_shape["width"]; + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + CHECK_GE(conv_param_->x->dims().size(), 4); + CHECK_GE(conv_param_->output->dims().size(), 4); + if (kernel_func_names_.size() > 0 && + kernel_func_names_[0] == "conv2d_3x3") { + groups_ = conv_param_->groups; + if (filter_tensor_n_ == output_tensor_c_ && + filter_tensor_c_ == input_tensor_c_) { + groups_ = 1; + } else if (!(filter_tensor_n_ == input_tensor_c_ && + filter_tensor_c_ == 1)) { + groups_ = input_tensor_c_ / filter_tensor_c_; + } } - */ - - // const std::vector& default_work_size = - // DefaultWorkSize(output_dims, - // DDim(std::vector{ - // static_cast(out_image_shape["width"]), - // static_cast(out_image_shape["height"])})); - - // int c_block = default_work_size[0]; - // int w = default_work_size[1]; - // int nh = default_work_size[2]; - - // VLOG(4) << "============ conv2d params ============"; - // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - // << input_image_shape["height"]; - // VLOG(4) << "input_c_block: " << input_c_block; - // VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - // VLOG(4) << "input_dims: " << input_dims; - // VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - // VLOG(4) << "output_dims: " << output_dims; - // VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - // << out_image_shape["height"]; - // VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - // VLOG(4) << "has bias: " << has_bias; - // VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - // VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - // VLOG(4) << "offset: " << offset; - // VLOG(4) << "dilations.size : " << dilations.size(); - // VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; - // VLOG(4) << "param.groups(groups):" << param.groups; - // VLOG(4) << "new_groups:" << new_groups; - // VLOG(4) << "default work size{c_block, w, nh}: " - // << "{" << c_block << ", " << w << ", " << nh << "" - // << "}"; - - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - // STL::stringstream kernel_key; - // kernel_key << kernel_func_names_[0] << build_options_[0]; - // auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - // VLOG(4) << "kernel_key: " << kernel_key.str(); - // VLOG(4) << "kernel ready ... " << kernel_key.str(); - // VLOG(4) << "w: " << w; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { - VLOG(4) << "set bias_image: "; - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); + // define image pointer for input, output + input_image_p_ = conv_param_->x->data(); + output_image_p_ = conv_param_->output->mutable_data( + output_image_w_, output_image_h_); + + GetGlobalWorkSize(); } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, new_groups); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(input_dims[1])); - CL_CHECK_FATAL(status); - - // auto global_work_size = - // cl::NDRange{static_cast(default_work_size.data()[0]), - // static_cast(default_work_size.data()[1]), - // static_cast(default_work_size.data()[2])}; - - // VLOG(4) << "out_image: " << out_image; - // VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," - // << global_work_size[1] << "," << global_work_size[2] << "}"; - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); } -void ConvImageCompute::Conv2d3x3opt(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int input_channel = input_dims[1]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int output_channel = output_dims[1]; - CHECK_EQ(input_dims[0], output_dims[0]); - int batch = input_dims[0]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); -#ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d params ============"; - // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - // << input_image_shape["height"]; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; -#endif +void ConvImageCompute::GetGlobalWorkSize() { + if (kernel_func_names_.size() <= 0) return; + // general input_c_block + input_c_block_ = static_cast(input_image_w_ / input_tensor_w_); - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } + // general gws + auto output_dims = conv_param_->output->dims(); + const std::vector& default_work_size = + DefaultWorkSize(output_dims, + DDim(std::vector{ + static_cast(output_image_w_), + static_cast(output_image_h_)})); + default_c_blk_ = default_work_size[0]; + default_w_blk_ = default_work_size[1]; + default_nh_blk_ = default_work_size[2]; + c_blk_ = default_c_blk_; + w_blk_ = default_w_blk_; + nh_blk_ = default_nh_blk_; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { -#ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; -#endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); + if (kernel_func_names_[0] == "conv2d_1x1_simple" || + kernel_func_names_[0] == "conv2d_1x1_opt") { + w_blk_ = maptofactor(default_w_blk_, 4); + c_blk_ = default_c_blk_; + nh_blk_ = default_nh_blk_; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + + } else if (kernel_func_names_[0] == "depth_conv2d_3x3s1") { + // depthwise spl gws s1 + int c_block = (output_tensor_c_ + 3) / 4; + int w = output_tensor_w_; + int nh = output_tensor_n_ * output_tensor_h_; + int w_blk_size = 2; + int w_blk = (w + w_blk_size - 1) / w_blk_size; + + c_blk_ = c_block; + w_blk_ = w_blk; + nh_blk_ = nh; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } else if (kernel_func_names_[0] == "depth_conv2d_3x3") { + // depthwise spl gws + int c_block = (output_tensor_c_ + 3) / 4; + int w = output_tensor_w_; + int nh = output_tensor_n_ * output_tensor_h_; + + c_blk_ = c_block; + w_blk_ = w; + nh_blk_ = nh; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + input_c_block_ = static_cast((input_tensor_c_ + 3) / 4); + } else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" || + kernel_func_names_[0] == "conv2d_3x3_opt") { + int w_blk_size = 5; + int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; + + int h_blk_size = 1; + int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; + + c_blk_ = default_c_blk_; + w_blk_ = w_blk; + nh_blk_ = h_blk; + + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } else if (kernel_func_names_[0] == "conv2d_5x5_multi_batch" || + kernel_func_names_[0] == "conv2d_5x5_opt") { + int w_blk_size = 5; + int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; + + int h_blk_size = 1; + int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; + + c_blk_ = default_c_blk_; + w_blk_ = w_blk; + nh_blk_ = h_blk; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } else if (kernel_func_names_[0] == "conv2d_7x7_multi_batch" || + kernel_func_names_[0] == "conv2d_7x7_opt") { + int w_blk_size = 5; + int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; + + int h_blk_size = 1; + int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; + + c_blk_ = default_c_blk_; + w_blk_ = w_blk; + nh_blk_ = h_blk; + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, paddings[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, batch); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); +} +void ConvImageCompute::Conv2d1x1opt(bool enable_tune) { #ifdef LITE_WITH_LOG - // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," - << global_work_size_[1] << "," << global_work_size_[2] << "}"; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, default_w_blk_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::Conv2d5x5(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int filter_width = filter_dims[3]; - int filter_height = filter_dims[2]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; - +void ConvImageCompute::Conv2d3x3(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d params ============"; - VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - VLOG(4) << "input_c_block: " << input_c_block; - VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "offset: " << offset; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, filter_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(17, filter_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(18, filter_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(19, groups_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(20, input_tensor_c_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); +} - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { +void ConvImageCompute::Conv2d3x3opt(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; + PrintConvInfo(); #endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); + auto& context = ctx_->As(); + + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, pad_left_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_n_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_h_); + CL_CHECK_FATAL(status_); #ifdef LITE_WITH_LOG // VLOG(4) << "out_image: " << out_image; @@ -1086,697 +781,406 @@ void ConvImageCompute::Conv2d5x5(bool is_turn) { << global_work_size_[1] << "," << global_work_size_[2] << "}"; #endif - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); - if (is_turn) { + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::Conv2d5x5opt(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int input_channel = input_dims[1]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int output_channel = output_dims[1]; - CHECK_EQ(input_dims[0], output_dims[0]); - int batch = input_dims[0]; - - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - -// default_work_size[2] = h_blk; +void ConvImageCompute::Conv2d5x5(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d params ============"; - // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - // << input_image_shape["height"]; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + PrintConvInfo(); #endif - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } - - auto kernel = kernel_; - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, paddings[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, batch); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - - // VLOG(4) << "out_image: " << out_image; + auto& context = ctx_->As(); - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::Conv2d7x7(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int filter_width = filter_dims[3]; - int filter_height = filter_dims[2]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); - - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; - +void ConvImageCompute::Conv2d5x5opt(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d params ============"; - VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - VLOG(4) << "input_c_block: " << input_c_block; - VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "offset: " << offset; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, pad_left_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_n_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { + CLRuntime::Global()->command_queue().finish(); } +} - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { +void ConvImageCompute::Conv2d7x7(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; + PrintConvInfo(); #endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); + auto& context = ctx_->As(); + + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); + if (enable_tune) { + CLRuntime::Global()->command_queue().finish(); } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); +} +void ConvImageCompute::Conv2d7x7opt(bool enable_tune) { #ifdef LITE_WITH_LOG - // VLOG(4) << "out_image: " << out_image; - VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," - << global_work_size_[1] << "," << global_work_size_[2] << "}"; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); - - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, pad_left_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_n_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::Conv2d7x7opt(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int input_channel = input_dims[1]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int output_channel = output_dims[1]; - CHECK_EQ(input_dims[0], output_dims[0]); - int batch = input_dims[0]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); +void ConvImageCompute::DepthwiseConv2d3x3s1(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "============ conv2d 7x7 params ============"; - // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - // << input_image_shape["height"]; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "input_dims: " << input_dims; - VLOG(4) << "filter_dims: " << filter_dims; - // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; - VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + PrintConvInfo(); #endif - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } + auto& context = ctx_->As(); - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, paddings[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, batch); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_channel); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, pad_left_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_tensor_c_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); + CL_CHECK_FATAL(status_); + + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto x_dims = param.x->dims(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - - auto* input_img = param.x->data(); - auto* filter_img = filter_gpu_image_->data(); - - const cl::Image2D* bias_img = nullptr; - if (param.bias) { - bias_img = bias_gpu_image_->data(); - } - - auto image_shape = InitImageDimInfoWith(output_dims); - - auto* output_img = param.output->mutable_data( - image_shape["width"], image_shape["height"]); - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_img); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_img); - CL_CHECK_FATAL(status); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); +void ConvImageCompute::DepthwiseConv2d3x3(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; + PrintConvInfo(); #endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *output_img); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(strides[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(paddings[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(dilations[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[1])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[3])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[2])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(output_dims[3])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(output_dims[2])); - CL_CHECK_FATAL(status); - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - event_); - CL_CHECK_FATAL(status); - - if (is_turn) { + auto& context = ctx_->As(); + + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); + + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto x_dims = param.x->dims(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto dilations = *param.dilations; - int offset = filter_dims[2] / 2 - paddings[0]; - int input_c_block = (x_dims[1] + 3) / 4; - - auto* input_img = param.x->data(); - auto* filter_img = filter_gpu_image_->data(); - - const cl::Image2D* bias_img = nullptr; - if (param.bias) { - bias_img = bias_gpu_image_->data(); - } - - auto image_shape = InitImageDimInfoWith(output_dims); - - auto* output_img = param.output->mutable_data( - image_shape["width"], image_shape["height"]); - - auto kernel = kernel_; - +void ConvImageCompute::DepthwiseConv2d(bool enable_tune) { #ifdef LITE_WITH_LOG - VLOG(4) << "setArg"; - VLOG(4) << "strides = " << strides[0]; - VLOG(4) << "offset = " << offset; - VLOG(4) << "dilations = " << dilations[0]; - VLOG(4) << "input_c_block = " << input_c_block; - VLOG(4) << "x_dims[3] = " << x_dims[3]; - VLOG(4) << "x_dims[2] = " << x_dims[2]; - VLOG(4) << "output_dims[3] = " << output_dims[3]; - VLOG(4) << "output_dims[2] = " << output_dims[2]; + PrintConvInfo(); #endif + auto& context = ctx_->As(); - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_img); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_img); - CL_CHECK_FATAL(status); - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); -#ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; -#endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *output_img); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(strides[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(offset)); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(dilations[0])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(input_c_block)); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[3])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(x_dims[2])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(output_dims[3])); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(output_dims[2])); - CL_CHECK_FATAL(status); - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); - - if (is_turn) { + status_ = kernel_.setArg(0, c_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(1, w_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(2, nh_blk_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(3, *input_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(4, *filter_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(5, *bias_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(6, *output_image_p_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(7, stride_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(8, offset_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(9, input_c_block_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(10, dilation_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(11, input_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(12, input_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(13, output_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, output_tensor_h_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, filter_tensor_w_); + CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, filter_tensor_h_); + CL_CHECK_FATAL(status_); + + status_ = EnqueueNDRangeKernel(context, + kernel_, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status_); + + if (enable_tune) { CLRuntime::Global()->command_queue().finish(); } } -void ConvImageCompute::DepthwiseConv2d(bool is_turn) { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - const auto& param = *param_.get_mutable(); - auto input_dims = param.x->dims(); - auto paddings = *param.paddings; - auto strides = param.strides; - auto* input_image = param.x->data(); - auto* filter_image = filter_gpu_image_->data(); - auto filter_dims = param.filter->dims(); - auto output_dims = param.output->dims(); - - int input_width = input_dims[3]; - int input_height = input_dims[2]; - int output_width = output_dims[3]; - int output_height = output_dims[2]; - int filter_width = filter_dims[3]; - int filter_height = filter_dims[2]; - auto out_image_shape = InitImageDimInfoWith(output_dims); - auto* out_image = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - - const bool has_bias = param.bias != nullptr; - const bool is_element_wise_bias = - has_bias && param.output->dims() == param.bias->dims(); - int offset = static_cast(param.filter->dims()[2]) / 2 - - static_cast(paddings[0]); +void ConvImageCompute::Run() { (this->*impl_)(false); } - // calc input_c_block - auto input_image_shape = InitImageDimInfoWith(input_dims); - int input_c_block = input_image_shape["width"] / input_dims[3]; - int input_c = input_dims[1]; - auto dilations = *param.dilations; +void ConvImageCompute::PrintConvInfo() { + const bool is_element_wise_bias = + has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims(); -#ifdef LITE_WITH_LOG - VLOG(4) << "============ depthwise conv2d params ============"; - VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," - << input_image_shape["height"]; - VLOG(4) << "input_c_block: " << input_c_block; - VLOG(4) << "input_c: " << input_c; - // VLOG(4) << "input_image: " << input_image; - VLOG(4) << "filter_dims: " << filter_dims; + VLOG(4) << "input_image_shape: " << input_image_w_ << "," << input_image_h_; + // VLOG(4) << "input_image: " << input_image_p_; + VLOG(4) << "input_dims: " << conv_param_->x->dims(); + VLOG(4) << "filter_dims: " << conv_param_->filter->dims(); // VLOG(4) << "filter_image: " << filter_image; - VLOG(4) << "output_dims: " << output_dims; - VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " - << out_image_shape["height"]; - VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; - VLOG(4) << "has bias: " << has_bias; + VLOG(4) << "output_dims: " << conv_param_->output->dims(); + VLOG(4) << "out_image_shape: " << output_image_w_ << ", " << output_image_h_; + VLOG(4) << "paddings: " << pad_left_ << "," << pad_up_; + VLOG(4) << "has bias: " << has_bias_; VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; - VLOG(4) << "strides: " << strides[0] << "," << strides[1]; - VLOG(4) << "offset: " << offset; - VLOG(4) << "dilations.size : " << dilations.size(); - VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; -#endif - - CHECK_GE(dilations.size(), 2); - CHECK(dilations[0] == dilations[1]); - CHECK_GE(input_dims.size(), 4); - CHECK_GE(paddings.size(), 2); - CHECK(paddings[0] == paddings[1]); - CHECK_GE(strides.size(), 2); - CHECK(strides[0] == strides[1]); - - // handle bias use buffer for channel wise , use image for element wise - const cl::Buffer* bias_buf = nullptr; - const cl::Image2D* bias_image = nullptr; - if (has_bias) { - bias_image = bias_gpu_image_->data(); - } - - auto kernel = kernel_; - - cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, c_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, w_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, nh_blk_); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *input_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *filter_image); - CL_CHECK_FATAL(status); - if (has_bias) { -#ifdef LITE_WITH_LOG - VLOG(4) << "set bias_image: "; -#endif - status = kernel.setArg(++arg_idx, *bias_image); - CL_CHECK_FATAL(status); - } - status = kernel.setArg(++arg_idx, *out_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, strides[0]); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, offset); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_c_block); - CL_CHECK_FATAL(status); - - status = kernel.setArg(++arg_idx, dilations[0]); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, input_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, output_height); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_width); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, filter_height); - CL_CHECK_FATAL(status); - -#ifdef LITE_WITH_LOG + VLOG(4) << "strides: " << stride_h_ << "," << stride_w_; + VLOG(4) << "offset: "; + VLOG(4) << "dilations.size : " << conv_param_->dilations->size(); + VLOG(4) << "dilations: " << dilation_h_ << ", " << dilation_w_; VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," << global_work_size_[1] << "," << global_work_size_[2] << "}"; -#endif - - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); } -void ConvImageCompute::Run() { (this->*impl_)(false); } - -double ConvImageCompute::Turn(int times) { +double ConvImageCompute::Tune(int times) { auto GetCurrentUS = []() -> double { struct timeval time; gettimeofday(&time, NULL); diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h index 64276a5721cb20718604d91d3cfac31e583ddbf1..e61557a71dfbf1353decc9491b67c5e1e326512e 100644 --- a/lite/kernels/opencl/conv_image_compute.h +++ b/lite/kernels/opencl/conv_image_compute.h @@ -33,6 +33,7 @@ namespace paddle { namespace lite { namespace kernels { namespace opencl { + class ConvImageCompute : public KernelLite { @@ -42,8 +43,11 @@ class ConvImageCompute : public KernelLite kernel_func_names_{}; @@ -79,19 +87,72 @@ class ConvImageCompute : public KernelLite tensor_hold_bias_image_{nullptr}; cl::NDRange global_work_size_ = cl::NDRange{ static_cast(1), static_cast(1), static_cast(1)}; + + // opencl kernel args int c_blk_ = 1; int w_blk_ = 1; int nh_blk_ = 1; + const cl::Image2D* input_image_p_{nullptr}; + const cl::Image2D* filter_image_p_{nullptr}; + const cl::Image2D* bias_image_p_{nullptr}; + const cl::Image2D* output_image_p_{nullptr}; + + int stride_h_{-1}; + int stride_w_{-1}; + + int dilation_h_{-1}; + int dilation_w_{-1}; + + int pad_up_{-1}; + int pad_down_{-1}; + int pad_left_{-1}; + int pad_right_{-1}; + + int offset_{-1}; + int groups_{-1}; + bool relu_fused_{false}; + bool has_bias_{false}; + + int input_tensor_n_{-1}; + int input_tensor_c_{-1}; + int input_tensor_h_{-1}; + int input_tensor_w_{-1}; + int input_image_h_{-1}; + int input_image_w_{-1}; + int input_c_block_{-1}; + + int output_tensor_n_{-1}; + int output_tensor_c_{-1}; + int output_tensor_h_{-1}; + int output_tensor_w_{-1}; + int output_image_h_{-1}; + int output_image_w_{-1}; + + int filter_tensor_n_{-1}; + int filter_tensor_c_{-1}; + int filter_tensor_h_{-1}; + int filter_tensor_w_{-1}; + int filter_image_h_{-1}; + int filter_image_w_{-1}; + + int bias_image_h_{-1}; + int bias_image_w_{-1}; + int default_c_blk_ = 1; int default_w_blk_ = 1; int default_nh_blk_ = 1; + // ================= + + DDim last_input_dims_{}; + bool is_first_epoch_for_run_{true}; cl::Kernel kernel_; + cl_int status_; cl::NDRange local_work_size_ = cl::NDRange{ static_cast(1), static_cast(1), static_cast(1)}; bool use_lws_{true}; - bool use_turn_{false}; + bool use_tune_{true}; }; } // namespace opencl diff --git a/lite/kernels/opencl/expand_image_compute_test.cc b/lite/kernels/opencl/expand_image_compute_test.cc index 1fa046c938a4b45bec0ae9842ed51fc0805b4131..c372855193e938081208addce058e3e38b692cbb 100644 --- a/lite/kernels/opencl/expand_image_compute_test.cc +++ b/lite/kernels/opencl/expand_image_compute_test.cc @@ -11,9 +11,9 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include #include +#include #include "lite/backends/opencl/target_wrapper.h" #include "lite/core/op_registry.h" #include "lite/core/tensor.h" @@ -54,11 +54,11 @@ TEST(expand_hw_image2d, compute) { context->As().InitOnce(); kernel->SetParam(param); - std::unique_ptr pixel_shuffle_context(new KernelContext); + std::unique_ptr expand_context(new KernelContext); context->As().CopySharedTo( - &(pixel_shuffle_context->As())); + &(expand_context->As())); - kernel->SetContext(std::move(pixel_shuffle_context)); + kernel->SetContext(std::move(expand_context)); const DDim in_dim = DDim(std::vector{INPUT_N, INPUT_C, INPUT_H, INPUT_W}); @@ -179,11 +179,11 @@ TEST(expand_c2hw_image2d, compute) { context->As().InitOnce(); kernel->SetParam(param); - std::unique_ptr pixel_shuffle_context(new KernelContext); + std::unique_ptr expand_context(new KernelContext); context->As().CopySharedTo( - &(pixel_shuffle_context->As())); + &(expand_context->As())); - kernel->SetContext(std::move(pixel_shuffle_context)); + kernel->SetContext(std::move(expand_context)); const DDim in_dim = DDim(std::vector{INPUT_N, INPUT_C, INPUT_H, INPUT_W}); @@ -303,11 +303,11 @@ TEST(expand_c3hw_image2d, compute) { context->As().InitOnce(); kernel->SetParam(param); - std::unique_ptr pixel_shuffle_context(new KernelContext); + std::unique_ptr expand_context(new KernelContext); context->As().CopySharedTo( - &(pixel_shuffle_context->As())); + &(expand_context->As())); - kernel->SetContext(std::move(pixel_shuffle_context)); + kernel->SetContext(std::move(expand_context)); const DDim in_dim = DDim(std::vector{INPUT_N, INPUT_C, INPUT_H, INPUT_W}); @@ -428,11 +428,11 @@ TEST(expand_c4hw_image2d, compute) { context->As().InitOnce(); kernel->SetParam(param); - std::unique_ptr pixel_shuffle_context(new KernelContext); + std::unique_ptr expand_context(new KernelContext); context->As().CopySharedTo( - &(pixel_shuffle_context->As())); + &(expand_context->As())); - kernel->SetContext(std::move(pixel_shuffle_context)); + kernel->SetContext(std::move(expand_context)); const DDim in_dim = DDim(std::vector{INPUT_N, INPUT_C, INPUT_H, INPUT_W}); @@ -551,11 +551,11 @@ TEST(expand_n_image2d, compute) { context->As().InitOnce(); kernel->SetParam(param); - std::unique_ptr pixel_shuffle_context(new KernelContext); + std::unique_ptr expand_context(new KernelContext); context->As().CopySharedTo( - &(pixel_shuffle_context->As())); + &(expand_context->As())); - kernel->SetContext(std::move(pixel_shuffle_context)); + kernel->SetContext(std::move(expand_context)); const DDim in_dim = DDim(std::vector{INPUT_N, INPUT_C, INPUT_H, INPUT_W}); diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc index 9763faf2f33f578e6f62b07a8c89390e1b80c159..3a31c8993d77388b95260ad5c0be65f791c433eb 100644 --- a/lite/kernels/opencl/fc_buffer_compute.cc +++ b/lite/kernels/opencl/fc_buffer_compute.cc @@ -35,10 +35,27 @@ class FcCompute public: using param_t = operators::FcParam; - void PrepareForRun() override {} + void PrepareForRun() override { + fc_param_ = param_.get_mutable(); + auto w_t = fc_param_->w; + auto bias_t = fc_param_->bias; + + w_gpu_t_ = std::unique_ptr(new Tensor); + auto w_gpu_data = + w_gpu_t_->mutable_data(TARGET(kOpenCL), w_t->memory_size()); + TargetWrapperCL::MemcpySync( + w_gpu_data, w_t->raw_data(), w_t->memory_size(), IoDirection::HtoD); + + bias_gpu_t_ = std::unique_ptr(new Tensor); + auto b_gpu_data = + bias_gpu_t_->mutable_data(TARGET(kOpenCL), bias_t->memory_size()); + TargetWrapperCL::MemcpySync(b_gpu_data, + bias_t->raw_data(), + bias_t->memory_size(), + IoDirection::HtoD); + } void ReInitWhenNeeded() override { - fc_param_ = param_.get_mutable(); const auto x_dims = fc_param_->input->dims(); if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) || first_epoch_for_reinit_) { @@ -93,7 +110,7 @@ class FcCompute } void GetGlobalWorkSize() { - if (m_ == 1) { // gemv + if (kernel_func_name_ == "fc_gemv_1x4") { // gemv global_work_size_ = cl::NDRange{static_cast((n_ + 3) / 4)}; } else { // gemm global_work_size_ = cl::NDRange{static_cast((m_ + 3) / 4), @@ -103,8 +120,8 @@ class FcCompute void Run() override { auto* x_buf = fc_param_->input->data(); - auto* w_buf = fc_param_->w->data(); - auto* bias_buf = fc_param_->bias->data(); + auto* w_buf = w_gpu_t_->data(); + auto* bias_buf = bias_gpu_t_->data(); auto* out_buf = fc_param_->output->mutable_data(TARGET(kOpenCL)); @@ -154,6 +171,10 @@ class FcCompute std::string time_stamp_{GetTimeStamp()}; bool first_epoch_for_reinit_{true}; DDim last_x_dims_; + + std::unique_ptr w_gpu_t_{nullptr}; + std::unique_ptr bias_gpu_t_{nullptr}; + cl::NDRange global_work_size_; cl::Kernel kernel_; }; @@ -166,7 +187,7 @@ class FcCompute REGISTER_LITE_KERNEL( fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))}) - .BindInput("W", {LiteType::GetTensorTy(TARGET(kOpenCL))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .Finalize(); diff --git a/lite/kernels/opencl/fc_buffer_compute_test.cc b/lite/kernels/opencl/fc_buffer_compute_test.cc index 4c9c8c47e4306c92486dd1b847884200959453dd..85793dffee9e4717e257ad8c73258ce35ad61d54 100644 --- a/lite/kernels/opencl/fc_buffer_compute_test.cc +++ b/lite/kernels/opencl/fc_buffer_compute_test.cc @@ -126,9 +126,11 @@ TEST(fc, compute) { out.Resize(out_dim); out_ref.Resize(out_dim); + VLOG(2) << "out.dims():" << out.dims() << ", out_dim:" << out_dim; + auto* x_data = x.mutable_data(TARGET(kOpenCL)); - auto* w_data = w.mutable_data(TARGET(kOpenCL)); - auto* bias_data = bias.mutable_data(TARGET(kOpenCL)); + auto* w_data = w.mutable_data(); + auto* bias_data = bias.mutable_data(); auto* out_data = out.mutable_data(TARGET(kOpenCL)); std::default_random_engine engine; @@ -148,17 +150,15 @@ TEST(fc, compute) { } for (size_t i = 0; i < w_dim.production(); ++i) { w_source[i] = static_cast(dist(engine)); + w_data[i] = w_source[i]; } for (size_t i = 0; i < bias_dim.production(); ++i) { bias_source[i] = 10; // static_cast(dist(engine)); + bias_data[i] = 10; } TargetWrapperCL::MemcpySync( x_data, x_source.data(), x_size, IoDirection::HtoD); - TargetWrapperCL::MemcpySync( - w_data, w_source.data(), w_size, IoDirection::HtoD); - TargetWrapperCL::MemcpySync( - bias_data, bias_source.data(), bias_size, IoDirection::HtoD); // run opencl kernel kernel->Launch(); @@ -186,8 +186,10 @@ TEST(fc, compute) { #endif std::vector out_data_from_gpu(out_dim.production()); - TargetWrapperCL::MemcpySync( - out_data_from_gpu.data(), out_data, bias_size, IoDirection::DtoH); + TargetWrapperCL::MemcpySync(out_data_from_gpu.data(), + out_data, + out_data_from_gpu.size() * sizeof(float), + IoDirection::DtoH); // run cpu ref auto* out_ref_data = out_ref.mutable_data(TARGET(kARM)); diff --git a/lite/kernels/opencl/nearest_interp_image_compute_test.cc b/lite/kernels/opencl/nearest_interp_image_compute_test.cc index 4a9948832d1a96d95a7f317bd3ac8245292ae02b..fb40da290d10ed49f293cf7ff78865f2e7967eab 100644 --- a/lite/kernels/opencl/nearest_interp_image_compute_test.cc +++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc @@ -155,6 +155,7 @@ TEST(nearest_interp_image2d, compute) { auto *x_data = x.mutable_data(TARGET(kOpenCL)); auto *y_data = y.mutable_data(TARGET(kOpenCL)); auto *y_data_ref = y_ref.mutable_data(TARGET(kARM)); + memset(reinterpret_cast(y_data_ref), 0, y_ref.numel()); auto *mapped_x = static_cast(TargetWrapperCL::Map( x_data, 0, sizeof(float) * x_dim.production())); auto *mapped_y = static_cast(TargetWrapperCL::Map( diff --git a/lite/kernels/opencl/transpose_image_compute.cc b/lite/kernels/opencl/transpose_image_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..31184092efa40cea47c3cacb6a65f03d15a229b2 --- /dev/null +++ b/lite/kernels/opencl/transpose_image_compute.cc @@ -0,0 +1,395 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/opencl/cl_half.h" +#include "lite/backends/opencl/cl_include.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/opencl/image_helper.h" +#include "lite/operators/op_params.h" +#include "lite/utils/logging.h" +#include "lite/utils/replace_stl/stream.h" +#ifdef LITE_WITH_PROFILE +#include "lite/core/profile/profiler.h" +#endif +#include "lite/backends/opencl/cl_utility.h" + +#undef LITE_WITH_LOG + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { + +// transpose operator +class TransposeComputeFloatImage + : public KernelLite { + public: + using param_t = operators::TransposeParam; + + void PrepareForRun() override { + auto& param = *param_.get_mutable(); + Tensor* const output = param.output; + const DDimLite& out_dims = output->dims(); + if (out_dims.size() == 4) { + kernel_func_name_ = "transpose_4d"; + } else { + kernel_func_name_ = "transpose"; + } + auto& context = ctx_->As(); + VLOG(1) << "kernel_func_name_:" << kernel_func_name_; + context.cl_context()->AddKernel(kernel_func_name_, + "image/transpose_kernel.cl", + build_options_, + time_stamp_); + } + +#ifdef LITE_WITH_PROFILE + void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) { + ch->kernel_func_name = kernel_func_name_; + ch->cl_event = + event_; // `event_` defined in `kernel.h`, valid after kernel::Run + } +#endif + + void Run() override { + auto& param = *param_.get_mutable(); + const Tensor* const x = param.x; + const auto x_dims = x->dims(); + const std::map& input_image_shape = + InitImageDimInfoWith(x_dims); + const int64_t& input_image_width = input_image_shape.at("width"); + const int64_t& input_image_height = input_image_shape.at("height"); + const cl::Image2D* const x_image = x->data(); + + Tensor* const output = param.output; + const DDimLite& out_dims = output->dims(); + VLOG(4) << "out_dims= " << out_dims; + const std::map& out_image_shape = + InitImageDimInfoWith(out_dims); + cl::Image2D* const out_image = output->mutable_data( + out_image_shape.at("width"), out_image_shape.at("height")); +#ifdef LITE_WITH_LOG + VLOG(4) << "out_dims= " << out_dims; +#endif + const std::vector& default_work_size = DefaultWorkSize( + out_dims, + DDim(std::vector{ + static_cast(out_image_shape.at("width")), + static_cast(out_image_shape.at("height"))})); + + int out_C = 0, out_H = 0, out_W = 0, in_W = 0; + if (param.output->dims().size() == 4) { + out_C = out_dims[1]; + out_H = out_dims[2]; + out_W = out_dims[3]; + in_W = x_dims[3]; + } else if (param.output->dims().size() == 3) { + out_C = out_dims[0]; + out_H = out_dims[1]; + out_W = out_dims[2]; + in_W = x_dims[2]; + } else if (param.output->dims().size() == 2) { + out_C = 1; + out_H = out_dims[0]; + out_W = out_dims[1]; + in_W = x_dims[1]; + } + +#ifdef LITE_WITH_LOG + VLOG(4) << "out_C=" << out_C; + VLOG(4) << "out_H=" << out_H; + VLOG(4) << "out_W=" << out_W; + VLOG(4) << "in_W=" << in_W; + VLOG(4) << "default_work_size= " << default_work_size[0] << ", " + << default_work_size[1] << ", " << default_work_size[2]; +#endif + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_ << time_stamp_; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + +#ifdef LITE_WITH_LOG + VLOG(4) << TargetToStr(x->target()); + VLOG(4) << TargetToStr(param.output->target()); +#endif + + int arg_idx = 0; + cl_int status; + status = kernel.setArg(arg_idx, *x_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *out_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, out_C); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, out_H); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, out_W); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, in_W); + CL_CHECK_FATAL(status); + + auto global_work_size = + cl::NDRange{static_cast(default_work_size.data()[0]), + static_cast(default_work_size.data()[1]), + static_cast(default_work_size.data()[2])}; + + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status); + } + + private: + std::string kernel_func_name_{"transpose"}; + std::string build_options_{"-DCL_DTYPE_half"}; + std::string time_stamp_{GetTimeStamp()}; +}; + +// transpose2 operator +class Transpose2ComputeFloatImage + : public KernelLite { + public: + using param_t = operators::TransposeParam; + + void PrepareForRun() override {} + +#ifdef LITE_WITH_PROFILE + void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {} +#endif + + bool IsShuffleChannel(const std::vector& axis) { + bool is_shuffle_channel = true; + if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) { + for (int i = 3; i < axis.size(); ++i) { + if (axis[i] != i) { + is_shuffle_channel = false; + break; + } + } + } else { + return false; + } + return is_shuffle_channel; + } + + template + void DeviceTensorToHostTensor(const Tensor* device_tensor, + Tensor* host_tensor) { + host_tensor->Resize(device_tensor->dims()); + Dtype* host_ptr = host_tensor->mutable_data(); + CLRuntime::Global()->command_queue().finish(); + CLImageConverterDefault default_converter; + auto device_tensor_image_dim = + default_converter.InitImageDimInfoWith(device_tensor->dims()); + half_t* image_data = new half_t[device_tensor_image_dim.production() * 4]; + TargetWrapperCL::ImgcpySync(image_data, + device_tensor->data(), + device_tensor_image_dim[0], + device_tensor_image_dim[1], + 0, + 0, + IoDirection::DtoH); + default_converter.ImageToNCHW( + image_data, host_ptr, device_tensor_image_dim, host_tensor->dims()); + delete[] image_data; + } + + template + void HostTensorToDeviceTensor(const Tensor* host_tensor, + Tensor* device_tensor) { + Dtype* host_ptr = const_cast(host_tensor->data()); + CLImageConverterDefault default_converter; + auto device_tensor_image_dim = + default_converter.InitImageDimInfoWith(device_tensor->dims()); + device_tensor->mutable_data( + device_tensor_image_dim[0], device_tensor_image_dim[1]); + half_t* image_data = new half_t[device_tensor->dims().production() * 4]; + default_converter.NCHWToImage(host_ptr, image_data, device_tensor->dims()); + + TargetWrapperCL::ImgcpySync( + device_tensor->mutable_data(), + image_data, + device_tensor_image_dim[0], + device_tensor_image_dim[1], + 0, + 0, + IoDirection::HtoD); + + delete[] image_data; + } + + template + void ShuffleChannelCompute(const operators::TransposeParam& param) { + const Tensor* input = param.x; + Tensor* input_tensor = new Tensor(); + DeviceTensorToHostTensor(input, input_tensor); + Dtype* input_ptr = input_tensor->mutable_data(); + + Tensor* output = param.output; + Tensor* output_tensor = new Tensor(); + output_tensor->Resize(output->dims()); + Dtype* output_ptr = output_tensor->mutable_data(); + + // input and output's shape dimension must >= 2 && <= 6. + const DDim& in_dim = input->dims(); + const DDim& out_dim = output->dims(); + size_t offset = 1; + for (int i = 3; i < param.axis.size(); ++i) { + offset *= in_dim[i]; + } +#pragma omp parallel for collapse(3) + for (int batch = 0; batch < out_dim[0]; ++batch) { + for (int c1 = 0; c1 < out_dim[1]; ++c1) { + for (int c2 = 0; c2 < out_dim[2]; ++c2) { + size_t out_offset = + ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset; + size_t in_offset = + ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset; + memcpy(output_ptr + out_offset, + input_ptr + in_offset, + offset * sizeof(Dtype)); + } + } + } + HostTensorToDeviceTensor(output_tensor, output); + delete input_tensor; + delete output_tensor; + } + + template + void Transpose2Compute(const operators::TransposeParam& param) { + const Tensor* input = param.x; + Tensor* input_tensor = new Tensor(); + DeviceTensorToHostTensor(input, input_tensor); + Dtype* input_ptr = input_tensor->mutable_data(); + + Tensor* output = param.output; + Tensor* output_tensor = new Tensor(); + output_tensor->Resize(output->dims()); + Dtype* output_ptr = output_tensor->mutable_data(); + + // input and output's shape dimension must >= 2 && <= 6. + const DDim& in_dim = input->dims(); + const DDim& out_dim = output->dims(); + + // precompute inverted output dim and strides + size_t rout_dim[6], strides[6]; + auto& axis = param.axis; + int permute = axis.size(); // permute must >=2 && <= 6. + for (int i = 0; i < permute; ++i) { + int k = permute - 1 - i; + strides[k] = 1; + for (int j = axis[i] + 1; j < permute; ++j) { + strides[k] *= in_dim[j]; + } + rout_dim[k] = out_dim[i]; + } + + // unroll the first 2 dimensions + int reamin_dim = 1; + for (int i = 2; i < out_dim.size(); ++i) { + reamin_dim *= out_dim[i]; + } + +#pragma omp parallel for collapse(2) + for (int batch = 0; batch < out_dim[0]; ++batch) { + for (int j = 0; j < out_dim[1]; ++j) { + size_t offset = batch * strides[permute - 1] + j * strides[permute - 2]; + Dtype* out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim; + int indics[4] = {0, 0, 0, 0}; + for (int k = 0; k < reamin_dim; ++k) { + out_ptr[k] = input_ptr[offset]; + indics[0] += 1; + offset += strides[0]; + for (int p = 0; p < permute - 3; ++p) { + if (indics[p] == rout_dim[p]) { + indics[p + 1] += 1; + indics[p] = 0; + offset += strides[p + 1]; + offset -= rout_dim[p] * strides[p]; + } else { + break; + } + } + } + } + } + HostTensorToDeviceTensor(output_tensor, output); + delete input_tensor; + delete output_tensor; + } + + void Run() override { + auto& param = *param_.get_mutable(); + const std::vector axis = param.axis; + + bool shuffle_channel = IsShuffleChannel(axis); + if (shuffle_channel) { + ShuffleChannelCompute(param); + } else { + Transpose2Compute(param); + } + } +}; + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(transpose, + kOpenCL, + kFP16, + kImageDefault, + paddle::lite::kernels::opencl::TransposeComputeFloatImage, + image2d) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .Finalize(); + +REGISTER_LITE_KERNEL(transpose2, + kOpenCL, + kFP16, + kImageDefault, + paddle::lite::kernels::opencl::Transpose2ComputeFloatImage, + image2d) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault))}) + .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); + +#define LITE_WITH_LOG diff --git a/lite/kernels/opencl/transpose_image_compute_test.cc b/lite/kernels/opencl/transpose_image_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9db9b3732d44aa3f342a8cf8b7b2fe5819586a5f --- /dev/null +++ b/lite/kernels/opencl/transpose_image_compute_test.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/opencl/target_wrapper.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/kernels/opencl/test_helper.h" +#include "lite/operators/reshape_op.h" +#include "lite/utils/logging.h" + +#define FP16_MAX_DIFF (5e-1) + +namespace paddle { +namespace lite { +namespace kernels { +namespace opencl { + +static inline void TestWithKernel( + const std::unique_ptr& kernel) { + int64_t batch_size = 1; + int64_t ic = 2; + int64_t ih = 3; + int64_t iw = 4; + + int64_t oc = 3; + int64_t oh = 4; + int64_t ow = 2; + + lite::Tensor input, output; + operators::TransposeParam param; + + param.x = &input; + param.output = &output; + param.axis = std::vector({0, 2, 3, 1}); + const DDim input_dim = + lite::DDim{std::vector({batch_size, ic, ih, iw})}; + input.Resize(input_dim); + const DDim output_dim = + lite::DDim{std::vector({batch_size, oc, oh, ow})}; + param.output->Resize(output_dim); + + LOG(INFO) << "prepare kernel SetParam------"; + kernel->SetParam(param); + + size_t input_image_width = iw * ((ic + 3) / 4); + size_t input_image_height = ih * batch_size; + + size_t output_image_width = ow * ((oc + 3) / 4); + size_t output_image_height = oh * batch_size; + + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + + std::vector input_v(batch_size * ic * ih * iw); + + LOG(INFO) << "gen input ..."; + + float* input_v_data = &input_v[0]; + auto index = 0; + for (auto& i : input_v) { + i = index++; + } + + paddle::lite::CLImageConverterDefault default_convertor; + + std::vector x_image_data(input_image_width * input_image_height * + 4); // 4 : RGBA + + LOG(INFO) << "set mapped input ..."; + default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim); + + auto* input_image = input.mutable_data( + input_image_width, input_image_height, x_image_data.data()); + + LOG(INFO) << "prepare kernel ready"; + + LOG(INFO) << "mutable output ..."; + CLImageConverterDefault default_converter; + DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim); + LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " " + << out_image_shape[1]; + auto* out_image = output.mutable_data( + out_image_shape[0], out_image_shape[1]); + + LOG(INFO) << "kernel context ..."; + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + std::unique_ptr transpose_context(new KernelContext); + context->As().CopySharedTo( + &(transpose_context->As())); + kernel->SetContext(std::move(transpose_context)); + + LOG(INFO) << "kernel launch ..."; + kernel->Launch(); + + CLRuntime::Global()->command_queue().finish(); + + half_t* out_image_data = new half_t[out_image_shape.production() * 4]; + TargetWrapperCL::ImgcpySync(out_image_data, + output.data(), + out_image_shape[0], + out_image_shape[1], + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + float* out_data = new float[out_image_shape.production() * 4]; + default_converter.ImageToNCHW( + out_image_data, out_data, out_image_shape, output_dim); + + // check output data + index = 0; + auto hxw = ih * iw; + auto cxhxw = ic * hxw; + for (auto n = 0; n < batch_size; n++) { + for (auto h = 0; h < ih; h++) { + for (auto w = 0; w < iw; w++) { + for (auto c = 0; c < ic; c++) { + auto input_index = n * cxhxw + c * hxw + h * iw + w; + auto input_value = input_v_data[input_index]; + auto output_value = out_data[index]; + auto abs_diff = abs(input_value - output_value); + auto relative_diff = COMPUTE_RELATIVE_DIFF(input_value, output_value); + EXPECT_EQ( + (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF), + true); + index++; + } + } + } + } +} + +TEST(transpose_opencl, compute) { + auto kernels = KernelRegistry::Global().Create("transpose", + TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + auto kernel = std::move(kernels.front()); + TestWithKernel(kernel); +} + +TEST(transpose2_opencl, compute) { + auto kernels = KernelRegistry::Global().Create("transpose2", + TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + auto kernel = std::move(kernels.front()); + TestWithKernel(kernel); +} + +} // namespace opencl +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(transpose, kOpenCL, kFP16, kImageDefault, image2d); diff --git a/lite/kernels/rknpu/subgraph_compute.cc b/lite/kernels/rknpu/subgraph_compute.cc index e0b63205705609b6899918ce8e254ccdf6cbad47..da01539b291d57da1501f8c3790acae8496581f3 100644 --- a/lite/kernels/rknpu/subgraph_compute.cc +++ b/lite/kernels/rknpu/subgraph_compute.cc @@ -28,117 +28,55 @@ namespace lite { namespace kernels { namespace rknpu { -int SubgraphEngine::BuildDeviceProgram() { +bool SubgraphEngine::BuildDeviceProgram() { LOG(INFO) << "[RKNPU]:BuildDeviceProgram"; int status = 0; // Convert all of ops and their input vars and weights and added into the NPU // RKNPU IR graph subgraph::rknpu::Graph graph; const auto& bridges = subgraph::Registry::Instance(); - for (auto& inst : origin_program_) { + if (!origin_program_) { + BuildOriginProgram(); + } + const auto& insts = origin_program_->instructions(kRootBlockIdx); + for (auto& inst : insts) { auto op = const_cast(inst.op()); CHECK(op); op->CheckShape(); op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kRKNPU))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kRKNPU))( reinterpret_cast(&graph), op, const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } // Collect the valid input and output nodes in the RKNPU IR graph and update // the input and output names - device_inames_.clear(); - device_onames_.clear(); - - for (auto& input_name : input_names_) { - LOG(INFO) << "[RKNPU] Input node " << input_name; - if (graph.Has(input_name)) { - LOG(INFO) << input_name << " Precision " - << PrecisionToStr(graph.Get(input_name)->precision()); - device_itensors_.push_back(graph.Get(input_name)->data()); - device_inames_.push_back(input_name); - } else { - LOG(WARNING) << "[RKNPU] Input node " << input_name - << " is ignored because it does not exist."; - } - } - - for (auto& output_name : output_names_) { - LOG(INFO) << "[RKNPU] Output node " << output_name; - if (graph.Has(output_name)) { - auto tensor = scope_->FindMutableTensor(output_name); - LOG(INFO) << output_name << " Precision " - << PrecisionToStr(tensor->precision()); - device_otensors_.push_back(graph.Get(output_name)->data()); - device_onames_.push_back(output_name); - } else { - LOG(WARNING) << "[RKNPU] Output node " << output_name - << " is ignored because it does not exist."; - } - } - CHECK(!device_inames_.empty()) - << "[RKNPU] No input nodes found for building NPU model"; - CHECK(!device_onames_.empty()) - << "[RKNPU] No output nodes found for building NPU model"; - - device_program_ = lite::rknpu::Device::Global().Build( - model_name_, graph.GetHandle(), device_itensors_, device_otensors_); - if (device_program_ == nullptr) { - LOG(WARNING) << "[RKNPU] Build model failed!"; - return subgraph::FAILED; - } - - // input - origin_idims_.resize(input_names_.size()); - origin_itensors_.resize(input_names_.size()); + device_itensors_.clear(); + device_otensors_.clear(); for (size_t i = 0; i < input_names_.size(); i++) { - origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]); - CHECK(origin_itensors_[i]); - origin_idims_[i] = origin_itensors_[i]->dims(); - } - // output - origin_odims_.resize(output_names_.size()); - origin_otensors_.resize(output_names_.size()); - for (size_t i = 0; i < output_names_.size(); i++) { - origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]); - CHECK(origin_otensors_[i]); - origin_odims_[i] = origin_otensors_[i]->dims(); - - auto output_dims = origin_otensors_[i]->dims(); - } - - origin_idims_.resize(device_inames_.size()); - origin_itensors_.resize(device_inames_.size()); - device_itensors_.resize(device_inames_.size()); - origin_odims_.resize(device_onames_.size()); - origin_otensors_.resize(device_onames_.size()); - device_otensors_.resize(device_onames_.size()); - for (int i = 0; i < device_inames_.size(); i++) { - auto node = graph.Get(device_inames_[i]); + CHECK(graph.Has(input_names_[i])) << "[RKNPU] Failed to find input node " + << input_names_[i]; + auto node = graph.Get(input_names_[i]); auto precision = node->precision(); auto layout = node->layout(); - origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); - CHECK(origin_itensors_[i]); - origin_idims_[i] = origin_itensors_[i]->dims(); - - LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << device_inames_[i] + LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << input_names_[i] << " precision: " << PrecisionToStr(precision) << " layout: " << DataLayoutToStr(layout); + device_itensors_.push_back(node->data()); } - for (int i = 0; i < device_onames_.size(); i++) { - auto node = graph.Get(device_onames_[i]); + for (size_t i = 0; i < output_names_.size(); i++) { + CHECK(graph.Has(output_names_[i])) << "[RKNPU] Failed to find output node " + << output_names_[i]; + auto node = graph.Get(output_names_[i]); auto precision = node->precision(); auto layout = node->layout(); - origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); - CHECK(origin_otensors_[i]); - origin_odims_[i] = origin_otensors_[i]->dims(); - LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << device_onames_[i] + LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << output_names_[i] << " precision: " << PrecisionToStr(precision) << " layout: " << DataLayoutToStr(layout); // Prepare the device output tensors @@ -159,22 +97,30 @@ int SubgraphEngine::BuildDeviceProgram() { origin_otensors_[i]->mutable_data(); break; default: - LOG(FATAL) << "[RKNPU] " << device_onames_[i] + LOG(FATAL) << "[RKNPU] " << output_names_[i] << " can't mutable data with precision type " << PrecisionToStr(precision); break; } + device_otensors_.push_back(node->data()); + } + // Create the RKNPU model and set the input and output nodes + device_program_ = lite::rknpu::Device::Global().Build( + model_name_, graph.GetHandle(), device_itensors_, device_otensors_); + if (device_program_ == nullptr) { + LOG(WARNING) << "[RKNPU] Build model failed!"; + return false; } - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { +bool SubgraphEngine::LaunchDeviceProgram() { LOG(INFO) << "[RKNPU]:LaunchDeviceProgram"; std::vector inputs; std::vector outputs; - inputs.resize(device_itensors_.size()); - for (size_t i = 0; i < device_itensors_.size(); i++) { + inputs.resize(origin_itensors_.size()); + for (size_t i = 0; i < origin_itensors_.size(); i++) { inputs[i].index = i; inputs[i].buf = const_cast(origin_itensors_[i]->raw_data()); inputs[i].size = origin_itensors_[i]->memory_size(); @@ -184,8 +130,8 @@ int SubgraphEngine::LaunchDeviceProgram() { inputs[i].layout = rk::nn::DataLayoutType::NCHW; } - outputs.resize(device_otensors_.size()); - for (size_t i = 0; i < device_otensors_.size(); i++) { + outputs.resize(origin_otensors_.size()); + for (size_t i = 0; i < origin_otensors_.size(); i++) { outputs[i].index = i; outputs[i].buf = const_cast(origin_otensors_[i]->raw_data()); outputs[i].size = origin_otensors_[i]->memory_size(); @@ -195,26 +141,25 @@ int SubgraphEngine::LaunchDeviceProgram() { device_program_->SetInputs(inputs); device_program_->Run(); device_program_->GetOutputs(outputs); - return 0; + return true; } void SubgraphCompute::PrepareForRun() { LOG(INFO) << "[RKNPU]:PrepareForRun"; auto& param = this->Param(); engine_.reset(new SubgraphEngine(ctx_.get(), - param.sub_block_idx, - param.sub_block_desc, + param.block_idx, + param.program_desc, + param.exec_scope, param.input_data_names, - param.output_data_names, - param.scope)); + param.output_data_names)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { LOG(INFO) << "[RKNPU]:Run"; CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace rknpu diff --git a/lite/kernels/rknpu/subgraph_compute.h b/lite/kernels/rknpu/subgraph_compute.h index 863e6aef39ad54f0e9d94d4b507c6fca4128ebb8..78162b3d165bde8e33436654bbcd1110ad9afea6 100644 --- a/lite/kernels/rknpu/subgraph_compute.h +++ b/lite/kernels/rknpu/subgraph_compute.h @@ -34,22 +34,26 @@ class SubgraphEngine : public subgraph::Engine { public: SubgraphEngine(KernelContext *ctx, int block_idx, - cpp::BlockDesc *block_desc, + const std::shared_ptr &program_desc, + Scope *exec_scope, const std::vector &input_names, - const std::vector &output_names, - Scope *scope) - : subgraph::Engine( - ctx, block_idx, block_desc, input_names, output_names, scope) {} + const std::vector &output_names) + : subgraph::Engine(ctx, + block_idx, + program_desc, + exec_scope, + input_names, + output_names) {} protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; std::string model_name_; std::vector device_inames_; std::vector device_onames_; - std::vector> device_itensors_; - std::vector> device_otensors_; + std::vector> device_itensors_{}; + std::vector> device_otensors_{}; std::unique_ptr device_program_{nullptr}; }; diff --git a/lite/kernels/x86/activation_compute.cc b/lite/kernels/x86/activation_compute.cc index 2910364f37b74d94977e2397e31eb97fd367825e..9b4c2fadd9ce427db272a9bb0cfd0e0a10716f11 100644 --- a/lite/kernels/x86/activation_compute.cc +++ b/lite/kernels/x86/activation_compute.cc @@ -78,3 +78,13 @@ REGISTER_LITE_KERNEL(softsign, .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); + +REGISTER_LITE_KERNEL(sigmoid, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SoftsignCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/activation_compute_test.cc b/lite/kernels/x86/activation_compute_test.cc index 8cc2607e73e605214e08e42e70de457a206e2468..550cf299f676105271e758eb1a13e880045ee1cc 100644 --- a/lite/kernels/x86/activation_compute_test.cc +++ b/lite/kernels/x86/activation_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/activation_compute.cc" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/activation_compute.cc" namespace paddle { namespace lite { @@ -26,8 +28,7 @@ namespace kernels { namespace x86 { TEST(relu_x86, retrive_op) { - auto relu = - KernelRegistry::Global().Create("relu"); + auto relu = KernelRegistry::Global().Create("relu"); ASSERT_FALSE(relu.empty()); ASSERT_TRUE(relu.front()); } diff --git a/lite/kernels/x86/attention_padding_mask_compute_test.cc b/lite/kernels/x86/attention_padding_mask_compute_test.cc index 35ce822e010fc3ce2dc756b86e3a437789cc8359..5c672a1ee05116ccefec074f54d0726a7cd010ea 100644 --- a/lite/kernels/x86/attention_padding_mask_compute_test.cc +++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/attention_padding_mask_compute.cc" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/attention_padding_mask_compute.cc" namespace paddle { namespace lite { @@ -81,8 +83,7 @@ int get_max_len(const LoD& lod) { TEST(attention_padding_mask_x86, retrive_op) { auto attention_padding_mask = - KernelRegistry::Global().Create( - "attention_padding_mask"); + KernelRegistry::Global().Create("attention_padding_mask"); ASSERT_FALSE(attention_padding_mask.empty()); ASSERT_TRUE(attention_padding_mask.front()); } diff --git a/lite/kernels/x86/batch_norm_compute_test.cc b/lite/kernels/x86/batch_norm_compute_test.cc index 5ec2cdcdda0e9ff3698c80584b36396b38328e03..dd70f78efa7334355c459fd1d85a7da4f5b05b60 100644 --- a/lite/kernels/x86/batch_norm_compute_test.cc +++ b/lite/kernels/x86/batch_norm_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/batch_norm_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/batch_norm_compute.h" namespace paddle { namespace lite { @@ -26,9 +28,7 @@ namespace kernels { namespace x86 { TEST(batch_norm_x86, retrive_op) { - auto batch_norm = - KernelRegistry::Global().Create( - "batch_norm"); + auto batch_norm = KernelRegistry::Global().Create("batch_norm"); ASSERT_FALSE(batch_norm.empty()); ASSERT_TRUE(batch_norm.front()); } diff --git a/lite/kernels/x86/cast_compute_test.cc b/lite/kernels/x86/cast_compute_test.cc index f7aa52ca6d0dde603357f009220b4a3a53f56833..b039cf5d3b01032e60ef7bdcf31a45c8ed302215 100644 --- a/lite/kernels/x86/cast_compute_test.cc +++ b/lite/kernels/x86/cast_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/cast_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/cast_compute.h" namespace paddle { namespace lite { @@ -25,8 +27,7 @@ namespace kernels { namespace x86 { TEST(cast_x86, retrive_op) { - auto cast = - KernelRegistry::Global().Create("cast"); + auto cast = KernelRegistry::Global().Create("cast"); ASSERT_FALSE(cast.empty()); ASSERT_TRUE(cast.front()); } diff --git a/lite/kernels/x86/concat_compute_test.cc b/lite/kernels/x86/concat_compute_test.cc index 468e9422752561ff6416e8859b485462b9e2abbe..4be51dff6ed613842de431cce8a7960182073c4f 100644 --- a/lite/kernels/x86/concat_compute_test.cc +++ b/lite/kernels/x86/concat_compute_test.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/concat_compute.h" #include + #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/concat_compute.h" namespace paddle { namespace lite { @@ -23,9 +25,7 @@ namespace kernels { namespace x86 { TEST(concat_x86, retrive_op) { - auto concat = - KernelRegistry::Global().Create( - "concat"); + auto concat = KernelRegistry::Global().Create("concat"); ASSERT_FALSE(concat.empty()); ASSERT_TRUE(concat.front()); } diff --git a/lite/kernels/x86/conv_compute_test.cc b/lite/kernels/x86/conv_compute_test.cc index 2827c6577e5bf311b4002526d4ac10f636162d96..cd46571a2a9fd6b428f84ca278a453c8675d6ed6 100644 --- a/lite/kernels/x86/conv_compute_test.cc +++ b/lite/kernels/x86/conv_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/conv_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/conv_compute.h" namespace paddle { namespace lite { @@ -25,9 +27,7 @@ namespace kernels { namespace x86 { TEST(conv_x86, retrive_op) { - auto conv2d = - KernelRegistry::Global().Create( - "conv2d"); + auto conv2d = KernelRegistry::Global().Create("conv2d"); ASSERT_FALSE(conv2d.empty()); ASSERT_TRUE(conv2d.front()); } diff --git a/lite/kernels/x86/dropout_compute_test.cc b/lite/kernels/x86/dropout_compute_test.cc index 279f639f40ece0a10e45fe16f36fcb443cea550a..d30fbbea670d9509e722e3a27fd3dbf1d89a308c 100644 --- a/lite/kernels/x86/dropout_compute_test.cc +++ b/lite/kernels/x86/dropout_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/dropout_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/dropout_compute.h" namespace paddle { namespace lite { @@ -26,9 +28,7 @@ namespace kernels { namespace x86 { TEST(dropout_x86, retrive_op) { - auto dropout = - KernelRegistry::Global().Create( - "dropout"); + auto dropout = KernelRegistry::Global().Create("dropout"); ASSERT_FALSE(dropout.empty()); ASSERT_TRUE(dropout.front()); } diff --git a/lite/kernels/x86/elementwise_compute_test.cc b/lite/kernels/x86/elementwise_compute_test.cc index 9850c0ce86756cd12e28ab95688b79a1c539189c..6379faacad75f98f73eafbdfc2f8c9deb4d086cb 100644 --- a/lite/kernels/x86/elementwise_compute_test.cc +++ b/lite/kernels/x86/elementwise_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/elementwise_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/elementwise_compute.h" namespace paddle { namespace lite { @@ -26,9 +28,7 @@ namespace kernels { namespace x86 { TEST(elementwise_add_x86, retrive_op) { - auto elementwise_add = - KernelRegistry::Global().Create( - "elementwise_add"); + auto elementwise_add = KernelRegistry::Global().Create("elementwise_add"); ASSERT_FALSE(elementwise_add.empty()); ASSERT_TRUE(elementwise_add.front()); } diff --git a/lite/kernels/x86/elementwise_op_function.h b/lite/kernels/x86/elementwise_op_function.h index f736248ed3632af92dea2823439e6e7d28ff3e1b..4cb7160097e320798c1b1e2ee94d7fec8aedc6d6 100644 --- a/lite/kernels/x86/elementwise_op_function.h +++ b/lite/kernels/x86/elementwise_op_function.h @@ -22,7 +22,6 @@ limitations under the License. */ #include "lite/fluid/for_range.h" #include "lite/fluid/transform.h" #include "lite/utils/cp_logging.h" -#include "lite/utils/paddle_enforce.h" #include "lite/utils/variant.h" namespace paddle { @@ -66,9 +65,8 @@ inline void get_mid_dims(const lite::DDim &x_dims, for (size_t i = 0; i < y_dims.size(); ++i) { if (x_dims[i + axis] != y_dims[i]) { // only support single y_dims[i] = 1 now. - PADDLE_ENFORCE_EQ( - *mid_flag, 0, "Broadcast support y_dims with single 1."); - PADDLE_ENFORCE_EQ(y_dims[i], 1, "Broadcast dimension mismatch."); + CHECK_EQ(*mid_flag, 0) << "Broadcast support y_dims with single 1."; + CHECK_EQ(y_dims[i], 1) << "Broadcast dimension mismatch."; // m*n*k m*1*k for (size_t j = 0; j < i; ++j) { (*pre) *= y_dims[j]; @@ -95,8 +93,7 @@ inline void get_mid_dims(const lite::DDim &x_dims, } for (size_t i = 0; i < y_dims.size(); ++i) { - PADDLE_ENFORCE_EQ( - x_dims[i + axis], y_dims[i], "Broadcast dimension mismatch."); + CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch."; (*n) *= y_dims[i]; } @@ -314,17 +311,16 @@ void ElementwiseComputeEx(const lite::Context &ctx, TransformFunctor functor(x, y, z, ctx, func); auto x_dims = x->dims(); auto y_dims_untrimed = y->dims(); - PADDLE_ENFORCE_GE(x_dims.size(), - y_dims_untrimed.size(), - "Rank of first input must >= rank of second input."); + CHECK_GE(x_dims.size(), y_dims_untrimed.size()) + << "Rank of first input must >= rank of second input."; if (x_dims == y_dims_untrimed) { functor.Run(); return; } axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis); - PADDLE_ENFORCE(axis >= 0 && axis < static_cast(x_dims.size()), - "Axis should be in range [0, x_dims)"); + CHECK(axis >= 0 && axis < static_cast(x_dims.size())) + << "Axis should be in range [0, x_dims)"; auto y_dims = trim_trailing_singular_dims(y_dims_untrimed); axis = (y_dims.size() == 0) ? x_dims.size() : axis; int pre, n, post, mid_flag = 0; @@ -560,9 +556,8 @@ void FusedElemwiseAndActComputeEx(const lite::Context &ctx, lite::Tensor *out, lite::Tensor *intermediate_out) { if (KeepIntermediateOut) { - PADDLE_ENFORCE(intermediate_out, - "The save_intermediate_out is opened, " - "intermediate_out should not be nullptr."); + CHECK(intermediate_out) << "The save_intermediate_out is opened, " + "intermediate_out should not be nullptr."; } const lite::DDim &x_dim = x.dims(); diff --git a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc index 16bec18a1c1c4d0075e1ed1dcc4f3a3462917868..e3e8b13413808b447018ac14acf9d4a16c0f47a6 100644 --- a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc +++ b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/fill_constant_batch_size_like_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/fill_constant_batch_size_like_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -26,8 +29,7 @@ namespace x86 { TEST(fill_constant_batch_size_like_x86, retrive_op) { auto fill_constant_batch_size_like = - KernelRegistry::Global().Create( - "fill_constant_batch_size_like"); + KernelRegistry::Global().Create("fill_constant_batch_size_like"); ASSERT_FALSE(fill_constant_batch_size_like.empty()); ASSERT_TRUE(fill_constant_batch_size_like.front()); } diff --git a/lite/kernels/x86/gather_compute_test.cc b/lite/kernels/x86/gather_compute_test.cc index 286dfcb08a0c2c7bc038e0ad3b5673bd7c0f8b19..63284452244b19b807f8b101cab5cbabbbf68476 100644 --- a/lite/kernels/x86/gather_compute_test.cc +++ b/lite/kernels/x86/gather_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/gather_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/gather_compute.h" namespace paddle { namespace lite { @@ -25,9 +27,7 @@ namespace kernels { namespace x86 { TEST(gather_x86, retrive_op) { - auto gather = - KernelRegistry::Global().Create( - "gather"); + auto gather = KernelRegistry::Global().Create("gather"); ASSERT_FALSE(gather.empty()); int cnt = 0; for (auto item = gather.begin(); item != gather.end(); ++item) { diff --git a/lite/kernels/x86/gelu_compute_test.cc b/lite/kernels/x86/gelu_compute_test.cc index e930cd32df91196fa9f4559ee6ba22bd8b82d337..9bda9ac4c1c0cee84141095b3100bb82a99661b7 100644 --- a/lite/kernels/x86/gelu_compute_test.cc +++ b/lite/kernels/x86/gelu_compute_test.cc @@ -13,10 +13,12 @@ // limitations under the License. #include + #include #include #include #include + #include "lite/core/op_registry.h" #include "lite/kernels/x86/activation_compute.cc" @@ -26,8 +28,7 @@ namespace kernels { namespace x86 { TEST(gelu_x86, retrive_op) { - auto gelu = - KernelRegistry::Global().Create("gelu"); + auto gelu = KernelRegistry::Global().Create("gelu"); ASSERT_FALSE(gelu.empty()); ASSERT_TRUE(gelu.front()); } diff --git a/lite/kernels/x86/gru_compute_test.cc b/lite/kernels/x86/gru_compute_test.cc index 3e0e944f23bafda6a5eb742a8e4b023c268c9955..c4a0045b3c1b27dfb1b518aede7dad2872cd1dc2 100644 --- a/lite/kernels/x86/gru_compute_test.cc +++ b/lite/kernels/x86/gru_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/gru_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/gru_compute.h" namespace paddle { namespace lite { @@ -26,8 +28,7 @@ namespace kernels { namespace x86 { TEST(gru_x86, retrive_op) { - auto gru = - KernelRegistry::Global().Create("gru"); + auto gru = KernelRegistry::Global().Create("gru"); ASSERT_FALSE(gru.empty()); ASSERT_TRUE(gru.front()); } diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h index 46d151bbc406e19b498b87420029da7f9c1c2f12..ba75dad11b75441dc09b75224bfc4dfb271396a8 100644 --- a/lite/kernels/x86/layer_norm_compute.h +++ b/lite/kernels/x86/layer_norm_compute.h @@ -63,10 +63,10 @@ class LayerNormCompute : public KernelLite { out.ShareDataWith(*y); out.Resize(matrix_shape); - PADDLE_ENFORCE_EQ(Mean->numel(), left); - PADDLE_ENFORCE_EQ(Var->numel(), left); - PADDLE_ENFORCE_EQ(Scale->numel(), right); - PADDLE_ENFORCE_EQ(Bias->numel(), right); + CHECK_EQ(Mean->numel(), left); + CHECK_EQ(Var->numel(), left); + CHECK_EQ(Scale->numel(), right); + CHECK_EQ(Bias->numel(), right); auto ker = paddle::lite::jit::KernelFuncs, lite::fluid::CPUPlace>::Cache() diff --git a/lite/kernels/x86/layer_norm_compute_test.cc b/lite/kernels/x86/layer_norm_compute_test.cc index d39500a5e8827230ddeecd6bbe30f8c0a47ee929..617f1fae066aa6dc5068d293f8e977a2d37fe496 100644 --- a/lite/kernels/x86/layer_norm_compute_test.cc +++ b/lite/kernels/x86/layer_norm_compute_test.cc @@ -12,15 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/layer_norm_compute.h" #include + #include #include #include + #include "lite/backends/x86/jit/helper.h" #include "lite/backends/x86/jit/kernel_base.h" #include "lite/backends/x86/jit/kernels.h" #include "lite/core/op_registry.h" +#include "lite/kernels/x86/layer_norm_compute.h" namespace paddle { namespace lite { @@ -74,9 +76,7 @@ std::vector ref(lite::Tensor* x, // layer_norm TEST(layer_norm_x86, retrive_op) { - auto layer_norm = - KernelRegistry::Global().Create( - "layer_norm"); + auto layer_norm = KernelRegistry::Global().Create("layer_norm"); ASSERT_FALSE(layer_norm.empty()); ASSERT_TRUE(layer_norm.front()); } diff --git a/lite/kernels/x86/leaky_relu_compute_test.cc b/lite/kernels/x86/leaky_relu_compute_test.cc index 76daf4ff9ffc5dea8b532610abc917406356b3a5..75ebcf071298d072682b6ea535b3c8244c328500 100644 --- a/lite/kernels/x86/leaky_relu_compute_test.cc +++ b/lite/kernels/x86/leaky_relu_compute_test.cc @@ -13,8 +13,10 @@ // limitations under the License. #include + #include #include + #include "lite/core/op_registry.h" #include "lite/kernels/x86/activation_compute.h" @@ -24,9 +26,7 @@ namespace kernels { namespace x86 { TEST(leaky_relu_x86, retrive_op) { - auto leaky_relu = - KernelRegistry::Global().Create( - "leaky_relu"); + auto leaky_relu = KernelRegistry::Global().Create("leaky_relu"); ASSERT_FALSE(leaky_relu.empty()); ASSERT_TRUE(leaky_relu.front()); } diff --git a/lite/kernels/x86/match_matrix_tensor_compute_test.cc b/lite/kernels/x86/match_matrix_tensor_compute_test.cc index 0c3f3ad50940ab0059ab04fb507a786f735584b9..02ed8e1b4bb3a7bccc8560cb1f51166d3833e6bf 100644 --- a/lite/kernels/x86/match_matrix_tensor_compute_test.cc +++ b/lite/kernels/x86/match_matrix_tensor_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/match_matrix_tensor_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/match_matrix_tensor_compute.h" namespace paddle { namespace lite { @@ -25,9 +27,7 @@ namespace kernels { namespace x86 { TEST(match_matrix_tensor_x86, retrive_op) { - auto kernel = - KernelRegistry::Global().Create( - "match_matrix_tensor"); + auto kernel = KernelRegistry::Global().Create("match_matrix_tensor"); ASSERT_FALSE(kernel.empty()); ASSERT_TRUE(kernel.front()); } diff --git a/lite/kernels/x86/matmul_compute_test.cc b/lite/kernels/x86/matmul_compute_test.cc index 53d2d1a47a0cdbdaf5dfa83a79987d908171a36d..1e98702193af11ea8678bdfbc2382c7845c49b38 100644 --- a/lite/kernels/x86/matmul_compute_test.cc +++ b/lite/kernels/x86/matmul_compute_test.cc @@ -12,22 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/matmul_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/matmul_compute.h" + namespace paddle { namespace lite { namespace kernels { namespace x86 { TEST(matmul_x86, retrive_op) { - auto matmul = - KernelRegistry::Global().Create( - "matmul"); + auto matmul = KernelRegistry::Global().Create("matmul"); ASSERT_FALSE(matmul.empty()); ASSERT_TRUE(matmul.front()); } diff --git a/lite/kernels/x86/mul_compute_test.cc b/lite/kernels/x86/mul_compute_test.cc index 32d82cbb77aeb71dcd1c172ec0c1e343c3954fea..0d66a2dbd6eb27dac6acde47cc395c3c6245b1b5 100644 --- a/lite/kernels/x86/mul_compute_test.cc +++ b/lite/kernels/x86/mul_compute_test.cc @@ -12,21 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/mul_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/mul_compute.h" + namespace paddle { namespace lite { namespace kernels { namespace x86 { TEST(mul_x86, retrive_op) { - auto mul = - KernelRegistry::Global().Create("mul"); + auto mul = KernelRegistry::Global().Create("mul"); ASSERT_FALSE(mul.empty()); ASSERT_TRUE(mul.front()); } diff --git a/lite/kernels/x86/pool_compute_test.cc b/lite/kernels/x86/pool_compute_test.cc index 4ea727cedd5206f5f1ac2685297f72c3019bb313..d67d3a1de2248a1f8c180867c76b5d31affc11b9 100644 --- a/lite/kernels/x86/pool_compute_test.cc +++ b/lite/kernels/x86/pool_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/pool_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/pool_compute.h" namespace paddle { namespace lite { @@ -26,9 +28,7 @@ namespace kernels { namespace x86 { TEST(pool_x86, retrive_op) { - auto pool2d = - KernelRegistry::Global().Create( - "pool2d"); + auto pool2d = KernelRegistry::Global().Create("pool2d"); ASSERT_FALSE(pool2d.empty()); ASSERT_TRUE(pool2d.front()); } diff --git a/lite/kernels/x86/relu_compute_test.cc b/lite/kernels/x86/relu_compute_test.cc index 37ed6db7f919e31828f89462fa46d5263c480fcc..c2233bd04cf33c983db521335d88339592d2ce6b 100644 --- a/lite/kernels/x86/relu_compute_test.cc +++ b/lite/kernels/x86/relu_compute_test.cc @@ -13,8 +13,10 @@ // limitations under the License. #include + #include #include + #include "lite/core/op_registry.h" #include "lite/kernels/x86/activation_compute.h" @@ -24,8 +26,7 @@ namespace kernels { namespace x86 { TEST(relu_x86, retrive_op) { - auto relu = - KernelRegistry::Global().Create("relu"); + auto relu = KernelRegistry::Global().Create("relu"); ASSERT_FALSE(relu.empty()); ASSERT_TRUE(relu.front()); } diff --git a/lite/kernels/x86/reshape_compute_test.cc b/lite/kernels/x86/reshape_compute_test.cc index 16fc8f31aded0ef62fdf14aa671a73ccf6635fb7..88f38adee4aa413ac91bfdec0294c816020942b5 100644 --- a/lite/kernels/x86/reshape_compute_test.cc +++ b/lite/kernels/x86/reshape_compute_test.cc @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/reshape_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/reshape_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -26,9 +29,7 @@ namespace x86 { // reshape TEST(reshape_x86, retrive_op) { - auto reshape = - KernelRegistry::Global().Create( - "reshape"); + auto reshape = KernelRegistry::Global().Create("reshape"); ASSERT_FALSE(reshape.empty()); ASSERT_TRUE(reshape.front()); } @@ -86,9 +87,7 @@ TEST(reshape_x86, run_test) { // reshape2 TEST(reshape2_x86, retrive_op) { - auto reshape2 = - KernelRegistry::Global().Create( - "reshape2"); + auto reshape2 = KernelRegistry::Global().Create("reshape2"); ASSERT_FALSE(reshape2.empty()); ASSERT_TRUE(reshape2.front()); } diff --git a/lite/kernels/x86/scale_compute_test.cc b/lite/kernels/x86/scale_compute_test.cc index 6da27f444c7ed4c5a86e5f08a6c1612110bb02b9..dafb1e590f27f14208cff1e9aef79b28256cd048 100644 --- a/lite/kernels/x86/scale_compute_test.cc +++ b/lite/kernels/x86/scale_compute_test.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/scale_compute.h" #include + #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/scale_compute.h" namespace paddle { namespace lite { @@ -24,8 +26,7 @@ namespace kernels { namespace x86 { TEST(scale_x86, retrive_op) { - auto scale = - KernelRegistry::Global().Create("scale"); + auto scale = KernelRegistry::Global().Create("scale"); ASSERT_FALSE(scale.empty()); ASSERT_TRUE(scale.front()); } diff --git a/lite/kernels/x86/search_fc_compute_test.cc b/lite/kernels/x86/search_fc_compute_test.cc index 425df2a0f0544d7345923cb2efdce96074845311..515a5e30c81e9edd6b9ebb8e52955b5de6ec9e24 100644 --- a/lite/kernels/x86/search_fc_compute_test.cc +++ b/lite/kernels/x86/search_fc_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/search_fc_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/search_fc_compute.h" namespace paddle { namespace lite { @@ -53,9 +55,7 @@ void fc_cpu_base(const lite::Tensor* X, } TEST(search_fc_x86, retrive_op) { - auto search_fc = - KernelRegistry::Global().Create( - "search_fc"); + auto search_fc = KernelRegistry::Global().Create("search_fc"); ASSERT_FALSE(search_fc.empty()); ASSERT_TRUE(search_fc.front()); } diff --git a/lite/kernels/x86/search_grnn_compute_test.cc b/lite/kernels/x86/search_grnn_compute_test.cc index b85d97e3f1be1f2f02837d347e42ce6731c58414..d120ca7500513bc99b71bf0003ec31bcf1e2ac19 100644 --- a/lite/kernels/x86/search_grnn_compute_test.cc +++ b/lite/kernels/x86/search_grnn_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/search_grnn_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/search_grnn_compute.h" namespace paddle { namespace lite { @@ -25,9 +27,7 @@ namespace kernels { namespace x86 { TEST(search_grnn_x86, retrive_op) { - auto kernel = - KernelRegistry::Global().Create( - "search_grnn"); + auto kernel = KernelRegistry::Global().Create("search_grnn"); ASSERT_FALSE(kernel.empty()); ASSERT_TRUE(kernel.front()); } diff --git a/lite/kernels/x86/search_group_padding_compute_test.cc b/lite/kernels/x86/search_group_padding_compute_test.cc index f4c36c2a63488a6bb902a2b8b4ad81fa32b37672..ae2007e463c0fc97a099cd5ae902b623e361066c 100644 --- a/lite/kernels/x86/search_group_padding_compute_test.cc +++ b/lite/kernels/x86/search_group_padding_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/search_group_padding_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/search_group_padding_compute.h" namespace paddle { namespace lite { @@ -26,8 +28,7 @@ namespace x86 { TEST(search_group_padding_x86, retrieve_op) { auto search_group_padding = - KernelRegistry::Global().Create( - "search_group_padding"); + KernelRegistry::Global().Create("search_group_padding"); ASSERT_FALSE(search_group_padding.empty()); ASSERT_TRUE(search_group_padding.front()); } diff --git a/lite/kernels/x86/search_seq_depadding_compute_test.cc b/lite/kernels/x86/search_seq_depadding_compute_test.cc index 0d978b35ed040d6b7c44354f37999e6e34e2e3ef..32bf3276bb378beafbf273ffe7142b9b8fc493ac 100644 --- a/lite/kernels/x86/search_seq_depadding_compute_test.cc +++ b/lite/kernels/x86/search_seq_depadding_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/search_seq_depadding_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/search_seq_depadding_compute.h" namespace paddle { namespace lite { @@ -25,9 +27,7 @@ namespace kernels { namespace x86 { TEST(search_seq_depadding_x86, retrive_op) { - auto kernel = - KernelRegistry::Global().Create( - "search_seq_depadding"); + auto kernel = KernelRegistry::Global().Create("search_seq_depadding"); ASSERT_FALSE(kernel.empty()); ASSERT_TRUE(kernel.front()); } diff --git a/lite/kernels/x86/sequence_arithmetic_compute_test.cc b/lite/kernels/x86/sequence_arithmetic_compute_test.cc index 3b41e7d7ce37ebaf6a3f8518bc248ff4ec5c1aec..d80d3c2d1097fe2bbb47eb4c9d1384ae54d7fe8c 100644 --- a/lite/kernels/x86/sequence_arithmetic_compute_test.cc +++ b/lite/kernels/x86/sequence_arithmetic_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/sequence_arithmetic_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/sequence_arithmetic_compute.h" namespace paddle { namespace lite { @@ -77,8 +79,7 @@ void prepare_input(Tensor* x, const LoD& x_lod) { TEST(sequence_arithmetic_x86, retrive_op) { auto sequence_arithmetic = - KernelRegistry::Global().Create( - "sequence_arithmetic"); + KernelRegistry::Global().Create("sequence_arithmetic"); ASSERT_FALSE(sequence_arithmetic.empty()); ASSERT_TRUE(sequence_arithmetic.front()); } diff --git a/lite/kernels/x86/sequence_concat_compute_test.cc b/lite/kernels/x86/sequence_concat_compute_test.cc index eb6678a655ed1eb5a7bcda1dc2a6b8afe4477d2d..9899e6c08a1d1af9dea3728b5105ff78286de819 100644 --- a/lite/kernels/x86/sequence_concat_compute_test.cc +++ b/lite/kernels/x86/sequence_concat_compute_test.cc @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/sequence_concat_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/sequence_concat_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -94,9 +97,7 @@ static void sequence_concat_ref(const std::vector& xs, } // namespace TEST(sequence_concat_x86, retrive_op) { - auto sequence_concat = - KernelRegistry::Global().Create( - "sequence_concat"); + auto sequence_concat = KernelRegistry::Global().Create("sequence_concat"); ASSERT_FALSE(sequence_concat.empty()); ASSERT_TRUE(sequence_concat.front()); } diff --git a/lite/kernels/x86/sequence_expand_as_compute_test.cc b/lite/kernels/x86/sequence_expand_as_compute_test.cc index d49fdbb7a6164435abb9eb7189b18376066d55df..6eafb5f1e5275e375b7c61fda3c437b6959b8dd2 100644 --- a/lite/kernels/x86/sequence_expand_as_compute_test.cc +++ b/lite/kernels/x86/sequence_expand_as_compute_test.cc @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/sequence_expand_as_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/sequence_expand_as_compute.h" namespace paddle { namespace lite { @@ -27,8 +29,7 @@ namespace x86 { TEST(sequence_expand_as_x86, retrive_op) { auto sequence_expand_as = - KernelRegistry::Global().Create( - "sequence_expand_as"); + KernelRegistry::Global().Create("sequence_expand_as"); ASSERT_FALSE(sequence_expand_as.empty()); ASSERT_TRUE(sequence_expand_as.front()); } diff --git a/lite/kernels/x86/sequence_pool_compute_test.cc b/lite/kernels/x86/sequence_pool_compute_test.cc index 372bfaf8741cdcdc902efb6b8380eb4c34dd49ad..35116adbf6f06b87482cfff99182ee6c675ba7ed 100644 --- a/lite/kernels/x86/sequence_pool_compute_test.cc +++ b/lite/kernels/x86/sequence_pool_compute_test.cc @@ -12,21 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/sequence_pool_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/sequence_pool_compute.h" + namespace paddle { namespace lite { namespace kernels { namespace x86 { TEST(sequence_pool_x86, retrive_op) { - auto sequence_pool = - KernelRegistry::Global().Create( - "sequence_pool"); + auto sequence_pool = KernelRegistry::Global().Create("sequence_pool"); ASSERT_FALSE(sequence_pool.empty()); ASSERT_TRUE(sequence_pool.front()); } diff --git a/lite/kernels/x86/sequence_reverse_compute_test.cc b/lite/kernels/x86/sequence_reverse_compute_test.cc index adf9981b242bfbb7f60989369715354cc2043685..37c2f9571d486a36eccc1f01c06a1550d4609730 100644 --- a/lite/kernels/x86/sequence_reverse_compute_test.cc +++ b/lite/kernels/x86/sequence_reverse_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/sequence_reverse_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/sequence_reverse_compute.h" namespace paddle { namespace lite { @@ -44,9 +46,7 @@ static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) { } // namespace TEST(sequence_reverse_x86, retrive_op) { - auto sequence_reverse = - KernelRegistry::Global().Create( - "sequence_reverse"); + auto sequence_reverse = KernelRegistry::Global().Create("sequence_reverse"); ASSERT_FALSE(sequence_reverse.empty()); ASSERT_TRUE(sequence_reverse.front()); } diff --git a/lite/kernels/x86/sgd_compute.cc b/lite/kernels/x86/sgd_compute.cc index a3241468f9f09d66401aa83e0d738779e555dfba..dd056e30209953c1f360d714db50e3236f278510 100644 --- a/lite/kernels/x86/sgd_compute.cc +++ b/lite/kernels/x86/sgd_compute.cc @@ -41,8 +41,8 @@ class SGDCompute : public KernelLite { auto *param_out = &sgd_param.ParamOut->raw_tensor(); auto sz = param_out->numel(); - PADDLE_ENFORCE_EQ(param->numel(), sz); - PADDLE_ENFORCE_EQ(grad->numel(), sz); + CHECK_EQ(param->numel(), sz); + CHECK_EQ(grad->numel(), sz); paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1); const T *lr = learning_rate->template data(); diff --git a/lite/kernels/x86/shape_compute_test.cc b/lite/kernels/x86/shape_compute_test.cc index 88bd98f33ffc7a727de584543bc7392cdbb2883f..9fe5e6c51eaee783072717cea055b00b75c59c07 100644 --- a/lite/kernels/x86/shape_compute_test.cc +++ b/lite/kernels/x86/shape_compute_test.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/shape_compute.h" #include + #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/shape_compute.h" namespace paddle { namespace lite { @@ -23,8 +25,7 @@ namespace kernels { namespace x86 { TEST(shape_x86, retrive_op) { - auto shape = - KernelRegistry::Global().Create("shape"); + auto shape = KernelRegistry::Global().Create("shape"); ASSERT_FALSE(shape.empty()); ASSERT_TRUE(shape.front()); } diff --git a/lite/kernels/x86/slice_compute.h b/lite/kernels/x86/slice_compute.h index ad30215691cde66ab1c7c8c57930fc6d58de7cd5..d32327668bac389e42ff9411be50ce3df42e39ff 100644 --- a/lite/kernels/x86/slice_compute.h +++ b/lite/kernels/x86/slice_compute.h @@ -157,7 +157,7 @@ void slice_compute(const lite::Tensor* in, } } - out->mutable_data(lite::TargetType::kX86); + out->mutable_data(); auto new_out_dims = out->dims(); auto offsets = Eigen::array(); diff --git a/lite/kernels/x86/slice_compute_test.cc b/lite/kernels/x86/slice_compute_test.cc index a62a62cd88ce48c4d47d784ecbc2fd16d0f433d1..b978d4533ccb28ae8826b8304d93f9bdbe85d106 100644 --- a/lite/kernels/x86/slice_compute_test.cc +++ b/lite/kernels/x86/slice_compute_test.cc @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/slice_compute.h" #include + #include #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/slice_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -79,8 +82,7 @@ static void slice_ref(const float* input, } TEST(slice_x86, retrive_op) { - auto slice = - KernelRegistry::Global().Create("slice"); + auto slice = KernelRegistry::Global().Create("slice"); ASSERT_FALSE(slice.empty()); ASSERT_TRUE(slice.front()); } diff --git a/lite/kernels/x86/softmax_compute_test.cc b/lite/kernels/x86/softmax_compute_test.cc index 0debeecb3150dfdd2626b6f8f3f6b5ef63981d93..f3def92992c7ca01e75d12b86b2680768a9fd2ee 100644 --- a/lite/kernels/x86/softmax_compute_test.cc +++ b/lite/kernels/x86/softmax_compute_test.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/softmax_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/softmax_compute.h" namespace paddle { namespace lite { @@ -25,9 +27,7 @@ namespace kernels { namespace x86 { TEST(softmax_x86, retrive_op) { - auto softmax = - KernelRegistry::Global().Create( - "softmax"); + auto softmax = KernelRegistry::Global().Create("softmax"); ASSERT_FALSE(softmax.empty()); ASSERT_TRUE(softmax.front()); } diff --git a/lite/kernels/x86/stack_compute_test.cc b/lite/kernels/x86/stack_compute_test.cc index d105165a98f936b7a6973e57f5199977a0b8bed3..33942fca96508d2868520e5b5e242b83a1f38b0e 100644 --- a/lite/kernels/x86/stack_compute_test.cc +++ b/lite/kernels/x86/stack_compute_test.cc @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/stack_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/stack_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -25,8 +28,7 @@ namespace x86 { // stack TEST(stack_x86, retrive_op) { - auto stack = - KernelRegistry::Global().Create("stack"); + auto stack = KernelRegistry::Global().Create("stack"); ASSERT_FALSE(stack.empty()); ASSERT_TRUE(stack.front()); } diff --git a/lite/kernels/x86/tanh_compute_test.cc b/lite/kernels/x86/tanh_compute_test.cc index 8132505fad6d93997c73ffb735a4a798c15d87a6..6cba531fd34df029a1cdaaf9d6925e379796260d 100644 --- a/lite/kernels/x86/tanh_compute_test.cc +++ b/lite/kernels/x86/tanh_compute_test.cc @@ -13,10 +13,12 @@ // limitations under the License. #include + #include #include #include #include + #include "lite/core/op_registry.h" #include "lite/kernels/x86/activation_compute.cc" @@ -26,8 +28,7 @@ namespace kernels { namespace x86 { TEST(tanh_x86, retrive_op) { - auto tanh = - KernelRegistry::Global().Create("tanh"); + auto tanh = KernelRegistry::Global().Create("tanh"); ASSERT_FALSE(tanh.empty()); ASSERT_TRUE(tanh.front()); } diff --git a/lite/kernels/x86/transpose_compute.h b/lite/kernels/x86/transpose_compute.h index 5f6faed2017b6bdef60e7505bf1f0088d86b3ec1..87e7fee7deec711914bd43039301f7180a4bcaa0 100644 --- a/lite/kernels/x86/transpose_compute.h +++ b/lite/kernels/x86/transpose_compute.h @@ -60,7 +60,7 @@ inline void TransCompute(const int dim, trans6(context, in, out, axis); break; default: - PADDLE_THROW("Tensors with rank at most 6 are supported"); + LOG(FATAL) << "Tensors with rank at most 6 are supported"; } } diff --git a/lite/kernels/x86/transpose_compute_test.cc b/lite/kernels/x86/transpose_compute_test.cc index d8533d98258637eba516974e03cd4d88fd452293..aa99db36c450326765d602aaf0b48f72a1a63e13 100644 --- a/lite/kernels/x86/transpose_compute_test.cc +++ b/lite/kernels/x86/transpose_compute_test.cc @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/transpose_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" +#include "lite/kernels/x86/transpose_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -25,9 +28,7 @@ namespace x86 { // transpose TEST(transpose_x86, retrive_op) { - auto transpose = - KernelRegistry::Global().Create( - "transpose"); + auto transpose = KernelRegistry::Global().Create("transpose"); ASSERT_FALSE(transpose.empty()); ASSERT_TRUE(transpose.front()); } @@ -75,9 +76,7 @@ TEST(transpose_x86, run_test) { // transpose2 TEST(transpose2_x86, retrive_op) { - auto transpose2 = - KernelRegistry::Global().Create( - "transpose2"); + auto transpose2 = KernelRegistry::Global().Create("transpose2"); ASSERT_FALSE(transpose2.empty()); ASSERT_TRUE(transpose2.front()); } diff --git a/lite/kernels/x86/var_conv_2d_compute_test.cc b/lite/kernels/x86/var_conv_2d_compute_test.cc index edef8cb2df75dfb45ad4964975365d4ddbbe9086..a6787b2e3e84360a63618f130305446316a08e01 100644 --- a/lite/kernels/x86/var_conv_2d_compute_test.cc +++ b/lite/kernels/x86/var_conv_2d_compute_test.cc @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/x86/var_conv_2d_compute.h" #include + #include #include #include + #include "lite/core/op_registry.h" #include "lite/core/tensor.h" +#include "lite/kernels/x86/var_conv_2d_compute.h" + namespace paddle { namespace lite { namespace kernels { @@ -197,9 +200,7 @@ static void var_conv_2d_ref(const lite::Tensor* bottom, } TEST(var_conv_2d_x86, retrive_op) { - auto var_conv_2d = - KernelRegistry::Global().Create( - "var_conv_2d"); + auto var_conv_2d = KernelRegistry::Global().Create("var_conv_2d"); ASSERT_FALSE(var_conv_2d.empty()); ASSERT_TRUE(var_conv_2d.front()); } diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt index 7ded008387b7d7c92fb2ce6b18e73e1c1e51f29d..fdb485df02f366f7f4868965b1f20c6861b03d43 100644 --- a/lite/kernels/xpu/CMakeLists.txt +++ b/lite/kernels/xpu/CMakeLists.txt @@ -6,6 +6,7 @@ if(LITE_WITH_XTCL) add_subdirectory(bridges) add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges}) else() + # basic add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc DEPS ${lite_kernel_deps}) add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} target_wrapper_xpu) add_kernel(batch_norm_compute_xpu XPU basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps}) @@ -15,15 +16,32 @@ else() add_kernel(mul_compute_xpu XPU basic SRCS mul_compute.cc DEPS ${lite_kernel_deps}) add_kernel(softmax_compute_xpu XPU basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps}) add_kernel(scale_compute_xpu XPU basic SRCS scale_compute.cc DEPS ${lite_kernel_deps}) - add_kernel(lookup_table_compute_xpu XPU basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps}) - add_kernel(layer_norm_compute_xpu XPU basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps}) add_kernel(dropout_compute_xpu XPU basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps}) add_kernel(matmul_compute_xpu XPU basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps}) add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc DEPS ${lite_kernel_deps}) add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc DEPS ${lite_kernel_deps}) add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(sequence_topk_avg_pooling_compute_xpu XPU basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(concat_compute_xpu XPU basic SRCS concat_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(search_fc_compute_xpu XPU basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps}) + + # extra + add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(layer_norm_compute_xpu XPU extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(sequence_reverse_compute_xpu XPU extra SRCS sequence_reverse_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(sequence_concat_compute_xpu XPU extra SRCS sequence_concat_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(sequence_arithmetic_compute_xpu XPU extra SRCS sequence_arithmetic_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(sequence_pool_compute_xpu XPU extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(match_matrix_tensor_compute_xpu XPU extra SRCS match_matrix_tensor_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(var_conv_2d_compute_xpu XPU extra SRCS var_conv_2d_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(search_grnn_compute_xpu XPU extra SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps}) + + # extra(fused kernel) add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(__xpu__resnet_cbam_compute_xpu XPU extra SRCS __xpu__resnet_cbam_compute.cc DEPS ${lite_kernel_deps}) add_kernel(__xpu__multi_encoder_compute_xpu XPU extra SRCS __xpu__multi_encoder_compute.cc DEPS ${lite_kernel_deps}) add_kernel(__xpu__embedding_with_eltwise_add_compute_xpu XPU extra SRCS __xpu__embedding_with_eltwise_add_compute.cc DEPS ${lite_kernel_deps}) add_kernel(__xpu__fc_compute_xpu XPU extra SRCS __xpu__fc_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(__xpu__search_attention_compute_xpu XPU extra SRCS __xpu__search_attention_compute.cc DEPS ${lite_kernel_deps}) + add_kernel(__xpu__mmdnn_compute_xpu XPU extra SRCS __xpu__mmdnn_compute.cc DEPS ${lite_kernel_deps}) endif() diff --git a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc index 376cdd0dc23426ede42ddac60e061727f73322e3..224bfdc130338bc653091400708bc8a7421a9482 100644 --- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc +++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc @@ -31,11 +31,14 @@ void XPUEmbeddingWithEltwiseAddCompute::PrepareForRun() { CHECK_EQ(table_dims.size(), 2); /* shape like [table_len, embed_dim] */ table_lens_cpu_.push_back(table_dims[0]); } - void* lens_ptr = nullptr; + size_t lens_size = table_lens_cpu_.size() * sizeof(int); - xpu_malloc(&lens_ptr, lens_size); - xpu_memcpy(lens_ptr, &table_lens_cpu_[0], lens_size, XPU_HOST_TO_DEVICE); - table_lens_guard_.reset(lens_ptr); + table_lens_guard_ = + TargetWrapperXPU::MallocScratchPad(lens_size, false /* use_l3 */); + XPU_CALL(xpu_memcpy(table_lens_guard_->addr_, + &table_lens_cpu_[0], + lens_size, + XPU_HOST_TO_DEVICE)); } void XPUEmbeddingWithEltwiseAddCompute::Run() { @@ -55,16 +58,16 @@ void XPUEmbeddingWithEltwiseAddCompute::Run() { int embed_dim = table_dims[1]; int emb_layer_num = param.Ids.size(); int r = xdnn::embedding_with_ewadd( - ctx.GetRawContext(), /* context */ - embed_dim, /* embed_dim */ - idx_len, /* idx_len */ - emb_layer_num, /* emb_layer_num */ - param.padding_idx, /* padding_idx */ - &arg_tables_[0], /* tables */ - &arg_ids_[0], /* indices */ - static_cast(table_lens_guard_.get()), /* table_lens */ - nullptr, /* scale_after_emb */ - nullptr, /* scale_after_ewadd */ + ctx.GetRawContext(), /* context */ + embed_dim, /* embed_dim */ + idx_len, /* idx_len */ + emb_layer_num, /* emb_layer_num */ + param.padding_idx, /* padding_idx */ + &arg_tables_[0], /* tables */ + &arg_ids_[0], /* indices */ + static_cast(table_lens_guard_->addr_), /* table_lens */ + nullptr, /* scale_after_emb */ + nullptr, /* scale_after_ewadd */ param.Out->mutable_data(TARGET(kXPU)) /* top */); CHECK_EQ(r, 0); } diff --git a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h index 10ba6e0b5b76a1dbebfd633732f7c36e6ac7c954..124ed7866f0a52b892e30ae41398d5140064c964 100644 --- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h +++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h @@ -14,10 +14,9 @@ #pragma once -#include #include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard #include "lite/core/kernel.h" -#include "lite/kernels/xpu/utils.h" // XPUFreeDeleter namespace paddle { namespace lite { @@ -36,7 +35,7 @@ class XPUEmbeddingWithEltwiseAddCompute private: std::vector arg_ids_; std::vector arg_tables_; - std::unique_ptr table_lens_guard_; + XPUScratchPadGuard table_lens_guard_; std::vector table_lens_cpu_; }; diff --git a/lite/kernels/xpu/__xpu__mmdnn_compute.cc b/lite/kernels/xpu/__xpu__mmdnn_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..09d59fcee37c634a87636ac80e7be15d927f2509 --- /dev/null +++ b/lite/kernels/xpu/__xpu__mmdnn_compute.cc @@ -0,0 +1,1514 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +namespace { + +void FillMax(float max, float* xpu_ptr) { + float maxs[4] = {max, 0.0f, 0.0f, 0.0f}; + XPU_CALL(xpu_memcpy( + xpu_ptr, maxs, 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); +} + +void GrnnLayout(int batch, + const std::vector& offset, + std::vector* new_offset_ptr, + std::vector* idx_sorted_ptr) { + auto& new_offset = *new_offset_ptr; + auto& idx_sorted = *idx_sorted_ptr; + + std::vector width; + width.resize(batch); + new_offset.clear(); + idx_sorted.clear(); + + idx_sorted.resize(batch); + for (int i = 0; i < batch; i++) { + width[i] = offset[i + 1] - offset[i]; + idx_sorted[i] = i; + } + std::sort(idx_sorted.data(), + idx_sorted.data() + batch, + [&width](int a, int b) { return width[a] > width[b]; }); + int max_width = width[idx_sorted[0]]; + new_offset.resize(max_width + 1); + new_offset[0] = 0; + int j = batch - 1; + int last_width = 0; + int sub_row = 0; + int sub_col = 0; + + for (int i = 1; i <= max_width;) { + for (int k = j; k >= 0; --k) { + if (width[idx_sorted[k]] > last_width) { + sub_row = width[idx_sorted[k]] - last_width; + sub_col = k + 1; + for (int s = 0; s < sub_row; s++) { + new_offset[i] = new_offset[i - 1] + sub_col; + i++; + } + // move on + last_width = width[idx_sorted[k]]; + j = k - 1; + break; + } + } + } +} + +} // anonymous namespace + +class MMDNNIdInfo { + XPUScratchPadGuard l3_buffer_guard_; + char* l3_buffer_{nullptr}; + std::unique_ptr cpu_buffer_guard_; + char* cpu_buffer_{nullptr}; + + public: + const int64_t* id0_64{nullptr}; + const int64_t* id1_64{nullptr}; + int64_t* lod_64{nullptr}; + int* lod_32{nullptr}; + int* new_offset_32{nullptr}; + int* idx_sorted_32{nullptr}; + + std::vector lod; + std::vector new_offset; + std::vector idx_sorted; + int batch; + int seqlen_max; + int seqlen_sum; + int seqlen_square_sum; + + void Init(int upper_bound_batch, int upper_bound_seqlen) { + int ub_lod_64_size = (upper_bound_batch + 1) * sizeof(int64_t); + int ub_lod_32_size = (upper_bound_batch + 1) * sizeof(int); + int ub_new_offset_32_size = (upper_bound_seqlen + 1) * sizeof(int); + int ub_idx_sorted_32_size = (upper_bound_batch + 1) * sizeof(int); + int total_size = ub_lod_64_size + ub_lod_32_size + ub_new_offset_32_size + + ub_idx_sorted_32_size; + + // TODO(miaotianxiang): use l3? + l3_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(total_size, false); + l3_buffer_ = reinterpret_cast(l3_buffer_guard_->addr_); + cpu_buffer_guard_.reset(new char[total_size]); + cpu_buffer_ = cpu_buffer_guard_.get(); + } + + void Update(lite::Tensor* id0, lite::Tensor* id1) { + auto& id0_lod = id0->lod()[0]; + lod.clear(); + for (auto e : id0_lod) { + lod.push_back(e); + } + + seqlen_max = 0; + seqlen_sum = 0; + seqlen_square_sum = 0; + batch = lod.size() - 1; + for (int i = 0; i < batch; i++) { + int seqlen = lod[i + 1] - lod[i]; + seqlen_max = std::max(seqlen_max, seqlen); + seqlen_sum = seqlen_sum + seqlen; + seqlen_square_sum = seqlen_square_sum + seqlen * seqlen; + } + GrnnLayout(batch, lod, &new_offset, &idx_sorted); + + id0_64 = id0->data(); + id1_64 = id1->data(); + + int offset = 0; + lod_64 = reinterpret_cast(l3_buffer_ + offset); + memcpy( + cpu_buffer_ + offset, id0_lod.data(), id0_lod.size() * sizeof(int64_t)); + offset += id0_lod.size() * sizeof(int64_t); + lod_32 = reinterpret_cast(l3_buffer_ + offset); + memcpy(cpu_buffer_ + offset, lod.data(), lod.size() * sizeof(int)); + offset += lod.size() * sizeof(int); + new_offset_32 = reinterpret_cast(l3_buffer_ + offset); + memcpy(cpu_buffer_ + offset, + new_offset.data(), + new_offset.size() * sizeof(int)); + offset += new_offset.size() * sizeof(int); + idx_sorted_32 = reinterpret_cast(l3_buffer_ + offset); + memcpy(cpu_buffer_ + offset, + idx_sorted.data(), + idx_sorted.size() * sizeof(int)); + offset += idx_sorted.size() * sizeof(int); + XPU_CALL(xpu_memcpy( + l3_buffer_, cpu_buffer_, offset, XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + } +}; + +class MMDNNFcOp { + const int16_t* weight_{nullptr}; + XPUScratchPadGuard weight_max_guard_; + float* weight_max_{nullptr}; + const float* bias_{nullptr}; + XPUScratchPadGuard in_max_guard_; + float* in_max_{nullptr}; + int n_; + int k_; + xdnn::Activation_t::act_enum act_type_; + XPUScratchPadGuard out_max_guard_; + + public: + float* out_max{nullptr}; + + void Init(const int16_t* weight, + float weight_max, + const float* bias, + int n, + int k, + xdnn::Activation_t::act_enum act_type) { + n_ = n; + k_ = k; + act_type_ = act_type; + + weight_ = weight; + weight_max_guard_ = + TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false); + weight_max_ = reinterpret_cast(weight_max_guard_->addr_); + FillMax(weight_max, weight_max_); + + bias_ = bias; + + in_max_guard_ = + TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false); + out_max_guard_ = + TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false); + in_max_ = reinterpret_cast(in_max_guard_->addr_); + out_max = reinterpret_cast(in_max_guard_->addr_); + } + + void Init(lite::Tensor* weight, + float weight_max, + lite::Tensor* bias, + int n, + int k, + xdnn::Activation_t::act_enum act_type) { + Init(weight->data(), + weight_max, + bias ? bias->data() : nullptr, + n, + k, + act_type); + } + + void Infer(xdnn::Context* ctx, + const float* in, + int m, + float* out, + const float* in_max_by_caller = nullptr) { + int r = 0; + if (in_max_by_caller == nullptr) { + r = xdnn::findmax(ctx, in, m * k_, in_max_); + CHECK_EQ(r, 0); + in_max_by_caller = in_max_; + } + r = xdnn::gemm_int16_maxptr(ctx, + false, + true, + m, + n_, + k_, + 1.0f, + in, + k_, + weight_, + k_, + 0.0f, + out, + n_, + bias_, + act_type_, + in_max_by_caller, + weight_max_, + out_max); + CHECK_EQ(r, 0); + } +}; + +class MMDNNGrnnOp { + MMDNNFcOp fc_e2h0_; + MMDNNFcOp fc_e2h1_; + MMDNNFcOp fc_e2h2_; + const int16_t* dense_h2h_{nullptr}; + float dense_h2h_max_[3]; + XPUScratchPadGuard input_max_guard_; + float* input_max_{nullptr}; + XPUScratchPadGuard hbm_buffer_guard_; + float* hbm_buffer_{nullptr}; + // require: cap_l * max(cap_e_, cap_h_) * 5 + // seq2batch_out: [cap_l, cap_e_] + // fc_e2h_out: [3, cap_l, cap_h_] + // gru_out: [cap_l, cap_h_] + int cap_e_; + int cap_h_; + int max_cap_l_; + + public: + void Init(lite::Tensor* wh, + const std::vector& wh_maxs, + lite::Tensor* wi, + const std::vector& wi_maxs, + int cap_e, + int cap_h, + int max_cap_l) { + cap_e_ = cap_e; + cap_h_ = cap_h; + max_cap_l_ = max_cap_l; + + // weight + auto* dense_e2h = wi->data(); + fc_e2h0_.Init(dense_e2h, + wi_maxs[0], + nullptr, + cap_h_, + cap_e_, + xdnn::Activation_t::LINEAR); + fc_e2h1_.Init(dense_e2h + cap_e_ * cap_h_, + wi_maxs[1], + nullptr, + cap_h_, + cap_e_, + xdnn::Activation_t::LINEAR); + fc_e2h2_.Init(dense_e2h + cap_e_ * cap_h_ * 2, + wi_maxs[2], + nullptr, + cap_h_, + cap_e_, + xdnn::Activation_t::LINEAR); + + dense_h2h_ = wh->data(); + dense_h2h_max_[0] = wh_maxs[0]; + dense_h2h_max_[1] = wh_maxs[1]; + dense_h2h_max_[2] = wh_maxs[2]; + + input_max_guard_ = + TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false); + input_max_ = reinterpret_cast(input_max_guard_->addr_); + hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad( + 5 * std::max(cap_e_, cap_h_) * max_cap_l_ * sizeof(float), false); + hbm_buffer_ = reinterpret_cast(hbm_buffer_guard_->addr_); + } + + void Infer(xdnn::Context* ctx, + const MMDNNIdInfo& sentense, + const float* in, + float* out, + float* l3_buffer = nullptr, + int l3_size = 0) { + int batch = sentense.batch; + int cap_l = sentense.seqlen_sum; + int max_width = sentense.seqlen_max; + + int slot_size = cap_l * std::max(cap_e_, cap_h_); + float* seq2batch_out = hbm_buffer_; + float* fc_e2h_out = hbm_buffer_ + 1 * slot_size; + float* gru_out = hbm_buffer_ + 4 * slot_size; + if (l3_size > 0 && l3_size >= 5 * slot_size * sizeof(float)) { + seq2batch_out = l3_buffer; + fc_e2h_out = l3_buffer + 1 * slot_size; + gru_out = l3_buffer + 4 * slot_size; + } + + int r = 0; + r = xdnn::search_seq2batch(ctx, + batch, + max_width, + cap_e_, + sentense.idx_sorted_32, + sentense.lod_32, + sentense.new_offset_32, + in, + seq2batch_out); + CHECK_EQ(r, 0); + + r = xdnn::findmax(ctx, in, cap_l * cap_e_, input_max_); + CHECK_EQ(r, 0); + fc_e2h0_.Infer(ctx, seq2batch_out, cap_l, fc_e2h_out, input_max_); + fc_e2h1_.Infer( + ctx, seq2batch_out, cap_l, fc_e2h_out + cap_l * cap_h_, input_max_); + fc_e2h2_.Infer( + ctx, seq2batch_out, cap_l, fc_e2h_out + cap_l * cap_h_ * 2, input_max_); + r = xdnn::search_grnn(ctx, + cap_l, + cap_h_, + cap_e_, + max_width, + sentense.new_offset_32, + fc_e2h_out, + dense_h2h_, + gru_out, + dense_h2h_max_[0], + dense_h2h_max_[1], + dense_h2h_max_[2]); + CHECK_EQ(r, 0); + + r = xdnn::search_batch2seq(ctx, + batch, + max_width, + cap_h_, + sentense.idx_sorted_32, + sentense.lod_32, + sentense.new_offset_32, + gru_out, + out); + CHECK_EQ(r, 0); + } +}; + +class MMDNNAttentionOp { + int dim_; + float alpha0_; + float alpha1_; + MMDNNFcOp seqfc_; + XPUScratchPadGuard hbm_buffer_guard_; + float* hbm_buffer_{nullptr}; + // require: cap_l * dim_ + seqlen_square_sum + // seqfc_out: [cap_l, dim_] + // batchgemm0_out: [seqlen_square_sum] + // seq_softmax_out: [seqlen_square_sum], reuse of batchgemm0_out + // batchgemm1_out: [cap_l, dim_], reuse of seqfc_out + + public: + void Init(lite::Tensor* att_fc_w, + float att_fc_w_max, + lite::Tensor* att_fc_b, + int dim, + int upper_bound_batch, + int upper_bound_seqlen) { + dim_ = dim; + alpha0_ = 0.0883883461356163f; // TODO(miaotianxiang): + alpha1_ = 1.0f; + + seqfc_.Init(att_fc_w, + att_fc_w_max, + att_fc_b, + dim_, + dim_, + xdnn::Activation_t::LINEAR); + hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch * (upper_bound_seqlen * dim_ + + upper_bound_seqlen * upper_bound_seqlen)) * + sizeof(float), + false); + hbm_buffer_ = reinterpret_cast(hbm_buffer_guard_->addr_); + } + + void Infer(xdnn::Context* ctx, + const MMDNNIdInfo& sentense, + const float* input, + float* pool_out, + float* l3_buffer = nullptr, + int l3_size = 0) { + int batch = sentense.batch; + int cap_l = sentense.seqlen_sum; + int max_width = sentense.seqlen_max; + int* lod_32 = sentense.lod_32; + + float* seqfc_out = hbm_buffer_; + float* batchgemm0_out = hbm_buffer_ + cap_l * dim_; + float* seq_softmax_out = batchgemm0_out; + float* batchgemm1_out = seqfc_out; + if (l3_size > 0 && + l3_size >= + (cap_l * dim_ + sentense.seqlen_square_sum) * sizeof(float)) { + seqfc_out = l3_buffer; + batchgemm0_out = l3_buffer + cap_l * dim_; + seq_softmax_out = batchgemm0_out; + batchgemm1_out = seqfc_out; + } + + seqfc_.Infer(ctx, input, cap_l, seqfc_out); + int r = 0; + r = xdnn::search_noaligned_mat_mul(ctx, + 0, + 1, + batch, + lod_32, + max_width, + dim_, + alpha0_, + input, + seqfc_out, + batchgemm0_out); + CHECK_EQ(r, 0); + r = xdnn::search_seq_softmax( + ctx, batchgemm0_out, seq_softmax_out, lod_32, batch, max_width); + CHECK_EQ(r, 0); + r = xdnn::search_noaligned_mat_mul(ctx, + 0, + 0, + batch, + lod_32, + max_width, + dim_, + alpha1_, + seq_softmax_out, + input, + batchgemm1_out); + CHECK_EQ(r, 0); + r = xdnn::sequence_pooling_forward(ctx, + xdnn::Pooling_t::MAX_WITHOUT_INDEX, + batch, + lod_32, + dim_, + batchgemm1_out, + nullptr, + pool_out); + CHECK_EQ(r, 0); + } +}; + +class MMDNNMatchConvTopk { + std::vector topks_; + int dim_t_; + int dim_in_; + int out_channel_; + + MMDNNFcOp xw_fc_; + const int16_t* conv_weight_{nullptr}; + float conv_weight_max_; + XPUScratchPadGuard hbm_buffer_guard_; + float* hbm_buffer_{nullptr}; + // xw_out: [sum(left_len), dim_t_ * dim_in_] + // xwy_out: [sum(left_len * right_len) * dim_t_] + // conv_out: [sum(left_len * right_len) * out_channel_] + // seq_concat_out: [sum(left_len * right_len) * (dim_t_ + out_channel_)] + + XPUScratchPadGuard left_lod_32_guard_; + int* left_lod_32_{nullptr}; + XPUScratchPadGuard right_lod_32_guard_; + int* right_lod_32_{nullptr}; + XPUScratchPadGuard match_lod_32_guard_; + int* match_lod_32_{nullptr}; + XPUScratchPadGuard conv_lod_32_guard_; + int* conv_lod_32_{nullptr}; + XPUScratchPadGuard topk_offset_32_guard_; + int* topk_offset_32_{nullptr}; + XPUScratchPadGuard topks_xpu_guard_; + int* topks_xpu_{nullptr}; + XPUScratchPadGuard useless_topk_pos_guard_; + int* useless_topk_pos_{nullptr}; + + public: + float* seq_avg_topk_out{nullptr}; + + void Init(lite::Tensor* input_w, + float input_w_max, + lite::Tensor* conv_w, + float conv_w_max, + int dim_t, + int dim_in, + int out_channel, + int upper_bound_batch, + int upper_bound_seqlen, + const std::vector& topks) { + dim_t_ = dim_t; + dim_in_ = dim_in; + out_channel_ = out_channel; + topks_ = topks; + + xw_fc_.Init(input_w, + input_w_max, + nullptr, + dim_t_ * dim_in_, + dim_in_, + xdnn::Activation_t::LINEAR); + conv_weight_ = conv_w->data(); + conv_weight_max_ = conv_w_max; + + hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch * upper_bound_seqlen * dim_t_ * dim_in_ + + upper_bound_batch * upper_bound_seqlen * upper_bound_seqlen * + (dim_t_ + out_channel_) * 2) * + sizeof(float), + false); + hbm_buffer_ = reinterpret_cast(hbm_buffer_guard_->addr_); + + left_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch + 1) * sizeof(int), false); + left_lod_32_ = reinterpret_cast(left_lod_32_guard_->addr_); + right_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch + 1) * sizeof(int), false); + right_lod_32_ = reinterpret_cast(right_lod_32_guard_->addr_); + match_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch + 1) * sizeof(int), false); + match_lod_32_ = reinterpret_cast(match_lod_32_guard_->addr_); + conv_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch + 1) * sizeof(int), false); + conv_lod_32_ = reinterpret_cast(conv_lod_32_guard_->addr_); + topk_offset_32_guard_ = TargetWrapperXPU::MallocScratchPad( + (upper_bound_batch + 1) * sizeof(int), false); + topk_offset_32_ = reinterpret_cast(topk_offset_32_guard_->addr_); + topks_xpu_guard_ = + TargetWrapperXPU::MallocScratchPad(topks_.size() * sizeof(int), false); + topks_xpu_ = reinterpret_cast(topks_xpu_guard_->addr_); + XPU_CALL(xpu_memcpy(topks_xpu_, + topks_.data(), + topks_.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + useless_topk_pos_guard_ = + TargetWrapperXPU::MallocScratchPad(4 * sizeof(int), false); + useless_topk_pos_ = reinterpret_cast(useless_topk_pos_guard_->addr_); + } + + void Infer(xdnn::Context* ctx, + lite::Tensor* left, + lite::Tensor* right, + lite::Tensor* out, + float* l3_buffer = nullptr, + int l3_size = 0) { + auto left_lod = left->lod()[0]; + auto right_lod = right->lod()[0]; + int batch = left_lod.size() - 1; + + std::vector left_lod_32_cpu; + for (auto e : left_lod) { + left_lod_32_cpu.push_back(e); + } + XPU_CALL(xpu_memcpy(left_lod_32_, + left_lod_32_cpu.data(), + left_lod_32_cpu.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + std::vector right_lod_32_cpu; + for (auto e : right_lod) { + right_lod_32_cpu.push_back(e); + } + XPU_CALL(xpu_memcpy(right_lod_32_, + right_lod_32_cpu.data(), + right_lod_32_cpu.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + + std::vector lod_match = {0}; + std::vector lod_conv = {0}; + std::vector lod_topk = {0}; + int x_mul_y_sum = 0; + int left_seqlen_sum = 0; + int left_seqlen_max = 0; + int right_seqlen_sum = 0; + int right_seqlen_max = 0; + for (int i = 0; i < batch; i++) { + int len_x = left_lod[i + 1] - left_lod[i]; + int len_y = right_lod[i + 1] - right_lod[i]; + int imgsize = len_x * len_y; + x_mul_y_sum = x_mul_y_sum + imgsize; + lod_match.push_back(lod_match.back() + imgsize * dim_t_); + lod_conv.push_back(lod_conv.back() + imgsize * out_channel_); + lod_topk.push_back(lod_topk.back() + imgsize * (dim_t_ + out_channel_)); + + left_seqlen_max = std::max(left_seqlen_max, len_x); + right_seqlen_max = std::max(right_seqlen_max, len_y); + left_seqlen_sum += len_x; + right_seqlen_sum += len_y; + } + XPU_CALL(xpu_memcpy(match_lod_32_, + lod_match.data(), + lod_match.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(conv_lod_32_, + lod_conv.data(), + lod_conv.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(topk_offset_32_, + lod_topk.data(), + lod_topk.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + + float* xwy_out = hbm_buffer_; + float* conv_out = hbm_buffer_ + x_mul_y_sum * dim_t_; + float* seq_concat_out = hbm_buffer_ + x_mul_y_sum * (dim_t_ + out_channel_); + float* xw_out = hbm_buffer_ + x_mul_y_sum * (dim_t_ + out_channel_) * 2; + int total_len = x_mul_y_sum * (dim_t_ + out_channel_) * 2 + + left_seqlen_sum * dim_t_ * dim_in_; + if (l3_size > 0 && l3_size >= total_len * sizeof(float)) { + xwy_out = l3_buffer; + conv_out = l3_buffer + x_mul_y_sum * dim_t_; + seq_concat_out = l3_buffer + x_mul_y_sum * (dim_t_ + out_channel_); + xw_out = l3_buffer + x_mul_y_sum * (dim_t_ + out_channel_) * 2; + } + seq_avg_topk_out = out->mutable_data(TARGET(kXPU)); + + int max_width = std::max(left_seqlen_max, right_seqlen_max); + xw_fc_.Infer(ctx, left->data(), left_seqlen_sum, xw_out); + int r = 0; + r = xdnn::match_matrix_tensor(ctx, + batch, + xw_out, + right->data(), + left_lod_32_, + right_lod_32_, + dim_t_, + dim_in_, + xwy_out, + xw_fc_.out_max, + xdnn::Activation_t::RELU, + max_width); + CHECK_EQ(r, 0); + r = xdnn::search_varconv( + ctx, + batch, + dim_t_, + out_channel_, + 5, + 5, + 1, + 1, + xwy_out, + conv_weight_, + right_lod_32_, + left_lod_32_, + conv_out, + conv_weight_max_, + xdnn::Activation_t::RELU); // TODO(miaotianxiang): + CHECK_EQ(r, 0); + r = xdnn::sequence_concat(ctx, + xwy_out, + match_lod_32_, + conv_out, + conv_lod_32_, + seq_concat_out, + batch); + CHECK_EQ(r, 0); + r = xdnn::sequence_topk_avg_pooling(ctx, + seq_concat_out, + seq_avg_topk_out, + useless_topk_pos_, + batch, + dim_t_ + out_channel_, + topk_offset_32_, + left_lod_32_, + right_lod_32_, + topks_xpu_, + topks_.size()); + CHECK_EQ(r, 0); + } +}; + +class MMDNNBidEmbGrnnAtt { + const float* table_{nullptr}; + int table_len_; + int emb_dim_; + int cap_h_; + MMDNNGrnnOp bi_fw_; + MMDNNGrnnOp bi_rv_; + MMDNNAttentionOp att_; + XPUScratchPadGuard hbm_buffer_guard_; + float* hbm_buffer_{nullptr}; + // require at least: 4 * cap_l * emb_dim_ + // emb_rv: [cap_l, emb_dim_] + // grnn_fw: [cap_l, emb_dim_] + // grnn_rv: [cap_l, emb_dim_] + // grnn_rv_rv: [cap_l, emb_dim_] + // concat_2in: [cap_l, 2 * emb_dim_] + // L3.bi_fw: 5 * cap_l * emb_dim_ + // L3.bi_rv: 5 * cap_l * emb_dim_ + // L3.att: cap_l * 2 * emb_dim_ + seqlen_square_sum + + // execution-plan: + // 1. bid_emb_ew, alloc(emb_rv) + // 2. bi_rv, alloc(grnn_rv) + // 3. free(emb_rv) + // 4. sequence_reverse, alloc(grnn_rv_rv) + // 5. sequence_pooling(grnn_rv) + // 6. free(grnn_rv) + // 7. bi_fw alloc(grnn_fw) + // 8. sequence_pooling(grnn_fw) + // 9. concat_2 alloc(concat_2in) + // 10. concat_3 + // 11. att + + // alloc-plan: + // [0]: emb_rv, grnn_rv_rv + // [1]: grnn_rv, grnn_fw + // [2, 3]: concat_2in + // [2, 3, 4, 5, 6]: L3.bi_fw, L3.bi_rv + // [4, 5, ..., ?]: L3.att + + public: + float* emb_fw{nullptr}; + float* concat_3in{nullptr}; + float* pool_fw{nullptr}; + float* pool_rv{nullptr}; + float* att_out{nullptr}; + + void Init(lite::Tensor* table, + lite::Tensor* fw_wh, + const std::vector& fw_wh_maxs, + lite::Tensor* fw_wi, + const std::vector& fw_wi_maxs, + lite::Tensor* rv_wh, + const std::vector& rv_wh_maxs, + lite::Tensor* rv_wi, + const std::vector& rv_wi_maxs, + lite::Tensor* att_fc_w, + float att_fc_w_max, + lite::Tensor* att_fc_b, + int upper_bound_batch, + int upper_bound_seqlen) { + table_ = table->data(); + table_len_ = table->dims()[0]; + emb_dim_ = table->dims()[1]; + cap_h_ = emb_dim_; + int max_cap_l = upper_bound_batch * upper_bound_seqlen; + + bi_fw_.Init( + fw_wh, fw_wh_maxs, fw_wi, fw_wi_maxs, emb_dim_, cap_h_, max_cap_l); + bi_rv_.Init( + rv_wh, rv_wh_maxs, rv_wi, rv_wi_maxs, emb_dim_, cap_h_, max_cap_l); + att_.Init(att_fc_w, + att_fc_w_max, + att_fc_b, + 2 * cap_h_, + upper_bound_batch, + upper_bound_seqlen); + + hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad( + 4 * max_cap_l * cap_h_ * sizeof(float), false); + hbm_buffer_ = reinterpret_cast(hbm_buffer_guard_->addr_); + } + + void Infer(xdnn::Context* ctx, + int batch, + const MMDNNIdInfo& sentense, + lite::Tensor* grnn_fw_pool_out, + lite::Tensor* grnn_rv_pool_out, + lite::Tensor* att_pool_out, + lite::Tensor* concat_3in1_out, + lite::Tensor* emb_fw_out, + float* l3_buffer = nullptr, + int l3_size = 0) { + int cap_l = sentense.seqlen_sum; + int slot_len = cap_l * cap_h_; + + float* emb_rv = hbm_buffer_; + float* grnn_fw = hbm_buffer_ + slot_len; + float* grnn_rv = hbm_buffer_ + slot_len; + float* grnn_rv_rv = hbm_buffer_; + float* concat_2in = hbm_buffer_ + 2 * slot_len; + if (l3_size > 0 && l3_size >= 4 * slot_len * sizeof(float)) { + emb_rv = l3_buffer; + grnn_fw = l3_buffer + slot_len; + grnn_rv = l3_buffer + slot_len; + grnn_rv_rv = l3_buffer; + } + emb_fw = emb_fw_out->mutable_data(TARGET(kXPU)); + concat_3in = concat_3in1_out->mutable_data(TARGET(kXPU)); + pool_fw = grnn_fw_pool_out->mutable_data(TARGET(kXPU)); + pool_rv = grnn_rv_pool_out->mutable_data(TARGET(kXPU)); + att_out = att_pool_out->mutable_data(TARGET(kXPU)); + + int r = 0; + r = xdnn::search_bid_emb_ew(ctx, + batch, + sentense.lod_64, + sentense.id0_64, + sentense.id1_64, + table_, + table_len_, + emb_dim_, + emb_fw, + emb_rv, + table_len_ - 2, + 1); + CHECK_EQ(r, 0); + bi_rv_.Infer(ctx, + sentense, + emb_rv, + grnn_rv, + l3_buffer + 2 * slot_len, + l3_size - 2 * slot_len * sizeof(float)); + r = xdnn::sequence_reverse( + ctx, batch, sentense.lod_32, cap_h_, grnn_rv, grnn_rv_rv); + CHECK_EQ(r, 0); + r = xdnn::sequence_pooling_forward(ctx, + xdnn::Pooling_t::LAST, + batch, + sentense.lod_32, + cap_h_, + grnn_rv, + nullptr, + pool_rv); + CHECK_EQ(r, 0); + + bi_fw_.Infer(ctx, + sentense, + emb_fw, + grnn_fw, + l3_buffer + 2 * slot_len, + l3_size - 2 * slot_len * sizeof(float)); + r = xdnn::sequence_pooling_forward(ctx, + xdnn::Pooling_t::LAST, + batch, + sentense.lod_32, + cap_h_, + grnn_fw, + nullptr, + pool_fw); + CHECK_EQ(r, 0); + const int concat_widths[] = {cap_h_, cap_h_, cap_h_}; + const float* concat_ptrs[] = {emb_fw, grnn_fw, grnn_rv_rv}; + r = xdnn::concat( + ctx, cap_l, concat_widths + 1, 2, concat_ptrs + 1, concat_2in); + CHECK_EQ(r, 0); + r = xdnn::concat( + ctx, cap_l, concat_widths, 3, concat_ptrs, concat_3in); + CHECK_EQ(r, 0); + att_.Infer(ctx, + sentense, + concat_2in, + att_out, + l3_buffer + 4 * slot_len, + l3_size - 4 * slot_len * sizeof(float)); + } +}; + +class MMDNNEmbAtt { + const float* table_{nullptr}; + int table_len_; + int emb_dim_; + MMDNNAttentionOp att_; + + public: + float* emb_fw{nullptr}; + float* att_out{nullptr}; + + void Init(lite::Tensor* table, + lite::Tensor* att_fc_w, + float att_fc_w_max, + lite::Tensor* att_fc_b, + int upper_bound_batch, + int upper_bound_seqlen) { + table_ = table->data(); + table_len_ = table->dims()[0]; + emb_dim_ = table->dims()[1]; + att_.Init(att_fc_w, + att_fc_w_max, + att_fc_b, + emb_dim_, + upper_bound_batch, + upper_bound_seqlen); + } + + void Infer(xdnn::Context* ctx, + int batch, + const MMDNNIdInfo& sentense, + lite::Tensor* att_pool_out, + lite::Tensor* emb_fw_out, + float* l3_buffer = nullptr, + int l3_size = 0) { + emb_fw = emb_fw_out->mutable_data(TARGET(kXPU)); + att_out = att_pool_out->mutable_data(TARGET(kXPU)); + + int cap_l = sentense.lod.back(); + const float* emb_tables[] = {table_, table_}; + const int64_t* emb_indices[] = {sentense.id0_64, sentense.id1_64}; + int r = + xdnn::embedding_with_ewadd(ctx, + emb_dim_, + cap_l, + 2, + table_len_ - 2, + emb_tables, + emb_indices, + nullptr, + nullptr, + emb_fw); + CHECK_EQ(r, 0); + att_.Infer(ctx, sentense, emb_fw, att_out, l3_buffer, l3_size); + } +}; + +class MMDNNMergeAll { + MMDNNGrnnOp coverage_fw_; + MMDNNGrnnOp coverage_rv_; + int cap_e_; + int cap_h_; + + // TODO(miaotianxiang): + const int fc0_k_ = 1152; + const int fc0_n_ = 512; + const int fc1_k_ = 640; + const int fc1_n_ = 320; + const int fc2_k_ = 320; + const int fc2_n_ = 1; + MMDNNFcOp fc0_; + MMDNNFcOp fc1_; + MMDNNFcOp fc2_; + + XPUScratchPadGuard hbm_buffer_guard_; + float* hbm_buffer_{nullptr}; + // topk_concat_out_fw: [cap_l, cap_e_] <= [cap_l, cap_h_] + // topk_concat_out_rv: [cap_l, cap_e_] <= [cap_l, cap_h_] + // grnn_fw: [cap_l, cap_h_] + // grnn_rv: [cap_l, cap_h_] + // pool_fw: [batch, cap_h_] + // pool_rv: [batch, cap_h_] + // fc0_in: [batch, fc0_k_] + // fc0_out: [batch, fc0_n_] + // fc1_in: [batch, fc1_k_] + // fc1_out: [batch, fc1_n_] + // fc2_out: [batch, fc2_n_] + + public: + void Init(lite::Tensor* grnn_fw_wh, + std::vector grnn_fw_wh_maxs, + lite::Tensor* grnn_fw_wi, + std::vector grnn_fw_wi_maxs, + lite::Tensor* grnn_rv_wh, + std::vector grnn_rv_wh_maxs, + lite::Tensor* grnn_rv_wi, + std::vector grnn_rv_wi_maxs, + lite::Tensor* fc0_w, + float fc0_w_max, + lite::Tensor* fc0_b, + lite::Tensor* fc1_w, + float fc1_w_max, + lite::Tensor* fc1_b, + lite::Tensor* fc2_w, + float fc2_w_max, + lite::Tensor* fc2_b, + int upper_bound_batch, + int upper_bound_seqlen) { + int max_cap_l = upper_bound_batch * upper_bound_seqlen; + cap_e_ = grnn_fw_wi->dims()[2]; + cap_h_ = grnn_fw_wi->dims()[1]; + + coverage_fw_.Init(grnn_fw_wh, + grnn_fw_wh_maxs, + grnn_fw_wi, + grnn_fw_wi_maxs, + cap_e_, + cap_h_, + max_cap_l); + coverage_rv_.Init(grnn_rv_wh, + grnn_rv_wh_maxs, + grnn_rv_wi, + grnn_rv_wi_maxs, + cap_e_, + cap_h_, + max_cap_l); + + fc0_.Init( + fc0_w, fc0_w_max, fc0_b, fc0_n_, fc0_k_, xdnn::Activation_t::RELU); + fc1_.Init( + fc1_w, fc1_w_max, fc1_b, fc1_n_, fc1_k_, xdnn::Activation_t::RELU); + fc2_.Init( + fc2_w, fc2_w_max, fc2_b, fc2_n_, fc2_k_, xdnn::Activation_t::LINEAR); + + int hbm_total_len = max_cap_l * cap_e_ * 2 + max_cap_l * cap_h_ * 2 + + upper_bound_batch * (2 * cap_h_ + fc0_k_ + fc0_n_ + + fc1_k_ + fc1_n_ + fc2_n_); + hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad( + hbm_total_len * sizeof(float), false); + hbm_buffer_ = reinterpret_cast(hbm_buffer_guard_->addr_); + } + + void Infer(xdnn::Context* ctx, + const MMDNNIdInfo& sentense, + const std::vector concat_topk_x, + const std::vector concat_7in1_x, + lite::Tensor* out, + float* l3_buffer = nullptr, + int l3_size = 0) { + int batch = sentense.batch; + int cap_l = sentense.seqlen_sum; + + float* topk_concat_out_fw = hbm_buffer_; + int hbm_total_len = + cap_l * cap_e_ * 2 + cap_l * cap_h_ * 2 + + batch * (2 * cap_h_ + fc0_k_ + fc0_n_ + fc1_k_ + fc1_n_ + fc2_n_); + if (l3_size > 0 && l3_size >= hbm_total_len * sizeof(float)) { + topk_concat_out_fw = l3_buffer; + } + float* topk_concat_out_rv = topk_concat_out_fw + cap_l * cap_e_; + float* grnn_fw = topk_concat_out_rv + cap_l * cap_e_; + float* grnn_rv = grnn_fw + cap_l * cap_h_; + float* pool_fw = grnn_rv + cap_l * cap_h_; + float* pool_rv = pool_fw + batch * cap_h_; + float* fc0_in = pool_fw + batch * cap_h_ * 2; + float* fc0_out = fc0_in + batch * fc0_k_; + float* fc1_in = fc0_out + batch * fc0_n_; + float* fc1_out = fc1_in + batch * fc1_k_; + // float* fc2_out = fc1_out + batch * fc1_n_; + float* fc2_out = out->mutable_data(TARGET(kXPU)); + + std::vector concat_widths; + std::vector concat_ptrs; + for (const auto* t : concat_topk_x) { + concat_widths.push_back(static_cast(t->dims()[1])); + concat_ptrs.push_back(t->data()); + } + int r = 0; + r = xdnn::concat(ctx, + cap_l, + concat_widths.data(), + concat_widths.size(), + concat_ptrs.data(), + topk_concat_out_fw); + CHECK_EQ(r, 0); + r = xdnn::sequence_reverse(ctx, + batch, + sentense.lod_32, + cap_e_, + topk_concat_out_fw, + topk_concat_out_rv); + CHECK_EQ(r, 0); + coverage_fw_.Infer(ctx, + sentense, + topk_concat_out_fw, + grnn_fw, + l3_buffer + hbm_total_len, + l3_size - hbm_total_len * sizeof(float)); + coverage_rv_.Infer(ctx, + sentense, + topk_concat_out_rv, + grnn_rv, + l3_buffer + hbm_total_len, + l3_size - hbm_total_len * sizeof(float)); + r = xdnn::sequence_pooling_forward(ctx, + xdnn::Pooling_t::LAST, + batch, + sentense.lod_32, + cap_h_, + grnn_fw, + nullptr, + pool_fw); + CHECK_EQ(r, 0); + r = xdnn::sequence_pooling_forward(ctx, + xdnn::Pooling_t::LAST, + batch, + sentense.lod_32, + cap_h_, + grnn_rv, + nullptr, + pool_rv); + CHECK_EQ(r, 0); + + const int concat_widths_fc0[] = { + static_cast(concat_7in1_x[0]->dims()[1]), + static_cast(concat_7in1_x[1]->dims()[1]), + static_cast(concat_7in1_x[2]->dims()[1]), + static_cast(concat_7in1_x[3]->dims()[1]), + static_cast(concat_7in1_x[4]->dims()[1]), + static_cast(concat_7in1_x[5]->dims()[1]), + static_cast(concat_7in1_x[6]->dims()[1]), + }; + const float* concat_ptrs_fc0[] = { + concat_7in1_x[0]->data(), + concat_7in1_x[1]->data(), + concat_7in1_x[2]->data(), + concat_7in1_x[3]->data(), + concat_7in1_x[4]->data(), + concat_7in1_x[5]->data(), + concat_7in1_x[6]->data(), + }; + const int concat_widths_fc1[] = {cap_h_, cap_h_, fc0_n_}; + const float* concat_ptrs_fc1[] = {pool_fw, pool_rv, fc0_out}; + + r = xdnn::concat( + ctx, batch, concat_widths_fc0, 7, concat_ptrs_fc0, fc0_in); + CHECK_EQ(r, 0); + fc0_.Infer(ctx, fc0_in, batch, fc0_out); + r = xdnn::concat( + ctx, batch, concat_widths_fc1, 3, concat_ptrs_fc1, fc1_in); + CHECK_EQ(r, 0); + fc1_.Infer(ctx, fc1_in, batch, fc1_out); + fc2_.Infer(ctx, fc1_out, batch, fc2_out); + } +}; + +class XPUMmdnnBidEmbGrnnAttCompute + : public KernelLite { + public: + using param_t = operators::XPUMmdnnBidEmbGrnnAttParam; + + void PrepareForRun() override; + + void Run() override; + + private: + MMDNNIdInfo id_; + MMDNNBidEmbGrnnAtt compound_; +}; + +void XPUMmdnnBidEmbGrnnAttCompute::PrepareForRun() { + auto& param = this->Param(); + + id_.Init(XPU_MAX_LOD_SIZE, XPU_MAX_LOD_SEQ_LEN); + compound_.Init(param.emb_tbl, + param.grnn_fw_wh, + param.grnn_fw_wh_maxs, + param.grnn_fw_wi, + param.grnn_fw_wi_maxs, + param.grnn_rv_wh, + param.grnn_rv_wh_maxs, + param.grnn_rv_wi, + param.grnn_rv_wi_maxs, + param.att_fc_w, + param.att_fc_w_max, + param.att_fc_b, + XPU_MAX_LOD_SIZE, + XPU_MAX_LOD_SEQ_LEN); +} + +void XPUMmdnnBidEmbGrnnAttCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* xpu_ctx = ctx.GetRawContext(); + + int batch = param.id0->lod()[0].size() - 1; + id_.Update(param.id0, param.id1); + compound_.Infer(ctx.GetRawContext(), + batch, + id_, + param.grnn_fw_pool_out, + param.grnn_rv_pool_out, + param.att_pool_out, + param.concat_3in1_out, + param.emb_fw_out, + reinterpret_cast( + reinterpret_cast(xpu_ctx->workspace_l3_ptr) + + xpu_ctx->used_l3_size), + xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size); +} + +class XPUMmdnnBidEmbGrnnAttCompute2 + : public KernelLite { + public: + using param_t = operators::XPUMmdnnBidEmbGrnnAttParam2; + + void PrepareForRun() override; + + void Run() override; + + private: + MMDNNIdInfo id_; + MMDNNBidEmbGrnnAtt compound_; +}; + +void XPUMmdnnBidEmbGrnnAttCompute2::PrepareForRun() { + auto& param = this->Param(); + + id_.Init(XPU_MAX_LOD_SIZE, XPU_MAX_LOD_SEQ_LEN); + compound_.Init(param.emb_tbl, + param.grnn_fw_wh, + param.grnn_fw_wh_maxs, + param.grnn_fw_wi, + param.grnn_fw_wi_maxs, + param.grnn_rv_wh, + param.grnn_rv_wh_maxs, + param.grnn_rv_wi, + param.grnn_rv_wi_maxs, + param.att_fc_w, + param.att_fc_w_max, + param.att_fc_b, + XPU_MAX_LOD_SIZE, + XPU_MAX_LOD_SEQ_LEN); +} + +void XPUMmdnnBidEmbGrnnAttCompute2::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* xpu_ctx = ctx.GetRawContext(); + + int batch = param.id0->lod()[0].size() - 1; + id_.Update(param.id0, param.id1); + compound_.Infer(ctx.GetRawContext(), + batch, + id_, + param.grnn_fw_pool_out, + param.grnn_rv_pool_out, + param.att_pool_out, + param.concat_3in1_out, + param.emb_fw_out, + reinterpret_cast( + reinterpret_cast(xpu_ctx->workspace_l3_ptr) + + xpu_ctx->used_l3_size), + xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size); + + int num = param.id0->numel(); + int embed_dim = param.emb_tbl->dims()[1]; + + // TODO(miaotianxiang): + int r = xdnn::embedding( + ctx.GetRawContext(), /* context */ + num, /* num */ + param.id0->data(), /* indices */ + embed_dim, /* embed_dim */ + param.emb_tbl->data(), /* table */ + param.emb0_out->mutable_data(TARGET(kXPU)), /* top */ + 128000 /* padding_idx */); + CHECK_EQ(r, 0); +} + +class XPUMmdnnBidEmbAttCompute + : public KernelLite { + public: + using param_t = operators::XPUMmdnnBidEmbAttParam; + + void PrepareForRun() override; + + void Run() override; + + private: + MMDNNIdInfo id_; + MMDNNEmbAtt compound_; +}; + +void XPUMmdnnBidEmbAttCompute::PrepareForRun() { + auto& param = this->Param(); + + id_.Init(XPU_MAX_LOD_SIZE, XPU_MAX_LOD_SEQ_LEN); + compound_.Init(param.emb_tbl, + param.att_fc_w, + param.att_fc_w_max, + param.att_fc_b, + XPU_MAX_LOD_SIZE, + XPU_MAX_LOD_SEQ_LEN); +} + +void XPUMmdnnBidEmbAttCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* xpu_ctx = ctx.GetRawContext(); + + int batch = param.id0->lod()[0].size() - 1; + id_.Update(param.id0, param.id1); + compound_.Infer(ctx.GetRawContext(), + batch, + id_, + param.att_pool_out, + param.emb_fw_out, + reinterpret_cast( + reinterpret_cast(xpu_ctx->workspace_l3_ptr) + + xpu_ctx->used_l3_size), + xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size); +} + +class XPUMmdnnMatchConvTopkCompute + : public KernelLite { + public: + using param_t = operators::XPUMmdnnMatchConvTopkParam; + + void PrepareForRun() override; + + void Run() override; + + private: + MMDNNMatchConvTopk compound_; +}; + +void XPUMmdnnMatchConvTopkCompute::PrepareForRun() { + auto& param = this->Param(); + + compound_.Init(param.input_w, + param.input_w_max, + param.conv_w, + param.conv_w_max, + param.dim_t, + param.input_w->dims()[0], + param.output_channel, + XPU_MAX_LOD_SIZE, + XPU_MAX_LOD_SEQ_LEN, + param.topks); +} + +void XPUMmdnnMatchConvTopkCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* xpu_ctx = ctx.GetRawContext(); + + compound_.Infer(ctx.GetRawContext(), + param.input_x, + param.input_y, + param.topk_out, + reinterpret_cast( + reinterpret_cast(xpu_ctx->workspace_l3_ptr) + + xpu_ctx->used_l3_size), + xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size); +} + +class XPUMmdnnMergeAllCompute + : public KernelLite { + public: + using param_t = operators::XPUMmdnnMergeAllParam; + + void PrepareForRun() override; + + void Run() override; + + private: + MMDNNIdInfo id_; + MMDNNMergeAll compound_; +}; + +void XPUMmdnnMergeAllCompute::PrepareForRun() { + auto& param = this->Param(); + + id_.Init(XPU_MAX_LOD_SIZE, XPU_MAX_LOD_SEQ_LEN); + compound_.Init(param.grnn_fw_wh, + param.grnn_fw_wh_maxs, + param.grnn_fw_wi, + param.grnn_fw_wi_maxs, + param.grnn_rv_wh, + param.grnn_rv_wh_maxs, + param.grnn_rv_wi, + param.grnn_rv_wi_maxs, + param.fc0_w, + param.fc0_w_max, + param.fc0_b, + param.fc1_w, + param.fc1_w_max, + param.fc1_b, + param.fc2_w, + param.fc2_w_max, + param.fc2_b, + XPU_MAX_LOD_SIZE, + XPU_MAX_LOD_SEQ_LEN); +} + +void XPUMmdnnMergeAllCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* xpu_ctx = ctx.GetRawContext(); + + id_.Update(param.concat_topk_x[0], param.concat_topk_x[1]); + compound_.Infer(ctx.GetRawContext(), + id_, + param.concat_topk_x, + param.concat_7in1_x, + param.out, + reinterpret_cast( + reinterpret_cast(xpu_ctx->workspace_l3_ptr) + + xpu_ctx->used_l3_size), + xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_grnn_att, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUMmdnnBidEmbGrnnAttCompute, + def) + .BindInput("id0", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("id1", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("emb_tbl", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_fw_wh", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_fw_wi", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_rv_wh", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_rv_wi", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("att_fc_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("att_fc_b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("grnn_fw_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("grnn_rv_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("att_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("concat_3in1_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("emb_fw_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_grnn_att2, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUMmdnnBidEmbGrnnAttCompute2, + def) + .BindInput("id0", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("id1", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("emb_tbl", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_fw_wh", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_fw_wi", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_rv_wh", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_rv_wi", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("att_fc_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("att_fc_b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("emb0_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("grnn_fw_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("grnn_rv_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("att_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("concat_3in1_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("emb_fw_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_att, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUMmdnnBidEmbAttCompute, + def) + .BindInput("id0", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("id1", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("emb_tbl", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("att_fc_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("att_fc_b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("att_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("concat_3in1_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("emb_fw_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__mmdnn_match_conv_topk, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUMmdnnMatchConvTopkCompute, + def) + .BindInput("input_x", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("input_y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("input_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("conv_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("topk_out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__mmdnn_merge_all, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUMmdnnMergeAllCompute, + def) + .BindInput("concat_7in1_x", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("concat_topk_x", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_fw_wh", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_fw_wi", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_rv_wh", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("grnn_rv_wi", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("fc0_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("fc0_b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("fc1_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("fc1_b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("fc2_w", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("fc2_b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.h b/lite/kernels/xpu/__xpu__multi_encoder_compute.h index 71db4e6f44f9c36e4acdaf0a440463a61f4e3099..dbc2d785d42ad29dc1cfbe36f744b71662e48315 100644 --- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h +++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include #include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/kernel.h" diff --git a/lite/kernels/xpu/__xpu__resnet50_compute.h b/lite/kernels/xpu/__xpu__resnet50_compute.h index 3d42f8b6f26edf615dba165b553b633673a4ae66..7ce8b1192ea9e85d83ddbeddc374378692866aa6 100644 --- a/lite/kernels/xpu/__xpu__resnet50_compute.h +++ b/lite/kernels/xpu/__xpu__resnet50_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include #include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/kernel.h" diff --git a/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc b/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..0d57445cd44953f504e292ad38d44d047daa3a7a --- /dev/null +++ b/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/__xpu__resnet_cbam_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void XPUResNetCbamCompute::PrepareForRun() { + auto& param = this->Param(); + + for (auto* filter : param.filter) { + arg_filter_.push_back( + reinterpret_cast(filter->data())); + } + for (auto* bias : param.bias) { + if (bias == nullptr) { + arg_bias_.push_back(nullptr); + } else { + arg_bias_.push_back(bias->data()); + } + } + for (auto* max_filter : param.max_filter) { + arg_max_filter_.push_back(max_filter->data()); + } +} + +void XPUResNetCbamCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto input_dims = param.input->dims(); + int batch_size = input_dims[0]; + int height = input_dims[2]; + int width = input_dims[3]; + + int r = xdnn::conv2d_int16_resnet_cbam( + ctx.GetRawContext(), /* context */ + batch_size, /* num */ + height, /* height */ + width, /* width */ + param.input->data(), /* bottom */ + &arg_filter_[0], /* weight_list */ + param.output->mutable_data(TARGET(kXPU)), /* top */ + &arg_bias_[0], /* bias_list */ + &arg_max_filter_[0], /* max_filter_list */ + param.pool_p, /* pool_p */ + true, /* midtype_fp16 */ + false /* dynamic_shape */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(__xpu__resnet_cbam, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUResNetCbamCompute, + def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("MaxFilter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/__xpu__resnet_cbam_compute.h b/lite/kernels/xpu/__xpu__resnet_cbam_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..b952bb088ea88399966c170cbeadebfa698889d8 --- /dev/null +++ b/lite/kernels/xpu/__xpu__resnet_cbam_compute.h @@ -0,0 +1,45 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class XPUResNetCbamCompute + : public KernelLite { + public: + using param_t = operators::XPUResNetCbamParam; + + virtual void PrepareForRun(); + + virtual void Run(); + + private: + std::vector arg_filter_; + std::vector arg_max_filter_; + std::vector arg_bias_; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/__xpu__search_attention_compute.cc b/lite/kernels/xpu/__xpu__search_attention_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..7f02f566dfb01f2d8a57302e714f4f2cb3d4b786 --- /dev/null +++ b/lite/kernels/xpu/__xpu__search_attention_compute.cc @@ -0,0 +1,231 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/__xpu__search_attention_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void XPUMmdnnSearchAttentionCompute::PrepareForRun() { + offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + pad_begin_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + w_max_xpu_guard_ = + TargetWrapperXPU::MallocScratchPad(8 * sizeof(float), false /* use_l3 */); + buffer_at_l3_guard_ = TargetWrapperXPU::MallocScratchPad( + 5 * L3_SLOT_SIZE * sizeof(float), false /* use_l3 */); + buffer_at_gm_guard_ = TargetWrapperXPU::MallocScratchPad( + 5 * GM_SLOT_SIZE * sizeof(float), false /* use_l3 */); + + offset_cpu.reset(new int[XPU_MAX_LOD_SIZE]); + pad_begin_cpu.reset(new int[XPU_MAX_LOD_SIZE]); +} + +void XPUMmdnnSearchAttentionCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* X = param.X; + auto* W = param.W; + auto* b = param.b; + float W_max = param.W_max; + float alpha0 = param.alpha0; + float alpha1 = param.alpha1; + float mask = param.mask; + + const int16_t* w_data = W->data(); + const float* b_data = b->data(); + + int batch = X->lod()[0].size() - 1; + int dim0 = X->dims()[0]; + int dim1 = X->dims()[1]; + const auto offset = X->lod()[0]; + int max_seq = 0; + + auto* top = param.Out; + LoD top_lod; + top_lod.push_back(X->lod()[0]); + top->set_lod(top_lod); + top->Resize({dim0, dim1}); + auto* top_data = top->mutable_data(TARGET(kXPU)); + + float maxs_cpu[8] = {0.0f, 0.0f, 0.0f, 0.0f, W_max, 0.0f, 0.0f, 0.0f}; + for (int i = 0; i < batch; ++i) { + offset_cpu[i] = offset[i]; // type of offset is int64, not supported by xpu + pad_begin_cpu[i] = offset[i + 1] - offset[i]; + if (offset[i + 1] - offset[i] > max_seq) { + max_seq = offset[i + 1] - offset[i]; + } + } + offset_cpu[batch] = offset[batch]; + + XPU_CALL(xpu_memcpy(offset_xpu_guard_->addr_, + offset_cpu.get(), + offset.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(pad_begin_xpu_guard_->addr_, + pad_begin_cpu.get(), + batch * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(w_max_xpu_guard_->addr_, + maxs_cpu, + 8 * sizeof(float), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + + int* offset_xpu = reinterpret_cast(offset_xpu_guard_->addr_); + int* pad_begin_xpu = reinterpret_cast(pad_begin_xpu_guard_->addr_); + float* maxs_xpu = reinterpret_cast(w_max_xpu_guard_->addr_); + float* buffer_at_l3 = reinterpret_cast(buffer_at_l3_guard_->addr_); + float* buffer_at_gm = reinterpret_cast(buffer_at_gm_guard_->addr_); + + // when use l3, max_seq <= 128: + // group_padding: batch * max_seq * dim1; at (slot0, slot1) + // seq_fc: batch * max_seq * dim1; at (slot2, slot3) + // batchgemm0: batch * max_seq * max_seq; at slot4 + // attention_padding_mask: batch * max_seq * max_seq; at slot3 + // seq_softmax: batch * max_seq * max_seq; at slot4 + // batchgemm1: batch * max_seq * dim1; at (slot2, slot3) + float* group_padding_output = buffer_at_l3; + float* seq_fc_output = buffer_at_l3 + 2 * L3_SLOT_SIZE; + float* batchgemm0_output = buffer_at_l3 + 4 * L3_SLOT_SIZE; + float* attention_output = buffer_at_l3 + 3 * L3_SLOT_SIZE; + float* seq_softmax_output = buffer_at_l3 + 4 * L3_SLOT_SIZE; + float* batchgemm1_output = buffer_at_l3 + 2 * L3_SLOT_SIZE; + + if (max_seq > 128) { + group_padding_output = buffer_at_gm; + seq_fc_output = buffer_at_gm + 1 * GM_SLOT_SIZE; + batchgemm0_output = buffer_at_gm + 2 * GM_SLOT_SIZE; + attention_output = buffer_at_gm + 1 * GM_SLOT_SIZE; + seq_softmax_output = buffer_at_gm + 3 * GM_SLOT_SIZE; + batchgemm1_output = buffer_at_gm + 4 * GM_SLOT_SIZE; + } + + const auto* bottom_data = X->data(); + int r = 0; + r = xdnn::search_sequence_pad_depad(ctx.GetRawContext(), + const_cast(bottom_data), + group_padding_output, + offset_xpu, + max_seq, + batch, + dim1, + 0); // is_depad = 0 + CHECK_EQ(r, 0); + // do-findmax + r = xdnn::findmax(ctx.GetRawContext(), + group_padding_output, + batch * max_seq * dim1, + maxs_xpu); + CHECK_EQ(r, 0); + r = xdnn::gemm_int16_maxptr( + ctx.GetRawContext(), /* ctx */ + false, /* trans_a */ + true, /* trans_b */ + batch * max_seq, /* m */ + dim1, /* n */ + dim1, /* k */ + 1.0f, /* alpha */ + group_padding_output, /* data_a */ + dim1, /* lda */ + w_data, /* data_b */ + dim1, /* ldb */ + 0.0f, /* beta */ + seq_fc_output, /* data_c */ + dim1, /* ldc */ + b_data, /* bias */ + xdnn::Activation_t::LINEAR, /* act */ + maxs_xpu, /* max_a */ + maxs_xpu + 4, /* max_b */ + nullptr /* max_c */); + CHECK_EQ(r, 0); + r = xdnn::search_aligned_mat_mul(ctx.GetRawContext(), + 0, + 1, + batch, + max_seq, + max_seq, + dim1, + alpha0, + group_padding_output, + dim1, + seq_fc_output, + dim1, + batchgemm0_output, + max_seq); + CHECK_EQ(r, 0); + r = xdnn::search_pad_mask(ctx.GetRawContext(), + batchgemm0_output, + attention_output, + pad_begin_xpu, + batch, + max_seq, + max_seq, + batch, + mask); + CHECK_EQ(r, 0); + r = xdnn::softmax2d_forward(ctx.GetRawContext(), + attention_output, + seq_softmax_output, + batch * max_seq, + max_seq, + true); + CHECK_EQ(r, 0); + r = xdnn::search_aligned_mat_mul(ctx.GetRawContext(), + 0, + 0, + batch, + max_seq, + dim1, + max_seq, + alpha1, + seq_softmax_output, + max_seq, + group_padding_output, + dim1, + batchgemm1_output, + dim1); + CHECK_EQ(r, 0); + r = xdnn::search_sequence_pad_depad(ctx.GetRawContext(), + top_data, + batchgemm1_output, + offset_xpu, + max_seq, + batch, + dim1, + 1); // is_depad = 1 + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(__xpu__mmdnn_search_attention, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUMmdnnSearchAttentionCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/__xpu__search_attention_compute.h b/lite/kernels/xpu/__xpu__search_attention_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..f9670dbab6247927acf6ac7d7b47f98a464a3489 --- /dev/null +++ b/lite/kernels/xpu/__xpu__search_attention_compute.h @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class XPUMmdnnSearchAttentionCompute + : public KernelLite { + public: + using param_t = operators::XPUMmdnnSearchAttentionParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard offset_xpu_guard_; + XPUScratchPadGuard pad_begin_xpu_guard_; + XPUScratchPadGuard w_max_xpu_guard_; + XPUScratchPadGuard buffer_at_l3_guard_; + XPUScratchPadGuard buffer_at_gm_guard_; + + std::unique_ptr offset_cpu; + std::unique_ptr pad_begin_cpu; + + const int L3_SLOT_SIZE = 40 * 128 * 128; + const int GM_SLOT_SIZE = 40 * 512 * 512; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/activation_compute.h b/lite/kernels/xpu/activation_compute.h index e440bde4146a88929c52c20ff1038eb35be91d38..f2ad667886ac33191687b70aa7548050461545e7 100644 --- a/lite/kernels/xpu/activation_compute.h +++ b/lite/kernels/xpu/activation_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/batch_norm_compute.h b/lite/kernels/xpu/batch_norm_compute.h index 7b428476b96ca3b2b60c66df28b7f82e8f57bebc..f5244574cebab6b10bbd81af9c8303ffec9f0965 100644 --- a/lite/kernels/xpu/batch_norm_compute.h +++ b/lite/kernels/xpu/batch_norm_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/cast_compute.h b/lite/kernels/xpu/cast_compute.h index 8992c29732630a5bf0d9c092461569234257e3a9..efd4cbae8d2d708b25729f04f36bc22d1d909e11 100644 --- a/lite/kernels/xpu/cast_compute.h +++ b/lite/kernels/xpu/cast_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/concat_compute.cc b/lite/kernels/xpu/concat_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..f088bb80f0c500c6f900726195bcb5903049d3fb --- /dev/null +++ b/lite/kernels/xpu/concat_compute.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/concat_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void ConcatCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto ins = param.x; + auto out = param.output; + int64_t axis = param.axis; + + int n = ins.size(); + int h = 1; + int w_except_axis = 1; + CHECK(n <= 8) << "XPU only surpport at most 8 tensors for now"; + for (int i = 0; i < axis; ++i) { + h *= (ins[0]->dims())[i]; + } + for (int i = axis + 1; i < ins[0]->dims().size(); ++i) { + w_except_axis *= (ins[0]->dims())[i]; + } + CHECK(axis >= 0) << "concat: axis shoud >= 0!"; + CHECK(axis < ins[0]->dims().size()) << "concat: axis shoud < ins[0]->dims()!"; + for (int i = 0; i < n; ++i) { + int hh = 1; + int ww = 1; + for (int j = 0; j < axis; ++j) { + hh *= (ins[i]->dims())[j]; + } + for (int j = axis + 1; j < ins[i]->dims().size(); ++j) { + ww *= (ins[i]->dims())[j]; + } + CHECK(hh == h) << "concat: h should be eual!"; + CHECK(ww == w_except_axis) << "concat: w should be eual except for axis!"; + } + + int in_w_host[n]; // NOLINT + const float* ptrs[n]; // NOLINT + + for (int i = 0; i < n; ++i) { + ptrs[i] = ins[i]->data(); + in_w_host[i] = w_except_axis * (ins[i]->dims())[axis]; + } + + int r = xdnn::concat(ctx.GetRawContext(), /* ctx */ + h, /* height */ + in_w_host, /* width_x */ + n, /* n */ + ptrs, /* lm_ptrs */ + out->mutable_data(TARGET(kXPU)) /*y*/); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + concat, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ConcatCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/utils.h b/lite/kernels/xpu/concat_compute.h similarity index 78% rename from lite/kernels/xpu/utils.h rename to lite/kernels/xpu/concat_compute.h index d410cb1567d5c60aeb52b798d9f17c7f5692e096..f29899a741194270272770d8b781cd9b0b54abc9 100644 --- a/lite/kernels/xpu/utils.h +++ b/lite/kernels/xpu/concat_compute.h @@ -14,15 +14,20 @@ #pragma once -#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/kernel.h" namespace paddle { namespace lite { namespace kernels { namespace xpu { -struct XPUFreeDeleter { - void operator()(void* p) const { xpu_free(p); } +class ConcatCompute : public KernelLite { + public: + using param_t = operators::ConcatParam; + + virtual void Run(); + + virtual ~ConcatCompute() = default; }; } // namespace xpu diff --git a/lite/kernels/xpu/conv_compute.h b/lite/kernels/xpu/conv_compute.h index b7631ce4e5773afe7cdd797a245c806b51d25c56..76159444c1861fad14b6ac4f0d32da626b3a8802 100644 --- a/lite/kernels/xpu/conv_compute.h +++ b/lite/kernels/xpu/conv_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/dropout_compute.h b/lite/kernels/xpu/dropout_compute.h index 0eaafb4f5555a163623402fee82d50bfa095b0b3..360450df537a68b9412d21db4e06dc74d6071ca6 100644 --- a/lite/kernels/xpu/dropout_compute.h +++ b/lite/kernels/xpu/dropout_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/elementwise_compute.h b/lite/kernels/xpu/elementwise_compute.h index 863ee3c643f9c431dacd057e251941914b1dd1c5..d910b9293e74428c426d9505245bc5958fc9df3a 100644 --- a/lite/kernels/xpu/elementwise_compute.h +++ b/lite/kernels/xpu/elementwise_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/layer_norm_compute.h b/lite/kernels/xpu/layer_norm_compute.h index 5d2df37795811ef8027e12b25139f2b7091cceed..9eeb5924c512fcfbf8825a9ff775378dfe4d6d4c 100644 --- a/lite/kernels/xpu/layer_norm_compute.h +++ b/lite/kernels/xpu/layer_norm_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/lookup_table_compute.cc b/lite/kernels/xpu/lookup_table_compute.cc index 568d303adefaa06bb8665b4cc92d4a949419d587..4256687fa8c17c7fe36e91ff727d52eb1047646f 100644 --- a/lite/kernels/xpu/lookup_table_compute.cc +++ b/lite/kernels/xpu/lookup_table_compute.cc @@ -29,12 +29,13 @@ void LookupTableCompute::Run() { int embed_dim = param.W->dims()[1]; int r = xdnn::embedding( - ctx.GetRawContext(), /* context */ - num, /* num */ - param.Ids->data(), /* indices */ - embed_dim, /* embed_dim */ - param.W->data(), /* table */ - param.Out->mutable_data(TARGET(kXPU)) /* top */); + ctx.GetRawContext(), /* context */ + num, /* num */ + param.Ids->data(), /* indices */ + embed_dim, /* embed_dim */ + param.W->data(), /* table */ + param.Out->mutable_data(TARGET(kXPU)), /* top */ + param.padding_idx /* padding_idx */); CHECK_EQ(r, 0); } diff --git a/lite/kernels/xpu/lookup_table_compute.h b/lite/kernels/xpu/lookup_table_compute.h index 2ba1afc869cf9c3a49ab1ad29c66c6c89ba87d19..7a43f5244e5d514a1644aac0437951af35bb7767 100644 --- a/lite/kernels/xpu/lookup_table_compute.h +++ b/lite/kernels/xpu/lookup_table_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/match_matrix_tensor_compute.cc b/lite/kernels/xpu/match_matrix_tensor_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..c3ee547ccce56cd16401e4aca465e64d99a26185 --- /dev/null +++ b/lite/kernels/xpu/match_matrix_tensor_compute.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/match_matrix_tensor_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void MatchMatrixTensorCompute::PrepareForRun() { + wx_max_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + offset_l_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + offset_r_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + + offset_l_cpu.reset(new int[XPU_MAX_LOD_SIZE]); + offset_r_cpu.reset(new int[XPU_MAX_LOD_SIZE]); +} + +void MatchMatrixTensorCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto* x = param.x; + auto* y = param.y; + auto* w = param.w; + auto* out = param.out; + auto* tmp = param.tmp; + int dim_t = param.dim_t; + float w_max = param.__xpu__w_max; + bool fuse_relu = param.fuse_relu; + bool float_to_fix = param.__xpu__float_to_fix; + CHECK(float_to_fix) << "W should be fixed point"; + + xdnn::Activation_t act = xdnn::Activation_t::LINEAR; + if (fuse_relu) { + act = xdnn::Activation_t::RELU; + } + + int dim_in = x->dims()[1]; + const auto& offset_l = x->lod()[0]; + const auto& offset_r = y->lod()[0]; + + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + top_size += dim_t * len_l * len_r; + top_offset.push_back(top_size); + } + auto* bottom_l_data = x->data(); + auto* bottom_r_data = y->data(); + auto* w_data = w->data(); + auto* out_data = out->mutable_data(TARGET(kXPU)); + auto* bottom_l_trans_data = tmp->mutable_data(TARGET(kXPU)); + int batch_size = x->lod()[0].size() - 1; + + float* wx_max = reinterpret_cast(wx_max_xpu_guard_->addr_); + int* offset_l_xpu = reinterpret_cast(offset_l_xpu_guard_->addr_); + int* offset_r_xpu = reinterpret_cast(offset_r_xpu_guard_->addr_); + + int r = xdnn::gemm_int16_tmp_api( + ctx.GetRawContext(), /* ctx */ + false, /* trans_a */ + false, /* trans_b */ + x->dims()[0], /* m */ + dim_t * dim_in, /* n */ + dim_in, /* k */ + 1.0f, /* alpha */ + bottom_l_data, /* data_a */ + dim_in, /* lda */ + w_data, /* data_b */ + dim_t * dim_in, /* ldb */ + 0.0f, /* beta */ + bottom_l_trans_data, /* data_c */ + dim_t * dim_in, /* ldc */ + nullptr, /* bias */ + xdnn::Activation_t::LINEAR, /* act */ + 0.0f, /* max_a */ + w_max, /* max_b */ + wx_max /* max_c */); + CHECK_EQ(r, 0); + + int max_width = 0; + for (int i = 0; i < offset_l.size(); ++i) { + offset_l_cpu[i] = offset_l[i]; + if (i != 0 && (offset_l_cpu[i] - offset_l_cpu[i - 1] > max_width)) { + max_width = offset_l_cpu[i] - offset_l_cpu[i - 1]; + } + } + for (int i = 0; i < offset_r.size(); ++i) { + offset_r_cpu[i] = offset_r[i]; + if (i != 0 && (offset_r_cpu[i] - offset_r_cpu[i - 1] > max_width)) { + max_width = offset_r_cpu[i] - offset_r_cpu[i - 1]; + } + } + XPU_CALL(xpu_memcpy(offset_l_xpu, + offset_l_cpu.get(), + offset_l.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(offset_r_xpu, + offset_r_cpu.get(), + offset_r.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + + r = xdnn::match_matrix_tensor(ctx.GetRawContext(), + batch_size, + bottom_l_trans_data, + bottom_r_data, + offset_l_xpu, + offset_r_xpu, + dim_t, + dim_in, + out_data, + wx_max, + act, + max_width); + CHECK_EQ(r, 0); + + int lod_lv1_size = batch_size * dim_t; + int lod_lv2_size = x->lod()[0].back() * dim_t; + std::vector out_lod0(batch_size + 1, 0); + std::vector out_lod1(lod_lv1_size + 1, 0); + std::vector out_lod2(lod_lv2_size + 1, 0); + for (int i = 0; i < batch_size; i++) { + out_lod0[i + 1] = out_lod0[i] + dim_t; + int len_l = offset_l[i + 1] - offset_l[i]; + + for (int j = 0; j < dim_t; j++) { + out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l; + int len_r = offset_r[i + 1] - offset_r[i]; + + for (int k = 0; k < len_l; k++) { + out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] = + out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r; + } + } + } + + paddle::lite::LoD out_lod; + out_lod.push_back(top_offset); + out_lod.push_back(offset_l); + out_lod.push_back(offset_r); + out->set_lod(out_lod); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(match_matrix_tensor, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::MatchMatrixTensorCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Tmp", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/match_matrix_tensor_compute.h b/lite/kernels/xpu/match_matrix_tensor_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..3bd0b622db1fce178ea66604d89dc50d6477a105 --- /dev/null +++ b/lite/kernels/xpu/match_matrix_tensor_compute.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class MatchMatrixTensorCompute + : public KernelLite { + public: + using param_t = operators::MatchMatrixTensorParam; + + virtual void PrepareForRun(); + + virtual void Run(); + + private: + XPUScratchPadGuard wx_max_xpu_guard_; + XPUScratchPadGuard offset_l_xpu_guard_; + XPUScratchPadGuard offset_r_xpu_guard_; + + std::unique_ptr offset_l_cpu; + std::unique_ptr offset_r_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/matmul_compute.h b/lite/kernels/xpu/matmul_compute.h index aca3cbc603eff490ae19fd2546352adca3c1a7cf..0fef2086e294fa5cd79e49adeb6b136f484a1efd 100644 --- a/lite/kernels/xpu/matmul_compute.h +++ b/lite/kernels/xpu/matmul_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/mul_compute.h b/lite/kernels/xpu/mul_compute.h index bb2778c0e73189b11135395b42655e0250bbfd0a..3c91384b726a4d43c6a38e96d143657c12dadd8a 100644 --- a/lite/kernels/xpu/mul_compute.h +++ b/lite/kernels/xpu/mul_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/pool_compute.h b/lite/kernels/xpu/pool_compute.h index 5648554c41c76396184b7dc536f8c8628cbf23e4..39e14f04a8c41bc057ac5733d881ba713c0883b2 100644 --- a/lite/kernels/xpu/pool_compute.h +++ b/lite/kernels/xpu/pool_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/scale_compute.h b/lite/kernels/xpu/scale_compute.h index 6989b0f0f31e54a63dac2f7c2090dc676e31acfb..5a84fe26a0d409dcd979ca7c26128775a4f64df2 100644 --- a/lite/kernels/xpu/scale_compute.h +++ b/lite/kernels/xpu/scale_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/search_fc_compute.cc b/lite/kernels/xpu/search_fc_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..52a9999b468564d81288ce494f575a8d1d46e4fc --- /dev/null +++ b/lite/kernels/xpu/search_fc_compute.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/search_fc_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SearchFcCompute::PrepareForRun() { + maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(float), false /* use_l3 */); +} + +void SearchFcCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* bottom = param.X; + auto* w = param.W; + auto* b = param.b; + auto* top = param.Out; + float w_max = param.__xpu__w_max; + int out_size = param.out_size; + bool fuse_relu = param.fuse_relu; + bool float_to_fix = param.__xpu__float_to_fix; + CHECK(float_to_fix) << "W should be fixed point"; + + int batch = bottom->dims()[0]; + int _out = w->dims()[0]; + int _in = w->dims()[1]; + + xdnn::Activation_t act = xdnn::Activation_t::LINEAR; + if (fuse_relu) { + act = xdnn::Activation_t::RELU; + } + + std::vector top_dims{bottom->dims()[0], out_size}; + top->Resize(top_dims); + + const auto* bottom_data = bottom->data(); + const auto* weights = w->data(); + const auto* bias_data = b->data(); + auto* top_data = top->mutable_data(TARGET(kXPU)); + + float* maxs_xpu = reinterpret_cast(maxs_xpu_guard_->addr_); + float maxs_cpu[8] = {0.0f, 0.0f, 0.0f, 0.0f, w_max, 0.0f, 0.0f, 0.0f}; + XPU_CALL(xpu_memcpy(maxs_xpu, + &maxs_cpu[0], + 8 * sizeof(float), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + + int r = xdnn::findmax( + ctx.GetRawContext(), bottom_data, batch * _in, maxs_xpu); + CHECK_EQ(r, 0); + r = xdnn::gemm_int16_maxptr( + ctx.GetRawContext(), /* ctx */ + false, /* trans_a */ + true, /* trans_b */ + batch, /* m */ + _out, /* n */ + _in, /* k */ + 1.0f, /* alpha */ + bottom_data, /* data_a */ + _in, /* lda */ + weights, /* data_b */ + _in, /* ldb */ + 0.0f, /* beta */ + top_data, /* data_c */ + _out, /* ldc */ + bias_data, /* bias */ + act, /* act */ + maxs_xpu, /* max_a */ + maxs_xpu + 4, /* max_b */ + nullptr /* max_c */); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_fc, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SearchFcCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("b", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/search_fc_compute.h b/lite/kernels/xpu/search_fc_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..c7ee06abd957187c18c1306f40a77735f40558e7 --- /dev/null +++ b/lite/kernels/xpu/search_fc_compute.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SearchFcCompute : public KernelLite { + public: + using param_t = operators::SearchFcParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard maxs_xpu_guard_; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/search_grnn_compute.cc b/lite/kernels/xpu/search_grnn_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..d4e2e4a9969149b0d2f7f2b75c195d1b3a5fda5c --- /dev/null +++ b/lite/kernels/xpu/search_grnn_compute.cc @@ -0,0 +1,285 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/search_grnn_compute.h" +#include +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SearchGrnnCompute::PrepareForRun() { + offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + new_offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SEQ_LEN * sizeof(int), false /* use_l3 */); + maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(16 * sizeof(float), + false /* use_l3 */); + + idx_sorted_by_width_data_cpu.reset(new int[XPU_MAX_LOD_SIZE]); + offset_cpu.reset(new int[XPU_MAX_LOD_SIZE]); + new_offset_cpu.reset(new int[XPU_MAX_LOD_SEQ_LEN]); +} + +void SearchGrnnCompute::prepare_layout(const operators::SearchGrnnParam& param, + const paddle::lite::Tensor* bottom) { + auto* idx_sorted_by_width = param.idx_sorted_by_width; + auto* layout_input = param.layout_input; + + int dim0 = bottom->dims()[0]; + int dim1 = 1; + if (bottom->dims().size() > 1) { + dim1 = bottom->dims()[1]; + } + int batch = bottom->lod()[0].size() - 1; + auto& offset = bottom->lod()[0]; + + idx_sorted_by_width->Resize({batch}); + std::vector width; + width.resize(batch); + + // sort sequences by width (descending) and find the largest width in the + // batch + for (int i = 0; i < batch; i++) { + width[i] = offset[i + 1] - offset[i]; + idx_sorted_by_width_data_cpu[i] = i; + } + std::sort(idx_sorted_by_width_data_cpu.get(), + idx_sorted_by_width_data_cpu.get() + batch, + [&width](int a, int b) { return width[a] > width[b]; }); + int max_width = width[idx_sorted_by_width_data_cpu[0]]; + + // start of reorganizing the input + std::vector new_offset; + new_offset.resize(max_width + 1); + new_offset[0] = 0; + int j = batch - 1; + int last_width = 0; + int sub_row = 0; + int sub_col = 0; + + for (int i = 1; i <= max_width;) { + for (int k = j; k >= 0; --k) { + if (width[idx_sorted_by_width_data_cpu[k]] > last_width) { + sub_row = width[idx_sorted_by_width_data_cpu[k]] - last_width; + sub_col = k + 1; + for (int s = 0; s < sub_row; s++) { + new_offset[i] = new_offset[i - 1] + sub_col; + i++; + } + // move on + last_width = width[idx_sorted_by_width_data_cpu[k]]; + j = k - 1; + break; + } + } + } + + // copying to the reorganized buffer + if (bottom->dims().size() == 1) { + } else { + LoD new_lod; + new_lod.push_back(new_offset); + layout_input->set_lod(new_lod); + layout_input->Resize({dim0, dim1}); + } + + XPU_CALL(xpu_memcpy(idx_sorted_by_width->mutable_data(TARGET(kXPU)), + idx_sorted_by_width_data_cpu.get(), + idx_sorted_by_width->numel() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); +} + +void SearchGrnnCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* bottom = param.x; + auto* wi = param.wi; + auto* wh = param.wh; + auto* top = param.out; + auto* tmp_buffer = param.tmp_buffer; + auto* idx_sorted_by_width = param.idx_sorted_by_width; + auto* layout_input = param.layout_input; + int cap_h = param.num_hidden; + int cap_e = param.num_input; + int cap_l = bottom->dims()[0]; + auto wi_max = param.__xpu__wi_max; + auto wh_max = param.__xpu__wh_max; + bool float_to_fix = param.__xpu__float_to_fix; + CHECK(float_to_fix) << "W should be fixed point"; + + int dim = 1; + if (bottom->dims().size() > 1) { + dim = bottom->dims()[1]; + } + + const auto& offset = bottom->lod()[0]; + LoD top_lod; + top_lod.push_back(offset); + top->set_lod(top_lod); + std::vector top_dims_vec{cap_l, cap_h}; + top->Resize(top_dims_vec); + auto* top_hidden = top->mutable_data(TARGET(kXPU)); + const auto* dense_e2h = wi->data(); + const auto* dense_h2h = wh->data(); + + // Prepare idx_sorted_by_width + prepare_layout(param, bottom); + int batch = bottom->lod()[0].size() - 1; + int max_width = layout_input->lod()[0].size() - 1; + const auto& new_offset = layout_input->lod()[0]; + auto* new_emb = layout_input->mutable_data(TARGET(kXPU)); + + // Prepare offset and new_offset + int* offset_xpu = reinterpret_cast(offset_xpu_guard_->addr_); + int* new_offset_xpu = reinterpret_cast(new_offset_xpu_guard_->addr_); + float* maxs_xpu = reinterpret_cast(maxs_xpu_guard_->addr_); + CHECK_LE(offset.size(), 64); + CHECK_LE(new_offset.size(), 256); + + for (size_t i = 0; i < offset.size(); ++i) { + offset_cpu[i] = offset[i]; + } + for (size_t i = 0; i < new_offset.size(); ++i) { + new_offset_cpu[i] = new_offset[i]; + } + XPU_CALL(xpu_memcpy(offset_xpu, + offset_cpu.get(), + offset.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(new_offset_xpu, + new_offset_cpu.get(), + new_offset.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + + int r = xdnn::search_seq2batch(ctx.GetRawContext(), + batch, + max_width, + dim, + idx_sorted_by_width->data(), + offset_xpu, + new_offset_xpu, + bottom->data(), + new_emb); + CHECK_EQ(r, 0); + + // this buffer is used for book keeping info which will be used in bp + // buffer also needed in bp, so make it larger + tmp_buffer->Resize({20, cap_l, cap_h}); + auto* buffer_data = tmp_buffer->mutable_data(TARGET(kXPU)); + // the internal hidden + auto* hidden = buffer_data + 19 * cap_l * cap_h; + + // do-findmax + float maxs_cpu[16] = {0.0f, + 0.0f, + 0.0f, + 0.0f, + wi_max[0], + 0.0f, + 0.0f, + 0.0f, + wi_max[1], + 0.0f, + 0.0f, + 0.0f, + wi_max[2], + 0.0f, + 0.0f, + 0.0f}; + XPU_CALL(xpu_memcpy(maxs_xpu, + maxs_cpu, + 16 * sizeof(float), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + r = xdnn::findmax( + ctx.GetRawContext(), new_emb, cap_l * cap_e, maxs_xpu); + CHECK_EQ(r, 0); + + // precompute embedding to hidden + for (int i = 0; i < 3; ++i) { + const int16_t* data_b = dense_e2h + i * cap_e * cap_h; // e2h, e2hr, e2hz + float* data_c = buffer_data + i * cap_l * cap_h; // w_x_e, wr_x_e, wz_x_e + int r = xdnn::gemm_int16_maxptr( + ctx.GetRawContext(), + false, + true, // trans_a, trans_b + cap_l, + cap_h, + cap_e, // m, n, k + 1.0f, + new_emb, + cap_e, // alpha, data_a, lda + data_b, + cap_e, + 0.0f, // data_b, ldb, beta + data_c, + cap_h, // data_c, ldc + nullptr, + xdnn::Activation_t::LINEAR, // bias, act + maxs_xpu, + maxs_xpu + 4 * (i + 1)); // max_a, max_b + CHECK_EQ(r, 0); + } + + r = xdnn::search_grnn(ctx.GetRawContext(), + cap_l, + cap_h, + cap_e, + max_width, + new_offset_xpu, + buffer_data, + dense_h2h, + hidden, + wh_max[0], + wh_max[1], + wh_max[2]); + CHECK_EQ(r, 0); + + r = xdnn::search_batch2seq(ctx.GetRawContext(), + batch, + max_width, + cap_h, + idx_sorted_by_width->data(), + offset_xpu, + new_offset_xpu, + hidden, + top_hidden); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_grnn, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SearchGrnnCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Wi", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Wh", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("tmp_buffer", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("idx_sorted_by_width", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("layout_input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/search_grnn_compute.h b/lite/kernels/xpu/search_grnn_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..7208e782474d39eabb41b4bc969d27a1d7d5f797 --- /dev/null +++ b/lite/kernels/xpu/search_grnn_compute.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SearchGrnnCompute : public KernelLite { + public: + using param_t = operators::SearchGrnnParam; + + void PrepareForRun() override; + + void prepare_layout(const operators::SearchGrnnParam& param, + const paddle::lite::Tensor* bottom); + void Run() override; + + private: + XPUScratchPadGuard offset_xpu_guard_; + XPUScratchPadGuard new_offset_xpu_guard_; + XPUScratchPadGuard maxs_xpu_guard_; + + std::unique_ptr idx_sorted_by_width_data_cpu; + std::unique_ptr offset_cpu; + std::unique_ptr new_offset_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/sequence_arithmetic_compute.cc b/lite/kernels/xpu/sequence_arithmetic_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..e1b9866123395b2d7867154c3b398adae670ed97 --- /dev/null +++ b/lite/kernels/xpu/sequence_arithmetic_compute.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/sequence_arithmetic_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SequenceArithmeticCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* bottom0 = param.X; + auto* bottom1 = param.Y; + auto* top = param.Out; + + int op_type = param.op_type; + + auto len1 = bottom0->numel(); + auto len2 = bottom1->numel(); + const auto* bottom_data0 = bottom0->data(); + const auto* bottom_data1 = bottom1->data(); + auto* top_data = top->mutable_data(TARGET(kXPU)); + + int r = 0; + switch (op_type) { + case 1: // addition: top[0] = bottom[0] + bottom[1] + if (len1 > len2) { + r = xdnn::elementwise_add( + ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2); + CHECK_EQ(r, 0); + r = xdnn::memcpy_device(ctx.GetRawContext(), + &top_data[len2], + &bottom_data0[len2], + (len1 - len2) * sizeof(float)); + CHECK_EQ(r, 0); + } else { + r = xdnn::elementwise_add( + ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1); + CHECK_EQ(r, 0); + } + break; + case 2: // substraction: top[0] = bottom[0] - bottom[1] + if (len1 > len2) { + r = xdnn::elementwise_sub( + ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2); + CHECK_EQ(r, 0); + r = xdnn::memcpy_device(ctx.GetRawContext(), + &top_data[len2], + &bottom_data0[len2], + (len1 - len2) * sizeof(float)); + CHECK_EQ(r, 0); + } else { + r = xdnn::elementwise_sub( + ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1); + CHECK_EQ(r, 0); + } + break; + case 3: // multiplication: top[0] = bottom[0] * bottom[1] + if (len1 > len2) { + r = xdnn::elementwise_mul( + ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2); + CHECK_EQ(r, 0); + r = xdnn::memcpy_device(ctx.GetRawContext(), + &top_data[len2], + &bottom_data0[len2], + (len1 - len2) * sizeof(float)); + CHECK_EQ(r, 0); + } else { + r = xdnn::elementwise_mul( + ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1); + CHECK_EQ(r, 0); + } + break; + default: + break; + } +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_arithmetic, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SequenceArithmeticCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(search_seq_arithmetic, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SequenceArithmeticCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/sequence_arithmetic_compute.h b/lite/kernels/xpu/sequence_arithmetic_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..9526587ac48cd5025022d646e31c24cac6b59a13 --- /dev/null +++ b/lite/kernels/xpu/sequence_arithmetic_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SequenceArithmeticCompute + : public KernelLite { + public: + using param_t = operators::SequenceArithmeticParam; + + void Run() override; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/sequence_concat_compute.cc b/lite/kernels/xpu/sequence_concat_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..349fdbad2a89300703c820588b4647bfba77ece5 --- /dev/null +++ b/lite/kernels/xpu/sequence_concat_compute.cc @@ -0,0 +1,143 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/sequence_concat_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SequenceConcatCompute::PrepareForRun() { + lod0_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + lod1_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + + lod0_cpu.reset(new int[XPU_MAX_LOD_SIZE]); + lod1_cpu.reset(new int[XPU_MAX_LOD_SIZE]); +} + +template +inline LoD ConcatLoD(const std::vector& xs, + std::vector* xs_in_order) { + std::vector result; + result.resize(xs[0]->lod()[0].size()); + + for (size_t i = 1; i < result.size(); ++i) { + size_t sum = 0; + for (size_t j = 0; j < xs.size(); ++j) { + auto& x_lod = xs[j]->lod()[0]; + if (x_lod[i - 1] < x_lod[i]) { + xs_in_order->emplace_back(xs[j]->Slice(x_lod[i - 1], x_lod[i])); + } + sum += x_lod[i]; + } + result[i] = sum; + } + LoD lod; + lod.emplace_back(result); + return lod; +} + +void SequenceConcatCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto xs = param.X; + auto out = param.Out; + + size_t lod_size = 0; + for (auto& x : xs) { + if (lod_size == 0) { + lod_size = x->lod()[0].size(); + } else { + CHECK_EQ(lod_size, x->lod()[0].size()) + << "The number of sequence must be same between each input"; + } + } + CHECK_NE(lod_size, 0) << "Each input must have sequence information"; + + // TODO(miaotianxiang): + int64_t dim0 = 0; + int64_t feature_size = 0; + std::vector out_dims; + for (const auto& tensor : param.X) { + const auto x_dims = tensor->dims(); + if (out_dims.empty()) { + out_dims = x_dims.data(); + } + dim0 += x_dims[0]; + if (feature_size == 0) { + feature_size = x_dims.production() / x_dims[0]; + } else { + CHECK_EQ(feature_size, x_dims.production() / x_dims[0]) + << "Inputs of sequence concat must have same feature size"; + } + } + out_dims[0] = dim0; + out->Resize(out_dims); + std::vector x_in_order; + out->set_lod(ConcatLoD(xs, &x_in_order)); + + CHECK(xs.size() == 2) << "XPU only support sequence_pool for 2 tensors"; + + auto lod0 = xs[0]->lod()[0]; + auto lod1 = xs[1]->lod()[0]; + int batch_size = lod0.size() - 1; + + int* lod0_xpu = reinterpret_cast(lod0_xpu_guard_->addr_); + int* lod1_xpu = reinterpret_cast(lod1_xpu_guard_->addr_); + for (int i = 0; i < lod0.size(); ++i) { + lod0_cpu[i] = lod0[i]; + } + for (int i = 0; i < lod1.size(); ++i) { + lod1_cpu[i] = lod1[i]; + } + XPU_CALL(xpu_memcpy(lod0_xpu, + lod0_cpu.get(), + lod0.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(lod1_xpu, + lod1_cpu.get(), + lod1.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + + int r = xdnn::sequence_concat(ctx.GetRawContext(), + xs[0]->data(), + lod0_xpu, + xs[1]->data(), + lod1_xpu, + out->mutable_data(TARGET(kXPU)), + batch_size); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_concat, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SequenceConcatCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/sequence_concat_compute.h b/lite/kernels/xpu/sequence_concat_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..5726671975d546d1e549ecbe95790c11faafba7b --- /dev/null +++ b/lite/kernels/xpu/sequence_concat_compute.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SequenceConcatCompute + : public KernelLite { + public: + using param_t = operators::SequenceConcatParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard lod0_xpu_guard_; + XPUScratchPadGuard lod1_xpu_guard_; + + std::unique_ptr lod0_cpu; + std::unique_ptr lod1_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/sequence_pool_compute.cc b/lite/kernels/xpu/sequence_pool_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..f8e71639b7f4c67f7e60103a42766a4d32026bc1 --- /dev/null +++ b/lite/kernels/xpu/sequence_pool_compute.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/sequence_pool_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void XPUSequencePoolCompute::PrepareForRun() { + lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]); +} + +void XPUSequencePoolCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* in = param.X; + auto* out = param.Out; + std::string pool_type_str = param.pool_type; + + auto dims = in->dims(); + auto lod = in->lod(); + dims[0] = lod[0].size() - 1; + + xdnn::Pooling_t pool_type = xdnn::Pooling_t::MAX_WITHOUT_INDEX; + if (pool_type_str == "MAX") { + } else if (pool_type_str == "LAST") { + pool_type = xdnn::Pooling_t::LAST; + } else { + CHECK(false); + } + + int num_seq = out->dims()[0]; + int dim = out->numel() / num_seq; + + auto in_lod = in->lod()[0]; + for (size_t i = 0; i < in_lod.size(); ++i) { + lod_cpu[i] = in_lod[i]; + } + int* lod_xpu = reinterpret_cast(lod_xpu_guard_->addr_); + XPU_CALL(xpu_memcpy(lod_xpu, + lod_cpu.get(), + in_lod.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + + int r = + xdnn::sequence_pooling_forward(ctx.GetRawContext(), + pool_type, + num_seq, + lod_xpu, + dim, + in->data(), + nullptr /* index */, + out->mutable_data(TARGET(kXPU))); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_pool, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::XPUSequencePoolCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/sequence_pool_compute.h b/lite/kernels/xpu/sequence_pool_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..232634de0e387e764eccdeeda4cb8fd2d5dce598 --- /dev/null +++ b/lite/kernels/xpu/sequence_pool_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class XPUSequencePoolCompute + : public KernelLite { + public: + using param_t = operators::SequencePoolParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard lod_xpu_guard_; + + std::unique_ptr lod_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/sequence_reverse_compute.cc b/lite/kernels/xpu/sequence_reverse_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..bb3f37890b644a660c594fb0fd6eea332b90b8d6 --- /dev/null +++ b/lite/kernels/xpu/sequence_reverse_compute.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/sequence_reverse_compute.h" +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +void SequenceReverseCompute::PrepareForRun() { + lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]); +} + +template +void SequenceReverseCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* x = param.X; + auto* y = param.Out; + + auto lod = x->lod()[0]; + size_t limit = x->numel(); + size_t ele_cnt_in_4_byte = limit / x->dims()[0]; + auto* x_data = x->template data(); + auto* y_data = y->template mutable_data(TARGET(kXPU)); + int batch_size = lod.size() - 1; + + if (std::is_same::value) { + ele_cnt_in_4_byte /= 4; + } else if (std::is_same::value) { + // remain the same + } else if (std::is_same::value) { + ele_cnt_in_4_byte *= 2; + } else if (std::is_same::value) { + // remain the same + } else if (std::is_same::value) { + ele_cnt_in_4_byte *= 2; + } + + for (size_t i = 0; i < lod.size(); ++i) { + lod_cpu[i] = lod[i]; + } + int* lod_xpu = reinterpret_cast(lod_xpu_guard_->addr_); + XPU_CALL(xpu_memcpy(lod_xpu, + lod_cpu.get(), + lod.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + + int r = xdnn::sequence_reverse(ctx.GetRawContext(), + batch_size, + lod_xpu, + ele_cnt_in_4_byte, + reinterpret_cast(x_data), + reinterpret_cast(y_data)); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +namespace xpu = paddle::lite::kernels::xpu; +using SequenceReverseFp32 = + xpu::SequenceReverseCompute; +using SequenceReverseInt64 = + xpu::SequenceReverseCompute; + +REGISTER_LITE_KERNEL( + sequence_reverse, kXPU, kFloat, kNCHW, SequenceReverseFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + sequence_reverse, kXPU, kInt64, kNCHW, SequenceReverseInt64, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/xpu/sequence_reverse_compute.h b/lite/kernels/xpu/sequence_reverse_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..91b285de767c65f93352380df7877e53d61ccd53 --- /dev/null +++ b/lite/kernels/xpu/sequence_reverse_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +class SequenceReverseCompute : public KernelLite { + public: + using param_t = operators::SequenceReverseParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard lod_xpu_guard_; + std::unique_ptr lod_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..4e8485e2999b29dfb487d0c7c632fcfa7a9a3d00 --- /dev/null +++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/sequence_topk_avg_pooling_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void SequenceTopkAvgPoolingCompute::PrepareForRun() { + lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + 4 * XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + in_lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]); + row_lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]); + col_lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]); +} + +void SequenceTopkAvgPoolingCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* in = param.X; + auto* row = param.ROW; + auto* col = param.COLUMN; + auto* out = param.Out; + auto* pos = param.pos; + + auto channel_num = param.channel_num; + auto topks = param.topks; + auto k_num = topks.size(); + auto max_k = topks[topks.size() - 1]; + auto in_lod = in->lod()[0]; + + auto row_lod = row->lod()[0]; + auto col_lod = col->lod()[0]; + int batch_size = row_lod.size() - 1; + int pos_total_size = row_lod[batch_size] * channel_num * max_k; + std::vector vec_pos_shape; + vec_pos_shape.push_back(pos_total_size); + pos->Resize(vec_pos_shape); + auto pos_data = pos->mutable_data(TARGET(kXPU)); + + int offset = 0; + std::vector vec_out_lod; + vec_out_lod.reserve(batch_size + 1); + for (int i = 0; i <= batch_size; ++i) { + offset = row_lod[i]; + vec_out_lod.push_back(offset); + } + LoD lod_temp; + lod_temp.push_back(vec_out_lod); + out->set_lod(lod_temp); + + auto in_data = in->data(); + auto out_data = out->mutable_data(TARGET(kXPU)); + + int* in_lod_xpu = reinterpret_cast(lod_xpu_guard_->addr_); + int* row_lod_xpu = in_lod_xpu + in_lod.size(); + int* col_lod_xpu = row_lod_xpu + row_lod.size(); + int* topks_xpu = col_lod_xpu + col_lod.size(); + for (int i = 0; i < in_lod.size(); ++i) { + in_lod_cpu[i] = in_lod[i]; + } + for (int i = 0; i < row_lod.size(); ++i) { + row_lod_cpu[i] = row_lod[i]; + } + for (int i = 0; i < col_lod.size(); ++i) { + col_lod_cpu[i] = col_lod[i]; + } + XPU_CALL(xpu_memcpy(in_lod_xpu, + in_lod_cpu.get(), + in_lod.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(row_lod_xpu, + row_lod_cpu.get(), + row_lod.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(col_lod_xpu, + col_lod_cpu.get(), + col_lod.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(topks_xpu, + topks.data(), + topks.size() * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + + int r = xdnn::sequence_topk_avg_pooling(ctx.GetRawContext(), + in_data, + out_data, + pos_data, + batch_size, + channel_num, + in_lod_xpu, + row_lod_xpu, + col_lod_xpu, + topks_xpu, + k_num); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_topk_avg_pooling, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::SequenceTopkAvgPoolingCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("pos", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..7c54ca96225ee9ec37d6d0487a526347c19fdb2d --- /dev/null +++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h @@ -0,0 +1,45 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class SequenceTopkAvgPoolingCompute + : public KernelLite { + public: + using param_t = operators::SequenceTopkAvgPoolingParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard lod_xpu_guard_; + std::unique_ptr in_lod_cpu; + std::unique_ptr row_lod_cpu; + std::unique_ptr col_lod_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/softmax_compute.h b/lite/kernels/xpu/softmax_compute.h index e807f38a2ea3c9645b78340ac4dc87d1984c40f7..a3d282588776b7d64bc856adf92685c8524af035 100644 --- a/lite/kernels/xpu/softmax_compute.h +++ b/lite/kernels/xpu/softmax_compute.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "lite/core/kernel.h" namespace paddle { diff --git a/lite/kernels/xpu/stack_compute.cc b/lite/kernels/xpu/stack_compute.cc index 90a6c70b49f39ce744f2a03eec41d79ddc768a19..156162923ceeb4abed466164b11672715f813fd7 100644 --- a/lite/kernels/xpu/stack_compute.cc +++ b/lite/kernels/xpu/stack_compute.cc @@ -25,9 +25,8 @@ void StackCompute::PrepareForRun() { auto& param = this->Param(); int n = param.X.size(); - void* x_ptr = nullptr; - xpu_malloc(&x_ptr, n * 8 /* sizeof(__global__ float*) */); - x_ptr_guard_.reset(x_ptr); + x_ptr_guard_ = TargetWrapperXPU::MallocScratchPad( + n * 8 /* sizeof(__global__ float*) */, false /* use_l3 */); x_ptr_cpu_.reserve(n); } @@ -47,14 +46,15 @@ void StackCompute::Run() { for (int i = 0; i < n; ++i) { x_ptr_cpu_[i] = param.X[i]->data(); } - xpu_memcpy(x_ptr_guard_.get(), &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE); + XPU_CALL(xpu_memcpy( + x_ptr_guard_->addr_, &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE)); int r = xdnn::stack_forward( ctx.GetRawContext(), /* context */ height, /* height */ width, /* width */ n, /* n */ - x_ptr_guard_.get(), /* x_ptr */ + x_ptr_guard_->addr_, /* x_ptr */ param.Out->mutable_data(TARGET(kXPU)) /* out */); CHECK_EQ(r, 0); } diff --git a/lite/kernels/xpu/stack_compute.h b/lite/kernels/xpu/stack_compute.h index 1ba1d92dc9479cfd00c5e154df7b5476ffd9976c..7618e2a147b862aee097a42b36721d520ad6012c 100644 --- a/lite/kernels/xpu/stack_compute.h +++ b/lite/kernels/xpu/stack_compute.h @@ -14,10 +14,9 @@ #pragma once -#include #include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard #include "lite/core/kernel.h" -#include "lite/kernels/xpu/utils.h" // XPUFreeDeleter namespace paddle { namespace lite { @@ -35,7 +34,7 @@ class StackCompute : public KernelLite { virtual ~StackCompute() = default; private: - std::unique_ptr x_ptr_guard_; + XPUScratchPadGuard x_ptr_guard_; std::vector x_ptr_cpu_; }; diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc index 9c2191331c85a7f99ffb5a2e9662ed5831cb1dda..ac301108386e2da43b2efc372b96531df8d55523 100644 --- a/lite/kernels/xpu/subgraph_compute.cc +++ b/lite/kernels/xpu/subgraph_compute.cc @@ -27,26 +27,50 @@ namespace lite { namespace kernels { namespace xpu { -int SubgraphEngine::BuildDeviceProgram() { +bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() { + // Obtain the origin input tensors, and create the origin output + // tensors(Don't try to access them before launch the device program or the + // origin program) + PrepareWorkspaceForOriginProgram(); + // Create the device input and output tensors, but don't initialize them + // with the dimensions + device_itensors_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + device_itensors_[i].reset(new hiai::AiTensor); + CHECK(device_itensors_[i]); + } + device_otensors_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + device_otensors_[i].reset(new hiai::AiTensor); + CHECK(device_otensors_[i]); + } + return true; +} + +bool SubgraphEngine::BuildDeviceProgram() { int status = 0; // Convert all of ops and their input vars and weights and added into the XPU // IR graph subgraph::xpu::Graph graph; const auto& bridges = subgraph::Registry::Instance(); - for (auto& inst : origin_program_) { + if (!origin_program_) { + BuildOriginProgram(); + } + const auto& insts = origin_program_->instructions(kRootBlockIdx); + for (auto& inst : insts) { auto op = const_cast(inst.op()); CHECK(op); op->CheckShape(); op->InferShape(); std::string op_type = op->op_info()->Type(); if (!bridges.Exists(op_type, TARGET(kXPU))) { - return subgraph::FAILED; + return false; } auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kXPU))( reinterpret_cast(&graph), op, const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { - return subgraph::FAILED; + return false; } } // Obtain the output nodes of the XPU IR graph and build the graph to the XPU @@ -86,7 +110,7 @@ int SubgraphEngine::BuildDeviceProgram() { &graph.builder_, &graph.params_, &device_onodes); if (device_program_ == nullptr) { LOG(WARNING) << "[XPU] Build model failed!"; - return subgraph::FAILED; + return false; } // Query and check the dimensions of input and output tensors @@ -100,7 +124,7 @@ int SubgraphEngine::BuildDeviceProgram() { auto node = graph.Get(device_inames_[i]); auto precision = node->precision(); auto layout = node->layout(); - origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); + origin_itensors_[i] = exec_scope_->FindMutableTensor(device_inames_[i]); CHECK(origin_itensors_[i]); origin_idims_[i] = origin_itensors_[i]->dims(); VLOG(3) << "[XPU] Inputs[" << i << "] name: " << device_inames_[i] @@ -124,7 +148,7 @@ int SubgraphEngine::BuildDeviceProgram() { auto node = graph.Get(device_onames_[i]); auto precision = node->precision(); auto layout = node->layout(); - origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); + origin_otensors_[i] = exec_scope_->FindMutableTensor(device_onames_[i]); CHECK(origin_otensors_[i]); origin_odims_[i] = origin_otensors_[i]->dims(); VLOG(3) << "[XPU] Outputs[" << i << "] name: " << device_onames_[i] @@ -166,10 +190,10 @@ int SubgraphEngine::BuildDeviceProgram() { device_otensors_[i].strides = nullptr; device_otensors_[i].byte_offset = 0; } - return status; + return true; } -int SubgraphEngine::LaunchDeviceProgram() { +bool SubgraphEngine::LaunchDeviceProgram() { for (size_t i = 0; i < device_itensors_.size(); i++) { // Update the data pointer of DLTensor to track the origin input tensors device_itensors_[i].data = @@ -191,24 +215,23 @@ int SubgraphEngine::LaunchDeviceProgram() { const_cast(origin_otensors_[i]->raw_data()); device_program_->CopyOutputTo(i, &device_otensors_[i]); } - return 0; + return true; } void SubgraphCompute::PrepareForRun() { auto& param = this->Param(); engine_.reset(new SubgraphEngine(ctx_.get(), - param.sub_block_idx, - param.sub_block_desc, + param.block_idx, + param.program_desc, + param.exec_scope, param.input_data_names, - param.output_data_names, - param.scope)); + param.output_data_names)); CHECK(engine_); - engine_->Build(); } void SubgraphCompute::Run() { CHECK(engine_); - engine_->Launch(); + engine_->Run(); } } // namespace xpu diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h index 601c8821bc826e350c233573bf7eff89cdf5c1f5..25ffa721572ce05b0652d56659f3db12903c589b 100644 --- a/lite/kernels/xpu/subgraph_compute.h +++ b/lite/kernels/xpu/subgraph_compute.h @@ -31,21 +31,26 @@ class SubgraphEngine : public subgraph::Engine { public: SubgraphEngine(KernelContext *ctx, int block_idx, - cpp::BlockDesc *block_desc, + const std::shared_ptr &program_desc, + Scope *exec_scope, const std::vector &input_names, - const std::vector &output_names, - Scope *scope) - : subgraph::Engine( - ctx, block_idx, block_desc, input_names, output_names, scope) {} + const std::vector &output_names) + : subgraph::Engine(ctx, + block_idx, + program_desc, + exec_scope, + input_names, + output_names) {} protected: - int BuildDeviceProgram() override; - int LaunchDeviceProgram() override; + bool PrepareWorkspaceForDeviceProgram() override; + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; std::vector device_inames_; std::vector device_onames_; - std::vector device_itensors_; - std::vector device_otensors_; + std::vector device_itensors_{}; + std::vector device_otensors_{}; std::unique_ptr device_program_{nullptr}; }; diff --git a/lite/kernels/xpu/var_conv_2d_compute.cc b/lite/kernels/xpu/var_conv_2d_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..b73581951f46a5f3cdbaf64cf732b1909805d27d --- /dev/null +++ b/lite/kernels/xpu/var_conv_2d_compute.cc @@ -0,0 +1,141 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/var_conv_2d_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void VarConv2DCompute::PrepareForRun() { + offset_x_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + offset_y_xpu_guard_ = TargetWrapperXPU::MallocScratchPad( + XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */); + offset_x_cpu.reset(new int[XPU_MAX_LOD_SIZE]); + offset_y_cpu.reset(new int[XPU_MAX_LOD_SIZE]); +} + +void VarConv2DCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + + auto* bottom = param.X; + auto* w = param.W; + auto* top = param.Out; + + int output_channel = param.output_channel; + int input_channel = param.input_channel; + int kernel_h = param.kernel_h; + int kernel_w = param.kernel_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + float w_max = param.__xpu__w_max; + bool fuse_relu = param.fuse_relu; + bool float_to_fix = param.__xpu__float_to_fix; + CHECK(float_to_fix) << "W should be fixed point"; + + xdnn::Activation_t act = xdnn::Activation_t::LINEAR; + if (fuse_relu) { + act = xdnn::Activation_t::RELU; + } + + int batch = bottom->lod()[0].size() - 1; + const auto& offset_x = bottom->lod()[2]; + const auto& offset_y = bottom->lod()[1]; + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + int top_im_y = 0; + if (width != 0) { + top_im_x = (width - 1) / stride_w + 1; + } + if (height != 0) { + top_im_y = (height - 1) / stride_h + 1; + } + int top_im_size = top_im_y * top_im_x; + top_size += output_channel * top_im_size; + top_offset.push_back(top_size); + } + + LoD top_lod; + top_lod.push_back(top_offset); + top_lod.push_back(bottom->lod()[1]); + top_lod.push_back(bottom->lod()[2]); + top->set_lod(top_lod); + std::vector top_dims_vec{top_size}; + top_dims_vec.push_back(1); + top->Resize(top_dims_vec); + auto* top_data = top->mutable_data(TARGET(kXPU)); + + auto* bottom_data = bottom->data(); + auto* w_data = w->data(); + + int* offset_x_xpu = reinterpret_cast(offset_x_xpu_guard_->addr_); + int* offset_y_xpu = reinterpret_cast(offset_y_xpu_guard_->addr_); + for (int i = 0; i < (batch + 1); ++i) { + offset_x_cpu[i] = offset_x[i]; + offset_y_cpu[i] = offset_y[i]; + } + XPU_CALL(xpu_memcpy(offset_x_xpu, + offset_x_cpu.get(), + (batch + 1) * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(offset_y_xpu, + offset_y_cpu.get(), + (batch + 1) * sizeof(int), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + + int r = xdnn::search_varconv(ctx.GetRawContext(), + batch, + input_channel, + output_channel, + kernel_h, + kernel_w, + stride_h, + stride_w, + bottom_data, + w_data, + offset_x_xpu, + offset_y_xpu, + top_data, + w_max, + act); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(var_conv_2d, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::VarConv2DCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/var_conv_2d_compute.h b/lite/kernels/xpu/var_conv_2d_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..4d9f0ca7a9851a0c3071e72519c4ad1f40ea3483 --- /dev/null +++ b/lite/kernels/xpu/var_conv_2d_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/backends/xpu/target_wrapper.h" // XPUScratchPadGuard +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class VarConv2DCompute : public KernelLite { + public: + using param_t = operators::VarConv2DParam; + + void PrepareForRun() override; + + void Run() override; + + private: + XPUScratchPadGuard offset_x_xpu_guard_; + XPUScratchPadGuard offset_y_xpu_guard_; + std::unique_ptr offset_x_cpu; + std::unique_ptr offset_y_cpu; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/CMakeLists.txt b/lite/model_parser/CMakeLists.txt index 34d524c5c1b86fb6b689b86089c355e3de42a34e..a83cecf4444910e710d0eb92b9c3449190f5bda2 100644 --- a/lite/model_parser/CMakeLists.txt +++ b/lite/model_parser/CMakeLists.txt @@ -1,8 +1,9 @@ if (NOT LITE_ON_TINY_PUBLISH) add_subdirectory(pb) endif() -add_subdirectory(cpp) +add_subdirectory(general) add_subdirectory(naive_buffer) +add_subdirectory(flatbuffers) #lite_cc_library(runtime_lite SRCS runtime.cc) diff --git a/lite/model_parser/base/apis.h b/lite/model_parser/base/apis.h new file mode 100644 index 0000000000000000000000000000000000000000..fa3449017c902479a7f6ad37ef73b3a316f585cc --- /dev/null +++ b/lite/model_parser/base/apis.h @@ -0,0 +1,23 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/model_parser/base/block_desc.h" +#include "lite/model_parser/base/op_desc.h" +#include "lite/model_parser/base/program_desc.h" +#include "lite/model_parser/base/proto_desc.h" +#include "lite/model_parser/base/traits.h" +#include "lite/model_parser/base/var_desc.h" +#include "lite/utils/all.h" diff --git a/lite/model_parser/base/block_desc.h b/lite/model_parser/base/block_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..b3d2e2452714d474e9d6bc9280cb2c5455fecc98 --- /dev/null +++ b/lite/model_parser/base/block_desc.h @@ -0,0 +1,86 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { + +class BlockDescReadAPI { + public: + virtual int32_t Idx() const = 0; + virtual int32_t ParentIdx() const = 0; + virtual size_t VarsSize() const = 0; + virtual size_t OpsSize() const = 0; + virtual int32_t ForwardBlockIdx() const = 0; + + template + T* GetVar(int32_t idx); + + template + T const* GetVar(int32_t idx) const; + + template + T* GetOp(int32_t idx); + + template + T const* GetOp(int32_t idx) const; + + virtual ~BlockDescReadAPI() = default; +}; + +class BlockDescWriteAPI { + public: + virtual void SetIdx(int32_t idx) { NotImplemented(); } + virtual void SetParentIdx(int32_t idx) { NotImplemented(); } + virtual void ClearVars() { NotImplemented(); } + virtual void ClearOps() { NotImplemented(); } + virtual void SetForwardBlockIdx(int32_t idx) { NotImplemented(); } + + template + T* AddVar() { + NotImplemented(); + return nullptr; + } + + template + T* AddOp() { + NotImplemented(); + return nullptr; + } + + virtual ~BlockDescWriteAPI() = default; + + private: + void NotImplemented() const { + LOG(FATAL) << "BlockDescWriteAPI is not available in model read-only mode."; + } +}; + +// The reading and writing of the model are one-time and separate. +// This interface is a combination of reading and writing interfaces, +// which is used to support legacy interfaces. + +class BlockDescAPI : public BlockDescReadAPI, public BlockDescWriteAPI { + public: + virtual ~BlockDescAPI() = default; +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/base/op_desc.h b/lite/model_parser/base/op_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..534ff0feabd2234b4d7a72894383020a5f64d594 --- /dev/null +++ b/lite/model_parser/base/op_desc.h @@ -0,0 +1,99 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/model_parser/base/traits.h" +#include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" + +namespace paddle { +namespace lite { + +class OpDescReadAPI { + public: + virtual std::string Type() const = 0; + virtual std::vector Input(const std::string& param) const = 0; + virtual std::vector InputArgumentNames() const = 0; + virtual std::vector Output(const std::string& param) const = 0; + virtual std::vector OutputArgumentNames() const = 0; + virtual bool HasAttr(const std::string& name) const = 0; + virtual OpAttrType GetAttrType(const std::string& name) const = 0; + virtual std::vector AttrNames() const = 0; + + template + T GetAttr(const std::string& name) const; + + std::string Repr() const { + STL::stringstream ss; + ss << Type(); + ss << "("; + for (auto& arg : InputArgumentNames()) { + ss << arg << ":"; + for (auto val : Input(arg)) { + ss << val << " "; + } + } + ss << ") -> ("; + for (auto& arg : OutputArgumentNames()) { + ss << arg << ":"; + for (auto val : Output(arg)) { + ss << val << " "; + } + } + ss << ")"; + return ss.str(); + } + + virtual ~OpDescReadAPI() = default; +}; + +class OpDescWriteAPI { + public: + virtual void SetType(const std::string& type) { NotImplemented(); } + virtual void SetInput(const std::string& param, + const std::vector& args) { + NotImplemented(); + } + virtual void SetOutput(const std::string& param, + const std::vector& args) { + NotImplemented(); + } + + template + void SetAttr(const std::string& name, const T& v) { + NotImplemented(); + } + + virtual ~OpDescWriteAPI() = default; + + private: + void NotImplemented() const { + LOG(FATAL) << "OpDescWriteAPI is not available in model read-only mode."; + } +}; + +// The reading and writing of the model are one-time and separate. +// This interface is a combination of reading and writing interfaces, +// which is used to support legacy interfaces. + +class OpDescAPI : public OpDescReadAPI, public OpDescWriteAPI { + public: + using AttrType = OpAttrType; + virtual ~OpDescAPI() = default; +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/base/program_desc.h b/lite/model_parser/base/program_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..9ca128bd0aa8ba39752247074e8d57c0d23513f3 --- /dev/null +++ b/lite/model_parser/base/program_desc.h @@ -0,0 +1,67 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { + +class ProgramDescReadAPI { + public: + virtual size_t BlocksSize() const = 0; + virtual bool HasVersion() const = 0; + virtual int64_t Version() const = 0; + + template + T* GetBlock(int32_t idx); + + template + T const* GetBlock(int32_t idx) const; + + virtual ~ProgramDescReadAPI() = default; +}; + +class ProgramDescWriteAPI { + public: + virtual void ClearBlocks() { NotImplemented(); } + virtual void SetVersion(int64_t version) { NotImplemented(); } + + template + T* AddBlock() { + NotImplemented(); + return nullptr; + } + + virtual ~ProgramDescWriteAPI() = default; + + private: + void NotImplemented() const { + LOG(FATAL) + << "ProgramDescWriteAPI is not available in model read-only mode."; + } +}; + +// The reading and writing of the model are one-time and separate. +// This interface is a combination of reading and writing interfaces, +// which is used to support legacy interfaces. + +class ProgramDescAPI : public ProgramDescReadAPI, public ProgramDescWriteAPI { + public: + virtual ~ProgramDescAPI() = default; +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/base/proto_desc.h b/lite/model_parser/base/proto_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..0f62ef6e43883fd41c509795d1e4f695fdbb8910 --- /dev/null +++ b/lite/model_parser/base/proto_desc.h @@ -0,0 +1,26 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace paddle { +namespace lite { + +// The Index of first Block in Program. also called root block. +constexpr int kRootBlockIdx = 0; +// The Parent Index of root Block, this block does not exist. +constexpr int kNoneBlockIdx = -1; + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/base/traits.h b/lite/model_parser/base/traits.h new file mode 100644 index 0000000000000000000000000000000000000000..bda293686c7996abb9b0fe36edcc84407ed3b541 --- /dev/null +++ b/lite/model_parser/base/traits.h @@ -0,0 +1,82 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace paddle { +namespace lite { + +// The AttrType is used to make the proto::AttrType portable. +enum class OpAttrType { + INT = 0, + FLOAT = 1, + STRING = 2, + INTS = 3, + FLOATS = 4, + STRINGS = 5, + BOOLEAN = 6, + BOOLEANS = 7, + BLOCK = 8, + LONG = 9, + BLOCKS = 10, + LONGS = 11, + UNK, +}; + +struct Standard {}; +struct Flatbuffers {}; + +template +class VectorView; + +template +struct OpDataTypeTrait; + +#define ATTR_TYPE_TRAIT_IMPL(T, type__) \ + template \ + struct OpDataTypeTrait { \ + typedef type__ ET; \ + typedef type__ RT; \ + static constexpr OpAttrType AT = OpAttrType::T; \ + static constexpr const char* ATN = #T; \ + }; +#define ATTR_VECTOR_TYPE_TRAIT_IMPL(T, type__) \ + template \ + struct OpDataTypeTrait, U> { \ + typedef type__ ET; \ + typedef VectorView RT; \ + static constexpr OpAttrType AT = OpAttrType::T; \ + static constexpr const char* ATN = #T; \ + }; + +ATTR_TYPE_TRAIT_IMPL(BLOCK, int16_t); +ATTR_TYPE_TRAIT_IMPL(INT, int32_t); +ATTR_TYPE_TRAIT_IMPL(FLOAT, float); +ATTR_TYPE_TRAIT_IMPL(STRING, std::string); +ATTR_TYPE_TRAIT_IMPL(BOOLEAN, bool); +ATTR_TYPE_TRAIT_IMPL(LONG, int64_t); + +ATTR_VECTOR_TYPE_TRAIT_IMPL(INTS, int32_t); +ATTR_VECTOR_TYPE_TRAIT_IMPL(FLOATS, float); +ATTR_VECTOR_TYPE_TRAIT_IMPL(STRINGS, std::string); +ATTR_VECTOR_TYPE_TRAIT_IMPL(LONGS, int64_t); + +#undef ATTR_TYPE_TRAIT_IMPL +#undef ATTR_VECTOR_TYPE_TRAIT_IMPL + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/base/var_desc.h b/lite/model_parser/base/var_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..47596f8792a83677a036bcb3d937e67576204546 --- /dev/null +++ b/lite/model_parser/base/var_desc.h @@ -0,0 +1,90 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { + +enum class VarDataType { + // Pod Types + BOOL = 0, + INT16, + INT32, + INT64, + FP16, + FP32, + FP64, + // Tensor is used in C++. + SIZE_T, + UINT8, + INT8, + + // Other types that may need additional descriptions + LOD_TENSOR, + SELECTED_ROWS, + FEED_MINIBATCH, + FETCH_LIST, + STEP_SCOPES, + LOD_RANK_TABLE, + LOD_TENSOR_ARRAY, + PLACE_LIST, + READER, + // Any runtime decided variable type is raw + // raw variables should manage their own allocations + // in operators like nccl_op + RAW, + TUPLE +}; + +class VarDescReadAPI { + public: + virtual std::string Name() const = 0; + virtual VarDataType GetType() const = 0; + virtual bool Persistable() const = 0; + virtual std::vector GetShape() const = 0; + virtual ~VarDescReadAPI() = default; +}; + +class VarDescWriteAPI { + public: + virtual void SetName(std::string name) { NotImplemented(); } + virtual void SetType(VarDataType type) { NotImplemented(); } + virtual void SetPersistable(bool persistable) { NotImplemented(); } + virtual void SetShape(const std::vector& dims) { NotImplemented(); } + virtual ~VarDescWriteAPI() = default; + + private: + void NotImplemented() const { + LOG(FATAL) << "VarDescWriteAPI is not available in model read-only mode."; + } +}; + +// The reading and writing of the model are one-time and separate. +// This interface is a combination of reading and writing interfaces, +// which is used to support legacy interfaces. + +class VarDescAPI : public VarDescReadAPI, public VarDescWriteAPI { + public: + using VarDataType = lite::VarDataType; + using Type = lite::VarDataType; + virtual ~VarDescAPI() = default; +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/base/vector_view.h b/lite/model_parser/base/vector_view.h new file mode 100644 index 0000000000000000000000000000000000000000..e4149d9c5acae83472904a86c47659355972855e --- /dev/null +++ b/lite/model_parser/base/vector_view.h @@ -0,0 +1,99 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/model_parser/base/traits.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace vector_view { + +template +struct ElementTraits { + typedef T element_type; +}; + +template +struct VectorTraits; + +template +struct VectorTraits { + typedef std::vector vector_type; + typedef typename vector_type::const_iterator const_iterator; + typedef typename vector_type::const_reference const_reference; + typedef const_reference subscript_return_type; +}; + +} // namespace vector_view + +// In the process of optimizing the performance of model loading, we found +// that it was necessary to reduce the copying and construction of STL +// containers. So use VectorView to simulate the operation of STL containers +// without copying, such as iteration and subscripting. +// +// Currently, VectorView is applicable to STL vector and Flatbuffers Vector. +// We used the template Traits to unify the behavior of the two, and provided +// an implicit conversion operator from VectorView to STL vector. Please use +// implicit conversion with caution because it will bring significant overhead. + +template +class VectorView { + public: + typedef vector_view::VectorTraits Traits; + explicit VectorView(typename Traits::vector_type const* cvec) { + cvec_ = cvec; + } + typename Traits::subscript_return_type operator[](size_t i) const { + return cvec_->operator[](i); + } + typename Traits::const_iterator begin() const { + if (!cvec_) { + return typename Traits::const_iterator(); + } + return cvec_->begin(); + } + typename Traits::const_iterator end() const { + if (!cvec_) { + return typename Traits::const_iterator(); + } + return cvec_->end(); + } + size_t size() const { + if (!cvec_) { + return 0; + } + return cvec_->size(); + } + operator std::vector() const { + VLOG(5) << "Copying elements out of VectorView will damage performance."; + std::vector tmp; + tmp.reserve(size()); + for (size_t i = 0; i < size(); ++i) { + tmp.push_back(cvec_->operator[](i)); + } + return tmp; + } + ~VectorView() = default; + + private: + typename Traits::vector_type const* cvec_; +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/compatibility.cc b/lite/model_parser/compatibility.cc index 67d7c9d69152d31d1381ea847ef859a08e4f82a7..dd43f7bd25277e34a2fd8b04aae6b705402a0436 100644 --- a/lite/model_parser/compatibility.cc +++ b/lite/model_parser/compatibility.cc @@ -20,10 +20,7 @@ #include "lite/model_parser/naive_buffer/program_desc.h" #include "lite/model_parser/naive_buffer/var_desc.h" #ifndef LITE_ON_TINY_PUBLISH -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/program_desc.h" -#include "lite/model_parser/cpp/var_desc.h" +#include "lite/model_parser/cpp_desc.h" #endif namespace paddle { diff --git a/lite/model_parser/compatibility.h b/lite/model_parser/compatibility.h index 9e421d709d1823852d6dac5cd0070b4330f56752..a47870cf9c4d8e1743f2eb749823e88f18b33900 100644 --- a/lite/model_parser/compatibility.h +++ b/lite/model_parser/compatibility.h @@ -17,7 +17,7 @@ #include #include #include "lite/api/paddle_place.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" namespace paddle { namespace lite { diff --git a/lite/model_parser/compatibility_test.cc b/lite/model_parser/compatibility_test.cc index b3cb38f1c95649567b72d73b8938420537ec7b5b..957bcb25ea68b5555c9937de4e87dc8e9c4923b1 100644 --- a/lite/model_parser/compatibility_test.cc +++ b/lite/model_parser/compatibility_test.cc @@ -17,10 +17,7 @@ #include "lite/api/paddle_lite_factory_helper.h" #include "lite/model_parser/compatible_pb.h" -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/program_desc.h" -#include "lite/model_parser/cpp/var_desc.h" +#include "lite/model_parser/cpp_desc.h" USE_LITE_KERNEL(leaky_relu, kCUDA, kFloat, kNCHW, def); diff --git a/lite/model_parser/compatible_pb.cc b/lite/model_parser/compatible_pb.cc index 3d66a5234994036397e445744499696909a8ab3e..8bfeb419e51b01ae008959ac5af3e9752834b1ab 100644 --- a/lite/model_parser/compatible_pb.cc +++ b/lite/model_parser/compatible_pb.cc @@ -234,7 +234,7 @@ void OpAttrsCppToAny(const cpp::OpDesc &cpp_desc, OpDescType *any_desc) { template <> \ void TransformBlockDescCppToAny(const cpp::T &cpp_desc, \ NT::T *any_desc) { \ - auto desc = cpp_desc; \ + const cpp::T &desc = cpp_desc; \ any_desc->SetIdx(desc.Idx()); \ any_desc->SetParentIdx(desc.ParentIdx()); \ any_desc->SetForwardBlockIdx(desc.ForwardBlockIdx()); \ @@ -277,7 +277,7 @@ void OpAttrsCppToAny(const cpp::OpDesc &cpp_desc, OpDescType *any_desc) { template <> \ void TransformProgramDescCppToAny(const cpp::T &cpp_desc, \ NT::T *any_desc) { \ - auto desc = cpp_desc; \ + auto &desc = cpp_desc; \ if (desc.HasVersion()) { \ any_desc->SetVersion(desc.Version()); \ } \ diff --git a/lite/model_parser/compatible_pb.h b/lite/model_parser/compatible_pb.h index 80fee49133130b09fbdd490ed86dce0af924aac1..c9889a5879160dd60ec64c4806df8af888db99c9 100644 --- a/lite/model_parser/compatible_pb.h +++ b/lite/model_parser/compatible_pb.h @@ -21,10 +21,7 @@ * lite::pb::XXDesc/lite::naive_buffer::XXDesc. */ -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/program_desc.h" -#include "lite/model_parser/cpp/var_desc.h" +#include "lite/model_parser/cpp_desc.h" namespace paddle { namespace lite { diff --git a/lite/model_parser/compatible_pb_test.cc b/lite/model_parser/compatible_pb_test.cc index 088b64bf2cd13ce0f443f962bd2cb5f709c4d4f2..d9a46e463209eb33e6f2cb53f4644056f88e7085 100644 --- a/lite/model_parser/compatible_pb_test.cc +++ b/lite/model_parser/compatible_pb_test.cc @@ -14,10 +14,7 @@ #include "lite/model_parser/compatible_pb.h" #include -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/program_desc.h" -#include "lite/model_parser/cpp/var_desc.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/model_parser/naive_buffer/block_desc.h" #include "lite/model_parser/naive_buffer/op_desc.h" #include "lite/model_parser/naive_buffer/program_desc.h" diff --git a/lite/model_parser/cpp_desc.h b/lite/model_parser/cpp_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..477f90a28d7bf1e31dbc648b18af42381e0c93d6 --- /dev/null +++ b/lite/model_parser/cpp_desc.h @@ -0,0 +1,26 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/model_parser/general/block_desc.h" +#include "lite/model_parser/general/op_desc.h" +#include "lite/model_parser/general/program_desc.h" +#include "lite/model_parser/general/var_desc.h" + +namespace paddle { +namespace lite { +namespace cpp = general; +} +} diff --git a/lite/model_parser/desc_apis.h b/lite/model_parser/desc_apis.h deleted file mode 100644 index 801d89e57b9a77ce04516cfdb67ce8917694188e..0000000000000000000000000000000000000000 --- a/lite/model_parser/desc_apis.h +++ /dev/null @@ -1,261 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include "lite/utils/all.h" -#include "lite/utils/replace_stl/stream.h" - -namespace paddle { -namespace lite { - -/* - * Compatible interfaces for all the different kinds of XXXDesc. All the XXXDesc - * classes should implement this. - */ - -class VarDescAPI { - public: - enum class Type { - // Pod Types - BOOL = 0, - INT16, - INT32, - INT64, - FP16, - FP32, - FP64, - // Tensor is used in C++. - SIZE_T, - UINT8, - INT8, - - // Other types that may need additional descriptions - LOD_TENSOR, - SELECTED_ROWS, - FEED_MINIBATCH, - FETCH_LIST, - STEP_SCOPES, - LOD_RANK_TABLE, - LOD_TENSOR_ARRAY, - PLACE_LIST, - READER, - // Any runtime decided variable type is raw - // raw variables should manage their own allocations - // in operators like nccl_op - RAW, - TUPLE - }; - - using VarDataType = Type; - - virtual ~VarDescAPI() = default; - - // Get var's name - virtual std::string Name() const = 0; - // Set var's name - virtual void SetName(std::string name) = 0; - // Get var's type - virtual Type GetType() const = 0; - // Set var's type - virtual void SetType(Type type) = 0; - // Tell whether var is persistable or not - virtual bool Persistable() const = 0; - // Set var to be persistable or not - virtual void SetPersistable(bool persistable) = 0; - // Get var's shape - virtual std::vector GetShape() const = 0; - // Set var's shape - virtual void SetShape(const std::vector& dims) = 0; -}; - -/* - * NOTE Some interfaces are weried, we remain them unchanged to keep compatible - * with framework::OpDesc in Fluid framework. - */ -class OpDescAPI { - public: - // The AttrType is used to make the proto::AttrType portable. - enum class AttrType { - INT = 0, - FLOAT = 1, - STRING = 2, - INTS = 3, - FLOATS = 4, - STRINGS = 5, - BOOLEAN = 6, - BOOLEANS = 7, - BLOCK = 8, - LONG = 9, - BLOCKS = 10, - LONGS = 11, - UNK, - }; - - template - struct AttrTypeTrait; - - template - struct DataTypeTrait; - - virtual ~OpDescAPI() = default; - - /// Get operator's type. - virtual std::string Type() const = 0; - /// Set operator's type. - virtual void SetType(const std::string& type) = 0; - /// Get arguments given the parameter. - virtual std::vector Input(const std::string& param) const = 0; - /// Get parameters. - virtual std::vector InputArgumentNames() const = 0; - /// Get arguments given the parameter. - virtual std::vector Output(const std::string& param) const = 0; - /// Get parameters. - virtual std::vector OutputArgumentNames() const = 0; - /// Set a input given the parameter and arguments. - virtual void SetInput(const std::string& param, - const std::vector& args) = 0; - virtual void SetOutput(const std::string& param, - const std::vector& args) = 0; - /// Tell whether this desc has an attribute. - virtual bool HasAttr(const std::string& name) const = 0; - - /// Get the type of an attribute. - virtual AttrType GetAttrType(const std::string& name) const = 0; - - virtual std::vector AttrNames() const = 0; - - /// Set an attribute. - template - void SetAttr(const std::string& name, const T& v); - - /// Get an attribute. - template - T GetAttr(const std::string& name) const; - - std::string Repr() const { - STL::stringstream ss; - ss << Type(); - ss << "("; - for (auto& arg : InputArgumentNames()) { - ss << arg << ":"; - for (auto val : Input(arg)) { - ss << val << " "; - } - } - ss << ") -> ("; - for (auto& arg : OutputArgumentNames()) { - ss << arg << ":"; - for (auto val : Output(arg)) { - ss << val << " "; - } - } - ss << ")"; - return ss.str(); - } -}; - -#define TYPE_TRAIT_IMPL(T, type__) \ - template <> \ - struct OpDescAPI::AttrTypeTrait { \ - typedef type__ DT; \ - }; \ - template <> \ - struct OpDescAPI::DataTypeTrait { \ - static constexpr AttrType AT = OpDescAPI::AttrType::T; \ - static constexpr const char* ATN = #T; \ - }; - -TYPE_TRAIT_IMPL(INT, int32_t); -TYPE_TRAIT_IMPL(FLOAT, float); -TYPE_TRAIT_IMPL(STRING, std::string); -TYPE_TRAIT_IMPL(BOOLEAN, bool); -TYPE_TRAIT_IMPL(LONG, int64_t); -TYPE_TRAIT_IMPL(INTS, std::vector); -TYPE_TRAIT_IMPL(FLOATS, std::vector); -TYPE_TRAIT_IMPL(STRINGS, std::vector); -TYPE_TRAIT_IMPL(LONGS, std::vector); -#undef TYPE_TRAIT_IMPL - -class BlockDescAPI { - public: - virtual ~BlockDescAPI() = default; - - virtual int32_t Idx() const = 0; - - virtual void SetIdx(int32_t idx) = 0; - - virtual int32_t ParentIdx() const = 0; - - virtual void SetParentIdx(int32_t idx) = 0; - - virtual size_t VarsSize() const = 0; - - virtual void ClearVars() = 0; - - // NOTE: This ugly method is used to compatible interfaces between cpp and - // pb/nb backends - // TODO(sangoly): refine this - template - T* GetVar(int32_t idx); - - template - T* AddVar(); - - virtual size_t OpsSize() const = 0; - - virtual void ClearOps() = 0; - - // NOTE: This ugly method is used to compatible interfaces between cpp and - // pb/nb backends - // TODO(sangoly): refine this - template - T* GetOp(int32_t idx); - - template - T* AddOp(); - - virtual int32_t ForwardBlockIdx() const = 0; - - virtual void SetForwardBlockIdx(int32_t idx) = 0; -}; - -class ProgramDescAPI { - public: - virtual ~ProgramDescAPI() = default; - - virtual size_t BlocksSize() const = 0; - - virtual void ClearBlocks() = 0; - - // NOTE: This ugly method is used to compatible interfaces between cpp and - // pb/nb backends - // TODO(sangoly): refine this - template - T* GetBlock(int32_t idx); - - template - T* AddBlock(); - - virtual bool HasVersion() const = 0; - - virtual int64_t Version() const = 0; - - virtual void SetVersion(int64_t version) = 0; -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/model_parser/flatbuffers/CMakeLists.txt b/lite/model_parser/flatbuffers/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7ae9514efaa406d6b339c7917ad3dc2ad4a1f4f --- /dev/null +++ b/lite/model_parser/flatbuffers/CMakeLists.txt @@ -0,0 +1,13 @@ +function(lite_fbs_library TARGET) + set(multiValueArgs SRCS FBS_DEPS) + cmake_parse_arguments(args "" "" "${multiValueArgs}" ${ARGN}) + lite_cc_library(${TARGET} SRCS ${args_SRCS}) + add_dependencies(${TARGET} ${args_FBS_DEPS}) +endfunction() + +lite_fbs_library(fbs_op_desc SRCS op_desc.cc FBS_DEPS framework_fbs_header) +lite_fbs_library(fbs_var_desc SRCS var_desc.cc FBS_DEPS framework_fbs_header) +lite_fbs_library(fbs_block_desc SRCS block_desc.cc FBS_DEPS framework_fbs_header) +lite_cc_library(fbs_program_desc SRCS program_desc.cc DEPS fbs_op_desc fbs_var_desc fbs_block_desc) +lite_cc_library(fbs_io SRCS io.cc DEPS fbs_program_desc) +lite_cc_test(test_vector_view SRCS vector_view_test.cc DEPS fbs_program_desc) diff --git a/lite/model_parser/flatbuffers/block_desc.cc b/lite/model_parser/flatbuffers/block_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..64087bb0707a891cc94a2d1234bb582312c3c10a --- /dev/null +++ b/lite/model_parser/flatbuffers/block_desc.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/model_parser/flatbuffers/block_desc.h" + +namespace paddle { +namespace lite { +namespace fbs { + +template <> +proto::VarDesc const* BlockDesc::GetVar(int32_t idx) const { + CHECK_LT(idx, VarsSize()) << "idx >= vars.size()"; + return desc_->vars()->Get(idx); +} + +template <> +proto::OpDesc const* BlockDesc::GetOp(int32_t idx) const { + CHECK_LT(idx, OpsSize()) << "idx >= ops.size()"; + return desc_->ops()->Get(idx); +} + +template <> +VarDesc const* BlockDesc::GetVar(int32_t idx) const { + CHECK_LT(idx, VarsSize()) << "idx >= vars.size()"; + return &vars_[idx]; +} + +template <> +OpDesc const* BlockDesc::GetOp(int32_t idx) const { + CHECK_LT(idx, OpsSize()) << "idx >= ops.size()"; + return &ops_[idx]; +} + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/block_desc.h b/lite/model_parser/flatbuffers/block_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..dd99bdaa69020823ad6ca50438f21356eae41459 --- /dev/null +++ b/lite/model_parser/flatbuffers/block_desc.h @@ -0,0 +1,94 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/model_parser/base/block_desc.h" +#include "lite/model_parser/flatbuffers/framework_generated.h" +#include "lite/model_parser/flatbuffers/op_desc.h" +#include "lite/model_parser/flatbuffers/var_desc.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace fbs { + +class BlockDesc : public BlockDescAPI { + public: + explicit BlockDesc(proto::BlockDesc const* desc) : desc_(desc) { + CHECK(desc_); + vars_.reserve(VarsSize()); + ops_.reserve(OpsSize()); + for (size_t idx = 0; idx < VarsSize(); ++idx) { + vars_.push_back(VarDesc(desc_->vars()->Get(idx))); + } + for (size_t idx = 0; idx < OpsSize(); ++idx) { + ops_.push_back(OpDesc(desc_->ops()->Get(idx))); + } + } + + int32_t Idx() const override { return desc_->idx(); } + + int32_t ParentIdx() const override { return desc_->parent_idx(); } + + size_t VarsSize() const override { return desc_->vars()->size(); } + + template + T const* GetVar(int32_t idx) const; + + template + T* GetVar(int32_t idx) { + NotImplemented(); + return nullptr; + } + + size_t OpsSize() const override { + CHECK(desc_); + CHECK(desc_->ops()); + return desc_->ops()->size(); + } + + template + T const* GetOp(int32_t idx) const; + + template + T* GetOp(int32_t idx) { + NotImplemented(); + return nullptr; + } + + const std::vector& GetVars() const { return vars_; } + + int32_t ForwardBlockIdx() const override { + return desc_->forward_block_idx(); + } + + BlockDesc() { NotImplemented(); } + + private: + proto::BlockDesc const* desc_; // not_own + std::vector vars_; + std::vector ops_; + + private: + void NotImplemented() const { + LOG(FATAL) << "The additional interfaces of BlockDesc is temporarily " + "unavailable in read-only mode."; + } +}; + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/framework.fbs b/lite/model_parser/flatbuffers/framework.fbs new file mode 100644 index 0000000000000000000000000000000000000000..90f6e626088003975f18303e47230a85c303181d --- /dev/null +++ b/lite/model_parser/flatbuffers/framework.fbs @@ -0,0 +1,172 @@ +// Generated from framework.proto + +namespace paddle.lite.fbs.proto; + +enum AttrType : int { + INT = 0, + FLOAT = 1, + STRING = 2, + INTS = 3, + FLOATS = 4, + STRINGS = 5, + BOOLEAN = 6, + BOOLEANS = 7, + BLOCK = 8, + LONG = 9, + BLOCKS = 10, + LONGS = 11, +} + +namespace paddle.lite.fbs.proto.VarType_; + +enum Type : int { + BOOL = 0, + INT16 = 1, + INT32 = 2, + INT64 = 3, + FP16 = 4, + FP32 = 5, + FP64 = 6, + LOD_TENSOR = 7, + SELECTED_ROWS = 8, + FEED_MINIBATCH = 9, + FETCH_LIST = 10, + STEP_SCOPES = 11, + LOD_RANK_TABLE = 12, + LOD_TENSOR_ARRAY = 13, + PLACE_LIST = 14, + READER = 15, + RAW = 17, + TUPLE = 18, + SIZE_T = 19, + UINT8 = 20, + INT8 = 21, +} + +namespace paddle.lite.fbs.proto.CompatibleInfo_; + +enum Type : int { + COMPATIBLE = 0, + DEFINITELY_NOT = 1, + POSSIBLE = 2, + BUG_FIX = 3, + PRECISION_CHANGE = 4, +} + +namespace paddle.lite.fbs.proto; + +table Version { + version:long; +} + +table OpDesc { + type:string (required); + inputs:[paddle.lite.fbs.proto.OpDesc_.Var]; + outputs:[paddle.lite.fbs.proto.OpDesc_.Var]; + attrs:[paddle.lite.fbs.proto.OpDesc_.Attr]; + is_target:bool; +} + +namespace paddle.lite.fbs.proto.OpDesc_; + +table Attr { + name:string (required, key); + type:paddle.lite.fbs.proto.AttrType; + i:int; + f:float; + s:string; + ints:[int]; + floats:[float]; + strings:[string]; + b:bool; + bools:[bool]; + block_idx:int; + l:long; + blocks_idx:[int]; + longs:[long]; +} + +table Var { + parameter:string (required, key); + arguments:[string]; +} + +namespace paddle.lite.fbs.proto; + +table VarType { + type:paddle.lite.fbs.proto.VarType_.Type; + selected_rows:paddle.lite.fbs.proto.VarType_.TensorDesc; + lod_tensor:paddle.lite.fbs.proto.VarType_.LoDTensorDesc; + tensor_array:paddle.lite.fbs.proto.VarType_.LoDTensorArrayDesc; + reader:paddle.lite.fbs.proto.VarType_.ReaderDesc; + tuple:paddle.lite.fbs.proto.VarType_.Tuple; +} + +namespace paddle.lite.fbs.proto.VarType_; + +table TensorDesc { + data_type:paddle.lite.fbs.proto.VarType_.Type; + dims:[long]; +} + +table LoDTensorDesc { + tensor:paddle.lite.fbs.proto.VarType_.TensorDesc (required); + lod_level:int; +} + +table LoDTensorArrayDesc { + tensor:paddle.lite.fbs.proto.VarType_.TensorDesc (required); + lod_level:int; +} + +table ReaderDesc { + lod_tensor:[paddle.lite.fbs.proto.VarType_.LoDTensorDesc]; +} + +table Tuple { + element_type:[paddle.lite.fbs.proto.VarType_.Type]; +} + +namespace paddle.lite.fbs.proto; + +table VarDesc { + name:string (required, key); + type:paddle.lite.fbs.proto.VarType (required); + persistable:bool; + need_check_feed:bool; +} + +table BlockDesc { + idx:int; + parent_idx:int; + vars:[paddle.lite.fbs.proto.VarDesc]; + ops:[paddle.lite.fbs.proto.OpDesc]; + forward_block_idx:int = -1; +} + +table CompatibleInfo { + version:string (required); + type:paddle.lite.fbs.proto.CompatibleInfo_.Type; +} + +table OpCompatibleMap { + pair:[paddle.lite.fbs.proto.OpCompatibleMap_.OpCompatiblePair]; + default_required_version:string; +} + +namespace paddle.lite.fbs.proto.OpCompatibleMap_; + +table OpCompatiblePair { + op_name:string (required, key); + compatible_info:paddle.lite.fbs.proto.CompatibleInfo (required); +} + +namespace paddle.lite.fbs.proto; + +table ProgramDesc { + blocks:[paddle.lite.fbs.proto.BlockDesc]; + version:paddle.lite.fbs.proto.Version; + op_compatible_map:paddle.lite.fbs.proto.OpCompatibleMap; +} + +root_type paddle.lite.fbs.proto.ProgramDesc; diff --git a/lite/model_parser/flatbuffers/io.cc b/lite/model_parser/flatbuffers/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..ef8e9afaefe94d72113299050f16077a09f6c6cf --- /dev/null +++ b/lite/model_parser/flatbuffers/io.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/model_parser/flatbuffers/io.h" +#include +#include +#include + +namespace paddle { +namespace lite { +namespace fbs { + +void LoadModel(const std::string& path, ProgramDesc* prog) { + CHECK(prog); + FILE* file = fopen(path.c_str(), "rb"); + fseek(file, 0, SEEK_END); + int64_t length = ftell(file); + rewind(file); + std::vector buf(length); + CHECK(fread(buf.data(), 1, length, file)); + fclose(file); + prog->Init(std::move(buf)); +} + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/io.h b/lite/model_parser/flatbuffers/io.h new file mode 100644 index 0000000000000000000000000000000000000000..1c81b192bbbcfc026bc4a2e77225c9a4c68208f3 --- /dev/null +++ b/lite/model_parser/flatbuffers/io.h @@ -0,0 +1,28 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/model_parser/flatbuffers/program_desc.h" + +namespace paddle { +namespace lite { +namespace fbs { + +void LoadModel(const std::string& path, ProgramDesc* prog); + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/op_desc.cc b/lite/model_parser/flatbuffers/op_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..9e416b020d8fed0861d1d0b02ae74a9ccc47df59 --- /dev/null +++ b/lite/model_parser/flatbuffers/op_desc.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/model_parser/flatbuffers/op_desc.h" + +namespace paddle { +namespace lite { +namespace fbs { + +template <> +std::string OpDesc::GetAttr(const std::string& name) const { + const auto& it = desc_->attrs()->LookupByKey(name.c_str()); + if (!it->s()) { + return std::string(); + } + return it->s()->str(); +} + +template <> +std::string OpDesc::GetAttr(size_t idx) const { + const auto& it = desc_->attrs()->Get(idx); + if (!it->s()) { + return std::string(); + } + return it->s()->str(); +} + +template <> +lite::VectorView +OpDesc::GetAttr>(const std::string& name) const { + const auto& it = desc_->attrs()->LookupByKey(name.c_str()); + CHECK(it) << "Attr " << name << "does not exist."; + return VectorView(it->strings()); +} + +template <> +VectorView OpDesc::GetAttr>( + size_t idx) const { + const auto& it = desc_->attrs()->Get(idx); + CHECK(it) << "Attr " << idx << "does not exist."; + return VectorView(it->strings()); +} + +#define GET_ATTR_IMPL(T, fb_f__) \ + template <> \ + typename lite::OpDataTypeTrait::RT OpDesc::GetAttr( \ + const std::string& name) const { \ + const auto& it = desc_->attrs()->LookupByKey(name.c_str()); \ + return it->fb_f__(); \ + } \ + template <> \ + typename lite::OpDataTypeTrait::RT OpDesc::GetAttr( \ + size_t idx) const { \ + const auto& it = desc_->attrs()->Get(idx); \ + return it->fb_f__(); \ + } + +#define GET_ATTRS_IMPL(T, fb_f__) \ + template <> \ + typename lite::OpDataTypeTrait::RT OpDesc::GetAttr( \ + const std::string& name) const { \ + const auto& it = desc_->attrs()->LookupByKey(name.c_str()); \ + return typename lite::OpDataTypeTrait::RT(it->fb_f__()); \ + } \ + template <> \ + typename lite::OpDataTypeTrait::RT OpDesc::GetAttr( \ + size_t idx) const { \ + const auto& it = desc_->attrs()->Get(idx); \ + return typename lite::OpDataTypeTrait::RT(it->fb_f__()); \ + } + +GET_ATTR_IMPL(int32_t, i); +GET_ATTR_IMPL(int16_t, block_idx); +GET_ATTR_IMPL(float, f); +GET_ATTR_IMPL(bool, b); +GET_ATTR_IMPL(int64_t, l); +GET_ATTRS_IMPL(std::vector, ints); +GET_ATTRS_IMPL(std::vector, floats); +GET_ATTRS_IMPL(std::vector, longs); + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/op_desc.h b/lite/model_parser/flatbuffers/op_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..450aa49fa13b676b33bef8490c65061dc504431d --- /dev/null +++ b/lite/model_parser/flatbuffers/op_desc.h @@ -0,0 +1,198 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "lite/model_parser/base/op_desc.h" +#include "lite/model_parser/flatbuffers/framework_generated.h" +#include "lite/model_parser/flatbuffers/vector_view.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace fbs { + +class OpDesc : public OpDescAPI { + public: + explicit OpDesc(proto::OpDesc const* desc) : desc_(desc) { CHECK(desc_); } + + std::string Type() const override { return desc_->type()->str(); } + + // Get the arguments of parameter called `param` + std::vector Input(const std::string& param) const override { + const auto& var = desc_->inputs()->LookupByKey(param.c_str()); + std::vector args_vec; + if (var->arguments()) { + args_vec.reserve(var->arguments()->size()); + for (const auto& in : *var->arguments()) { + args_vec.push_back(in->str()); + } + } + return args_vec; + } + + std::vector InputArgumentNames() const override { + const auto& vars = desc_->inputs(); + std::vector input_names_vec; + if (vars) { + input_names_vec.reserve(vars->size()); + for (const auto& in : *vars) { + input_names_vec.push_back(in->parameter()->str()); + } + } + return input_names_vec; + } + + std::vector Output(const std::string& param) const override { + const auto& var = desc_->outputs()->LookupByKey(param.c_str()); + std::vector args_vec; + if (var && var->arguments()) { + args_vec.reserve(var->arguments()->size()); + for (const auto& out : *var->arguments()) { + args_vec.push_back(out->str()); + } + } + return args_vec; + } + + std::vector OutputArgumentNames() const override { + const auto& vars = desc_->outputs(); + std::vector output_names_vec; + if (vars) { + output_names_vec.reserve(vars->size()); + for (const auto& out : *vars) { + output_names_vec.push_back(out->parameter()->str()); + } + } + return output_names_vec; + } + + bool HasAttr(const std::string& name) const override { + return desc_->attrs()->LookupByKey(name.c_str()) != nullptr; + } + + size_t AttrsSize() const { return desc_->attrs()->size(); } + + std::string AttrName(size_t idx) const { + return desc_->attrs()->Get(idx)->name()->str(); + } + + OpDescAPI::AttrType GetAttrType(const std::string& name) const override { + const auto& attr = desc_->attrs()->LookupByKey(name.c_str()); + CHECK(attr) << "Can not find attr: " << name; + return static_cast(attr->type()); + } + + OpDescAPI::AttrType GetAttrType(size_t idx) const { + const auto& attr = desc_->attrs()->Get(idx); + CHECK(attr); + return static_cast(attr->type()); + } + + std::vector AttrNames() const override { + const auto& attrs = desc_->attrs(); + std::vector attr_names_vec; + if (attrs) { + attr_names_vec.reserve(attrs->size()); + for (const auto& attr : *attrs) { + attr_names_vec.push_back(attr->name()->str()); + } + } + return attr_names_vec; + } + + template + typename lite::OpDataTypeTrait::RT GetAttr( + const std::string& name) const; + + template + typename lite::OpDataTypeTrait::RT GetAttr(size_t idx) const; + + private: + proto::OpDesc const* desc_; + + // To reduce overhead, we expect to use namespace aliasing to make cpp::Desc + // and flatbuffers::Desc replace each other. However, there is no direct + // inheritance relationship between the two data types, and the read-only + // version of flatbuffers lacks some write implementations. Therefore, at + // present, we are temporarily providing a default interface that triggers + // execution-time errors to avoid type ambiguity and compile-time errors + // caused by different building options. + + public: + OpDesc() { NotImplemented(); } + bool HasInput(const std::string& param) const { + return desc_->inputs()->LookupByKey(param.c_str()) != nullptr; + } + + const std::map>& inputs() const { + NotImplemented(); + return inputs_; + } + const std::map>& outputs() const { + NotImplemented(); + return outputs_; + } + std::map>* mutable_inputs() { + NotImplemented(); + return &inputs_; + } + std::map>* mutable_outputs() { + NotImplemented(); + return &outputs_; + } + + std::vector input_vars() const { + NotImplemented(); + return std::vector(); + } + + std::vector output_vars() const { + NotImplemented(); + return std::vector(); + } + + bool HasOutput(const std::string& param) const { + return !Output(param).empty(); + } + + const std::map& attrs() const { + NotImplemented(); + return attrs_; + } + const std::map& attr_types() const { + NotImplemented(); + return attr_types_; + } + + private: + void NotImplemented() const { + LOG(FATAL) << "The additional interfaces of OpDesc is temporarily " + "unavailable in read-only mode."; + } + std::string type_; + std::map> inputs_; + std::map> outputs_; + std::map attrs_; + std::map attr_types_; +}; + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/program_desc.cc b/lite/model_parser/flatbuffers/program_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..f04954a9dc890a0b5866a7e6c3f3c7b18f2783e4 --- /dev/null +++ b/lite/model_parser/flatbuffers/program_desc.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/model_parser/flatbuffers/program_desc.h" + +namespace paddle { +namespace lite { +namespace fbs { + +template <> +proto::BlockDesc const* ProgramDesc::GetBlock( + int32_t idx) const { + CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()"; + return desc_->blocks()->Get(idx); +} + +template <> +BlockDesc const* ProgramDesc::GetBlock(int32_t idx) const { + CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()"; + return &blocks_[idx]; +} + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/program_desc.h b/lite/model_parser/flatbuffers/program_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..55218eef5b4037d13b2f45db6de6b94cb39d994e --- /dev/null +++ b/lite/model_parser/flatbuffers/program_desc.h @@ -0,0 +1,102 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/model_parser/base/program_desc.h" +#include "lite/model_parser/flatbuffers/block_desc.h" +#include "lite/model_parser/flatbuffers/framework_generated.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace fbs { + +class ProgramDesc : public ProgramDescAPI { + public: + ProgramDesc() = default; + explicit ProgramDesc(const std::vector& buf) { Init(buf); } + explicit ProgramDesc(std::vector&& buf) { + Init(std::forward>(buf)); + } + + void Init(const std::vector& buf) { + CHECK(buf.data()); + buf_ = buf; + InitProgramDesc(); + } + + void Init(std::vector&& buf) { + CHECK(buf.data()); + buf_ = std::move(buf); + InitProgramDesc(); + } + + void InitProgramDesc() { + desc_ = proto::GetProgramDesc(buf_.data()); + blocks_.reserve(BlocksSize()); + for (size_t idx = 0; idx < BlocksSize(); ++idx) { + blocks_.push_back(BlockDesc(desc_->blocks()->Get(idx))); + } + } + + void CopyFrom(const ProgramDesc& other) { + buf_ = other.buf(); + Init(buf_); + } + + size_t BlocksSize() const override { return desc_->blocks()->size(); } + + template + T const* GetBlock(int32_t idx) const; + + template + T* GetBlock(int32_t idx) { + NotImplemented(); + return nullptr; + } + + const std::vector& GetBlocks() const { return blocks_; } + + bool HasVersion() const override { return desc_->version() != nullptr; } + + int64_t Version() const override { + CHECK(HasVersion()); + return desc_->version()->version(); + } + + proto::ProgramDesc const* raw_desc() const { return desc_; } + + const std::vector& buf() const { return buf_; } + + private: + proto::ProgramDesc const* desc_; + std::vector buf_; + std::vector blocks_; + + private: + ProgramDesc& operator=(const ProgramDesc&) = delete; + ProgramDesc(const ProgramDesc&) = delete; + void NotImplemented() const { + LOG(FATAL) << "The additional interfaces of ProgramDesc is temporarily " + "unavailable in read-only mode."; + } +}; + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/var_desc.cc b/lite/model_parser/flatbuffers/var_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..a629ffd5e35223aee218a8798a597b8c684c8c62 --- /dev/null +++ b/lite/model_parser/flatbuffers/var_desc.cc @@ -0,0 +1,15 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/model_parser/flatbuffers/var_desc.h" diff --git a/lite/model_parser/flatbuffers/var_desc.h b/lite/model_parser/flatbuffers/var_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..48d81df30f78ca668bbe9358b4f488fd2f4d3d66 --- /dev/null +++ b/lite/model_parser/flatbuffers/var_desc.h @@ -0,0 +1,83 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/model_parser/base/var_desc.h" +#include "lite/model_parser/flatbuffers/framework_generated.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace fbs { + +class VarDesc : public VarDescAPI { + public: + explicit VarDesc(proto::VarDesc const* desc) : desc_(desc) {} + + std::string Name() const override { return desc_->name()->str(); } + + VarDescAPI::Type GetType() const override { + return static_cast(desc_->type()->type()); + } + + bool Persistable() const override { return desc_->persistable(); } + + std::vector GetShape() const override { + CHECK(GetType() == VarDescAPI::Type::LOD_TENSOR); + const auto& dims = desc_->type()->lod_tensor()->tensor()->dims(); + std::vector dims_vec; + dims_vec.reserve(dims->size()); + for (const auto& dim : *dims) { + dims_vec.push_back(dim); + } + return dims_vec; + } + + VarDescAPI::Type GetDataType() const { + CHECK(GetType() == VarDescAPI::Type::LOD_TENSOR); + return static_cast( + desc_->type()->lod_tensor()->tensor()->data_type()); + } + + private: + proto::VarDesc const* desc_; + + // To reduce overhead, we expect to use namespace aliasing to make cpp::Desc + // and flatbuffers::Desc replace each other. However, there is no direct + // inheritance relationship between the two data types, and the read-only + // version of flatbuffers lacks some write implementations. Therefore, at + // present, we are temporarily providing a default interface that triggers + // execution-time errors to avoid type ambiguity and compile-time errors + // caused by different building options. + + public: + VarDesc() { NotImplemented(); } + void SetDataType(Type data_type) { NotImplemented(); } + void SetShape(const std::vector& dims) { NotImplemented(); } + + private: + void NotImplemented() const { + LOG(FATAL) << "The additional interfaces of VarDesc is temporarily " + "unavailable in read-only mode."; + } + std::vector shape_; +}; + +} // namespace fbs +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/vector_view.h b/lite/model_parser/flatbuffers/vector_view.h new file mode 100644 index 0000000000000000000000000000000000000000..bb1331823a2dce79d2b3a6784f1f2d5b5864281d --- /dev/null +++ b/lite/model_parser/flatbuffers/vector_view.h @@ -0,0 +1,143 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "flatbuffers/flatbuffers.h" +#include "lite/model_parser/base/vector_view.h" + +namespace paddle { +namespace lite { +namespace vector_view { + +template +struct ElementTraits::value>::type> { + typedef flatbuffers::Offset element_type; +}; + +template <> +struct ElementTraits { + typedef flatbuffers::Offset element_type; +}; + +template +struct VectorTraits { + typedef flatbuffers::Vector::element_type> + vector_type; + typedef typename vector_type::const_iterator const_iterator; + typedef typename const_iterator::value_type value_type; + typedef const typename const_iterator::reference const_reference; + typedef value_type subscript_return_type; +}; + +struct FBSStrIterator { + typedef flatbuffers::VectorIterator< + flatbuffers::Offset, + typename flatbuffers::IndirectHelper< + flatbuffers::Offset>::return_type> + VI; + + FBSStrIterator() = default; + explicit FBSStrIterator(const VI& iter) { iter_ = iter; } + const VI& raw_iter() const { return iter_; } + + bool operator==(const FBSStrIterator& other) const { + return iter_ == other.raw_iter(); + } + + bool operator<(const FBSStrIterator& other) const { + return iter_ < other.raw_iter(); + } + + bool operator!=(const FBSStrIterator& other) const { + return iter_ != other.raw_iter(); + } + + ptrdiff_t operator-(const FBSStrIterator& other) const { + return iter_ - other.raw_iter(); + } + + std::string operator*() const { return iter_.operator*()->str(); } + std::string operator->() const { return iter_.operator->()->str(); } + + FBSStrIterator& operator++() { + iter_++; + return *this; + } + + FBSStrIterator& operator--() { + iter_--; + return *this; + } + + FBSStrIterator operator+(const size_t& offset) { + return FBSStrIterator(iter_ + offset); + } + + FBSStrIterator operator-(const size_t& offset) { + return FBSStrIterator(iter_ - offset); + } + + private: + VI iter_; +}; + +} // namespace vector_view + +template <> +class VectorView { + public: + typedef vector_view::VectorTraits Traits; + explicit VectorView(typename Traits::vector_type const* cvec) { + cvec_ = cvec; + } + std::string operator[](size_t i) const { return cvec_->operator[](i)->str(); } + vector_view::FBSStrIterator begin() const { + if (!cvec_) { + return vector_view::FBSStrIterator(); + } + return vector_view::FBSStrIterator(cvec_->begin()); + } + vector_view::FBSStrIterator end() const { + if (!cvec_) { + return vector_view::FBSStrIterator(); + } + return vector_view::FBSStrIterator(cvec_->end()); + } + size_t size() const { + if (!cvec_) { + return 0; + } + return cvec_->size(); + } + operator std::vector() const { + VLOG(5) << "Copying elements out of VectorView will damage performance."; + std::vector tmp; + tmp.reserve(size()); + for (size_t i = 0; i < size(); ++i) { + tmp.push_back(cvec_->operator[](i)->str()); + } + return tmp; + } + ~VectorView() = default; + + private: + typename Traits::vector_type const* cvec_; +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/flatbuffers/vector_view_test.cc b/lite/model_parser/flatbuffers/vector_view_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6512ee69bd4f34c0d6e49274d478404191fd9476 --- /dev/null +++ b/lite/model_parser/flatbuffers/vector_view_test.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/model_parser/flatbuffers/vector_view.h" +#include +#include +#include +#include +#include "lite/model_parser/flatbuffers/framework_generated.h" + +namespace paddle { +namespace lite { + +TEST(VectorView, std_vector) { + std::vector vector{1, 2, 3}; + VectorView vector_view(&vector); + size_t i = 0; + for (const auto& value : vector_view) { + EXPECT_EQ(value, vector[i]); + ++i; + } + for (size_t j = 0; j < vector_view.size(); ++j) { + EXPECT_EQ(vector_view[i], vector[i]); + } +} + +TEST(VectorView, Flatbuffers) { + using namespace flatbuffers; // NOLINT + using namespace paddle::lite::fbs; // NOLINT + + auto create_desc = [](FlatBufferBuilder& fbb) { + /* --------- Set --------- */ + // Attr + std::vector ints({-1, 0, 1, 2, 3}); + auto string_0 = fbb.CreateString("string_0"); + auto string_1 = fbb.CreateString("string_1"); + std::vector> strings; + strings.push_back(string_0); + strings.push_back(string_1); + auto attr = proto::OpDesc_::CreateAttrDirect(fbb, + nullptr, + proto::AttrType_INT, + 0, + 0.0f, + nullptr, + &ints, + nullptr, + &strings); + + // OpDesc + std::vector> attrs; + attrs.push_back(attr); + auto op_desc = + proto::CreateOpDescDirect(fbb, "hello!", nullptr, nullptr, &attrs); + + // BlockDesc 0 + std::vector> ops; + ops.push_back(op_desc); + auto block_0 = proto::CreateBlockDescDirect(fbb, 0, 0, nullptr, &ops); + + // BlockDesc 1 + auto block_1 = proto::CreateBlockDescDirect(fbb, 1); + + // ProgramDesc + std::vector> block_vector; + block_vector.push_back(block_0); + block_vector.push_back(block_1); + auto orc = proto::CreateProgramDescDirect(fbb, &block_vector); + fbb.Finish(orc); + }; + + FlatBufferBuilder fbb; + create_desc(fbb); + auto program = fbs::proto::GetProgramDesc(fbb.GetBufferPointer()); + + // BlockDesc View + VectorView block_view(program->blocks()); + EXPECT_EQ(block_view.size(), static_cast(2)); + EXPECT_EQ(block_view[0]->idx(), 0); + EXPECT_EQ(block_view[1]->idx(), 1); + + // OpDesc & Attr View + VectorView op_view(block_view[0]->ops()); + EXPECT_EQ(op_view[0]->type()->str(), std::string("hello!")); + VectorView attr_view(op_view[0]->attrs()); + + // int32_t View + VectorView ints_view(attr_view[0]->ints()); + std::vector ints({-1, 0, 1, 2, 3}); + size_t cnt_0 = 0; + for (const auto& i : ints_view) { + EXPECT_EQ(i, ints[cnt_0]); + ++cnt_0; + } + for (size_t i = 0; i < ints_view.size(); ++i) { + EXPECT_EQ(ints_view[i], ints[i]); + } + std::vector ints_2(ints_view); + for (size_t i = 0; i < ints_2.size(); ++i) { + EXPECT_EQ(ints_2[i], ints[i]); + } + + // String View + VectorView strings_view(attr_view[0]->strings()); + std::vector strings({"string_0", "string_1"}); + EXPECT_EQ(strings_view.size(), strings.size()); + size_t cnt_1 = 0; + for (const auto& s : strings_view) { + EXPECT_EQ(s, strings[cnt_1]); + ++cnt_1; + } + for (size_t i = 0; i < strings_view.size(); ++i) { + EXPECT_EQ(strings_view[i], strings[i]); + } + std::vector string_2(strings_view); + for (size_t i = 0; i < string_2.size(); ++i) { + EXPECT_EQ(string_2[i], strings[i]); + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/cpp/CMakeLists.txt b/lite/model_parser/general/CMakeLists.txt similarity index 70% rename from lite/model_parser/cpp/CMakeLists.txt rename to lite/model_parser/general/CMakeLists.txt index fe3b2f848e404385b8d948db676865b8039f4ba2..ed53678dfac4cc58b208c2faa8573bcd06943aaa 100644 --- a/lite/model_parser/cpp/CMakeLists.txt +++ b/lite/model_parser/general/CMakeLists.txt @@ -3,4 +3,4 @@ lite_cc_library(cpp_var_desc SRCS var_desc.cc) lite_cc_library(cpp_block_desc SRCS block_desc.cc) lite_cc_library(cpp_program_desc SRCS program_desc.cc) -set(cpp_wrapper cpp_op_desc cpp_var_desc cpp_block_desc cpp_program_desc PARENT_SCOPE) +set(cpp_wrapper cpp_program_desc cpp_block_desc cpp_var_desc cpp_op_desc PARENT_SCOPE) diff --git a/lite/model_parser/cpp/block_desc.cc b/lite/model_parser/general/block_desc.cc similarity index 75% rename from lite/model_parser/cpp/block_desc.cc rename to lite/model_parser/general/block_desc.cc index a4dc7cd72acacb6392cecdfe9a551773c1937888..11d2376bc05a6086036b0fd026666b0b16b2de84 100644 --- a/lite/model_parser/cpp/block_desc.cc +++ b/lite/model_parser/general/block_desc.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/model_parser/cpp/block_desc.h" +#include "lite/model_parser/general/block_desc.h" namespace paddle { namespace lite { -namespace cpp { +namespace general { template <> VarDesc* BlockDesc::GetVar(int32_t idx) { @@ -24,6 +24,12 @@ VarDesc* BlockDesc::GetVar(int32_t idx) { return &vars_[idx]; } +template <> +VarDesc const* BlockDesc::GetVar(int32_t idx) const { + CHECK_LT(idx, VarsSize()) << "idx >= vars.size()"; + return &vars_[idx]; +} + template <> VarDesc* BlockDesc::AddVar() { vars_.emplace_back(); @@ -36,12 +42,18 @@ OpDesc* BlockDesc::GetOp(int32_t idx) { return &ops_[idx]; } +template <> +OpDesc const* BlockDesc::GetOp(int32_t idx) const { + CHECK_LT(idx, OpsSize()) << "idx >= ops.size()"; + return &ops_[idx]; +} + template <> OpDesc* BlockDesc::AddOp() { ops_.emplace_back(); return &ops_.back(); } -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/cpp/block_desc.h b/lite/model_parser/general/block_desc.h similarity index 80% rename from lite/model_parser/cpp/block_desc.h rename to lite/model_parser/general/block_desc.h index b6f473b88b84bff71650dd4ecf4d1dc803351212..e618e570c20bfb0915289d2da625865fc5b64676 100644 --- a/lite/model_parser/cpp/block_desc.h +++ b/lite/model_parser/general/block_desc.h @@ -14,16 +14,17 @@ #pragma once #include -#include "lite/model_parser/cpp/op_desc.h" -#include "lite/model_parser/cpp/var_desc.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" +#include "lite/model_parser/general/op_desc.h" +#include "lite/model_parser/general/var_desc.h" namespace paddle { namespace lite { -namespace cpp { +namespace general { /* - * The cpp::BlockDesc is the internal representation for Op. All the internal + * The general::BlockDesc is the internal representation for Op. All the + * internal * imprementation should use it, not the pb::BlockDesc. */ class BlockDesc : public BlockDescAPI { @@ -45,6 +46,11 @@ class BlockDesc : public BlockDescAPI { template T* GetVar(int32_t idx); + template + T const* GetVar(int32_t idx) const; + + std::vector& GetVars() { return vars_; } + template T* AddVar(); @@ -55,6 +61,9 @@ class BlockDesc : public BlockDescAPI { template T* GetOp(int32_t idx); + template + T const* GetOp(int32_t idx) const; + template T* AddOp(); @@ -70,6 +79,6 @@ class BlockDesc : public BlockDescAPI { int32_t forward_block_idx_; }; -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/cpp/op_desc.cc b/lite/model_parser/general/op_desc.cc similarity index 95% rename from lite/model_parser/cpp/op_desc.cc rename to lite/model_parser/general/op_desc.cc index a816943bb9689483f1eb60575147a42594db2654..b4589a14f26b641a0e48c69ec067cd847649b67e 100644 --- a/lite/model_parser/cpp/op_desc.cc +++ b/lite/model_parser/general/op_desc.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/model_parser/cpp/op_desc.h" +#include "lite/model_parser/general/op_desc.h" #include #include namespace paddle { namespace lite { -namespace cpp { +namespace general { std::vector OpDesc::OutputArgumentNames() const { std::vector res; @@ -69,6 +69,6 @@ bool OpDesc::HasOutput(const std::string& param) const { return it != outputs_.end(); } -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/cpp/op_desc.h b/lite/model_parser/general/op_desc.h similarity index 91% rename from lite/model_parser/cpp/op_desc.h rename to lite/model_parser/general/op_desc.h index 57d2f6bbb27a73e1093b6cef114d032e164c0432..e0c2541182adde6ab9171a55d859a5bd5a1195e2 100644 --- a/lite/model_parser/cpp/op_desc.h +++ b/lite/model_parser/general/op_desc.h @@ -17,16 +17,16 @@ #include #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/utils/any.h" #include "lite/utils/varient.h" namespace paddle { namespace lite { -namespace cpp { +namespace general { /* - * The cpp::OpDesc is the internal representation for Op. All the internal + * The general::OpDesc is the internal representation for Op. All the internal * imprementation should use it, not the pb::OpDesc. */ class OpDesc : public OpDescAPI { @@ -108,7 +108,7 @@ class OpDesc : public OpDescAPI { template void SetAttr(const std::string& name, const T& v) { - attr_types_[name] = OpDescAPI::DataTypeTrait::AT; + attr_types_[name] = OpDataTypeTrait::AT; attrs_[name].set(v); } @@ -119,8 +119,8 @@ class OpDesc : public OpDescAPI { auto attr_it = attr_types().find(name); CHECK(attr_it != attr_types().end()); auto pair = std::make_pair(it, attr_it); - CHECK(pair.second->second == OpDescAPI::DataTypeTrait::AT) - << "required type is " << OpDescAPI::DataTypeTrait::ATN + CHECK(pair.second->second == OpDataTypeTrait::AT) + << "required type is " << OpDataTypeTrait::ATN << " not match the true type"; return pair.first->second.get(); } @@ -131,6 +131,6 @@ class OpDesc : public OpDescAPI { } }; -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/cpp/program_desc.cc b/lite/model_parser/general/program_desc.cc similarity index 78% rename from lite/model_parser/cpp/program_desc.cc rename to lite/model_parser/general/program_desc.cc index 3c6adcddf319db57366e5b3cdb05bc6169f229ee..b767a6f77ca657e8ec02b8e182dd8a8b62b7d6ab 100644 --- a/lite/model_parser/cpp/program_desc.cc +++ b/lite/model_parser/general/program_desc.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/model_parser/cpp/program_desc.h" +#include "lite/model_parser/general/program_desc.h" namespace paddle { namespace lite { -namespace cpp { +namespace general { template <> BlockDesc* ProgramDesc::GetBlock(int32_t idx) { @@ -24,12 +24,18 @@ BlockDesc* ProgramDesc::GetBlock(int32_t idx) { return &blocks_[idx]; } +template <> +BlockDesc const* ProgramDesc::GetBlock(int32_t idx) const { + CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()"; + return &blocks_[idx]; +} + template <> BlockDesc* ProgramDesc::AddBlock() { blocks_.emplace_back(); return &blocks_.back(); } -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/cpp/program_desc.h b/lite/model_parser/general/program_desc.h similarity index 72% rename from lite/model_parser/cpp/program_desc.h rename to lite/model_parser/general/program_desc.h index 786dad134adf8d5ac4b03ba43b254359dfc2cdb2..bbc045412d2086473375863575e5d16146d84751 100644 --- a/lite/model_parser/cpp/program_desc.h +++ b/lite/model_parser/general/program_desc.h @@ -14,21 +14,29 @@ #pragma once #include -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" +#include "lite/model_parser/general/block_desc.h" namespace paddle { namespace lite { -namespace cpp { +namespace general { /* - * The cpp::ProgramDesc is the internal representation for Op. All the internal + * The general::ProgramDesc is the internal representation for Op. All the + * internal * imprementation should use it, not the pb::ProgramDesc. */ class ProgramDesc : public ProgramDescAPI { public: ProgramDesc() = default; + void CopyFrom(const ProgramDesc& other) { + version_ = other.Version(); + blocks_ = other.blocks(); + } + + const std::vector& blocks() const { return blocks_; } + size_t BlocksSize() const override { return blocks_.size(); } void ClearBlocks() override { blocks_.clear(); } @@ -36,6 +44,11 @@ class ProgramDesc : public ProgramDescAPI { template T* GetBlock(int32_t idx); + template + T const* GetBlock(int32_t idx) const; + + std::vector& GetBlocks() { return blocks_; } + template T* AddBlock(); @@ -52,6 +65,6 @@ class ProgramDesc : public ProgramDescAPI { std::vector blocks_; }; -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/cpp/var_desc.cc b/lite/model_parser/general/var_desc.cc similarity index 92% rename from lite/model_parser/cpp/var_desc.cc rename to lite/model_parser/general/var_desc.cc index e30bb3eb55d274d5287702d6247b94d5d33c4e74..f2782d1778b07ef201401a62f9c7a6295159ef5f 100644 --- a/lite/model_parser/cpp/var_desc.cc +++ b/lite/model_parser/general/var_desc.cc @@ -12,4 +12,4 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/model_parser/cpp/var_desc.h" +#include "lite/model_parser/general/var_desc.h" diff --git a/lite/model_parser/cpp/var_desc.h b/lite/model_parser/general/var_desc.h similarity index 91% rename from lite/model_parser/cpp/var_desc.h rename to lite/model_parser/general/var_desc.h index c56d7cce53180e0157913372f8b0da4c9cedd8c9..ed69d035dfbe837afa79a3f52bd2c0c925bd19ea 100644 --- a/lite/model_parser/cpp/var_desc.h +++ b/lite/model_parser/general/var_desc.h @@ -15,14 +15,14 @@ #pragma once #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" namespace paddle { namespace lite { -namespace cpp { +namespace general { /* - * The cpp::VarDesc is the internal representation for Op. All the internal + * The general::VarDesc is the internal representation for Op. All the internal * imprementation should use it, not the pb::VarDesc. */ class VarDesc : public VarDescAPI { @@ -59,6 +59,6 @@ class VarDesc : public VarDescAPI { std::vector shape_; }; -} // namespace cpp +} // namespace general } // namespace lite } // namespace paddle diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc index ea94ca52e8f123da5077f3b751ab03b857e8c390..cf93e7f2cedc8db5c5a18d26fa2499dd79c456de 100644 --- a/lite/model_parser/model_parser.cc +++ b/lite/model_parser/model_parser.cc @@ -21,7 +21,7 @@ #include "lite/core/tensor.h" #include "lite/core/variable.h" #include "lite/core/version.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/combined_params_desc.h" #include "lite/model_parser/naive_buffer/param_desc.h" #include "lite/model_parser/naive_buffer/program_desc.h" @@ -176,7 +176,7 @@ void LoadCombinedParamsPb(const std::string &path, const cpp::ProgramDesc &cpp_prog, bool params_from_memory) { CHECK(scope); - auto prog = cpp_prog; + auto &prog = cpp_prog; auto &main_block_desc = *prog.GetBlock(0); // Get vars @@ -310,7 +310,7 @@ void SaveModelPb(const std::string &model_dir, void SaveCombinedParamsPb(const std::string &path, const lite::Scope &exec_scope, const cpp::ProgramDesc &cpp_prog) { - auto prog = cpp_prog; + auto &prog = cpp_prog; auto &main_block_desc = *prog.GetBlock(0); // Get vars @@ -526,7 +526,7 @@ void SaveCombinedParamsNaive(const std::string &path, naive_buffer::proto::CombinedParamsDesc pt_desc(&table); naive_buffer::CombinedParamsDesc desc(&pt_desc); - auto prog = cpp_prog; + auto &prog = cpp_prog; auto &main_block_desc = *prog.GetBlock(0); // set unique_var_names to avoid saving shared params repeatedly std::set unique_var_names; @@ -681,7 +681,7 @@ void LoadCombinedParamsNaive(const std::string &path, } // Check all params loaded - auto prog = cpp_prog; + auto &prog = cpp_prog; auto &main_block_desc = *prog.GetBlock(0); for (size_t i = 0; i < main_block_desc.VarsSize(); ++i) { auto &var = *main_block_desc.GetVar(i); diff --git a/lite/model_parser/naive_buffer/block_desc.h b/lite/model_parser/naive_buffer/block_desc.h index b0ebe7c03f954654864fb9c56d6861cde7fe9384..3f99302c4033f3f732e0c79017fc251c6d0c40b5 100644 --- a/lite/model_parser/naive_buffer/block_desc.h +++ b/lite/model_parser/naive_buffer/block_desc.h @@ -17,7 +17,7 @@ #include #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/proto/framework.nb.h" namespace paddle { diff --git a/lite/model_parser/naive_buffer/combined_params_desc.h b/lite/model_parser/naive_buffer/combined_params_desc.h index a5462ef5eea47867a737cd1eff344c696f9dc159..1131bab9615b53055d58ba962ad21e206ee70bfc 100644 --- a/lite/model_parser/naive_buffer/combined_params_desc.h +++ b/lite/model_parser/naive_buffer/combined_params_desc.h @@ -17,7 +17,7 @@ #include #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/param_desc.h" #include "lite/model_parser/naive_buffer/proto/framework.nb.h" diff --git a/lite/model_parser/naive_buffer/op_desc.h b/lite/model_parser/naive_buffer/op_desc.h index cce0c22c2e717b6d622314f31af2dc418503c78b..f4cd2d8578cf69854fc4044b739fdfa3d6516d50 100644 --- a/lite/model_parser/naive_buffer/op_desc.h +++ b/lite/model_parser/naive_buffer/op_desc.h @@ -23,7 +23,7 @@ #include #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/proto/framework.nb.h" namespace paddle { diff --git a/lite/model_parser/naive_buffer/param_desc.h b/lite/model_parser/naive_buffer/param_desc.h index 0a20b153312d99602ada77317e64c5934df0f070..ebbbdaf846a3550015ec97c11ccfb7d34271b6c5 100644 --- a/lite/model_parser/naive_buffer/param_desc.h +++ b/lite/model_parser/naive_buffer/param_desc.h @@ -17,7 +17,7 @@ #include #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/proto/framework.nb.h" namespace paddle { diff --git a/lite/model_parser/naive_buffer/program_desc.h b/lite/model_parser/naive_buffer/program_desc.h index 0d59b7f71f4f32d4e861b6a622cab646797bca80..6f5277ad32aa2fccf52134a262975cfdbe1b9d6c 100644 --- a/lite/model_parser/naive_buffer/program_desc.h +++ b/lite/model_parser/naive_buffer/program_desc.h @@ -15,7 +15,7 @@ #pragma once #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/proto/framework.nb.h" namespace paddle { diff --git a/lite/model_parser/naive_buffer/var_desc.h b/lite/model_parser/naive_buffer/var_desc.h index bf0845d7464f511dfb77812612c2b99c954600da..20c8e03a5433ba98c8dc3d98af25920a934ee31d 100644 --- a/lite/model_parser/naive_buffer/var_desc.h +++ b/lite/model_parser/naive_buffer/var_desc.h @@ -17,7 +17,7 @@ #include #include #include -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/model_parser/naive_buffer/naive_buffer_wrapper_helper.h" #include "lite/model_parser/naive_buffer/proto/framework.nb.h" diff --git a/lite/model_parser/pb/block_desc.h b/lite/model_parser/pb/block_desc.h index d541a7fbd2dee2dbabf4acdd51259898691f9188..8844173798dcacf77c876f717b71c87cbc57e5e6 100644 --- a/lite/model_parser/pb/block_desc.h +++ b/lite/model_parser/pb/block_desc.h @@ -16,7 +16,7 @@ #include #include "lite/core/framework.pb.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/utils/cp_logging.h" namespace paddle { @@ -50,6 +50,11 @@ class BlockDesc : public BlockDescAPI { template T* GetVar(int32_t idx); + template + T const* GetVar(int32_t idx) const { + return GetVar(idx); + } + template T* AddVar(); @@ -60,6 +65,11 @@ class BlockDesc : public BlockDescAPI { template T* GetOp(int32_t idx); + template + T const* GetOp(int32_t idx) const { + return GetOp(idx); + } + template T* AddOp(); diff --git a/lite/model_parser/pb/op_desc.h b/lite/model_parser/pb/op_desc.h index f21c194a271b46c84b3a363c6f7c0d9c1f7b1f32..6f186e778298a5ae59a63188640725b3ae5322c9 100644 --- a/lite/model_parser/pb/op_desc.h +++ b/lite/model_parser/pb/op_desc.h @@ -26,7 +26,7 @@ #include #include #include "lite/core/framework.pb.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/utils/all.h" namespace paddle { diff --git a/lite/model_parser/pb/program_desc.h b/lite/model_parser/pb/program_desc.h index 38c667f78b98956d26231f90f66a9914eeb349dc..950bf5480db501289250ece88b28d1c1369e56fc 100644 --- a/lite/model_parser/pb/program_desc.h +++ b/lite/model_parser/pb/program_desc.h @@ -16,7 +16,7 @@ #include #include "lite/core/framework.pb.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/utils/cp_logging.h" namespace paddle { @@ -42,6 +42,11 @@ class ProgramDesc : public ProgramDescAPI { template T *GetBlock(int32_t idx); + template + T const *GetBlock(int32_t idx) const { + return GetBlock(idx); + } + template T *AddBlock(); diff --git a/lite/model_parser/pb/var_desc.cc b/lite/model_parser/pb/var_desc.cc index f849b8dd0ed103f789aec41e5c88f3e4f3cdf878..42625ee6190fb98c50de2b88a08b9910d91ed014 100644 --- a/lite/model_parser/pb/var_desc.cc +++ b/lite/model_parser/pb/var_desc.cc @@ -294,9 +294,9 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { case proto::VarType::LOD_TENSOR_ARRAY: return desc_->type().tensor_array().tensor(); default: - LOG(FATAL) - << "Getting 'tensor_desc' is not supported by the type of var %s." - << this->Name(); + LOG(WARNING) << "Getting 'tensor_desc' is not supported by the type(" + << static_cast(desc_->type().type()) << ") of var " + << this->Name(); } return framework::proto::VarDesc().type().lod_tensor().tensor(); } @@ -312,10 +312,9 @@ std::vector VarDesc::tensor_descs() const { } return res; default: - LOG(FATAL) - << "Getting 'tensor_descs' is not supported by the type of var " - "%s." - << this->Name(); + LOG(WARNING) << "Getting 'tensor_descs' is not supported by the type(" + << static_cast(desc_->type().type()) << ") of var " + << this->Name(); } return std::vector(); } diff --git a/lite/model_parser/pb/var_desc.h b/lite/model_parser/pb/var_desc.h index eefacef4b0c90faf132b2e4ef141ac7009939db5..d36881d5892ca8b4bef754554d164409fab4b858 100644 --- a/lite/model_parser/pb/var_desc.h +++ b/lite/model_parser/pb/var_desc.h @@ -18,7 +18,7 @@ #include #include #include "lite/core/framework.pb.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" #include "lite/utils/cp_logging.h" namespace paddle { diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index cd73259a338992e3a88c753c5935b547bbe7595d..17abee4a217897cef3ec7d1e03267e5d00dbef91 100644 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -58,6 +58,7 @@ add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS}) add_operator(crop_op extra SRCS crop_op.cc DEPS ${op_DEPS}) add_operator(assign_op extra SRCS assign_op.cc DEPS ${op_DEPS}) add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS}) +add_operator(group_norm_op extra SRCS group_norm_op.cc DEPS ${op_DEPS}) add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS}) # 3.extra ops @@ -76,6 +77,8 @@ add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS}) add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS}) add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS}) add_operator(sequence_unpad_op_lite extra SRCS sequence_unpad_op.cc DEPS ${op_DEPS}) +add_operator(sequence_pad_op_lite extra SRCS sequence_pad_op.cc DEPS ${op_DEPS}) +add_operator(sequence_mask_op_lite extra SRCS sequence_mask_op.cc DEPS ${op_DEPS}) add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS}) add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS}) add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS}) @@ -86,6 +89,7 @@ add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_m add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS}) add_operator(assign_value_op basic SRCS assign_value_op.cc DEPS ${op_DEPS}) add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS}) +add_operator(fake_quantize_dequantize_abs_max_op extra SRCS fake_quantize_dequantize_abs_max.cc DEPS ${op_DEPS}) add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS}) add_operator(split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS ${op_DEPS}) add_operator(merge_lod_tensor_op_lite extra SRCS merge_lod_tensor_op.cc DEPS ${op_DEPS}) @@ -110,6 +114,9 @@ add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposal add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS}) add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS}) add_operator(max_pool_with_index_op extra SRCS max_pool_with_index_op.cc DEPS ${op_DEPS}) +add_operator(pixel_shuffle_op extra SRCS pixel_shuffle_op.cc DEPS ${op_DEPS}) +add_operator(clip_op extra SRCS clip_op.cc DEPS ${op_DEPS}) +add_operator(print_op extra SRCS print_op.cc DEPS ${op_DEPS}) # for OCR specific add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS}) @@ -137,14 +144,17 @@ add_operator(topk_op extra SRCS topk_op.cc DEPS ${op_DEPS}) add_operator(increment_op extra SRCS increment_op.cc DEPS ${op_DEPS}) add_operator(layer_norm_op extra SRCS layer_norm_op.cc DEPS ${op_DEPS}) add_operator(sequence_softmax_op extra SRCS sequence_softmax_op.cc DEPS ${op_DEPS}) +add_operator(retinanet_detection_output_op extra SRCS retinanet_detection_output_op.cc DEPS ${op_DEPS}) +add_operator(where_index_op extra SRCS where_index_op.cc DEPS ${op_DEPS}) # for content-dnn specific add_operator(search_aligned_mat_mul_op extra SRCS search_aligned_mat_mul_op.cc DEPS ${op_DEPS}) add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS}) add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS}) add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS}) add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS}) +add_operator(topk_pooling_op extra SRCS topk_pooling_op.cc DEPS ${op_DEPS}) # for deformable-convNet -add_operator(deformable_conv_op basic SRCS deformable_conv_op.cc DEPS ${op_DEPS}) +add_operator(deformable_conv_op extra SRCS deformable_conv_op.cc DEPS ${op_DEPS}) # 4. training op add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS}) @@ -160,6 +170,9 @@ add_operator(__xpu__resnet50_op extra SRCS __xpu__resnet50_op.cc DEPS ${op_DEPS} add_operator(__xpu__multi_encoder_op extra SRCS __xpu__multi_encoder_op.cc DEPS ${op_DEPS}) add_operator(__xpu__embedding_with_eltwise_add_op extra SRCS __xpu__embedding_with_eltwise_add_op.cc DEPS ${op_DEPS}) add_operator(__xpu__fc_op extra SRCS __xpu__fc_op.cc DEPS ${op_DEPS}) +add_operator(__xpu__resnet_cbam_op extra SRCS __xpu__resnet_cbam_op.cc DEPS ${op_DEPS}) +add_operator(__xpu__search_attention_op extra SRCS __xpu__search_attention_op.cc DEPS ${op_DEPS}) +add_operator(__xpu__mmdnn_op extra SRCS __xpu__mmdnn_op.cc DEPS ${op_DEPS}) if (NOT LITE_WITH_X86) lite_cc_test(test_fc_op SRCS fc_op_test.cc diff --git a/lite/operators/__xpu__mmdnn_op.cc b/lite/operators/__xpu__mmdnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b898c0b132dc0767c8ba28c29098ac998c2cab21 --- /dev/null +++ b/lite/operators/__xpu__mmdnn_op.cc @@ -0,0 +1,314 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/__xpu__mmdnn_op.h" +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool XPUMmdnnBidEmbGrnnAttOp::CheckShape() const { return true; } + +bool XPUMmdnnBidEmbGrnnAttOp::InferShapeImpl() const { + auto& id_dims = param_.id0->dims(); + auto& id_lod = param_.id0->lod()[0]; + auto& emb_tbl_dims = param_.emb_tbl->dims(); + auto& grnn_wh_dims = param_.grnn_rv_wh->dims(); + + param_.grnn_fw_pool_out->Resize( + {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]}); + param_.grnn_rv_pool_out->Resize( + {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]}); + param_.att_pool_out->Resize( + {(int64_t)id_lod.size() - 1, 2 * grnn_wh_dims[2]}); + param_.concat_3in1_out->Resize({id_dims[0], 3 * grnn_wh_dims[2]}); + param_.concat_3in1_out->set_lod({id_lod}); + param_.emb_fw_out->Resize({id_dims[0], emb_tbl_dims[1]}); + param_.emb_fw_out->set_lod({id_lod}); + return true; +} + +bool XPUMmdnnBidEmbGrnnAttOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + param_.id0 = + scope->FindVar(op_desc.Input("id0").front())->GetMutable(); + param_.id1 = + scope->FindVar(op_desc.Input("id1").front())->GetMutable(); + param_.emb_tbl = scope->FindVar(op_desc.Input("emb_tbl").front()) + ->GetMutable(); + param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front()) + ->GetMutable(); + param_.grnn_fw_wi = scope->FindVar(op_desc.Input("grnn_fw_wi").front()) + ->GetMutable(); + param_.grnn_rv_wh = scope->FindVar(op_desc.Input("grnn_rv_wh").front()) + ->GetMutable(); + param_.grnn_rv_wi = scope->FindVar(op_desc.Input("grnn_rv_wi").front()) + ->GetMutable(); + param_.att_fc_w = scope->FindVar(op_desc.Input("att_fc_w").front()) + ->GetMutable(); + param_.att_fc_b = scope->FindVar(op_desc.Input("att_fc_b").front()) + ->GetMutable(); + + param_.grnn_fw_pool_out = + scope->FindVar(op_desc.Output("grnn_fw_pool_out").front()) + ->GetMutable(); + param_.grnn_rv_pool_out = + scope->FindVar(op_desc.Output("grnn_rv_pool_out").front()) + ->GetMutable(); + param_.att_pool_out = scope->FindVar(op_desc.Output("att_pool_out").front()) + ->GetMutable(); + param_.concat_3in1_out = + scope->FindVar(op_desc.Output("concat_3in1_out").front()) + ->GetMutable(); + param_.emb_fw_out = scope->FindVar(op_desc.Output("emb_fw_out").front()) + ->GetMutable(); + + param_.grnn_fw_wh_maxs = + op_desc.GetAttr>("grnn_fw_wh_maxs"); + param_.grnn_fw_wi_maxs = + op_desc.GetAttr>("grnn_fw_wi_maxs"); + param_.grnn_rv_wh_maxs = + op_desc.GetAttr>("grnn_rv_wh_maxs"); + param_.grnn_rv_wi_maxs = + op_desc.GetAttr>("grnn_rv_wi_maxs"); + param_.att_fc_w_max = op_desc.GetAttr("att_fc_w_max"); + return true; +} + +bool XPUMmdnnBidEmbGrnnAttOp2::CheckShape() const { return true; } + +bool XPUMmdnnBidEmbGrnnAttOp2::InferShapeImpl() const { + auto& id_dims = param_.id0->dims(); + auto& id_lod = param_.id0->lod()[0]; + auto& emb_tbl_dims = param_.emb_tbl->dims(); + auto& grnn_wh_dims = param_.grnn_rv_wh->dims(); + + param_.emb0_out->Resize({id_dims[0], emb_tbl_dims[1]}); + param_.emb0_out->set_lod({id_lod}); + param_.grnn_fw_pool_out->Resize( + {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]}); + param_.grnn_rv_pool_out->Resize( + {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]}); + param_.att_pool_out->Resize( + {(int64_t)id_lod.size() - 1, 2 * grnn_wh_dims[2]}); + param_.concat_3in1_out->Resize({id_dims[0], 3 * grnn_wh_dims[2]}); + param_.concat_3in1_out->set_lod({id_lod}); + param_.emb_fw_out->Resize({id_dims[0], emb_tbl_dims[1]}); + param_.emb_fw_out->set_lod({id_lod}); + return true; +} + +bool XPUMmdnnBidEmbGrnnAttOp2::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + param_.id0 = + scope->FindVar(op_desc.Input("id0").front())->GetMutable(); + param_.id1 = + scope->FindVar(op_desc.Input("id1").front())->GetMutable(); + param_.emb_tbl = scope->FindVar(op_desc.Input("emb_tbl").front()) + ->GetMutable(); + param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front()) + ->GetMutable(); + param_.grnn_fw_wi = scope->FindVar(op_desc.Input("grnn_fw_wi").front()) + ->GetMutable(); + param_.grnn_rv_wh = scope->FindVar(op_desc.Input("grnn_rv_wh").front()) + ->GetMutable(); + param_.grnn_rv_wi = scope->FindVar(op_desc.Input("grnn_rv_wi").front()) + ->GetMutable(); + param_.att_fc_w = scope->FindVar(op_desc.Input("att_fc_w").front()) + ->GetMutable(); + param_.att_fc_b = scope->FindVar(op_desc.Input("att_fc_b").front()) + ->GetMutable(); + + param_.emb0_out = scope->FindVar(op_desc.Output("emb0_out").front()) + ->GetMutable(); + param_.grnn_fw_pool_out = + scope->FindVar(op_desc.Output("grnn_fw_pool_out").front()) + ->GetMutable(); + param_.grnn_rv_pool_out = + scope->FindVar(op_desc.Output("grnn_rv_pool_out").front()) + ->GetMutable(); + param_.att_pool_out = scope->FindVar(op_desc.Output("att_pool_out").front()) + ->GetMutable(); + param_.concat_3in1_out = + scope->FindVar(op_desc.Output("concat_3in1_out").front()) + ->GetMutable(); + param_.emb_fw_out = scope->FindVar(op_desc.Output("emb_fw_out").front()) + ->GetMutable(); + + param_.grnn_fw_wh_maxs = + op_desc.GetAttr>("grnn_fw_wh_maxs"); + param_.grnn_fw_wi_maxs = + op_desc.GetAttr>("grnn_fw_wi_maxs"); + param_.grnn_rv_wh_maxs = + op_desc.GetAttr>("grnn_rv_wh_maxs"); + param_.grnn_rv_wi_maxs = + op_desc.GetAttr>("grnn_rv_wi_maxs"); + param_.att_fc_w_max = op_desc.GetAttr("att_fc_w_max"); + return true; +} + +bool XPUMmdnnBidEmbAttOp::CheckShape() const { return true; } + +bool XPUMmdnnBidEmbAttOp::InferShapeImpl() const { + auto& id_dims = param_.id0->dims(); + auto& id_lod = param_.id0->lod()[0]; + auto& emb_tbl_dims = param_.emb_tbl->dims(); + + param_.att_pool_out->Resize({(int64_t)id_lod.size() - 1, emb_tbl_dims[1]}); + param_.emb_fw_out->Resize({id_dims[0], emb_tbl_dims[1]}); + param_.emb_fw_out->set_lod({id_lod}); + return true; +} + +bool XPUMmdnnBidEmbAttOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + param_.id0 = + scope->FindVar(op_desc.Input("id0").front())->GetMutable(); + param_.id1 = + scope->FindVar(op_desc.Input("id1").front())->GetMutable(); + param_.emb_tbl = scope->FindVar(op_desc.Input("emb_tbl").front()) + ->GetMutable(); + param_.att_fc_w = scope->FindVar(op_desc.Input("att_fc_w").front()) + ->GetMutable(); + param_.att_fc_b = scope->FindVar(op_desc.Input("att_fc_b").front()) + ->GetMutable(); + + param_.att_pool_out = scope->FindVar(op_desc.Output("att_pool_out").front()) + ->GetMutable(); + param_.emb_fw_out = scope->FindVar(op_desc.Output("emb_fw_out").front()) + ->GetMutable(); + + param_.att_fc_w_max = op_desc.GetAttr("att_fc_w_max"); + return true; +} + +bool XPUMmdnnMatchConvTopkOp::CheckShape() const { return true; } + +bool XPUMmdnnMatchConvTopkOp::InferShapeImpl() const { + int channel_num = param_.channel_num; + std::vector topks = param_.topks; + auto row_dim = param_.input_x->dims(); + auto num_k = topks.size(); + auto row_shape_0 = row_dim[0]; + std::vector vec_out_shape; + vec_out_shape.push_back(row_shape_0); + vec_out_shape.push_back(channel_num * num_k); + + param_.topk_out->Resize(lite::DDim(vec_out_shape)); + param_.topk_out->set_lod(param_.input_x->lod()); + return true; +} + +bool XPUMmdnnMatchConvTopkOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + param_.input_x = scope->FindVar(op_desc.Input("input_x").front()) + ->GetMutable(); + param_.input_y = scope->FindVar(op_desc.Input("input_y").front()) + ->GetMutable(); + param_.input_w = scope->FindVar(op_desc.Input("input_w").front()) + ->GetMutable(); + param_.conv_w = scope->FindVar(op_desc.Input("conv_w").front()) + ->GetMutable(); + + param_.topk_out = scope->FindVar(op_desc.Output("topk_out").front()) + ->GetMutable(); + + param_.input_w_max = op_desc.GetAttr("input_w_max"); + param_.conv_w_max = op_desc.GetAttr("conv_w_max"); + param_.topks = op_desc.GetAttr>("topks"); + param_.output_channel = op_desc.GetAttr("output_channel"); + param_.channel_num = op_desc.GetAttr("channel_num"); + param_.dim_t = op_desc.GetAttr("dim_t"); + return true; +} + +bool XPUMmdnnMergeAllOp::CheckShape() const { return true; } + +bool XPUMmdnnMergeAllOp::InferShapeImpl() const { + int64_t dim0 = param_.concat_7in1_x[0]->dims()[0]; + int64_t dim1 = param_.fc2_w->dims()[0]; + std::vector vec_out_shape; + vec_out_shape.push_back(dim0); + vec_out_shape.push_back(dim1); + + param_.out->Resize(lite::DDim(vec_out_shape)); + return true; +} + +bool XPUMmdnnMergeAllOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + param_.concat_7in1_x.clear(); + for (auto& name : op_desc.Input("concat_7in1_x")) { + auto t = scope->FindVar(name)->GetMutable(); + param_.concat_7in1_x.push_back(t); + } + param_.concat_topk_x.clear(); + for (auto& name : op_desc.Input("concat_topk_x")) { + auto t = scope->FindVar(name)->GetMutable(); + param_.concat_topk_x.push_back(t); + } + param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front()) + ->GetMutable(); + param_.grnn_fw_wi = scope->FindVar(op_desc.Input("grnn_fw_wi").front()) + ->GetMutable(); + param_.grnn_rv_wh = scope->FindVar(op_desc.Input("grnn_rv_wh").front()) + ->GetMutable(); + param_.grnn_rv_wi = scope->FindVar(op_desc.Input("grnn_rv_wi").front()) + ->GetMutable(); + param_.fc0_w = scope->FindVar(op_desc.Input("fc0_w").front()) + ->GetMutable(); + param_.fc0_b = scope->FindVar(op_desc.Input("fc0_b").front()) + ->GetMutable(); + param_.fc1_w = scope->FindVar(op_desc.Input("fc1_w").front()) + ->GetMutable(); + param_.fc1_b = scope->FindVar(op_desc.Input("fc1_b").front()) + ->GetMutable(); + param_.fc2_w = scope->FindVar(op_desc.Input("fc2_w").front()) + ->GetMutable(); + param_.fc2_b = scope->FindVar(op_desc.Input("fc2_b").front()) + ->GetMutable(); + + param_.out = + scope->FindVar(op_desc.Output("out").front())->GetMutable(); + + param_.grnn_fw_wh_maxs = + op_desc.GetAttr>("grnn_fw_wh_maxs"); + param_.grnn_fw_wi_maxs = + op_desc.GetAttr>("grnn_fw_wi_maxs"); + param_.grnn_rv_wh_maxs = + op_desc.GetAttr>("grnn_rv_wh_maxs"); + param_.grnn_rv_wi_maxs = + op_desc.GetAttr>("grnn_rv_wi_maxs"); + param_.fc0_w_max = op_desc.GetAttr("fc0_w_max"); + param_.fc1_w_max = op_desc.GetAttr("fc1_w_max"); + param_.fc2_w_max = op_desc.GetAttr("fc2_w_max"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_grnn_att, + paddle::lite::operators::XPUMmdnnBidEmbGrnnAttOp); +REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_grnn_att2, + paddle::lite::operators::XPUMmdnnBidEmbGrnnAttOp2); +REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_att, + paddle::lite::operators::XPUMmdnnBidEmbAttOp); +REGISTER_LITE_OP(__xpu__mmdnn_match_conv_topk, + paddle::lite::operators::XPUMmdnnMatchConvTopkOp); +REGISTER_LITE_OP(__xpu__mmdnn_merge_all, + paddle::lite::operators::XPUMmdnnMergeAllOp); diff --git a/lite/operators/__xpu__mmdnn_op.h b/lite/operators/__xpu__mmdnn_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ba815a1eec7d0913bc08b4f8fa520de73a4bb835 --- /dev/null +++ b/lite/operators/__xpu__mmdnn_op.h @@ -0,0 +1,130 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class XPUMmdnnBidEmbGrnnAttOp : public OpLite { + public: + XPUMmdnnBidEmbGrnnAttOp() {} + + explicit XPUMmdnnBidEmbGrnnAttOp(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "XPUMmdnnBidEmbGrnnAttOp"; } + + private: + mutable XPUMmdnnBidEmbGrnnAttParam param_; +}; + +class XPUMmdnnBidEmbGrnnAttOp2 : public OpLite { + public: + XPUMmdnnBidEmbGrnnAttOp2() {} + + explicit XPUMmdnnBidEmbGrnnAttOp2(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { + return "XPUMmdnnBidEmbGrnnAttOp2"; + } + + private: + mutable XPUMmdnnBidEmbGrnnAttParam2 param_; +}; + +class XPUMmdnnBidEmbAttOp : public OpLite { + public: + XPUMmdnnBidEmbAttOp() {} + + explicit XPUMmdnnBidEmbAttOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "XPUMmdnnBidEmbAttOp"; } + + private: + mutable XPUMmdnnBidEmbAttParam param_; +}; + +class XPUMmdnnMatchConvTopkOp : public OpLite { + public: + XPUMmdnnMatchConvTopkOp() {} + + explicit XPUMmdnnMatchConvTopkOp(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "XPUMmdnnMatchConvTopkOp"; } + + private: + mutable XPUMmdnnMatchConvTopkParam param_; +}; + +class XPUMmdnnMergeAllOp : public OpLite { + public: + XPUMmdnnMergeAllOp() {} + + explicit XPUMmdnnMergeAllOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "XPUMmdnnMergeAllOp"; } + + private: + mutable XPUMmdnnMergeAllParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/__xpu__resnet_cbam_op.cc b/lite/operators/__xpu__resnet_cbam_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6013f4fa90033c51df7a0d3bb670e02f8bf4628d --- /dev/null +++ b/lite/operators/__xpu__resnet_cbam_op.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/__xpu__resnet_cbam_op.h" +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool XPUResNetCbamOp::CheckShape() const { return true; } + +bool XPUResNetCbamOp::InferShapeImpl() const { + auto input_shape = param_.input->dims(); + std::vector output_shape_vec{1, 64}; + paddle::lite::DDim output_shape(output_shape_vec); + output_shape[0] = input_shape[0]; + param_.output->Resize(output_shape); + return true; +} + +bool XPUResNetCbamOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + param_.input = const_cast( + &scope->FindVar(op_desc.Input("Input").front())->Get()); + param_.output = scope->FindVar(op_desc.Output("Output").front()) + ->GetMutable(); + + param_.filter.clear(); + for (auto& name : op_desc.Input("Filter")) { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.filter.push_back(t); + } + param_.bias.clear(); + for (auto& name : op_desc.Input("Bias")) { + if (name.substr(0, 11) == "placeholder") { + param_.bias.push_back(nullptr); + } else { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.bias.push_back(t); + } + } + param_.max_filter.clear(); + for (auto& name : op_desc.Input("MaxFilter")) { + auto t = + const_cast(&scope->FindVar(name)->Get()); + param_.max_filter.push_back(t); + } + + param_.pool_p = op_desc.GetAttr("pool_p"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(__xpu__resnet_cbam, paddle::lite::operators::XPUResNetCbamOp); diff --git a/lite/operators/__xpu__resnet_cbam_op.h b/lite/operators/__xpu__resnet_cbam_op.h new file mode 100644 index 0000000000000000000000000000000000000000..26e5bafeae31183e9054e7e77ea46813c95db707 --- /dev/null +++ b/lite/operators/__xpu__resnet_cbam_op.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class XPUResNetCbamOp : public OpLite { + public: + XPUResNetCbamOp() {} + explicit XPUResNetCbamOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "ResNetCbam"; } + + private: + mutable XPUResNetCbamParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/__xpu__search_attention_op.cc b/lite/operators/__xpu__search_attention_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..acd8c817b0d81ef03df1c05417b8bb2f56c00812 --- /dev/null +++ b/lite/operators/__xpu__search_attention_op.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/__xpu__search_attention_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool XPUMmdnnSearchAttentionOp::CheckShape() const { return true; } + +bool XPUMmdnnSearchAttentionOp::InferShapeImpl() const { + auto& x_dims = param_.X->dims(); + param_.Out->Resize(x_dims); + param_.Out->set_lod(param_.X->lod()); + return true; +} + +bool XPUMmdnnSearchAttentionOp::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + auto x = op_desc.Input("X").front(); + auto w = op_desc.Input("W").front(); + auto b = op_desc.Input("b").front(); + auto out = op_desc.Output("Out").front(); + + param_.X = scope->FindVar(x)->GetMutable(); + param_.W = scope->FindVar(w)->GetMutable(); + param_.b = scope->FindVar(b)->GetMutable(); + param_.Out = scope->FindVar(out)->GetMutable(); + + param_.W_max = op_desc.GetAttr("W_max"); + param_.pad_id = op_desc.GetAttr("pad_id"); + param_.alpha0 = op_desc.GetAttr("alpha0"); + param_.alpha1 = op_desc.GetAttr("alpha1"); + param_.mask = op_desc.GetAttr("mask"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(__xpu__mmdnn_search_attention, + paddle::lite::operators::XPUMmdnnSearchAttentionOp); diff --git a/lite/operators/__xpu__search_attention_op.h b/lite/operators/__xpu__search_attention_op.h new file mode 100644 index 0000000000000000000000000000000000000000..81bd366ee8a51dc8d2d7fb4c9cb03d2199bcb4f2 --- /dev/null +++ b/lite/operators/__xpu__search_attention_op.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class XPUMmdnnSearchAttentionOp : public OpLite { + public: + XPUMmdnnSearchAttentionOp() {} + + explicit XPUMmdnnSearchAttentionOp(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { + return "XPUMmdnnSearchAttentionOp"; + } + + private: + mutable XPUMmdnnSearchAttentionParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/activation_grad_ops.cc b/lite/operators/activation_grad_ops.cc index b31163e5dce6d9b77d923ba44ed58952263610a5..a30231be921e2c4445bb4c7a72c9572b14c1c0f5 100644 --- a/lite/operators/activation_grad_ops.cc +++ b/lite/operators/activation_grad_ops.cc @@ -41,15 +41,11 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc, if (opdesc.HasInput("X")) { auto X_name = opdesc.Input("X").front(); param_.X = GetVar(scope, X_name); - } else { - param_.X = param_.X_grad; } if (opdesc.HasInput("Out")) { auto Out_name = opdesc.Input("Out").front(); param_.Out = GetVar(scope, Out_name); - } else { - param_.Out = param_.Out_grad; } return true; @@ -60,3 +56,5 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc, } // namespace paddle REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp); +REGISTER_LITE_OP(relu_grad, paddle::lite::operators::ActivationGradOp); +REGISTER_LITE_OP(tanh_grad, paddle::lite::operators::ActivationGradOp); diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc index a3d9895955d99b96609a8c35e2493b17a11b9181..01e4116c94c75df3bd5360494c57419fe57c18ef 100644 --- a/lite/operators/activation_ops.cc +++ b/lite/operators/activation_ops.cc @@ -82,7 +82,11 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) { param_.hard_swish_offset = opdesc.GetAttr("offset"); } else if (opdesc.Type() == "reciprocal") { param_.active_type = lite_api::ActivationType::kReciprocal; + } else if (opdesc.Type() == "thresholded_relu") { + param_.active_type = lite_api::ActivationType::kThresholdedRelu; + param_.relu_threshold = opdesc.GetAttr("threshold"); } + VLOG(4) << "opdesc.Type():" << opdesc.Type(); param_.Out = scope->FindVar(out_name)->GetMutable(); @@ -100,3 +104,4 @@ REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(thresholded_relu, paddle::lite::operators::ActivationOp); diff --git a/lite/operators/activation_ops.h b/lite/operators/activation_ops.h index 71fda90bcd893bb0589697a7726b0b9a7500fb6d..250a88de42b4004932f78b0490a844d4a8dbc6fe 100644 --- a/lite/operators/activation_ops.h +++ b/lite/operators/activation_ops.h @@ -80,6 +80,9 @@ class ActivationOp : public OpLite { break; case lite_api::ActivationType::kIndentity: break; + case lite_api::ActivationType::kThresholdedRelu: + ch->macs = param_.X->numel(); + break; default: LOG(FATAL) << "This Type of Activation:" << static_cast(param_.active_type) diff --git a/lite/operators/assign_op.cc b/lite/operators/assign_op.cc index fe1e8db1f954af38041621d1d676cf16833357da..f2237230dceda55c89a423e0ee9504ee1e3c1de8 100644 --- a/lite/operators/assign_op.cc +++ b/lite/operators/assign_op.cc @@ -21,15 +21,15 @@ namespace lite { namespace operators { bool AssignOpLite::CheckShape() const { - CHECK_OR_FALSE(param_.X); - CHECK_OR_FALSE(param_.Out); + CHECK_OR_FALSE(param_.X || param_.X_array); + CHECK_OR_FALSE(param_.Out || param_.Out_array); return true; } bool AssignOpLite::InferShapeImpl() const { - if (param_.X != nullptr) { + if (param_.X) { param_.Out->Resize(param_.X->dims()); - } else if (param_.X_array != nullptr) { + } else if (param_.X_array) { param_.Out_array->resize(param_.Out_array->size()); } else { LOG(FATAL) << "x or x_array must be set."; diff --git a/lite/operators/assign_value_op.cc b/lite/operators/assign_value_op.cc index ff5b55735f7b58aa2eaa2274574336dadd8061e6..f6f8cb7e3c8958693dd7234b7a21b29b769aa96c 100644 --- a/lite/operators/assign_value_op.cc +++ b/lite/operators/assign_value_op.cc @@ -26,12 +26,15 @@ bool AssignValueOpLite::CheckShape() const { auto shape = param_.shape; auto int32_values = param_.int32_values; auto fp32_values = param_.fp32_values; + auto int64_values = param_.int64_values; + auto bool_values = param_.bool_values; size_t shape_num = 1; - for (int i = 0; i < shape.size(); i++) { + for (size_t i = 0; i < shape.size(); i++) { shape_num *= shape[i]; } - CHECK_OR_FALSE(shape_num == int32_values.size() || - shape_num == fp32_values.size()); + CHECK_OR_FALSE( + shape_num == int32_values.size() || shape_num == fp32_values.size() || + shape_num == int64_values.size() || shape_num == bool_values.size()); return true; } @@ -47,9 +50,18 @@ bool AssignValueOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { param_.shape = op_desc.GetAttr>("shape"); param_.dtype = op_desc.GetAttr("dtype"); - param_.fp32_values = op_desc.GetAttr>("fp32_values"); - param_.int32_values = op_desc.GetAttr>("int32_values"); - + if (op_desc.HasAttr("fp32_values")) { + param_.fp32_values = op_desc.GetAttr>("fp32_values"); + } + if (op_desc.HasAttr("int32_values")) { + param_.int32_values = op_desc.GetAttr>("int32_values"); + } + if (op_desc.HasAttr("int64_values")) { + param_.int64_values = op_desc.GetAttr>("int64_values"); + } + if (op_desc.HasAttr("bool_values")) { + param_.bool_values = op_desc.GetAttr>("bool_values"); + } auto out = op_desc.Output("Out").front(); param_.Out = scope->FindVar(out)->GetMutable(); return true; diff --git a/lite/operators/clip_op.cc b/lite/operators/clip_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ad8eef45f3b38cd176d1bd3d2d0b42620faf602c --- /dev/null +++ b/lite/operators/clip_op.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/clip_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool ClipOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.out); + return true; +} + +bool ClipOpLite::InferShapeImpl() const { + param_.out->Resize(param_.x->dims()); + param_.out->set_lod(param_.x->lod()); + return true; +} + +bool ClipOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { + AttachInput(op_desc, scope, "X", false, ¶m_.x); + AttachInput(op_desc, scope, "Min", true, ¶m_.min_tensor); + AttachInput(op_desc, scope, "Max", true, ¶m_.max_tensor); + AttachOutput(op_desc, scope, "Out", false, ¶m_.out); + + param_.min = op_desc.GetAttr("min"); + param_.max = op_desc.GetAttr("max"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(clip, paddle::lite::operators::ClipOpLite); diff --git a/lite/operators/clip_op.h b/lite/operators/clip_op.h new file mode 100644 index 0000000000000000000000000000000000000000..25c7f9a824ffc4b395a13df39811074724211f44 --- /dev/null +++ b/lite/operators/clip_op.h @@ -0,0 +1,48 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class ClipOpLite : public OpLite { + public: + ClipOpLite() {} + + explicit ClipOpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "clip"; } + + private: + mutable ClipParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/conditional_block_op.cc b/lite/operators/conditional_block_op.cc index e3678e92c9d33be5428c82331ce963f4c6067369..de8bea345fe8da1e157665b93f9d50c6f6bbffa3 100644 --- a/lite/operators/conditional_block_op.cc +++ b/lite/operators/conditional_block_op.cc @@ -20,35 +20,37 @@ namespace paddle { namespace lite { namespace operators { -bool ConditionalBlockOpLite::CheckShape() const { +bool ConditionalBlockOp::CheckShape() const { CHECK_OR_FALSE(param_.cond); - CHECK_OR_FALSE(param_.sub_block); - CHECK_OR_FALSE(param_.scope); + CHECK_OR_FALSE(param_.program_desc); + CHECK_OR_FALSE(param_.exec_scope); return true; } -bool ConditionalBlockOpLite::InferShapeImpl() const { return true; } +bool ConditionalBlockOp::InferShapeImpl() const { return true; } -bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc, - lite::Scope *scope) { +bool ConditionalBlockOp::AttachImpl(const cpp::OpDesc& op_desc, Scope* scope) { auto condition = op_desc.Input("Cond").front(); param_.cond = scope->FindVar(condition)->GetMutable(); - auto inputs = op_desc.Input("Input"); - for (auto var : inputs) { - param_.x.push_back(scope->FindVar(var)->GetMutable()); + for (const auto& input : inputs) { + auto* var = scope->FindVar(input); + CHECK(var); + param_.inputs.push_back(var->GetMutable()); } - auto outs = op_desc.Output("Out"); - for (auto var : outs) { - param_.outs.push_back(scope->FindVar(var)->GetMutable()); + for (const auto& out : outs) { + auto* var = scope->FindVar(out); + CHECK(var); + param_.outs.push_back(var->GetMutable()); } - param_.is_scalar_condition = op_desc.GetAttr("is_scalar_condition"); // obtain sub_block in core program.cc - param_.sub_block = sub_block_; - param_.scope = scope; - + CHECK(param_.program_desc); + param_.block_idx = op_desc.GetAttr("sub_block"); + CHECK_GE(param_.block_idx, 0); + param_.exec_scope = scope; + CHECK(param_.exec_scope); return true; } @@ -57,4 +59,4 @@ bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc, } // namespace paddle REGISTER_LITE_OP(conditional_block, - paddle::lite::operators::ConditionalBlockOpLite); + paddle::lite::operators::ConditionalBlockOp); diff --git a/lite/operators/conditional_block_op.h b/lite/operators/conditional_block_op.h index 1815731c8df3ac07bee80aa8e0cc658e752b5c4f..adcd8acdff391e2ae3ece9ec21669d853250dcf4 100644 --- a/lite/operators/conditional_block_op.h +++ b/lite/operators/conditional_block_op.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include "lite/core/op_lite.h" @@ -23,27 +24,30 @@ namespace paddle { namespace lite { namespace operators { -class ConditionalBlockOpLite : public OpLite { +class ConditionalBlockOp : public OpLite { public: - ConditionalBlockOpLite() {} - explicit ConditionalBlockOpLite(const std::string &op_type) - : OpLite(op_type) {} + ConditionalBlockOp() {} + explicit ConditionalBlockOp(const std::string &op_type) : OpLite(op_type) {} bool CheckShape() const override; bool InferShapeImpl() const override; - bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + bool AttachImpl(const cpp::OpDesc &opdesc, Scope *scope) override; void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } std::string DebugString() const override { return "conditional_block"; } - void SetSubBlock(cpp::BlockDesc *desc) { sub_block_ = desc; } + void SetProgramDesc(std::shared_ptr program_desc) { + param_.program_desc = program_desc; + } + std::shared_ptr GetProgramDesc() { + return param_.program_desc; + } private: mutable ConditionalBlockParam param_; - cpp::BlockDesc *sub_block_; }; } // namespace operators diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h index c3e375e2e44b8184e6e7e635ab2c6c1f8889f844..a1d4e2e8a038046b257b3ab5f936cc4cb2e62c67 100644 --- a/lite/operators/conv_op.h +++ b/lite/operators/conv_op.h @@ -74,7 +74,7 @@ class ConvOpLite : public OpLite { param_.output = scope->FindVar(Out)->GetMutable(); param_.strides = op_desc.GetAttr>("strides"); - auto paddings = op_desc.GetAttr>("paddings"); + std::vector paddings = op_desc.GetAttr>("paddings"); param_.groups = op_desc.GetAttr("groups"); auto dilations = op_desc.GetAttr>("dilations"); param_.dilations = std::make_shared>(dilations); @@ -130,15 +130,18 @@ class ConvOpLite : public OpLite { padding_algorithm_ = op_desc.GetAttr("padding_algorithm"); } // For Int8 - if (op_desc.HasAttr("enable_int8")) { - param_.enable_int8 = op_desc.GetAttr("enable_int8"); - if (op_desc.HasAttr("input_scale")) - param_.input_scale = op_desc.GetAttr("input_scale"); - if (op_desc.HasAttr("weight_scale")) - param_.weight_scale = - op_desc.GetAttr>("weight_scale"); - if (op_desc.HasAttr("output_scale")) { - param_.output_scale = op_desc.GetAttr("output_scale"); + const OpInfo* op_info = dynamic_cast(&op_desc); + if (op_info != nullptr && op_info->HasAttr("enable_int8")) { + param_.enable_int8 = op_info->GetAttr("enable_int8"); + auto input_name = op_info->Input("Input").front(); + auto filter_name = op_info->Input("Filter").front(); + auto output_name = op_info->Output("Output").front(); + if (op_info->HasInputScale(input_name)) + param_.input_scale = op_info->GetInputScale(input_name)[0]; + if (op_info->HasInputScale(filter_name)) + param_.weight_scale = op_info->GetInputScale(filter_name); + if (op_info->HasOutputScale(output_name)) { + param_.output_scale = op_info->GetOutputScale(output_name)[0]; } } diff --git a/lite/operators/conv_transpose_op.cc b/lite/operators/conv_transpose_op.cc index 9d098eb975ef071a4650ea547d6081d950b251f1..732f8c5056f930259655339c8d8a0b2846f29313 100644 --- a/lite/operators/conv_transpose_op.cc +++ b/lite/operators/conv_transpose_op.cc @@ -106,7 +106,7 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc& op_desc, param_.output = scope->FindVar(Out)->GetMutable(); param_.strides = op_desc.GetAttr>("strides"); - auto paddings = op_desc.GetAttr>("paddings"); + std::vector paddings = op_desc.GetAttr>("paddings"); param_.groups = op_desc.GetAttr("groups"); auto dilations = op_desc.GetAttr>("dilations"); diff --git a/lite/operators/deformable_conv_op.cc b/lite/operators/deformable_conv_op.cc index 8cc8614d00801fb033bc3f449e82f9f03e271db5..a834528f27c9d6c97e355a1a149482ad00ae79aa 100644 --- a/lite/operators/deformable_conv_op.cc +++ b/lite/operators/deformable_conv_op.cc @@ -84,5 +84,5 @@ bool DeformableConvOpLite::InferShapeImpl() const { } // namespace lite } // namespace paddle -REGISTER_LITE_OP(DeformableConv2d, +REGISTER_LITE_OP(deformable_conv, paddle::lite::operators::DeformableConvOpLite); diff --git a/lite/operators/deformable_conv_op.h b/lite/operators/deformable_conv_op.h index aa736fcef6b6f74740253b8607e8bfcd938d0ff8..69b764758699089bdee0a64e33a01d838b011ec0 100644 --- a/lite/operators/deformable_conv_op.h +++ b/lite/operators/deformable_conv_op.h @@ -83,7 +83,7 @@ class DeformableConvOpLite : public OpLite { param_.conv_param.filter = scope->FindVar(Filter)->GetMutable(); param_.conv_param.strides = op_desc.GetAttr>("strides"); - auto paddings = op_desc.GetAttr>("paddings"); + std::vector paddings = op_desc.GetAttr>("paddings"); auto dilations = op_desc.GetAttr>("dilations"); param_.conv_param.groups = op_desc.GetAttr("groups"); param_.conv_param.dilations = std::make_shared>(dilations); diff --git a/lite/operators/elementwise_ops.cc b/lite/operators/elementwise_ops.cc index 6cc41f0a66cfac4a0baa0153765a59766fa045f4..5895bb667aa22507d362004627304ecf78e085f1 100644 --- a/lite/operators/elementwise_ops.cc +++ b/lite/operators/elementwise_ops.cc @@ -144,6 +144,8 @@ REGISTER_LITE_OP(elementwise_add, paddle::lite::operators::ElementwiseOp); REGISTER_LITE_OP(elementwise_mul, paddle::lite::operators::ElementwiseOp); REGISTER_LITE_OP(elementwise_max, paddle::lite::operators::ElementwiseOp); REGISTER_LITE_OP(elementwise_div, paddle::lite::operators::ElementwiseOp); +REGISTER_LITE_OP(elementwise_mod, paddle::lite::operators::ElementwiseOp); +REGISTER_LITE_OP(elementwise_pow, paddle::lite::operators::ElementwiseOp); // #ifdef LITE_WITH_TRAIN // REGISTER_LITE_OP(elementwise_sub_grad, diff --git a/lite/operators/fake_quantize_dequantize_abs_max.cc b/lite/operators/fake_quantize_dequantize_abs_max.cc new file mode 100644 index 0000000000000000000000000000000000000000..354f5e9dcdbd55f634ae394187c5f9163eb9c25a --- /dev/null +++ b/lite/operators/fake_quantize_dequantize_abs_max.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/fake_quantize_dequantize_abs_max.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators {} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(fake_quantize_dequantize_abs_max, + paddle::lite::operators::FakeQuantizeDequantizeAbsMaxOpLite); diff --git a/lite/operators/fake_quantize_dequantize_abs_max.h b/lite/operators/fake_quantize_dequantize_abs_max.h new file mode 100644 index 0000000000000000000000000000000000000000..7413b448ea5e2317501960a246478d15242f9cdc --- /dev/null +++ b/lite/operators/fake_quantize_dequantize_abs_max.h @@ -0,0 +1,65 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/core/tensor.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class FakeQuantizeDequantizeAbsMaxOpLite : public OpLite { + public: + FakeQuantizeDequantizeAbsMaxOpLite() {} + + explicit FakeQuantizeDequantizeAbsMaxOpLite(const std::string &type) + : OpLite(type) {} + + bool CheckShape() const override { return true; } + + bool InferShapeImpl() const override { return true; } + + bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override { + auto x = op_desc.Input("X").front(); + auto out = op_desc.Output("Out").front(); + auto out_scale = op_desc.Output("OutScale").front(); + + param_.x = scope->FindVar(x)->GetMutable(); + param_.out = scope->FindVar(out)->GetMutable(); + param_.out_scale = scope->FindVar(out_scale)->GetMutable(); + param_.bit_length = op_desc.GetAttr("bit_length"); + return true; + } + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { + return "fake_quantize_dequantize_abs_max"; + } + + private: + mutable FakeQuantDequantAbsMaxParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc index d4032c5e8b98ff6d5763d2d06610d2e214ad90ca..28a220da2de0920643d46f1ed9c610dfa613cf95 100644 --- a/lite/operators/fc_op.cc +++ b/lite/operators/fc_op.cc @@ -102,14 +102,18 @@ bool FcOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { } // For Int8 - if (op_desc.HasAttr("enable_int8")) { - param_.enable_int8 = op_desc.GetAttr("enable_int8"); - if (op_desc.HasAttr("input_scale")) - param_.input_scale = op_desc.GetAttr("input_scale"); - if (op_desc.HasAttr("weight_scale")) - param_.weight_scale = op_desc.GetAttr>("weight_scale"); - if (op_desc.HasAttr("output_scale")) - param_.output_scale = op_desc.GetAttr("output_scale"); + const OpInfo* op_info = dynamic_cast(&op_desc); + if (op_info != nullptr && op_info->HasAttr("enable_int8")) { + param_.enable_int8 = op_info->GetAttr("enable_int8"); + auto input_name = op_info->Input("Input").front(); + auto weight_name = op_info->Input("W").front(); + auto out_name = op_info->Output("Out").front(); + if (op_info->HasInputScale(input_name)) + param_.input_scale = op_info->GetInputScale(input_name)[0]; + if (op_info->HasInputScale(weight_name)) + param_.weight_scale = op_info->GetInputScale(weight_name); + if (op_info->HasOutputScale(out_name)) + param_.output_scale = op_info->GetOutputScale(out_name)[0]; } return true; } diff --git a/lite/operators/group_norm_op.cc b/lite/operators/group_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e1a6413ebb140bac4a1d7e74ef42413f489395c7 --- /dev/null +++ b/lite/operators/group_norm_op.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/group_norm_op.h" +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool GroupNormOp::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.scale); + CHECK_OR_FALSE(param_.bias); + CHECK_OR_FALSE(param_.out); + CHECK_OR_FALSE(param_.saved_mean); + CHECK_OR_FALSE(param_.saved_variance); + auto x_dims = param_.x->dims(); + auto scale_dims = param_.scale->dims(); + auto bias_dims = param_.bias->dims(); + CHECK(x_dims.size() >= 2 && x_dims.size() <= 5) + << "Input X must have 2 to 5 dimensions."; + CHECK_EQ(scale_dims.size(), 1UL) << "Input Scale must have 1 dimensions."; + CHECK_EQ(bias_dims.size(), 1UL) << "Input Bias must have 1 dimensions."; + CHECK_GT(param_.epsilon, 0.f) << "epsilon should be greater than 0.f"; + CHECK_LT(param_.epsilon, 0.01f) << "epsilon should be less than 0.01f"; + CHECK_EQ(param_.channels, x_dims[1]) + << "Input channels must be equal input_shape[1]"; + CHECK_EQ(param_.channels % param_.groups, 0) + << "channels must be divide groups"; + return true; +} + +bool GroupNormOp::InferShapeImpl() const { + auto x_dims = param_.x->dims(); + int64_t batch_size = x_dims[0]; + int64_t num = param_.channels / param_.groups; + param_.saved_mean->Resize({batch_size * num}); + param_.saved_variance->Resize({batch_size * num}); + param_.out->Resize(x_dims); + return true; +} + +bool GroupNormOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { + param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable(); + param_.scale = + scope->FindVar(op_desc.Input("Scale").front())->GetMutable(); + param_.bias = + scope->FindVar(op_desc.Input("Bias").front())->GetMutable(); + param_.saved_mean = + scope->FindVar(op_desc.Output("SavedMean").front())->GetMutable(); + param_.saved_variance = + scope->FindVar(op_desc.Output("SavedVariance").front()) + ->GetMutable(); + param_.out = + scope->FindVar(op_desc.Output("Y").front())->GetMutable(); + param_.epsilon = op_desc.GetAttr("epsilon"); + param_.groups = op_desc.GetAttr("groups"); + param_.channels = op_desc.GetAttr("channels"); + return true; +} + +} /* namespace operators */ +} /* namespace lite */ +} /* namespace paddle */ + +REGISTER_LITE_OP(group_norm, paddle::lite::operators::GroupNormOp); diff --git a/lite/operators/group_norm_op.h b/lite/operators/group_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f2251686ea2caa89e3934e8adae69466f9c9515d --- /dev/null +++ b/lite/operators/group_norm_op.h @@ -0,0 +1,61 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class GroupNormOp : public OpLite { + public: + GroupNormOp() {} + + explicit GroupNormOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "group_norm"; } + +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) { + ch->input_shape = ch->DimToStr(param_.x->dims()); + ch->output_shape = ch->DimToStr(param_.out->dims()); + // ch->remark = ""; + auto x_dims = param_.x->dims(); + auto nc = x_dims[0] * x_dims[1]; + auto hw = x_dims[2] * x_dims[3]; + auto nchw = x_dims.production(); + ch->macs = 5.f * nchw + 3.f * (nc + hw); + } +#endif + + private: + mutable GroupNormParam param_; +}; + +} /* namespace operators */ +} /* namespace lite */ +} /* namespace paddle */ diff --git a/lite/operators/gru_op.cc b/lite/operators/gru_op.cc index 862a1ff98f699393c9aa91afab978f947cc25187..0a9128dcd27870f6456b26ba636d4189267583be 100644 --- a/lite/operators/gru_op.cc +++ b/lite/operators/gru_op.cc @@ -75,9 +75,8 @@ bool GRUOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { auto batch_reset_hidden_prev = op_desc.Output("BatchResetHiddenPrev").front(); auto batch_hidden = op_desc.Output("BatchHidden").front(); auto hidden = op_desc.Output("Hidden").front(); - param_.input = scope->FindVar(input)->GetMutable(); - if (op_desc.Input("H0").size()) { + if (!op_desc.Input("H0").empty()) { auto h0 = op_desc.Input("H0").front(); param_.h0 = scope->FindVar(h0)->GetMutable(); } @@ -90,7 +89,7 @@ bool GRUOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { scope->FindVar(batch_hidden)->GetMutable(); param_.hidden = scope->FindVar(hidden)->GetMutable(); - if (op_desc.HasInput("Bias")) { + if (!op_desc.Input("Bias").empty()) { auto bias = op_desc.Input("Bias").front(); param_.bias = scope->FindVar(bias)->GetMutable(); } diff --git a/lite/operators/match_matrix_tensor_op.cc b/lite/operators/match_matrix_tensor_op.cc index 1cc751109f76a96097d363b493322dde182a715d..fd70143131b458c1d985a21a6d9d84c707ba9986 100644 --- a/lite/operators/match_matrix_tensor_op.cc +++ b/lite/operators/match_matrix_tensor_op.cc @@ -94,6 +94,18 @@ bool MatchMatrixTensorOpLite::AttachImpl(const cpp::OpDesc& op_desc, param_.dim_t = op_desc.GetAttr("dim_t"); + if (op_desc.HasAttr("fuse_relu")) { + param_.fuse_relu = op_desc.GetAttr("fuse_relu"); + } +#ifdef LITE_WITH_XPU + if (op_desc.HasAttr("__xpu__float_to_fix")) { + param_.__xpu__float_to_fix = op_desc.GetAttr("__xpu__float_to_fix"); + } + if (op_desc.HasAttr("__xpu__w_max")) { + param_.__xpu__w_max = op_desc.GetAttr("__xpu__w_max"); + } +#endif + return true; } diff --git a/lite/operators/max_pool_with_index_op.h b/lite/operators/max_pool_with_index_op.h index bd82743c279c4728483c72f017a8fa6e94cf3eb4..dfc220907549dc9ce61726b79cb1626c2734b234 100644 --- a/lite/operators/max_pool_with_index_op.h +++ b/lite/operators/max_pool_with_index_op.h @@ -54,7 +54,7 @@ class MaxPoolWithIndexOpLite : public OpLite { param_.ksize = op_desc.GetAttr>("ksize"); param_.global_pooling = op_desc.GetAttr("global_pooling"); param_.strides = op_desc.GetAttr>("strides"); - auto paddings = op_desc.GetAttr>("paddings"); + std::vector paddings = op_desc.GetAttr>("paddings"); if (op_desc.HasAttr("adaptive")) { param_.adaptive = op_desc.GetAttr("adaptive"); } diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index 955f5a19de6c191f5eba53774f5137c90d481dd8..ef728924c1afe2cb4040ca84c27dc9ea09f18190 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -21,10 +21,9 @@ #include "lite/core/scope.h" #include "lite/core/tensor.h" #include "lite/core/types.h" -#include "lite/model_parser/cpp/block_desc.h" -#include "lite/model_parser/desc_apis.h" +#include "lite/model_parser/base/apis.h" +#include "lite/model_parser/cpp_desc.h" #include "lite/utils/all.h" -#include "lite/utils/variant.h" /* * This file contains all the argument parameter data structure for operators. */ @@ -91,9 +90,9 @@ struct SubgraphParam : ParamBase { std::vector output_names{}; std::vector input_data_names{}; std::vector output_data_names{}; - int sub_block_idx{-1}; - cpp::BlockDesc* sub_block_desc{nullptr}; - Scope* scope{nullptr}; + int block_idx{-1}; + std::shared_ptr program_desc{nullptr}; + Scope* exec_scope{nullptr}; }; /// -------------------------- NN operators ------------------------------------ @@ -358,6 +357,8 @@ struct ActivationParam : ParamBase { float hard_swish_threshold{6.0}; float hard_swish_scale{6.0}; float hard_swish_offset{3.0}; + // thresholded_relu + float relu_threshold{1.0f}; }; struct ActivationGradParam : ParamBase { @@ -677,6 +678,13 @@ struct FakeChannelWiseDequantizeMaxAbsParam : ParamBase { std::vector quant_bits; }; +struct FakeQuantDequantAbsMaxParam : ParamBase { + const lite::Tensor* x{}; + lite::Tensor* out{}; + lite::Tensor* out_scale{}; + int bit_length; +}; + /// ----------------------- sgd operators ---------------------- struct SGDParam : ParamBase { int dtype{static_cast(VarDescAPI::VarDataType::FP32)}; @@ -938,11 +946,10 @@ struct CompareParam : ParamBase { }; struct WhileParam : ParamBase { - Scope* scope{}; Tensor* cond{}; - cpp::BlockDesc* sub_block{}; - std::vector x{}; - std::vector outs{}; + int block_idx{-1}; + std::shared_ptr program_desc{nullptr}; + Scope* exec_scope{nullptr}; }; struct TopkParam : ParamBase { @@ -1030,12 +1037,28 @@ struct SequenceExpandParam : ParamBase { int ref_level{-1}; }; +struct SequencePadParam : ParamBase { + const lite::Tensor* X{}; + const lite::Tensor* PadValue{}; + lite::Tensor* Out{}; + lite::Tensor* Length{}; + int padded_length{-1}; +}; + struct SequenceUnpadParam : ParamBase { const lite::Tensor* X{}; const lite::Tensor* Length{}; lite::Tensor* Out{}; }; +struct SequenceMaskParam : ParamBase { + const lite::Tensor* X{}; + const lite::Tensor* MaxLenTensor{nullptr}; + lite::Tensor* Y{}; + int maxlen{-1}; + int out_dtype; +}; + struct SequenceExpandAsParam : ParamBase { const lite::Tensor* x{nullptr}; const lite::Tensor* y{nullptr}; @@ -1112,6 +1135,11 @@ struct VarConv2DParam : ParamBase { int kernel_w; bool fuse_relu{false}; + +#ifdef LITE_WITH_XPU + bool __xpu__float_to_fix{false}; // Is W already converted to int16/int8 + float __xpu__w_max{0.0f}; // Abs max in W +#endif }; /// ----------------------- shape operators ---------------------- @@ -1329,6 +1357,8 @@ struct AssignValueParam : ParamBase { int dtype{}; std::vector fp32_values{}; std::vector int32_values{}; + std::vector int64_values{}; + std::vector bool_values{}; lite::Tensor* Out{}; }; @@ -1343,6 +1373,15 @@ struct SequenceTopkAvgPoolingParam : ParamBase { std::vector topks{}; }; +/// --------------- topk_pooling operators ------------------ +struct TopkPoolingParam : ParamBase { + const lite::Tensor* X{}; + const lite::Tensor* Y{}; + lite::Tensor* Out{}; + int top_k{1}; + int feat_map_num{1}; +}; + /// --------------- search_fc operators ------------------ struct SearchFcParam : ParamBase { const lite::Tensor* X{}; @@ -1350,6 +1389,13 @@ struct SearchFcParam : ParamBase { const lite::Tensor* b{}; lite::Tensor* Out{}; int out_size{}; + + bool fuse_relu{false}; + +#ifdef LITE_WITH_XPU + bool __xpu__float_to_fix{false}; // Is W already converted to int16/int8 + float __xpu__w_max{0.0f}; // Abs max in W +#endif }; /// --------------------- match_matrix_tensor operators -------------------- struct MatchMatrixTensorParam : ParamBase { @@ -1360,6 +1406,12 @@ struct MatchMatrixTensorParam : ParamBase { lite::Tensor* tmp{}; int dim_t; + bool fuse_relu{false}; + +#ifdef LITE_WITH_XPU + bool __xpu__float_to_fix{false}; // Is w already converted to int16/int8 + float __xpu__w_max{0.0f}; // Abs max in w +#endif }; /// --------------------- search_seq_depadding operators -------------------- @@ -1381,6 +1433,12 @@ struct SearchGrnnParam : ParamBase { lite::Tensor* tmp_buffer{}; lite::Tensor* idx_sorted_by_width{}; lite::Tensor* layout_input{}; + +#ifdef LITE_WITH_XPU + bool __xpu__float_to_fix{false}; // Is wi/wh already converted to int16/int8 + std::vector __xpu__wi_max; // Abs max in wi + std::vector __xpu__wh_max; // Abs max in wh +#endif }; struct SplitLodTensorParam : ParamBase { @@ -1402,10 +1460,11 @@ struct MergeLodTensorParam : ParamBase { struct ConditionalBlockParam : ParamBase { const lite::Tensor* cond{}; - std::vector x{}; + std::vector inputs{}; std::vector outs{}; - cpp::BlockDesc* sub_block{}; - Scope* scope{}; + int block_idx{-1}; + std::shared_ptr program_desc{nullptr}; + Scope* exec_scope{nullptr}; bool is_scalar_condition{}; }; @@ -1436,6 +1495,19 @@ struct InstanceNormParam : ParamBase { lite::Tensor* saved_variance{}; float epsilon; }; +/// --------------------- group_norm operators -------------------- +struct GroupNormParam : ParamBase { + lite::Tensor* x{}; + lite::Tensor* out{}; + lite::Tensor* bias{}; + lite::Tensor* scale{}; + lite::Tensor* saved_mean{}; + lite::Tensor* saved_variance{}; + float epsilon; + int groups; + int channels; +}; + /// --------------------- grid sampler operators -------------------- struct GridSamplerParam : ParamBase { lite::Tensor* x{}; @@ -1522,6 +1594,132 @@ struct XPUFcParam : ParamBase { std::string activation_type{""}; }; +struct XPUResNetCbamParam : ParamBase { + lite::Tensor* input{}; + std::vector filter; + std::vector bias; + std::vector max_filter; + lite::Tensor* output{}; + + float pool_p{1.0f}; +}; + +struct XPUMmdnnSearchAttentionParam : ParamBase { + lite::Tensor* X{}; + lite::Tensor* W{}; + lite::Tensor* b{}; + lite::Tensor* Out{}; + + float W_max{0.0f}; + int pad_id{0}; + float alpha0{1.0f}; + float alpha1{1.0f}; + float mask{1.0f}; +}; + +struct XPUMmdnnBidEmbGrnnAttParam : ParamBase { + lite::Tensor* id0{}; + lite::Tensor* id1{}; + lite::Tensor* emb_tbl{}; + lite::Tensor* grnn_fw_wh{}; + lite::Tensor* grnn_fw_wi{}; + lite::Tensor* grnn_rv_wh{}; + lite::Tensor* grnn_rv_wi{}; + lite::Tensor* att_fc_w{}; + lite::Tensor* att_fc_b{}; + + std::vector grnn_fw_wh_maxs; + std::vector grnn_fw_wi_maxs; + std::vector grnn_rv_wh_maxs; + std::vector grnn_rv_wi_maxs; + float att_fc_w_max{0.0f}; + + lite::Tensor* grnn_fw_pool_out{}; + lite::Tensor* grnn_rv_pool_out{}; + lite::Tensor* att_pool_out{}; + lite::Tensor* concat_3in1_out{}; + lite::Tensor* emb_fw_out{}; +}; + +struct XPUMmdnnBidEmbGrnnAttParam2 : ParamBase { + lite::Tensor* id0{}; + lite::Tensor* id1{}; + lite::Tensor* emb_tbl{}; + lite::Tensor* grnn_fw_wh{}; + lite::Tensor* grnn_fw_wi{}; + lite::Tensor* grnn_rv_wh{}; + lite::Tensor* grnn_rv_wi{}; + lite::Tensor* att_fc_w{}; + lite::Tensor* att_fc_b{}; + + std::vector grnn_fw_wh_maxs; + std::vector grnn_fw_wi_maxs; + std::vector grnn_rv_wh_maxs; + std::vector grnn_rv_wi_maxs; + float att_fc_w_max{0.0f}; + + lite::Tensor* emb0_out{}; + lite::Tensor* grnn_fw_pool_out{}; + lite::Tensor* grnn_rv_pool_out{}; + lite::Tensor* att_pool_out{}; + lite::Tensor* concat_3in1_out{}; + lite::Tensor* emb_fw_out{}; +}; + +struct XPUMmdnnBidEmbAttParam : ParamBase { + lite::Tensor* id0{}; + lite::Tensor* id1{}; + lite::Tensor* emb_tbl{}; + lite::Tensor* att_fc_w{}; + lite::Tensor* att_fc_b{}; + + float att_fc_w_max{0.0f}; + + lite::Tensor* att_pool_out{}; + lite::Tensor* emb_fw_out{}; +}; + +struct XPUMmdnnMatchConvTopkParam : ParamBase { + lite::Tensor* input_x{}; + lite::Tensor* input_y{}; + lite::Tensor* input_w{}; + lite::Tensor* conv_w{}; + + float input_w_max{0.0f}; + float conv_w_max{0.0f}; + std::vector topks; + int output_channel{0}; + int channel_num{0}; + int dim_t{0}; + + lite::Tensor* topk_out{}; +}; + +struct XPUMmdnnMergeAllParam : ParamBase { + std::vector concat_7in1_x; + std::vector concat_topk_x; + lite::Tensor* grnn_fw_wh{}; + lite::Tensor* grnn_fw_wi{}; + lite::Tensor* grnn_rv_wh{}; + lite::Tensor* grnn_rv_wi{}; + lite::Tensor* fc0_w{}; + lite::Tensor* fc0_b{}; + lite::Tensor* fc1_w{}; + lite::Tensor* fc1_b{}; + lite::Tensor* fc2_w{}; + lite::Tensor* fc2_b{}; + + std::vector grnn_fw_wh_maxs; + std::vector grnn_fw_wi_maxs; + std::vector grnn_rv_wh_maxs; + std::vector grnn_rv_wi_maxs; + float fc0_w_max{0.0f}; + float fc1_w_max{0.0f}; + float fc2_w_max{0.0f}; + + lite::Tensor* out{}; +}; + // For DeformableConvolution op struct DeformableConvParam : ParamBase { lite::Tensor* x{}; @@ -1560,6 +1758,50 @@ struct PixelShuffleParam : ParamBase { lite::Tensor* output{nullptr}; int upscale_factor{1}; }; + +struct RetinanetDetectionOutputParam : ParamBase { + std::vector bboxes{}; + std::vector scores{}; + std::vector anchors{}; + Tensor* im_info{}; + Tensor* out{}; + float score_threshold{}; + int nms_top_k{}; + float nms_threshold{}; + float nms_eta{}; + int keep_top_k{}; +}; + +struct WhereIndexParam : ParamBase { + const lite::Tensor* input{nullptr}; + lite::Tensor* output{nullptr}; +}; + +struct ClipParam : ParamBase { + Tensor* x{}; + Tensor* min_tensor{}; + Tensor* max_tensor{}; + Tensor* out{}; + float min{}; + float max{}; +}; + +struct PrintParam : ParamBase { + const lite::Tensor* in{}; + lite::Tensor* out{}; + std::string name; + int first_n{-1}; + std::string message; + int summarize{20}; + bool print_tensor_name{true}; + bool print_tensor_type{true}; + bool print_tensor_shape{true}; + bool print_tensor_lod{true}; + bool print_tensor_layout{true}; + std::string print_phase; + bool is_forward{true}; +}; + } // namespace operators } // namespace lite } // namespace paddle diff --git a/lite/operators/pixel_shuffle_op.cc b/lite/operators/pixel_shuffle_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..40f564bdd6d2699bafe497bdfded21ea4f3956a3 --- /dev/null +++ b/lite/operators/pixel_shuffle_op.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/pixel_shuffle_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool PixelShuffleOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.output); + CHECK_OR_FALSE(param_.upscale_factor); + const auto x_dims = param_.x->dims(); + const auto upscale_factor = param_.upscale_factor; + CHECK_EQ_OR_FALSE(x_dims[1] % (upscale_factor * upscale_factor), 0); + return true; +} + +bool PixelShuffleOpLite::InferShapeImpl() const { + const auto x_dims = param_.x->dims(); + const auto upscale_factor = param_.upscale_factor; + auto output_dims = x_dims; + output_dims[0] = x_dims[0]; + output_dims[1] = x_dims[1] / (upscale_factor * upscale_factor); + output_dims[2] = x_dims[2] * upscale_factor; + output_dims[3] = x_dims[3] * upscale_factor; + param_.output->Resize(output_dims); + return true; +} + +bool PixelShuffleOpLite::AttachImpl(const cpp::OpDesc& opdesc, + lite::Scope* scope) { + auto input = opdesc.Input("X").front(); + auto out = opdesc.Output("Out").front(); + + param_.x = scope->FindVar(input)->GetMutable(); + param_.output = scope->FindVar(out)->GetMutable(); + + if (opdesc.HasAttr("upscale_factor")) { + param_.upscale_factor = opdesc.GetAttr("upscale_factor"); + } + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(pixel_shuffle, paddle::lite::operators::PixelShuffleOpLite); diff --git a/lite/operators/pixel_shuffle_op.h b/lite/operators/pixel_shuffle_op.h new file mode 100644 index 0000000000000000000000000000000000000000..63efd8df778c6d92bc448f795c19ff5bffba62c8 --- /dev/null +++ b/lite/operators/pixel_shuffle_op.h @@ -0,0 +1,45 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class PixelShuffleOpLite : public OpLite { + public: + PixelShuffleOpLite() {} + explicit PixelShuffleOpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "pixel_shuffle"; } + + private: + mutable PixelShuffleParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h index 92f00a4272fddeb03abd04cba473a997cce37217..916ed1dd6f036c6c36954622abbbc1361de1b790 100644 --- a/lite/operators/pool_op.h +++ b/lite/operators/pool_op.h @@ -54,7 +54,7 @@ class PoolOpLite : public OpLite { param_.ksize = op_desc.GetAttr>("ksize"); param_.global_pooling = op_desc.GetAttr("global_pooling"); param_.strides = op_desc.GetAttr>("strides"); - auto paddings = op_desc.GetAttr>("paddings"); + std::vector paddings = op_desc.GetAttr>("paddings"); if (op_desc.HasAttr("exclusive")) { param_.exclusive = op_desc.GetAttr("exclusive"); diff --git a/lite/operators/print_op.cc b/lite/operators/print_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1f4299aed06f17d7bf3bd30b9fec34c587168884 --- /dev/null +++ b/lite/operators/print_op.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/print_op.h" +#include "lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace operators { + +bool PrintOp::CheckShape() const { + CHECK_OR_FALSE(param_.in); + CHECK_OR_FALSE(param_.out); + return true; +} + +bool PrintOp::InferShapeImpl() const { + param_.out->set_lod(param_.in->lod()); + param_.out->Resize(param_.in->dims()); + return true; +} + +bool PrintOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { + AttachParam(¶m_); + + param_.name = op_desc.Input("In").front(); + param_.in = scope->FindTensor(param_.name); + param_.out = scope->FindMutableTensor(op_desc.Output("Out").front()); + param_.first_n = op_desc.GetAttr("first_n"); + param_.message = op_desc.GetAttr("message"); + param_.summarize = op_desc.GetAttr("summarize"); + param_.print_tensor_name = op_desc.GetAttr("print_tensor_name"); + param_.print_tensor_type = op_desc.GetAttr("print_tensor_type"); + param_.print_tensor_shape = op_desc.GetAttr("print_tensor_shape"); + param_.print_tensor_lod = op_desc.GetAttr("print_tensor_lod"); + param_.print_tensor_layout = op_desc.GetAttr("print_tensor_layout"); + param_.print_phase = op_desc.GetAttr("print_phase"); + param_.is_forward = op_desc.GetAttr("is_forward"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(print, paddle::lite::operators::PrintOp); diff --git a/lite/operators/print_op.h b/lite/operators/print_op.h new file mode 100644 index 0000000000000000000000000000000000000000..cd8e777b59c3aac92771442402cf16623b75fbef --- /dev/null +++ b/lite/operators/print_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class PrintOp : public OpLite { + public: + PrintOp() {} + explicit PrintOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "print"; } + + private: + mutable PrintParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/retinanet_detection_output_op.cc b/lite/operators/retinanet_detection_output_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e27f2bfca0ab25b8f73d4c6a68d539a7c22389e0 --- /dev/null +++ b/lite/operators/retinanet_detection_output_op.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/retinanet_detection_output_op.h" +#include +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool RetinanetDetectionOutputOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.bboxes.size() > 0); + CHECK_OR_FALSE(param_.scores.size() > 0); + CHECK_OR_FALSE(param_.anchors.size() > 0); + CHECK_OR_FALSE(param_.bboxes.size() == param_.scores.size()); + CHECK_OR_FALSE(param_.bboxes.size() == param_.anchors.size()); + CHECK_OR_FALSE(param_.im_info); + CHECK_OR_FALSE(param_.out); + + DDim bbox_dims = param_.bboxes.front()->dims(); + DDim score_dims = param_.scores.front()->dims(); + DDim anchor_dims = param_.anchors.front()->dims(); + DDim im_info_dims = param_.im_info->dims(); + + CHECK_OR_FALSE(bbox_dims.size() == 3); + CHECK_OR_FALSE(score_dims.size() == 3); + CHECK_OR_FALSE(anchor_dims.size() == 2); + CHECK_OR_FALSE(bbox_dims[2] == 4); + CHECK_OR_FALSE(bbox_dims[1] == score_dims[1]); + CHECK_OR_FALSE(anchor_dims[0] == bbox_dims[1]); + CHECK_OR_FALSE(im_info_dims.size() == 2); + + return true; +} + +bool RetinanetDetectionOutputOpLite::InferShapeImpl() const { + DDim bbox_dims = param_.bboxes.front()->dims(); + param_.out->Resize({bbox_dims[1], bbox_dims[2] + 2}); + return true; +} + +bool RetinanetDetectionOutputOpLite::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + for (auto arg_name : op_desc.Input("BBoxes")) { + param_.bboxes.push_back( + scope->FindVar(arg_name)->GetMutable()); + } + for (auto arg_name : op_desc.Input("Scores")) { + param_.scores.push_back( + scope->FindVar(arg_name)->GetMutable()); + } + for (auto arg_name : op_desc.Input("Anchors")) { + param_.anchors.push_back( + scope->FindVar(arg_name)->GetMutable()); + } + AttachInput(op_desc, scope, "ImInfo", false, ¶m_.im_info); + AttachOutput(op_desc, scope, "Out", false, ¶m_.out); + + param_.score_threshold = op_desc.GetAttr("score_threshold"); + param_.nms_top_k = op_desc.GetAttr("nms_top_k"); + param_.nms_threshold = op_desc.GetAttr("nms_threshold"); + param_.nms_eta = op_desc.GetAttr("nms_eta"); + param_.keep_top_k = op_desc.GetAttr("keep_top_k"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(retinanet_detection_output, + paddle::lite::operators::RetinanetDetectionOutputOpLite); diff --git a/lite/operators/retinanet_detection_output_op.h b/lite/operators/retinanet_detection_output_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9969227e15941644249b46ba7372f9afc705672c --- /dev/null +++ b/lite/operators/retinanet_detection_output_op.h @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class RetinanetDetectionOutputOpLite : public OpLite { + public: + RetinanetDetectionOutputOpLite() {} + + explicit RetinanetDetectionOutputOpLite(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { + return "retinanet_detection_output"; + } + +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {} +#endif + + private: + mutable RetinanetDetectionOutputParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_fc_op.cc b/lite/operators/search_fc_op.cc index 71e62c2ae729b4e1516a219888b9af3f7d994428..8024c38f9cc4a6d3ba2d47d6c61e716dd57bb362 100644 --- a/lite/operators/search_fc_op.cc +++ b/lite/operators/search_fc_op.cc @@ -70,6 +70,18 @@ bool SearchFcOpLite::AttachImpl(const cpp::OpDesc &op_desc, param_.Out = scope->FindVar(Out)->GetMutable(); param_.out_size = op_desc.GetAttr("out_size"); + if (op_desc.HasAttr("fuse_relu")) { + param_.fuse_relu = op_desc.GetAttr("fuse_relu"); + } +#ifdef LITE_WITH_XPU + if (op_desc.HasAttr("__xpu__float_to_fix")) { + param_.__xpu__float_to_fix = op_desc.GetAttr("__xpu__float_to_fix"); + } + if (op_desc.HasAttr("__xpu__w_max")) { + param_.__xpu__w_max = op_desc.GetAttr("__xpu__w_max"); + } +#endif + return true; } diff --git a/lite/operators/search_grnn_op.cc b/lite/operators/search_grnn_op.cc index 1ced477c109d8cd93485f0193523887759939f17..6f743693bc782e636064ca398539433b497dc645 100644 --- a/lite/operators/search_grnn_op.cc +++ b/lite/operators/search_grnn_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/operators/search_grnn_op.h" +#include #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" @@ -84,6 +85,18 @@ bool SearchGrnnOpLite::AttachImpl(const cpp::OpDesc& op_desc, param_.layout_input = scope->FindVar(layout_input)->GetMutable(); +#ifdef LITE_WITH_XPU + if (op_desc.HasAttr("__xpu__float_to_fix")) { + param_.__xpu__float_to_fix = op_desc.GetAttr("__xpu__float_to_fix"); + } + if (op_desc.HasAttr("__xpu__wi_max")) { + param_.__xpu__wi_max = op_desc.GetAttr>("__xpu__wi_max"); + } + if (op_desc.HasAttr("__xpu__wh_max")) { + param_.__xpu__wh_max = op_desc.GetAttr>("__xpu__wh_max"); + } +#endif + return true; } diff --git a/lite/operators/sequence_mask_op.cc b/lite/operators/sequence_mask_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..bac1dc8a26abe9a9ae2bbd77e03c2375b4814268 --- /dev/null +++ b/lite/operators/sequence_mask_op.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_mask_op.h" + +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequenceMaskOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Y); + return true; +} + +bool SequenceMaskOp::InferShapeImpl() const { return true; } + +bool SequenceMaskOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { + param_.X = const_cast( + &scope->FindVar(opdesc.Input("X").front())->Get()); + if (opdesc.HasInput("MaxLenTensor") && + !opdesc.Input("MaxLenTensor").empty()) { + auto var = scope->FindVar(opdesc.Input("MaxLenTensor").front()); + if (var != nullptr) { + param_.MaxLenTensor = var->GetMutable(); + } + } + param_.Y = + scope->FindVar(opdesc.Output("Y").front())->GetMutable(); + param_.maxlen = opdesc.GetAttr("maxlen"); + param_.out_dtype = opdesc.GetAttr("out_dtype"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_mask, paddle::lite::operators::SequenceMaskOp); diff --git a/lite/operators/sequence_mask_op.h b/lite/operators/sequence_mask_op.h new file mode 100644 index 0000000000000000000000000000000000000000..97008b865b850f3837fcc49befc5735987fb2048 --- /dev/null +++ b/lite/operators/sequence_mask_op.h @@ -0,0 +1,45 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequenceMaskOp : public OpLite { + public: + SequenceMaskOp() {} + explicit SequenceMaskOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "sequence_mask"; } + + private: + mutable SequenceMaskParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/sequence_pad_op.cc b/lite/operators/sequence_pad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..858c0ffcbb1a8e739cf4575e9f2f8882fd231912 --- /dev/null +++ b/lite/operators/sequence_pad_op.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_pad_op.h" +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequencePadOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.PadValue); + CHECK_OR_FALSE(param_.Out); + CHECK_OR_FALSE(param_.Length); + + return true; +} + +bool SequencePadOp::InferShapeImpl() const { + auto x_dims = param_.X->dims(); + CHECK_GE(x_dims.size(), 2) << "The rank of SequencePad OP Input(x) can't be " + "less than 2. But the rank we received is " + << x_dims.size(); + auto time_step_dims = x_dims.Slice(1, x_dims.size()); + auto pad_value_dims = param_.PadValue->dims(); + CHECK_EQ((pad_value_dims == DDim({1})) || (pad_value_dims == time_step_dims), + true) + << "The SequencePad OP Input(PadValue) must be a scalar or a tensor " + "whiose shape equals to time steps in sequences"; + + auto x_lod = param_.X->lod(); + CHECK_EQ(x_lod.empty(), false) + << "The SequencePad OP Input(X) must hold lod info."; + const auto &x_lod_0 = x_lod[0]; + CHECK_GE(x_lod_0.size(), 2) + << "The size of SequencePadOp Input(X)'s lod info can't be less than 2. " + "But the size we received is " + << x_lod_0.size(); + CHECK_EQ(x_dims[0], static_cast(x_lod_0.back())) + << "The SequencePadOp Input(X)'s lod info mismatches the actual tensor " + "shape. The 1st dimension of Input(X)'s lod info is " + << x_dims[0] << ", the 1st dimension of actual tensor shape is " + << static_cast(x_lod_0.back()); + + int seq_num = x_lod_0.size() - 1; + int max_seq_len = 0; + for (int i = 0; i < seq_num; ++i) { + max_seq_len = + std::max(max_seq_len, static_cast(x_lod_0[i + 1] - x_lod_0[i])); + } + int real_padded_length = param_.padded_length; + if (real_padded_length == -1) { + real_padded_length = max_seq_len; + } + CHECK_GE(real_padded_length, max_seq_len) + << "The SequencePadOp Attr(padded_length) should be greater than or " + "equal to the length of the longest original sequence. But the " + "padded_length we received is " + << real_padded_length + << ", the length of the longest original sequence is " << max_seq_len; + + int out_dim_0 = seq_num; + std::vector out_dims_vec{out_dim_0, real_padded_length}; + std::vector len_dims_vec{out_dim_0}; + auto time_step_dims_vec = time_step_dims.Vectorize(); + out_dims_vec.insert( + out_dims_vec.end(), time_step_dims_vec.begin(), time_step_dims_vec.end()); + param_.Out->Resize(out_dims_vec); + param_.Length->Resize(len_dims_vec); + return true; +} + +bool SequencePadOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { + param_.X = const_cast( + &scope->FindVar(opdesc.Input("X").front())->Get()); + param_.PadValue = const_cast( + &scope->FindVar(opdesc.Input("PadValue").front())->Get()); + param_.Length = scope->FindVar(opdesc.Output("Length").front()) + ->GetMutable(); + param_.Out = + scope->FindVar(opdesc.Output("Out").front())->GetMutable(); + param_.padded_length = opdesc.GetAttr("padded_length"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_pad, paddle::lite::operators::SequencePadOp); diff --git a/lite/operators/sequence_pad_op.h b/lite/operators/sequence_pad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..bd5d732a5d8816d4f7994ee0e3175ac8a032b2d4 --- /dev/null +++ b/lite/operators/sequence_pad_op.h @@ -0,0 +1,45 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequencePadOp : public OpLite { + public: + SequencePadOp() {} + explicit SequencePadOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "sequence_pad"; } + + private: + mutable SequencePadParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/sequence_reverse_op.cc b/lite/operators/sequence_reverse_op.cc index 19a47cac9da666269fc5ef2a172ff0295b71e95d..fa2b0553aa2ac84f27d5d27d31df5ce9584d82c3 100644 --- a/lite/operators/sequence_reverse_op.cc +++ b/lite/operators/sequence_reverse_op.cc @@ -34,6 +34,7 @@ bool SequenceReverseOp::InferShapeImpl() const { const auto *input = param_.X; auto out_dims = input->dims(); param_.Out->Resize(out_dims); + param_.Out->set_lod(param_.X->lod()); return true; } @@ -45,6 +46,7 @@ bool SequenceReverseOp::AttachImpl(const cpp::OpDesc &opdesc, scope->FindVar(opdesc.Output("Y").front())->GetMutable(); CHECK(param_.X); CHECK(param_.Out); + return true; } diff --git a/lite/operators/sequence_unpad_op.cc b/lite/operators/sequence_unpad_op.cc index b91d43c741f002b2bdb30e161688cd40b462faee..4f4497f0b81b5710e71cd0a2fcce10e9559d9d30 100644 --- a/lite/operators/sequence_unpad_op.cc +++ b/lite/operators/sequence_unpad_op.cc @@ -32,32 +32,7 @@ bool SequenceUnpadOp::CheckShape() const { return true; } -bool SequenceUnpadOp::InferShapeImpl() const { - auto x_dims = param_.X->dims(); - auto len_dims = param_.Length->dims(); - - auto *seq_len_ptr = param_.Length->data(); - int64_t batch_size = len_dims[0]; - std::vector out_lod0(batch_size + 1, 0); - for (int64_t i = 0; i < batch_size; ++i) { - out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i]; - } - paddle::lite::LoD out_lod; - out_lod.push_back(out_lod0); - - int64_t out_dim0 = out_lod0.back(); - std::vector out_dims{out_dim0}; - if (x_dims.size() == 2) { - out_dims.push_back(1); - } else { - for (size_t i = 2; i < x_dims.size(); ++i) { - out_dims.push_back(x_dims[i]); - } - } - param_.Out->Resize(out_dims); - param_.Out->set_lod(out_lod); - return true; -} +bool SequenceUnpadOp::InferShapeImpl() const { return true; } bool SequenceUnpadOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { diff --git a/lite/operators/subgraph_op.cc b/lite/operators/subgraph_op.cc index 9ac07e96334eda9f0001d33e0789f9de15c4ca67..fec5a0e3254328220508f28a16b110beb01fb613 100644 --- a/lite/operators/subgraph_op.cc +++ b/lite/operators/subgraph_op.cc @@ -39,10 +39,11 @@ bool SubgraphOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { op_desc.GetAttr>("input_data_names"); param_.output_data_names = op_desc.GetAttr>("output_data_names"); - CHECK(param_.sub_block_desc); - param_.sub_block_idx = op_desc.GetAttr("sub_block"); - param_.scope = scope; - CHECK(param_.scope); + CHECK(param_.program_desc); + param_.block_idx = op_desc.GetAttr("sub_block"); + CHECK_GE(param_.block_idx, 0); + param_.exec_scope = scope; + CHECK(param_.exec_scope); return true; } diff --git a/lite/operators/subgraph_op.h b/lite/operators/subgraph_op.h index edbfb922044d60165e589d389cd8cfb3b2547796..df6448f2f78a08f41ac037a13d14cbca1725cfb5 100644 --- a/lite/operators/subgraph_op.h +++ b/lite/operators/subgraph_op.h @@ -13,14 +13,11 @@ // limitations under the License. #pragma once - +#include #include #include -#include "lite/core/kernel.h" #include "lite/core/op_lite.h" #include "lite/core/scope.h" -#include "lite/core/tensor.h" -#include "lite/operators/op_params.h" #include "lite/utils/all.h" namespace paddle { @@ -37,14 +34,18 @@ class SubgraphOp : public OpLite { bool InferShapeImpl() const override; - bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override; + bool AttachImpl(const cpp::OpDesc &op_desc, Scope *scope) override; void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } std::string DebugString() const override { return "subgraph"; } - void SetSubBlock(cpp::BlockDesc *desc) { param_.sub_block_desc = desc; } - cpp::BlockDesc *GetSubBlock() { return param_.sub_block_desc; } + void SetProgramDesc(std::shared_ptr program_desc) { + param_.program_desc = program_desc; + } + std::shared_ptr GetProgramDesc() { + return param_.program_desc; + } private: mutable SubgraphParam param_; diff --git a/lite/operators/topk_pooling_op.cc b/lite/operators/topk_pooling_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..76634d216a8a120f4e83dfe511089c6deb750cba --- /dev/null +++ b/lite/operators/topk_pooling_op.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/topk_pooling_op.h" +#include "lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace operators { + +bool TopkPoolingOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Y); + CHECK_OR_FALSE(param_.Out); + return true; +} + +bool TopkPoolingOp::InferShapeImpl() const { + auto out_dims = param_.X->dims(); + out_dims[1] *= param_.top_k; + auto out = param_.Out; + out->Resize(out_dims); + out->set_lod(param_.X->lod()); + + return true; +} + +bool TopkPoolingOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { + auto x = op_desc.Input("X").front(); + auto y = op_desc.Input("Y").front(); + param_.X = scope->FindTensor(x); + param_.Y = scope->FindTensor(y); + auto output = op_desc.Output("Out").front(); + param_.Out = scope->FindMutableTensor(output); + param_.top_k = op_desc.GetAttr("top_k"); + param_.feat_map_num = op_desc.GetAttr("feat_map_num"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(topk_pooling, paddle::lite::operators::TopkPoolingOp); diff --git a/lite/operators/topk_pooling_op.h b/lite/operators/topk_pooling_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ec48c476ca3e6854038bed591ca59402eda93736 --- /dev/null +++ b/lite/operators/topk_pooling_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class TopkPoolingOp : public OpLite { + public: + TopkPoolingOp() {} + explicit TopkPoolingOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShapeImpl() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "topk_pooling"; } + + private: + mutable TopkPoolingParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/transpose_op.cc b/lite/operators/transpose_op.cc index fe40bf6fa2f84ce7c999b41435aed00cd6555887..8f1372a883a1cd54ac2368f1e7f5e30a60a6b1db 100644 --- a/lite/operators/transpose_op.cc +++ b/lite/operators/transpose_op.cc @@ -43,24 +43,9 @@ bool TransposeOp::CheckShape() const { } bool TransposeOp::InferShapeImpl() const { - CHECK_OR_FALSE(param_.x); - CHECK_OR_FALSE(param_.output); auto x_dims = param_.x->dims(); - auto x_rank = x_dims.size(); std::vector axis = param_.axis; size_t axis_size = axis.size(); - // "The input tensor's rank(%d) should be equal to the axis's size(%d)", - // x_rank, axis_size - CHECK_OR_FALSE(x_rank == axis_size); - - std::vector count(axis_size, 0); - for (size_t i = 0; i < axis_size; i++) { - // Each element of Attribute axis should be a unique value - // range from 0 to (dims - 1), - // where the dims is the axis's size - CHECK_OR_FALSE(axis[i] < static_cast(axis_size) && - ++count[axis[i]] == 1); - } lite::DDim out_dims(x_dims); for (size_t i = 0; i < axis_size; i++) { out_dims[i] = x_dims[axis[i]]; @@ -113,24 +98,9 @@ bool Transpose2Op::CheckShape() const { } bool Transpose2Op::InferShapeImpl() const { - CHECK_OR_FALSE(param_.x); - CHECK_OR_FALSE(param_.output); auto x_dims = param_.x->dims(); - auto x_rank = x_dims.size(); std::vector axis = param_.axis; size_t axis_size = axis.size(); - // "The input tensor's rank(%d) should be equal to the axis's size(%d)", - // x_rank, axis_size - CHECK_OR_FALSE(x_rank == axis_size); - - std::vector count(axis_size, 0); - for (size_t i = 0; i < axis_size; i++) { - // Each element of Attribute axis should be a unique value - // range from 0 to (dims - 1), - // where the dims is the axis's size - CHECK_OR_FALSE(axis[i] < static_cast(axis_size) && - ++count[axis[i]] == 1); - } lite::DDim out_dims(x_dims); for (size_t i = 0; i < axis_size; i++) { out_dims[i] = x_dims[axis[i]]; diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc index 8cf11f6465d73646ec9bf846cbe6347bdc4b9f5b..612632acb4fbea692aa4a02dbd94bb1b506460bb 100644 --- a/lite/operators/var_conv_2d_op.cc +++ b/lite/operators/var_conv_2d_op.cc @@ -26,10 +26,16 @@ bool VarConv2dOp::InferShapeImpl() const { return true; } bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { param_.X = const_cast( &scope->FindVar(opdesc.Input("X").front())->Get()); - // param_.ROW = const_cast( - // &scope->FindVar(opdesc.Input("ROW").front())->Get()); - // param_.COLUMN = const_cast( - // &scope->FindVar(opdesc.Input("COLUMN").front())->Get()); + if (opdesc.HasInput("ROW") && !opdesc.Input("ROW").empty()) { + param_.ROW = const_cast( + &scope->FindVar(opdesc.Input("ROW").front())->Get()); + CHECK(param_.ROW) << "Input(ROW) of VarConv2dOP should not be null."; + } + if (opdesc.HasInput("COLUMN") && !opdesc.Input("COLUMN").empty()) { + param_.COLUMN = const_cast( + &scope->FindVar(opdesc.Input("COLUMN").front())->Get()); + CHECK(param_.COLUMN) << "Input(COLUMN) of VarConv2dOP should not be null."; + } param_.W = const_cast( &scope->FindVar(opdesc.Input("W").front())->Get()); param_.Out = @@ -37,8 +43,6 @@ bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { param_.Col = scope->FindVar(opdesc.Output("Col").front())->GetMutable(); CHECK(param_.X) << "X(Input) of VarConv2dOP should not be null."; - // CHECK(param_.ROW) << "Input(ROW) of VarConv2dOP should not be null."; - // CHECK(param_.COLUMN) << "Input(COLUMN) of VarConv2dOP should not be null."; CHECK(param_.W) << "W(Input) of VarConv2dOP should not be null."; CHECK(param_.Out) << "Out(Output) of VarConv2dOP should not be null."; CHECK(param_.Col) << "Col(Output) of VarConv2dOP should not be null."; @@ -52,6 +56,15 @@ bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { if (opdesc.HasAttr("fuse_relu")) { param_.fuse_relu = opdesc.GetAttr("fuse_relu"); } +#ifdef LITE_WITH_XPU + if (opdesc.HasAttr("__xpu__float_to_fix")) { + param_.__xpu__float_to_fix = opdesc.GetAttr("__xpu__float_to_fix"); + } + if (opdesc.HasAttr("__xpu__w_max")) { + param_.__xpu__w_max = opdesc.GetAttr("__xpu__w_max"); + } +#endif + return true; } diff --git a/lite/operators/where_index_op.cc b/lite/operators/where_index_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..81443b7058e0c7d68008cbe98040b3f50eac852f --- /dev/null +++ b/lite/operators/where_index_op.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/where_index_op.h" +#include "lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace operators { + +bool WhereIndexdOp::CheckShape() const { + CHECK_OR_FALSE(param_.input); + CHECK_OR_FALSE(param_.output); + CHECK_GE(param_.input->dims().size(), 1); + return true; +} + +bool WhereIndexdOp::InferShapeImpl() const { + int64_t rank = static_cast(param_.input->dims().size()); + int64_t numel = static_cast(param_.input->dims().production()); + param_.output->Resize({numel, rank}); + return true; +} + +bool WhereIndexdOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { + AttachParam(¶m_); + auto input = opdesc.Input("Condition").front(); + auto output = opdesc.Output("Out").front(); + CHECK(scope->FindVar(input)); + CHECK(scope->FindVar(output)); + param_.input = GetVar(scope, input); + param_.output = GetMutableVar(scope, output); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(where_index, paddle::lite::operators::WhereIndexdOp); diff --git a/lite/operators/where_index_op.h b/lite/operators/where_index_op.h new file mode 100644 index 0000000000000000000000000000000000000000..157a3cb0be33ffad275ae55a0999095357a09948 --- /dev/null +++ b/lite/operators/where_index_op.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class WhereIndexdOp : public OpLite { + public: + WhereIndexdOp() {} + explicit WhereIndexdOp(const std::string &op_type) : OpLite(op_type) {} + bool CheckShape() const override; + bool InferShapeImpl() const override; + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "where_index_op"; } + + private: + mutable WhereIndexParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/while_op.cc b/lite/operators/while_op.cc index 1dcf9553f331ee6646ad6d93de048728a0886116..ab8e4a5489c13e042bf0d07da1228f33626a1d43 100644 --- a/lite/operators/while_op.cc +++ b/lite/operators/while_op.cc @@ -20,31 +20,23 @@ namespace paddle { namespace lite { namespace operators { -bool WhileOpLite::CheckShape() const { - CHECK_OR_FALSE(param_.sub_block); - CHECK_OR_FALSE(param_.scope); +bool WhileOp::CheckShape() const { CHECK_OR_FALSE(param_.cond); + CHECK_OR_FALSE(param_.program_desc); + CHECK_OR_FALSE(param_.exec_scope); return true; } -bool WhileOpLite::InferShapeImpl() const { return true; } - -bool WhileOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { - auto inputs = op_desc.Input("X"); - auto outs = op_desc.Output("Out"); - - for (auto var : inputs) { - // param_.x.push_back(scope->FindVar(var)->GetMutable()); - } - for (auto var : outs) { - // param_.outs.push_back(scope->FindVar(var)->GetMutable()); - } - param_.sub_block = sub_block_; +bool WhileOp::InferShapeImpl() const { return true; } +bool WhileOp::AttachImpl(const cpp::OpDesc &op_desc, Scope *scope) { auto condition = op_desc.Input("Condition"); param_.cond = scope->FindVar(condition[0])->GetMutable(); - param_.scope = scope; - + CHECK(param_.program_desc); + param_.block_idx = op_desc.GetAttr("sub_block"); + CHECK_GE(param_.block_idx, 0); + param_.exec_scope = scope; + CHECK(param_.exec_scope); return true; } @@ -52,4 +44,4 @@ bool WhileOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { } // namespace lite } // namespace paddle -REGISTER_LITE_OP(while, paddle::lite::operators::WhileOpLite); +REGISTER_LITE_OP(while, paddle::lite::operators::WhileOp); diff --git a/lite/operators/while_op.h b/lite/operators/while_op.h index 94aec15a6d3eb60036bf9c2168fdbd855b84a396..e448ee568723b24a241c5bb127ac61458385337e 100644 --- a/lite/operators/while_op.h +++ b/lite/operators/while_op.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include "lite/core/op_lite.h" @@ -23,24 +24,30 @@ namespace paddle { namespace lite { namespace operators { -class WhileOpLite : public OpLite { +class WhileOp : public OpLite { public: - WhileOpLite() {} - explicit WhileOpLite(const std::string &op_type) : OpLite(op_type) {} + WhileOp() {} + explicit WhileOp(const std::string &op_type) : OpLite(op_type) {} bool CheckShape() const override; bool InferShapeImpl() const override; - bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + bool AttachImpl(const cpp::OpDesc &opdesc, Scope *scope) override; void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "while"; } - void SetSubBlock(cpp::BlockDesc *desc) { sub_block_ = desc; } + + void SetProgramDesc(std::shared_ptr program_desc) { + param_.program_desc = program_desc; + } + std::shared_ptr GetProgramDesc() { + return param_.program_desc; + } private: mutable WhileParam param_; - cpp::BlockDesc *sub_block_; }; } // namespace operators diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt index 810a20abbc0d13897822cef2c99e5942e352a19f..e9c6574c19bcb6a238503d7b5fc955db9b96d689 100644 --- a/lite/tests/api/CMakeLists.txt +++ b/lite/tests/api/CMakeLists.txt @@ -1,3 +1,13 @@ +if(LITE_WITH_ARM) + lite_cc_test(test_transformer_with_mask_fp32_arm SRCS test_transformer_with_mask_fp32_arm.cc + DEPS ${lite_model_test_DEPS} paddle_api_full + ARM_DEPS ${arm_kernels} + ARGS --model_dir=${LITE_MODEL_DIR}/transformer_with_mask_fp32 SERIAL) + if(WITH_TESTING) + add_dependencies(test_transformer_with_mask_fp32_arm extern_lite_download_transformer_with_mask_fp32_tar_gz) + endif() +endif() + if(LITE_WITH_XPU) lite_cc_test(test_resnet50_lite_xpu SRCS test_resnet50_lite_xpu.cc DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils @@ -6,11 +16,25 @@ if(LITE_WITH_XPU) lite_cc_test(test_ernie_lite_xpu SRCS test_ernie_lite_xpu.cc DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + ARGS --model_dir=${LITE_MODEL_DIR}/ernie) lite_cc_test(test_bert_lite_xpu SRCS test_bert_lite_xpu.cc DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} - ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + ARGS --model_dir=${LITE_MODEL_DIR}/bert) + if(WITH_TESTING) + add_dependencies(test_resnet50_lite_xpu extern_lite_download_resnet50_tar_gz) + add_dependencies(test_ernie_lite_xpu extern_lite_download_ernie_tar_gz) + add_dependencies(test_bert_lite_xpu extern_lite_download_bert_tar_gz) + endif() + # TODO(miaotianxiang): enable later + #lite_cc_test(test_fpr_lite_xpu SRCS test_fpr_lite_xpu.cc + #DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + #${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} + #ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + #lite_cc_test(test_mmdnn_lite_xpu SRCS test_mmdnn_lite_xpu.cc + #DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + #${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} + #ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) endif() if(LITE_WITH_RKNPU) diff --git a/lite/tests/api/test_bert_lite_xpu.cc b/lite/tests/api/test_bert_lite_xpu.cc index b3ee9febb3f0eabd36118680beca66ace9470de4..5d66fd0d5496e105ba97bea6c5e5387d96c9e01b 100644 --- a/lite/tests/api/test_bert_lite_xpu.cc +++ b/lite/tests/api/test_bert_lite_xpu.cc @@ -93,7 +93,7 @@ TEST(Ernie, test_ernie_lite_xpu) { for (size_t i = 0; i < results.size(); ++i) { for (size_t j = 0; j < results[i].size(); ++j) { EXPECT_NEAR( - out->data()[j + (out->shape()[1] * i)], results[i][j], 1e-5); + out->data()[j + (out->shape()[1] * i)], results[i][j], 3e-5); } } } diff --git a/lite/tests/api/test_ernie_lite_xpu.cc b/lite/tests/api/test_ernie_lite_xpu.cc index 0b614fec96cbcc5d9c96653681d0e8794cf4ab8f..b1db9f353657f3f09bcad25db4e777b05f15e0f7 100644 --- a/lite/tests/api/test_ernie_lite_xpu.cc +++ b/lite/tests/api/test_ernie_lite_xpu.cc @@ -93,7 +93,7 @@ TEST(Ernie, test_ernie_lite_xpu) { for (size_t i = 0; i < results.size(); ++i) { for (size_t j = 0; j < results[i].size(); ++j) { EXPECT_NEAR( - out->data()[j + (out->shape()[1] * i)], results[i][j], 1e-5); + out->data()[j + (out->shape()[1] * i)], results[i][j], 2e-5); } } } diff --git a/lite/tests/api/test_fpr_lite_xpu.cc b/lite/tests/api/test_fpr_lite_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..026c25690fe2a673be0a5a97b163d7bbe5fdb4f6 --- /dev/null +++ b/lite/tests/api/test_fpr_lite_xpu.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/lite_api_test_helper.h" +#include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { + +TEST(ResnetCbam, test_resnet_cbam_lite_xpu) { + lite_api::CxxConfig config; + // config.set_model_dir(FLAGS_model_dir); + config.set_model_file(FLAGS_model_dir + "/__model__"); + config.set_param_file(FLAGS_model_dir + "/__params__"); + config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}, + lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, + lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); + config.set_xpu_workspace_l3_size_per_thread(); + auto predictor = lite_api::CreatePaddlePredictor(config); + + auto input_tensor = predictor->GetInput(0); + std::vector input_shape{1, 3, 224, 224}; + input_tensor->Resize(input_shape); + auto* data = input_tensor->mutable_data(); + int input_num = 1; + for (size_t i = 0; i < input_shape.size(); ++i) { + input_num *= input_shape[i]; + } + for (int i = 0; i < input_num; i++) { + data[i] = 1; + } + + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor->Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor->Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/api/test_mmdnn_lite_xpu.cc b/lite/tests/api/test_mmdnn_lite_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..72d774db14d955f17caee217f13fddb32acb93c3 --- /dev/null +++ b/lite/tests/api/test_mmdnn_lite_xpu.cc @@ -0,0 +1,299 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/lite_api_test_helper.h" +#include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" + +DEFINE_bool(perf, false, "perf?"); +DEFINE_string(perf_input, "perf_input", "perf_input"); +DEFINE_int32(perf_batch_size, 40, "perf_batch_size"); +DEFINE_bool(use_xpu, true, "use_xpu?"); +DEFINE_int32(perf_dev, 0, "perf_dev"); + +namespace paddle { +namespace lite { + +class SampleReader { + public: + std::vector> data; + std::vector> lod; + + void Read() { + std::string raw_input = + "0 1;125 584 142 2114 197;125 756226 756913 855693 760836;125 584 142 " + "2114 197 10 2899;125 756226 756913 855693 760836 10 750793;125 584 " + "142 2114 197 10 2899 2 825 32 18499 125 584 295 2114 197 2114 2730 6 " + "15 32 18499 125 584 142 295 2114 1423 21 2 334 863 5122 197 974 21 " + "295 619 25 2114 1755 2701 197 15 216 23 18499 125 584 142 599 3228 23 " + "2 5122 1917 804 5 2114 197 1236 3 2114 1403 15 3886 1080 23 1150 125 " + "475 23 2998 23;125 756226 756913 855693 760836 10 750793 2 825 750355 " + "18499 881680 756226 295 765124 760836 2114 872813 754265 15 32 18499 " + "881680 756226 756913 761251 765124 752843 766823 2 334 759834 5122 " + "774643 758458 21 295 755114 25 1148365 1755 2701 197 15 216 23 18499 " + "881680 756226 756913 826848 3228 23 2 5122 831009 804 752371 2114 " + "760836 1236 3 2114 910393 15 3886 1080 23 877375 752137 761034 792123 " + "2998 23;1;1;\n" + "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;121 28 1054 " + "1459 125 72 32 2321 531 125 295 584 142 2114 197 14 477 30 121;121 28 " + "764114 1459 753052 750694 750001 886192 750435 752179 295 584 756913 " + "855693 760836 14 477 30 753504;121 28 1054 1459 125 72 32 2321 531 " + "125 295 584 142 2114 197 2 121 28 1054 1459 125 72 32 2321 531 125 " + "295 584 142 4 263 2114 197 43 95 863 2114 323 20 142 626 11 2 45 10 " + "45 58 142 65 918 741 2114 197 764 3 5122 26 51 1266 2037 295 222 1121 " + "4491 3 545 4338 11 2 5122 26 495 3 142 3444 3249 2114 197 3 626 4 " + "2794;121 28 764114 1459 753052 750694 750001 886192 750435 752179 295 " + "584 756913 855693 760836 2 121 28 764114 1459 753052 750694 750001 " + "886192 750435 752179 295 584 756913 4 750885 2114 760836 43 750030 " + "754302 2114 323 822131 142 626 769001 2 45 750128 750324 58 142 " + "1147454 918 910829 2114 760836 841946 767340 5122 779102 51 1266 2037 " + "756461 222 752031 942669 1139389 780275 4338 830597 2 5122 779102 495 " + "761418 142 3444 852932 2114 760836 3 760162 757966 751127;121 295 " + "5593 142 2114 197;121 295 5593 925208 2114 760836;\n" + "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;207 125 584 " + "142 2114 1423 14 5283 1745 73;207 752276 756226 756913 855693 752843 " + "14 5283 781651 786597;6109 18807 142 5 64 5283 1745 73 3690 1060 3626 " + "4 716 51 1030 2114 197 4 428 936 9066 10 10 10 2 207 125 584 142 2114 " + "1423 2 15329 2114 197 5669 401 318 285 953 4 2114 197 2285 7 1783 11 " + "2 5122 197 14017 584;6109 18807 142 5 755319 5283 781651 786597 3690 " + "1060 3626 4 716 910478 1030 2114 760836 4 750323 936 9066 10 750002 " + "750002 2 207 752276 756226 756913 855693 752843 2 15329 2114 760836 " + "5669 401 318 757541 750261 4 2114 760836 2285 7 757639 11 2 5122 " + "774643 14017 584;125 584 142 1745 5122;125 756226 756913 1745 " + "755836;\n" + "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;149 396 778 " + "584 142 295 2114 1423 14 64 125 584 73 21 36670 5834 10 211 25;149 " + "751876 1048872 584 756913 761251 765124 752843 14 64 125 756226 73 " + "944567 36670 5834 10 750012 753240;101 10 2114 197 3 946 2 149 396 " + "778 584 142 295 2114 1423 2 2610 6 1444 111 2114 948 72 32 21 15 494 " + "25 4 2114 197 5669 1145 2 148 295 149 396 778 584 142 295 21 22853 41 " + "348 619 25 366 5305 2114 807 4 1115 381 1955 2114 11;101 751178 2114 " + "760836 3 946 2 149 751876 1048872 584 756913 761251 765124 752843 2 " + "2610 753567 775165 750899 972788 948 750125 750001 751875 15 494 25 4 " + "2114 760836 5669 1145 2 148 808886 982157 751876 1048872 584 756913 " + "761251 790772 22853 41 348 619 25 366 894206 2114 1008440 4 753953 " + "381 851474 765868 11;149 396 778 584 142 295 2 149 396 354 778 584 " + "142 1333 2 584 778 295 5122 2 149 396 778 584 3609 2 149 396 64478 " + "816 14246 1423 2 149 396 584 32 127 19 3609 2 149 396 584 73 2 149 " + "396 584 778 295 2285 142 4922 323 2 149 396 584 2114 2 149 396 253 " + "584 2114 197;149 751876 1048872 584 756913 761251 2 149 751876 756286 " + "767182 584 756913 1333 2 584 778 897778 941364 2 149 751876 1048872 " + "584 1102835 2 149 751876 64478 816 14246 912094 2 149 751876 584 " + "773547 127 750771 791456 2 149 751876 584 73 2 149 751876 584 778 " + "897778 2285 751493 791984 323 2 149 751876 584 2114 2 149 751876 " + "808443 835481 2114 760836;\n" + "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;125 584 545 " + "149 14 125 584;125 756226 545 874302 14 125 756226;2204 25 30 1692 " + "1770 6534 295 125 584 72 32 1346 4 2698 2114 197 11 2 4235 4301 240 " + "295 125 584 72 32 21 6708 15 56974 494 25 1030 2114 197 110 804 495 " + "611 2 221 759 341 6 5283 1745 73 71 2114 1423 71 125 584 545 149 149 " + "2 505 345 58 125 584 65 3486 2114 295 4 45 786 196 6604 6086;2204 25 " + "30 797189 1770 1191824 295 752782 756226 751697 750001 1346 4 2698 " + "2114 760836 765158 2 4235 4301 240 753859 752782 756226 751697 750001 " + "751875 6708 15 56974 494 25 1030 2114 760836 777607 762850 966521 611 " + "2 221 752565 750130 750084 910219 781651 786597 71 2114 752843 71 125 " + "756226 545 874302 149 2 505 825657 782848 125 756226 65 3486 2114 " + "760669 4 45 755747 758903 6604 6086;125 584 2114 2 125 584 2114 1423 " + "2 125 584 2114 149 2 149 584 1745 5122 725 2 2114 125 584 2 125 584 " + "2114 2 2621 584 2114 2 527 37 2754 130 170 1013 494 887 240 2 4521 " + "11111 586 2321 531 125 584 142 1360 816 2842 1423 2 125 584 2114;125 " + "756226 2114 2 125 756226 2114 752843 2 125 756226 2114 783644 2 149 " + "760183 1745 755836 725 2 2114 125 756226 2 125 756226 2114 2 2621 " + "932600 2114 2 527 751304 869964 754462 170 1013 750719 778287 774620 " + "2 4521 11111 586 2321 750435 752179 756226 756913 1360 764399 2842 " + "1423 2 125 756226 2114;\n" + "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;207 584 142 " + "2114 197 4 207 584 142 2114 197 674 14 240 4328 14 4328 767;207 " + "1237071 756913 855693 760836 4 207 1237071 756913 855693 760836 674 " + "14 240 755573 14 4328 795065;207 584 142 2114 197 2 325 71 71 207 584 " + "142 2114 197 2 876 125 140 2114 197 2 207 584 142 2114 197 674 1210 " + "239 4328 767 268 1349 485 28 4389 504 3 941 57 1419 1978 11;207 " + "1237071 756913 855693 760836 2 325 71 71 207 1237071 756913 855693 " + "760836 2 876 125 750977 1250790 760836 2 207 1237071 756913 855693 " + "760836 674 814792 755820 812174 795065 818859 817155 816597 761001 " + "774461 780904 820475 1109800 790141 790459 780324 770390;584 142 295 " + "2114 232 2 207 584 2114 197 2 584 142 295 2114 232 2 584 142 512 2114 " + "197;584 756913 761251 765124 1006359 2 207 1237071 2114 760836 2 584 " + "756913 761251 765124 1006359 2 584 756913 879930 2114 760836;"; + + auto lines = Split(raw_input, "\n"); + for (auto& line : lines) { + auto split1 = Split(line, ";"); + if (data.size() == 0) { + for (size_t i = 1; i < split1.size(); ++i) { + data.push_back(std::vector()); + lod.push_back({0}); + } + } + + for (size_t i = 1; i < split1.size(); ++i) { + auto split2 = Split(split1[i], " "); + if (split2.size() == 0) { + split2.push_back("1280000"); + } + for (auto e : split2) { + data[i - 1].push_back(std::stoi(e.c_str(), nullptr, 0)); + } + lod[i - 1].push_back(lod[i - 1].back() + split2.size()); + } + } + } +}; + +class FileReader { + std::ifstream ifs; + + public: + std::vector> data; + std::vector> lod; + + void Init(std::string file_name) { ifs.open(file_name); } + + int Read(int maxline) { + data.clear(); + lod.clear(); + + std::string line; + int cnt = 0; + while (cnt < maxline && getline(ifs, line)) { + std::vector split1 = Split(line, ";"); + if (data.size() == 0) { + for (size_t i = 1; i < split1.size(); ++i) { + data.push_back(std::vector()); + lod.push_back({0}); + } + } + + for (size_t i = 1; i < split1.size(); i++) { + std::vector split2 = Split(split1[i], " "); + if (split2.size() == 0) { + split2.push_back("1280000"); + } + for (size_t j = 0; j < split2.size(); j++) { + data[i - 1].push_back(std::stoi(split2[j].c_str(), nullptr, 0)); + } + lod[i - 1].push_back(lod[i - 1].back() + split2.size()); + } + cnt++; + } + return cnt; + } +}; + +TEST(MMDNN, test_mmdnn_lite_xpu) { + lite_api::CxxConfig config; + // config.set_model_dir(FLAGS_model_dir); + config.set_model_file(FLAGS_model_dir + "/__model__"); + config.set_param_file(FLAGS_model_dir + "/__param__"); + config.set_xpu_dev_per_thread(FLAGS_perf_dev); + if (FLAGS_use_xpu) { + config.set_valid_places( + {lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}, + lite_api::Place{TARGET(kXPU), PRECISION(kInt64)}, + lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, + lite_api::Place{TARGET(kX86), PRECISION(kInt64)}, + lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); + } else { + config.set_valid_places( + {lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, + lite_api::Place{TARGET(kX86), PRECISION(kInt64)}, + lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); + } + config.set_xpu_workspace_l3_size_per_thread(); + auto predictor = lite_api::CreatePaddlePredictor(config); + + if (FLAGS_perf) { + FileReader file_reader; + file_reader.Init(FLAGS_perf_input); + int UB_batch = FLAGS_perf_batch_size; // upper bound of batch + int iter = 0; + double tsc_sum = 0; + + while (true) { + int batch = file_reader.Read(UB_batch); + if (batch <= 0) { + break; + } + ++iter; + for (size_t i = 0; i < file_reader.data.size(); ++i) { + auto input_x = predictor->GetInput(i); + input_x->Resize({(int64_t)file_reader.data[i].size(), 1}); + input_x->SetLoD({file_reader.lod[i]}); + auto* data_x = input_x->mutable_data(); + memcpy(data_x, + file_reader.data[i].data(), + file_reader.data[i].size() * sizeof(int64_t)); + } + + auto start = GetCurrentUS(); + predictor->Run(); + auto end = GetCurrentUS(); + tsc_sum += end - start; + } + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " + << FLAGS_threads << ", warmup: " << FLAGS_warmup + << ", repeats: " << iter << ", spend " << tsc_sum / iter / 1000.0 + << " ms in average."; + + return; + } + + SampleReader sample_reader; + sample_reader.Read(); + + for (size_t i = 0; i < sample_reader.data.size(); ++i) { + auto input_x = predictor->GetInput(i); + input_x->Resize({(int64_t)sample_reader.data[i].size(), 1}); + input_x->SetLoD({sample_reader.lod[i]}); + auto* data_x = input_x->mutable_data(); + memcpy(data_x, + sample_reader.data[i].data(), + sample_reader.data[i].size() * sizeof(int64_t)); + } + + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor->Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor->Run(); + } + + auto out = predictor->GetOutput(0); + auto out_shape = out->shape(); + auto out_size = std::accumulate( + out_shape.begin(), out_shape.end(), 1, std::multiplies()); + for (int i = 0; i < out_size; ++i) { + LOG(INFO) << "out[" << i << "] = " << out->data()[i]; + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/api/test_transformer_with_mask_fp32_arm.cc b/lite/tests/api/test_transformer_with_mask_fp32_arm.cc new file mode 100644 index 0000000000000000000000000000000000000000..e65b017aa1440683d86d0da03686a2be9c4c6ee5 --- /dev/null +++ b/lite/tests/api/test_transformer_with_mask_fp32_arm.cc @@ -0,0 +1,274 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "lite/api/lite_api_test_helper.h" +#include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { + +template +void SetTensorData(const std::vector &data, + const std::vector &shape, + paddle::lite_api::Tensor *tensor, + const std::vector> &lod = {}) { + tensor->Resize(shape); + tensor->SetLoD(lod); + std::copy(data.begin(), data.end(), tensor->mutable_data()); +} + +void PrepareInputData( + const std::shared_ptr &predictor, + std::vector src_word_data, + int max_seq_len = 16, // padding + int max_out_len = 8, + int bos_idx = 0, + int eos_idx = 1, + int n_head = 8) { + // src_word + auto src_word = predictor->GetInput(0); + int seq_len = src_word_data.size(); + for (int i = seq_len; i < max_seq_len; i++) { + src_word_data.push_back(eos_idx); + } + std::vector src_word_shape{ + 1, static_cast(src_word_data.size())}; + SetTensorData(src_word_data, src_word_shape, src_word.get()); + // src_pos + auto src_pos = predictor->GetInput(1); + std::vector src_pos_data(src_word_data.size()); + std::iota(src_pos_data.begin(), src_pos_data.end(), 0); + std::vector src_pos_shape{1, + static_cast(src_pos_data.size())}; + SetTensorData(src_pos_data, src_pos_shape, src_pos.get()); + // src_slf_attn_bias + auto src_slf_attn_bias = predictor->GetInput(2); + std::vector src_slf_attn_bias_data(1 * n_head * src_word_data.size() * + src_word_data.size()); + int offset = 0; + for (int j = 0; j < 1 * n_head * src_word_data.size(); j++) { + for (int i = 0; i < seq_len; i++) { + src_slf_attn_bias_data[offset++] = 0.0f; + } + for (int i = seq_len; i < src_word_data.size(); i++) { + src_slf_attn_bias_data[offset++] = -1e9f; + } + } + std::vector src_slf_attn_bias_shape{ + 1, + n_head, + static_cast(src_word_data.size()), + static_cast(src_word_data.size())}; + SetTensorData( + src_slf_attn_bias_data, src_slf_attn_bias_shape, src_slf_attn_bias.get()); + // trg_word + auto trg_word = predictor->GetInput(3); + std::vector trg_word_data(2, 0); + std::vector trg_word_shape{2, 1}; + std::vector lod_level_0{0, 2}; + std::vector lod_level_1{0, 1, 2}; + std::vector> trg_word_lod(2); + trg_word_lod[0] = lod_level_0; + trg_word_lod[1] = lod_level_1; + SetTensorData( + trg_word_data, trg_word_shape, trg_word.get(), trg_word_lod); + // init_score + auto init_score = predictor->GetInput(4); + std::vector init_score_data(2); + init_score_data[0] = 0; + init_score_data[1] = -1e9f; + std::vector init_score_shape{2, 1}; + std::vector> init_score_lod(trg_word_lod); + SetTensorData( + init_score_data, init_score_shape, init_score.get(), init_score_lod); + // init_idx + auto init_idx = predictor->GetInput(5); + std::vector init_idx_data(2, 0); + std::vector init_idx_shape{2}; + SetTensorData(init_idx_data, init_idx_shape, init_idx.get()); + // trg_slf_attn_bias + auto trg_slf_attn_bias = predictor->GetInput(6); + std::vector trg_slf_attn_bias_data(max_out_len * n_head * 1 * + max_out_len); + offset = 0; + for (int k = 0; k < max_out_len; k++) { + for (int j = 0; j < n_head; j++) { + for (int i = 0; i < max_out_len; i++) { + trg_slf_attn_bias_data[offset++] = (i <= k) ? 0.0f : -1e9f; + } + } + } + std::vector trg_slf_attn_bias_shape{ + max_out_len, n_head, 1, max_out_len}; + SetTensorData( + trg_slf_attn_bias_data, trg_slf_attn_bias_shape, trg_slf_attn_bias.get()); + // trg_src_attn_bias + auto trg_src_attn_bias = predictor->GetInput(7); + std::vector trg_src_attn_bias_data(1 * n_head * 1 * + src_word_data.size()); + offset = 0; + for (int j = 0; j < 1 * n_head * 1; j++) { + for (int i = 0; i < seq_len; i++) { + trg_src_attn_bias_data[offset++] = 0.0f; + } + for (int i = seq_len; i < src_word_data.size(); i++) { + trg_src_attn_bias_data[offset++] = -1e9f; + } + } + std::vector trg_src_attn_bias_shape{ + 1, n_head, 1, static_cast(src_word_data.size())}; + SetTensorData( + trg_src_attn_bias_data, trg_src_attn_bias_shape, trg_src_attn_bias.get()); + // kv_padding_selection + auto kv_padding_selection = predictor->GetInput(8); + std::vector kv_padding_selection_data(max_out_len * n_head * + max_out_len * 1); + offset = 0; + for (int k = 0; k < max_out_len; k++) { + for (int j = 0; j < n_head; j++) { + for (int i = 0; i < max_out_len; i++) { + kv_padding_selection_data[offset++] = (i == k) ? 1.0f : 0.0f; + } + } + } + std::vector kv_padding_selection_shape{ + max_out_len, n_head, max_out_len, 1}; + SetTensorData(kv_padding_selection_data, + kv_padding_selection_shape, + kv_padding_selection.get()); +} + +void CheckOutputData( + const std::shared_ptr &predictor, + const std::vector &ref_seq_ids_data, + const std::vector &ref_seq_scores_data) { + // seq_ids + auto seq_ids = predictor->GetOutput(0); + auto seq_ids_shape = seq_ids->shape(); + auto seq_ids_size = std::accumulate(seq_ids_shape.begin(), + seq_ids_shape.end(), + 1, + std::multiplies()); + ASSERT_EQ(seq_ids_size, ref_seq_ids_data.size()); + auto *seq_ids_data = seq_ids->data(); + for (size_t i = 0; i < seq_ids_size; i++) { + EXPECT_EQ(seq_ids_data[i], ref_seq_ids_data[i]); + } + // seq_scores + auto seq_scores = predictor->GetOutput(1); + auto seq_scores_shape = seq_scores->shape(); + auto seq_scores_size = std::accumulate(seq_scores_shape.begin(), + seq_scores_shape.end(), + 1, + std::multiplies()); + ASSERT_EQ(seq_scores_size, ref_seq_scores_data.size()); + auto *seq_scores_data = seq_scores->data(); + for (size_t i = 0; i < seq_scores_size; i++) { + EXPECT_NEAR(seq_scores_data[i], ref_seq_scores_data[i], 1e-5); + } +} + +TEST(TransformerWithMask, test_transformer_with_mask_fp32) { + // Save the optimized model by using full api with CxxConfig + lite_api::CxxConfig cxx_config; + cxx_config.set_model_dir(FLAGS_model_dir); + cxx_config.set_valid_places( + {lite_api::Place{TARGET(kARM), PRECISION(kFloat)}, + lite_api::Place{TARGET(kARM), PRECISION(kInt64)}}); + auto predictor = lite_api::CreatePaddlePredictor(cxx_config); + predictor->SaveOptimizedModel(FLAGS_model_dir + ".nb", + paddle::lite_api::LiteModelType::kNaiveBuffer); + // Load the optimized model and run inference by using light api with + // MobileConfig + paddle::lite_api::MobileConfig mobile_config; + mobile_config.set_model_from_file(FLAGS_model_dir + ".nb"); + mobile_config.set_threads(1); + mobile_config.set_power_mode(paddle::lite_api::PowerMode::LITE_POWER_HIGH); + std::vector, + std::pair, std::vector>>> + test_cases = { + {{16, 16, 16, 1}, + {{0, 16, 16, 16, 16, 16, 16, 1, 0, 16, 16, 16, 16, 16, 9, 1}, + {0.0f, + -0.939061f, + -1.91494f, + -2.94378f, + -4.26457f, + -5.82675f, + -7.45856f, + -7.58065f, + 0.0f, + -0.939061f, + -1.91494f, + -2.94378f, + -4.26457f, + -5.82675f, + -8.70994f, + -8.8053f}}}, + {{16, 16, 16, 10, 1}, + {{0, 6, 53, 11, 1, 0, 6, 53, 56, 4, 1}, + {0.0f, + -2.36122f, + -4.1678f, + -6.19764f, + -7.69256f, + 0.0f, + -2.36122f, + -4.1678f, + -6.20145f, + -7.66355f, + -8.63024f}}}, + {{126, 4, 33, 1}, + {{0, 68, 5, 17, 1, 0, 68, 5, 13, 14, 1}, + {0.0f, + -0.829941f, + -1.20217f, + -2.23938f, + -2.98262f, + 0.0f, + -0.829941f, + -1.20217f, + -2.25051f, + -3.07555f, + -3.57711f}}}, + {{126, 4, 33, 99, 1}, + {{0, 14, 242, 17, 1, 0, 93, 38, 27, 68, 1}, + {0.f, + -1.8504f, + -2.66679f, + -3.09469f, + -3.63227f, + 0.0f, + -1.33829f, + -1.41656f, + -3.1333f, + -3.27901f, + -3.88582f}}}}; + for (auto &test_case : test_cases) { + PrepareInputData(predictor, test_case.first); + predictor->Run(); + CheckOutputData(predictor, test_case.second.first, test_case.second.second); + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index 03f0de291e80d821af5704727dbd30b10d2ca453..b8d142d7f5cc322b5950ebd512f6e60cd40f247a 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -1,89 +1,91 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_RKNPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_group_norm_compute SRCS group_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_logical_compute SRCS logical_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_multiclass_nms_compute SRCS multiclass_nms_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_fill_constant_compute SRCS fill_constant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_fill_constant_batch_size_like_compute SRCS fill_constant_batch_size_like_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_logical_compute SRCS logical_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_multiclass_nms_compute SRCS multiclass_nms_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_fill_constant_compute SRCS fill_constant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_fill_constant_batch_size_like_compute SRCS fill_constant_batch_size_like_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) if(LITE_BUILD_EXTRA) - lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_sequence_conv_compute SRCS sequence_conv_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_sum_compute SRCS reduce_sum_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_sequence_conv_compute SRCS sequence_conv_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_sum_compute SRCS reduce_sum_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_clip_compute SRCS clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) # for training kernel if (LITE_WITH_TRAIN) - lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_activation_grad_compute SRCS activation_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_elementwise_grad_compute SRCS elementwise_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_mul_grad_compute SRCS mul_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_sgd_compute SRCS sgd_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_activation_grad_compute SRCS activation_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_elementwise_grad_compute SRCS elementwise_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_mul_grad_compute SRCS mul_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_sgd_compute SRCS sgd_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() endif() - lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_interp_compute SRCS interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_is_empty_compute SRCS is_empty_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_crf_decoding_compute SRCS crf_decoding_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_interp_compute SRCS interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_is_empty_compute SRCS is_empty_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_crf_decoding_compute SRCS crf_decoding_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc index c71eac8d4532eefd5569421807c85128746c6c8b..0e803f1281fe2fc4dfca70c3f5223b8835ad7eff 100644 --- a/lite/tests/kernels/activation_compute_test.cc +++ b/lite/tests/kernels/activation_compute_test.cc @@ -38,7 +38,8 @@ enum activation_type_test { GELU, SQUARE, HARD_SWISH, - RECIPROCAL + RECIPROCAL, + THRESHOLDED_RELU }; class ActivationComputeTester : public arena::TestCase { @@ -54,6 +55,7 @@ class ActivationComputeTester : public arena::TestCase { float hard_swish_threshold = 6.0; float hard_swish_scale = 6.0; float hard_swish_offset = 3.0; + float relu_threshold_ = 1.0; DDim dims_{{1}}; std::string type_ = ""; activation_type_test act_type_ = RELU; @@ -218,6 +220,12 @@ class ActivationComputeTester : public arena::TestCase { } break; } + case THRESHOLDED_RELU: { + for (int i = 0; i < dims_.production(); i++) { + output_data[i] = x_data[i] > relu_threshold_ ? x_data[i] : 0.f; + } + break; + } default: LOG(INFO) << "the type of activation is unknow."; } @@ -245,6 +253,9 @@ class ActivationComputeTester : public arena::TestCase { op_desc->SetAttr("scale", hard_swish_scale); op_desc->SetAttr("offset", hard_swish_offset); } + if (act_type_ == THRESHOLDED_RELU) { + op_desc->SetAttr("threshold", relu_threshold_); + } } void PrepareData() override { @@ -289,8 +300,11 @@ TEST(Activation_relu, precision) { abs_error = 1e-2; // Using fp16 in NPU #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); +#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU) + place = TARGET(kHuaweiAscendNPU); + abs_error = 1e-2; // Using fp16 in NPU #else return; #endif @@ -313,6 +327,9 @@ TEST(Activation_leaky_relu, precision) { abs_error = 1e-2; // Using fp16 in NPU #elif defined(LITE_WITH_ARM) place = TARGET(kARM); +#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU) + place = TARGET(kHuaweiAscendNPU); + abs_error = 1e-2; // Using fp16 in NPU #else return; #endif @@ -393,6 +410,9 @@ TEST(Activation_sigmoid, precision) { abs_error = 1e-2; // Using fp16 in NPU #elif defined(LITE_WITH_ARM) place = TARGET(kARM); +#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU) + place = TARGET(kHuaweiAscendNPU); + abs_error = 1e-2; // Using fp16 in NPU #else return; #endif @@ -415,8 +435,11 @@ TEST(Activation_tanh, precision) { abs_error = 1e-2; // Using fp16 in NPU #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); +#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU) + place = TARGET(kHuaweiAscendNPU); + abs_error = 1e-2; // Using fp16 in NPU #else return; #endif @@ -456,6 +479,9 @@ TEST(Activation_relu6, precision) { abs_error = 1e-2; // Using fp16 in NPU #elif defined(LITE_WITH_ARM) place = TARGET(kARM); +#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU) + place = TARGET(kHuaweiAscendNPU); + abs_error = 1e-2; // Using fp16 in NPU #else return; #endif @@ -561,7 +587,7 @@ TEST(Activation_gelu, precision) { LOG(INFO) << "test gelu op"; Place place; float abs_error = 2e-5; -#if defined(LITE_WITH_XPU) +#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; @@ -632,5 +658,35 @@ TEST(activation_reciprocal, precision) { } } +TEST(Activation_thresholded_relu, precision) { + LOG(INFO) << "test thresholded_relu op"; + Place place; + float abs_error = 2e-5; +#if defined(LITE_WITH_NPU) + place = TARGET(kNPU); + abs_error = 1e-2; // Using fp16 in NPU +#elif defined(LITE_WITH_ARM) + place = TARGET(kARM); +#else + return; +#endif + + for (auto dims : std::vector>{ + {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) { + std::unique_ptr tester( + new ActivationComputeTester(place, + "def", + 0.01, + 6., + "all", + 0., + DDim(dims), + "thresholded_relu", + THRESHOLDED_RELU)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } +} + } // namespace lite } // namespace paddle diff --git a/lite/tests/kernels/activation_grad_compute_test.cc b/lite/tests/kernels/activation_grad_compute_test.cc index 5d5046b01dee6c84f341159b68300197c20695e6..2ad5b80a910f323b34b039eabda0ceb4b49784c5 100644 --- a/lite/tests/kernels/activation_grad_compute_test.cc +++ b/lite/tests/kernels/activation_grad_compute_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/arm/activation_grad_compute.h" +#include "lite/kernels/host/activation_grad_compute.h" #include #include "lite/core/op_registry.h" #include "lite/kernels/arm/activation_compute.h" @@ -20,13 +20,11 @@ namespace paddle { namespace lite { namespace kernels { -namespace arm { using param_t = operators::ActivationParam; using grad_param_t = operators::ActivationGradParam; -using kernel_t = SquareCompute; -using grad_kernel_t = SquareGradCompute; +template class ActivationGradTester { public: explicit ActivationGradTester(DDim dims) : dims_(dims) {} @@ -71,22 +69,28 @@ class ActivationGradTester { void run_backward(grad_param_t* param, grad_kernel_t* kernel, const std::vector& in_vec, + const std::vector& out_vec, const std::vector& out_grad_vec, float* in_grad_vec) { Tensor x; + Tensor out; Tensor x_grad; Tensor out_grad; x.Resize(dims_); + out.Resize(dims_); x_grad.Resize(dims_); out_grad.Resize(dims_); auto* x_data = x.mutable_data(); + auto* out_data = out.mutable_data(); auto* out_grad_data = out_grad.mutable_data(); for (int i = 0; i < dims_.production(); i++) { x_data[i] = in_vec[i]; + out_data[i] = out_vec[i]; out_grad_data[i] = out_grad_vec[i]; } param->X = &x; + param->Out = &out; param->X_grad = &x_grad; param->Out_grad = &out_grad; kernel->SetParam(*param); @@ -102,7 +106,9 @@ class ActivationGradTester { std::vector x(dims_.production()); std::vector out(dims_.production()); for (int i = 0; i < dims_.production(); i++) { - x[i] = 1.0 * static_cast(i % 128) * 0.3f - 1.1; + x[i] = static_cast(i % 3 - 2.0) / 2.0 * 0.333 + + static_cast(i % 19 - 10.0) / 10.0 * 0.333 + + static_cast(i % 39 - 20.0) / 20.0 * 0.333 + 0.001213; } this->run_forward(¶m_, &kernel_, x, out.data()); @@ -120,7 +126,8 @@ class ActivationGradTester { for (int i = 0; i < dims_.production(); i++) { out_grad[i] = 1.0; } - this->run_backward(&grad_param_, &grad_kernel_, x, out_grad, x_grad.data()); + this->run_backward( + &grad_param_, &grad_kernel_, x, out, out_grad, x_grad.data()); for (int i = 0; i < dims_.production(); i++) { EXPECT_NEAR(x_grad[i], (out_delta[i] - out[i]) / delta, max_grad_delta); @@ -137,31 +144,58 @@ class ActivationGradTester { grad_param_t grad_param_; }; -void TestNormalCase(DDim dims) { - std::unique_ptr tester(new ActivationGradTester(dims)); +void TestSquareGrad(DDim dims) { + LOG(INFO) << "Test Square grad"; + std::unique_ptr< + ActivationGradTester> + tester( + new ActivationGradTester( + dims)); tester->prepare_kernel(); float delta = 0.001; float max_grad_delta = 0.005; tester->check_grad(delta, max_grad_delta); } -TEST(activation_grad_arm, compute) { - LOG(INFO) << "Test Square grad"; +void TestReluGrad(DDim dims) { + LOG(INFO) << "Test Relu grad"; + std::unique_ptr> + tester(new ActivationGradTester( + dims)); + tester->prepare_kernel(); + float delta = 0.001; + float max_grad_delta = 0.005; + tester->check_grad(delta, max_grad_delta); +} + +void TestTanhGrad(DDim dims) { + LOG(INFO) << "Test Tanh grad"; + std::unique_ptr> + tester(new ActivationGradTester( + dims)); + tester->prepare_kernel(); + float delta = 0.001; + float max_grad_delta = 0.005; + tester->check_grad(delta, max_grad_delta); +} + +TEST(activation_grad_host, compute) { DeviceInfo::Init(); - for (auto n : {2}) { - for (auto c : {2}) { - for (auto h : {2}) { - for (auto w : {2}) { - TestNormalCase(DDim(std::vector({n, c, h, w}))); + for (auto n : {2, 1}) { + for (auto c : {2, 9}) { + for (auto h : {2, 1}) { + for (auto w : {2, 10}) { + TestSquareGrad(DDim(std::vector({n, c, h, w}))); + TestReluGrad(DDim(std::vector({n, c, h, w}))); + TestTanhGrad(DDim(std::vector({n, c, h, w}))); } } } } } -} // namespace arm } // namespace kernels } // namespace lite } // namespace paddle USE_LITE_KERNEL(square, kARM, kFloat, kNCHW, def); -USE_LITE_KERNEL(square_grad, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(square_grad, kHost, kFloat, kNCHW, def); diff --git a/lite/tests/kernels/batch_norm_compute_test.cc b/lite/tests/kernels/batch_norm_compute_test.cc index ae65e0e3c320ff153a99d2a1656227bad34428d4..9674f95d0b52dbc264ef78748d0c0fba1e4ebc37 100644 --- a/lite/tests/kernels/batch_norm_compute_test.cc +++ b/lite/tests/kernels/batch_norm_compute_test.cc @@ -157,7 +157,7 @@ TEST(BatchNorm, precision) { LOG(INFO) << "test BatchNorm op"; float abs_error = 2e-5; Place place; -#if defined(LITE_WITH_XPU) +#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #elif defined(LITE_WITH_NPU) place = TARGET(kNPU); diff --git a/lite/tests/kernels/box_clip_compute_test.cc b/lite/tests/kernels/box_clip_compute_test.cc index 72947fa4b258a894e5a73c5e8fe8cce12ef9a02c..c599e64214d3fb15a52cb14fe48de7a7d75b2868 100644 --- a/lite/tests/kernels/box_clip_compute_test.cc +++ b/lite/tests/kernels/box_clip_compute_test.cc @@ -70,9 +70,7 @@ class BoxClipComputeTester : public arena::TestCase { float sign = i % 3 == 0 ? -1.0f : 1.0f; input_data[i] = sign * static_cast((i * 7) % 20); } - SetCommonTensor(input_, input_dims_, input_data.data()); - auto input_tensor = baseline_scope()->FindMutableTensor(input_); - input_tensor->set_lod(input_lod_); + SetCommonTensor(input_, input_dims_, input_data.data(), input_lod_); std::vector im_info_data{10, 10, 1, 15, 15, 1}; SetCommonTensor(im_info_, im_info_dim_, im_info_data.data()); diff --git a/lite/tests/kernels/cast_compute_test.cc b/lite/tests/kernels/cast_compute_test.cc index 86331bb8a1cce89da76d2ebb87a9d091e34f68c5..34038dfdc797d0e5ee618b575ad532fd64809276 100644 --- a/lite/tests/kernels/cast_compute_test.cc +++ b/lite/tests/kernels/cast_compute_test.cc @@ -135,7 +135,7 @@ TEST(Cast, precision) { float abs_error = 2e-5; #if defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/clip_compute_test.cc b/lite/tests/kernels/clip_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c6149bb753b2a83813d0a129d61d7444456c399 --- /dev/null +++ b/lite/tests/kernels/clip_compute_test.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" + +namespace paddle { +namespace lite { + +class ClipComputeTester : public arena::TestCase { + protected: + // common attributes for this op. + std::string x_ = "x"; + std::string out_ = "out"; + std::string min_tensor_ = "min_tensor"; + std::string max_tensor_ = "max_tensor"; + float min_{}; + float max_{}; + bool use_minmax_tensor_{}; + DDim x_dims_; + + public: + ClipComputeTester(const Place& place, + const std::string& alias, + int n, + int c, + int h, + int w, + float min, + float max, + bool use_minmax_tensor) + : TestCase(place, alias) { + x_dims_ = DDim(std::vector({n, c, h, w})); + min_ = min; + max_ = max; + use_minmax_tensor_ = use_minmax_tensor; + } + + void RunBaseline(Scope* scope) override { + auto* x = scope->FindTensor(x_); + auto* out = scope->NewTensor(out_); + CHECK(out); + out->Resize(x->dims()); + const auto* x_data = x->data(); + auto* out_data = out->mutable_data(); + + for (int i = 0; i < x->numel(); i++) { + if (x_data[i] < min_) + out_data[i] = min_; + else if (x_data[i] > max_) + out_data[i] = max_; + else + out_data[i] = x_data[i]; + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType("clip"); + op_desc->SetInput("X", {x_}); + op_desc->SetOutput("Out", {out_}); + if (use_minmax_tensor_) { + op_desc->SetInput("Min", {min_tensor_}); + op_desc->SetInput("Max", {max_tensor_}); + op_desc->SetAttr("min", 0.f); + op_desc->SetAttr("max", 0.f); + } else { + op_desc->SetAttr("min", min_); + op_desc->SetAttr("max", max_); + } + } + + void PrepareData() override { + std::vector x_data(x_dims_.production()); + for (int i = 0; i < x_dims_.production(); i++) { + float sign = i % 3 == 0 ? -1.0f : 1.0f; + x_data[i] = sign * static_cast(i % 128) * 0.013f + 0.001; + } + SetCommonTensor(x_, x_dims_, x_data.data()); + + if (use_minmax_tensor_) { + std::vector min_data = {min_}; + SetCommonTensor( + min_tensor_, DDim(std::vector({1})), min_data.data()); + + std::vector max_data = {max_}; + SetCommonTensor( + max_tensor_, DDim(std::vector({1})), max_data.data()); + } + } +}; + +TEST(Clip, precision) { + LOG(INFO) << "test clip op"; +#ifdef LITE_WITH_ARM + Place place(TARGET(kARM)); + + float min = -1; + float max = 1; + for (int n : {1, 3}) { + for (int c : {3, 5}) { + for (int h : {5, 6}) { + for (int w : {6, 7}) { + for (bool use_minmax_tensor : {true, false}) { + std::unique_ptr tester(new ClipComputeTester( + place, "def", n, c, h, w, min, max, use_minmax_tensor)); + arena::Arena arena(std::move(tester), place, 2e-5); + arena.TestPrecision(); + } + } + } + } + } +#endif +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/conv_compute_test.cc b/lite/tests/kernels/conv_compute_test.cc index 4442fe47e3a6410aa921d163ef0257602cce2fbc..a4bcf6ea70e3fe719793aa4ebd8fb8cd09e35905 100644 --- a/lite/tests/kernels/conv_compute_test.cc +++ b/lite/tests/kernels/conv_compute_test.cc @@ -413,6 +413,9 @@ TEST(Conv2d, precision) { #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 5e-2; // Using fp16 in NPU +#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU) + place = TARGET(kHuaweiAscendNPU); + abs_error = 5e-2; // Using fp16 in NPU #else return; #endif diff --git a/lite/tests/kernels/dropout_compute_test.cc b/lite/tests/kernels/dropout_compute_test.cc index 025f02ce31505cee684fb9a21c7b26d96e1c3026..c4ecc0cf01e3da7c43294ba1249b5b4f106caa95 100644 --- a/lite/tests/kernels/dropout_compute_test.cc +++ b/lite/tests/kernels/dropout_compute_test.cc @@ -94,7 +94,7 @@ TEST(Dropout, precision) { #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // Using fp16 in NPU -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/elementwise_compute_test.cc b/lite/tests/kernels/elementwise_compute_test.cc index 505ab72dc125d5b527845f4695a444c215422f8b..d91c304ef7e76b9ff623ebfe1bb9ad5bb4ace2c9 100644 --- a/lite/tests/kernels/elementwise_compute_test.cc +++ b/lite/tests/kernels/elementwise_compute_test.cc @@ -228,7 +228,7 @@ TEST(Elementwise, precision) { abs_error = 1e-2; // use fp16 in npu #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/elementwise_grad_compute_test.cc b/lite/tests/kernels/elementwise_grad_compute_test.cc index 2b5fbbb65d3d7e17bf90afb71f5c8154f0d88488..04e74e49099f13a7e5920b306f8d2e26650a2574 100644 --- a/lite/tests/kernels/elementwise_grad_compute_test.cc +++ b/lite/tests/kernels/elementwise_grad_compute_test.cc @@ -215,18 +215,6 @@ class ElementwiseAddGradTester { fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production()); this->run_forward(¶m_, &kernel_, x, y, out.data()); - for (int i = 0; i < x_dims_.production(); i++) { - LOG(INFO) << "x_" << i << ": " << x[i]; - } - - for (int i = 0; i < y_dims_.production(); i++) { - LOG(INFO) << "y_" << i << ": " << y[i]; - } - - for (int i = 0; i < out_dims_.production(); i++) { - LOG(INFO) << "out_" << i << ": " << out[i]; - } - // backward std::vector out_grad(out_dims_.production()); std::vector x_grad(x_dims_.production()); @@ -242,14 +230,6 @@ class ElementwiseAddGradTester { x_grad.data(), y_grad.data()); - for (int i = 0; i < x_grad.size(); i++) { - LOG(INFO) << "x_grad_" << i << ": " << x_grad[i]; - } - - for (int i = 0; i < y_grad.size(); i++) { - LOG(INFO) << "y_grad_" << i << ": " << y_grad[i]; - } - // get numeric gradient std::vector x_delta(x_dims_.production()); std::vector y_delta(y_dims_.production()); @@ -443,18 +423,6 @@ class ElementwiseSubGradTester { fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production()); this->run_forward(¶m_, &kernel_, x, y, out.data()); - for (int i = 0; i < x_dims_.production(); i++) { - LOG(INFO) << "x_" << i << ": " << x[i]; - } - - for (int i = 0; i < y_dims_.production(); i++) { - LOG(INFO) << "y_" << i << ": " << y[i]; - } - - for (int i = 0; i < out_dims_.production(); i++) { - LOG(INFO) << "out_" << i << ": " << out[i]; - } - // backward std::vector out_grad(out_dims_.production()); std::vector x_grad(x_dims_.production()); @@ -470,14 +438,6 @@ class ElementwiseSubGradTester { x_grad.data(), y_grad.data()); - for (int i = 0; i < x_grad.size(); i++) { - LOG(INFO) << "x_grad_" << i << ": " << x_grad[i]; - } - - for (int i = 0; i < y_grad.size(); i++) { - LOG(INFO) << "y_grad_" << i << ": " << y_grad[i]; - } - // get numeric gradient std::vector x_delta(x_dims_.production()); std::vector y_delta(y_dims_.production()); diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc index 4d0ad1ab47a17c3e8d227b9e0482d7cbe21ab7e2..c023a12b0fb4e3118976d854114c554ca6bf6462 100644 --- a/lite/tests/kernels/gather_compute_test.cc +++ b/lite/tests/kernels/gather_compute_test.cc @@ -98,7 +98,7 @@ TEST(Gather, precision) { abs_error = 1e-2; // use fp16 in npu #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/group_norm_compute_test.cc b/lite/tests/kernels/group_norm_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a1df003850731eb4d355d01f65100d2b9d200224 --- /dev/null +++ b/lite/tests/kernels/group_norm_compute_test.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" + +namespace paddle { +namespace lite { + +class GroupNormComputeTest : public arena::TestCase { + protected: + // common attributes for this op. + std::string x_ = "x"; + std::string y_ = "y"; + std::string saved_mean_ = "saved_mean"; + std::string saved_variance_ = "saved_variance"; + std::string scale_ = "scale"; + std::string bias_ = "bias"; + + DDim dims_{{4, 5, 19, 19}}; + float epsilon_ = 1e-5f; + int groups_ = 1; + int channels_ = dims_[1]; + + public: + GroupNormComputeTest(const Place& place, + const std::string& alias, + DDim dims, + float epsilon, + int groups, + int channels) + : TestCase(place, alias), + dims_(dims), + epsilon_(epsilon), + groups_(groups), + channels_(channels) {} + + void RunBaseline(Scope* scope) override { + auto x = scope->FindTensor(x_); + auto scale = scope->FindTensor(scale_); + auto bias = scope->FindTensor(bias_); + auto y = scope->NewTensor(y_); + auto saved_mean = scope->NewTensor(saved_mean_); + auto saved_variance = scope->NewTensor(saved_variance_); + CHECK(y); + CHECK(saved_mean); + CHECK(saved_variance); + DDim saved_dim({dims_[0] * groups_}); + y->Resize(dims_); + saved_mean->Resize(saved_dim); + saved_variance->Resize(saved_dim); + + auto x_data = x->data(); + auto scale_data = scale->data(); + auto bias_data = bias->data(); + auto y_data = y->mutable_data(); + auto saved_mean_data = saved_mean->mutable_data(); + auto saved_variance_data = saved_variance->mutable_data(); + + int n = x->dims()[0]; + int ch_per_group = channels_ / groups_; + CHECK_EQ(x->dims()[1], channels_); + int spatial_size = ch_per_group * x->dims()[2] * x->dims()[3]; + // compute mean + for (int i = 0; i < n * groups_; ++i) { + const float* x_ptr = x_data + i * spatial_size; + float sum = 0.f; + for (int j = 0; j < spatial_size; ++j) { + sum += x_ptr[j]; + } + saved_mean_data[i] = sum / spatial_size; + } + // compute variance + for (int i = 0; i < n * groups_; ++i) { + const float* x_ptr = x_data + i * spatial_size; + float sum = 0.f; + for (int j = 0; j < spatial_size; ++j) { + sum += + (x_ptr[j] - saved_mean_data[i]) * (x_ptr[j] - saved_mean_data[i]); + } + saved_variance_data[i] = 1.f / sqrtf(sum / spatial_size + epsilon_); + } + int in_size = x->dims()[2] * x->dims()[3]; + // compute out + for (int i = 0; i < n * groups_; ++i) { + const float* x_ptr = x_data + i * spatial_size; + float* y_ptr = y_data + i * spatial_size; + int c_num = i % groups_; + for (int c = 0; c < ch_per_group; c++) { + int chin = c_num * ch_per_group + c; + float scale_val = scale_data[chin]; + float bias_val = bias_data[chin]; + const float* x_ch_ptr = x_ptr + c * in_size; + float* y_ch_ptr = y_ptr + c * in_size; + for (int j = 0; j < in_size; j++) { + y_ch_ptr[j] = scale_val * (x_ch_ptr[j] - saved_mean_data[i]) * + saved_variance_data[i] + + bias_val; + } + } + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType("group_norm"); + op_desc->SetInput("X", {x_}); + op_desc->SetInput("Bias", {bias_}); + op_desc->SetInput("Scale", {scale_}); + op_desc->SetOutput("Y", {y_}); + op_desc->SetOutput("SavedMean", {saved_mean_}); + op_desc->SetOutput("SavedVariance", {saved_variance_}); + op_desc->SetAttr("epsilon", epsilon_); + op_desc->SetAttr("groups", groups_); + op_desc->SetAttr("channels", channels_); + } + + void PrepareData() override { + std::vector x(dims_.production()); + fill_data_rand(x.data(), -1.f, 1.f, dims_.production()); + + DDim scale_bias_dims{{dims_[1]}}; + std::vector scale(scale_bias_dims.production()); + fill_data_rand(scale.data(), -1.f, 1.f, scale_bias_dims.production()); + std::vector bias(scale_bias_dims.production()); + fill_data_rand(bias.data(), -1.f, 1.f, scale_bias_dims.production()); + + SetCommonTensor(x_, dims_, x.data()); + SetCommonTensor(scale_, scale_bias_dims, scale.data(), {}, true); + SetCommonTensor(bias_, scale_bias_dims, bias.data(), {}, true); + } +}; + +void TestGroupNorm(Place place, + float abs_error = 6e-5, + std::vector ignored_outs = {}) { + for (auto& n : {1, 3, 16}) { + for (auto& c : {1}) { + for (auto& h : {1, 16, 33, 56}) { + for (auto& w : {1, 17, 55}) { + for (auto& groups : {1, 2, 4}) { + if (c % groups != 0) { + continue; + } + DDim dim_in({n, c, h, w}); + float epsilon = 1e-5f; + std::unique_ptr tester(new GroupNormComputeTest( + place, "def", dim_in, epsilon, groups, c)); +#ifdef LITE_WITH_ARM + if (place == TARGET(kARM)) { + auto& ctx = tester->context()->As(); + ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 4); + } +#endif + arena::Arena arena(std::move(tester), place, abs_error); + if (!arena.TestPrecision(ignored_outs)) { + LOG(ERROR) << "run n: " << n << ", c: " << c << ", h: " << h + << ", w: " << w; + return; + } + } + } + } + } + } +} + +TEST(GroupNorm, precision) { + Place place; + float abs_error = 6e-5; + std::vector ignored_outs = {}; +#ifdef LITE_WITH_ARM + place = TARGET(kARM); +#else + return; +#endif + TestGroupNorm(place, abs_error, ignored_outs); +} +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/layer_norm_compute_test.cc b/lite/tests/kernels/layer_norm_compute_test.cc index 5ea01a6cca504db230d62a63ef3a62d4f73470fa..bd4480b6127a318286b3172f53fc8a5bceb8c328 100644 --- a/lite/tests/kernels/layer_norm_compute_test.cc +++ b/lite/tests/kernels/layer_norm_compute_test.cc @@ -147,7 +147,7 @@ TEST(LayerNorm, precision) { LOG(INFO) << "test layer_norm op"; float abs_error = 2e-5; Place place; -#if defined(LITE_WITH_XPU) +#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #elif defined(LITE_WITH_NPU) place = TARGET(kNPU); diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc index 988077c6c319d5bcc8e50d6c8e5544331a86fe45..ae39abf1dbaf206fe0a68dd492a48a2452c8094e 100644 --- a/lite/tests/kernels/lookup_table_compute_test.cc +++ b/lite/tests/kernels/lookup_table_compute_test.cc @@ -116,7 +116,7 @@ TEST(LookupTable, precision) { abs_error = 1e-2; #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; @@ -132,7 +132,8 @@ TEST(LookupTable, precision) { std::vector>{{5, 2, 3, 1}, {2, 3, 1}, {3, 1}}) { for (auto w_dims : std::vector>{{4, 2}, {6, 8}, {12, 15}}) { -#if defined(LITE_WITH_XPU) && defined(LITE_WITH_NPU) +#if (defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)) || \ + defined(LITE_WITH_NPU) for (auto padding_idx : std::vector{-1}) { // Only -1 is supported by XPU or NPU #else diff --git a/lite/tests/kernels/matmul_compute_test.cc b/lite/tests/kernels/matmul_compute_test.cc index 59b0fde8fd18b8a2170b6fdbd42444f09843f077..9799c15622b07a8d126654c79738d29b176c2cf4 100644 --- a/lite/tests/kernels/matmul_compute_test.cc +++ b/lite/tests/kernels/matmul_compute_test.cc @@ -457,7 +457,7 @@ TEST(Matmul2x2, precision) { abs_error = 1e-2; // use fp16 in npu #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; @@ -489,7 +489,7 @@ TEST(Matmul2x2_y_transpose, precision) { abs_error = 1e-2; // use fp16 in npu #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc index d070292332b65ed577ec6cefdb220ee691eb99e9..d89b3569358034d72ac8019f2348b49764ca6b0c 100644 --- a/lite/tests/kernels/mul_compute_test.cc +++ b/lite/tests/kernels/mul_compute_test.cc @@ -127,7 +127,7 @@ TEST(Mul, precision) { #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // use fp16 in npu -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/multiclass_nms_compute_test.cc b/lite/tests/kernels/multiclass_nms_compute_test.cc index a1190197bffdf505fec77c6b22b7871316a2d125..dd16730ef551ddc11825936d99733f33015fd2c0 100644 --- a/lite/tests/kernels/multiclass_nms_compute_test.cc +++ b/lite/tests/kernels/multiclass_nms_compute_test.cc @@ -478,7 +478,7 @@ TEST(multiclass_nms, precision) { Place place; #if defined(LITE_WITH_ARM) place = TARGET(kHost); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/pool_compute_test.cc b/lite/tests/kernels/pool_compute_test.cc index 04894188b0bf1557000479ae18b0369997909f89..fc4d004e552e76792470f46a54afd6aa13bbc330 100644 --- a/lite/tests/kernels/pool_compute_test.cc +++ b/lite/tests/kernels/pool_compute_test.cc @@ -381,7 +381,7 @@ TEST(Pool, precision) { #if defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 1e-2; // Using fp16 in NPU -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/prior_box_compute_test.cc b/lite/tests/kernels/prior_box_compute_test.cc index 73fd612c3a03c0a15ddaf3ce6c08ff0ed1a5a95b..ec0eda8cbb2b7f8d6ab01efa467ed857d817905a 100644 --- a/lite/tests/kernels/prior_box_compute_test.cc +++ b/lite/tests/kernels/prior_box_compute_test.cc @@ -21,7 +21,7 @@ namespace paddle { namespace lite { -const int MALLOC_ALIGN = 64; +const int MALLOC_ALIGN = 16; void* fast_malloc(size_t size) { size_t offset = sizeof(void*) + MALLOC_ALIGN - 1; diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc index 3a866b6cf22cf67c3f5a60e5a4aa8603cee6a1a3..f3fcc0bad5418624c86897bafc52dbf3a7ec0d8e 100644 --- a/lite/tests/kernels/reshape_compute_test.cc +++ b/lite/tests/kernels/reshape_compute_test.cc @@ -206,7 +206,7 @@ TEST(Reshape, precision) { abs_error = 1e-2; // Using fp16 in NPU #elif defined(LITE_WITH_ARM) place = TARGET(kHost); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/roi_align_compute_test.cc b/lite/tests/kernels/roi_align_compute_test.cc index 8eb84dd0337d0635dc360e2e04aa1ad047e912c0..2bbfdcd81da951bd769ab03094a0df48f3a6e13b 100644 --- a/lite/tests/kernels/roi_align_compute_test.cc +++ b/lite/tests/kernels/roi_align_compute_test.cc @@ -106,13 +106,11 @@ class RoiAlignComputeTester : public arena::TestCase { } LOG(INFO) << "Read rois data. " << datas[0] << " " << datas.back(); reader.close(); - SetCommonTensor(rois_, dims, datas.data()); - auto rois_tensor = baseline_scope()->FindMutableTensor(rois_); std::vector lod0({0, 152, 304}); LoD lod; lod.push_back(lod0); - rois_tensor->set_lod(lod); + SetCommonTensor(rois_, dims, datas.data(), lod); } }; diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc index efd0497002ee402426a7198bf47ec60c7f41d2fd..9d1f4403dc1a82e58d8c764933ba01c0e0b5c082 100644 --- a/lite/tests/kernels/scale_compute_test.cc +++ b/lite/tests/kernels/scale_compute_test.cc @@ -165,7 +165,7 @@ TEST(Scale, precision) { abs_error = 4e-3; // Using fp16 in NPU #elif defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); abs_error = 3e-4; // Some operations use fp16 in XPU #elif defined(LITE_WITH_X86) diff --git a/lite/tests/kernels/sequence_conv_compute_test.cc b/lite/tests/kernels/sequence_conv_compute_test.cc index 84887b2573516d0c82cbb8c9b4cf9336f30ee41d..68afaad04f8e84995e811f81f99a2d4109c845a5 100644 --- a/lite/tests/kernels/sequence_conv_compute_test.cc +++ b/lite/tests/kernels/sequence_conv_compute_test.cc @@ -85,21 +85,31 @@ class SequenceConvComputeTester : public arena::TestCase { auto output_dims = output->dims(); auto output_data = output->mutable_data(); std::vector> res; - if (contextStart_ == -2) { + + if (contextStart_ == -2 && lod_.size() == 1 && + lod_[0] == std::vector({0, 4})) { res = {{-0.08867277f, -0.17257819f, -0.2564836f}, {0.194508f, 0.05720823f, -0.08009153f}, {0.73512584f, 0.5749428f, 0.41475973f}, {0.5635012f, 0.49485126f, 0.42620137f}}; - } else if (contextStart_ == -1) { + } else if (contextStart_ == -1 && lod_.size() == 1 && + lod_[0] == std::vector({0, 4})) { res = {{0.194508f, 0.05720823f, -0.08009153f}, {0.73512584f, 0.5749428f, 0.41475973f}, {0.5635012f, 0.49485126f, 0.42620137f}, {0.2517162f, 0.23646072f, 0.22120519f}}; - } else if (contextStart_ == 0) { + } else if (contextStart_ == 0 && lod_.size() == 1 && + lod_[0] == std::vector({0, 4})) { res = {{0.73512584f, 0.5749428f, 0.41475973f}, {0.5635012f, 0.49485126f, 0.42620137f}, {0.2517162f, 0.23646072f, 0.22120519f}, {0.02574372f, 0.03337148f, 0.04099924f}}; + } else if (contextStart_ == -1 && lod_.size() == 1 && + lod_[0] == std::vector({0, 2, 4})) { + res = {{0.194508, 0.05720823, -0.08009153}, + {0.7093821, 0.57208234, 0.43478262}, + {0.19450802, 0.17925248, 0.16399695}, + {0.2517162, 0.23646072, 0.22120519}}; } else { fprintf(stderr, "not supported contextStart_\n"); exit(-1); @@ -136,12 +146,25 @@ void TestNormalCase(Place place, float abs_error = 2e-5) { } } +void TestBatchCase(Place place, float abs_error = 2e-5) { + std::vector> lod{{0, 2, 4}}; + std::vector dims{4, 5}; + std::vector candidate_pad_idx{-1}; + for (int pad_idx : candidate_pad_idx) { + std::unique_ptr tester(new SequenceConvComputeTester( + place, "def", lod, DDim(dims), pad_idx, 1, 3, 3)); + arena::Arena arena(std::move(tester), place, abs_error); + arena.TestPrecision(); + } +} + TEST(sequence_conv, precision) { #ifdef LITE_WITH_ARM float abs_error = 2e-5; Place place(TARGET(kARM)); TestNormalCase(place, abs_error); + TestBatchCase(place, abs_error); #endif } diff --git a/lite/tests/kernels/slice_compute_test.cc b/lite/tests/kernels/slice_compute_test.cc index fc96b39f010eab5eedd431cb81e881b7aadb11a2..b566bfa3e86cf6067f9914b5fc3932458a6ee186 100644 --- a/lite/tests/kernels/slice_compute_test.cc +++ b/lite/tests/kernels/slice_compute_test.cc @@ -202,20 +202,15 @@ class SliceComputeTester : public arena::TestCase { DDim({static_cast(ends_.size())}), ends_.data()); } else if (use_tensor_list_) { - Scope& scope_ = this->scope(); for (int i = 0; i < starts_.size(); ++i) { - auto* tensor = scope_.NewTensor("starts_tensor_list_" + - paddle::lite::to_string(i)); - tensor->Resize(DDim({1})); - auto* d = tensor->mutable_data(); - d[0] = starts_[i]; + SetCommonTensor("starts_tensor_list_" + paddle::lite::to_string(i), + DDim({1}), + &starts_[i]); } for (int i = 0; i < ends_.size(); ++i) { - auto* tensor = - scope_.NewTensor("ends_tensor_list_" + paddle::lite::to_string(i)); - tensor->Resize(DDim({1})); - auto* d = tensor->mutable_data(); - d[0] = ends_[i]; + SetCommonTensor("ends_tensor_list_" + paddle::lite::to_string(i), + DDim({1}), + &ends_[i]); } } } @@ -273,7 +268,7 @@ TEST(Slice, precision) { test_slice(place); test_slice_tensor(place); test_slice_tensor_list(place); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) Place place(TARGET(kXPU)); test_slice(place); #endif diff --git a/lite/tests/kernels/softmax_compute_test.cc b/lite/tests/kernels/softmax_compute_test.cc index a91f6534ffa1f8022e2005cc83255d306adf77c1..87a94aba184a055081446b4df830b72146834ed2 100644 --- a/lite/tests/kernels/softmax_compute_test.cc +++ b/lite/tests/kernels/softmax_compute_test.cc @@ -111,8 +111,12 @@ TEST(Softmax, precision) { for (auto x_dims : std::vector>{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}) { - for (auto axis : {-1, 0, 1, 2, 3}) { - if (axis >= x_dims.size()) continue; + int ndims = x_dims.size(); + for (int axis = -1; axis < ndims; axis++) { +#if defined(LITE_WITH_XPU) + if (axis != -1 && axis != ndims - 1) + continue; // -1 and dims.size() - 1 are only supported by XPU +#endif std::unique_ptr tester( new SoftmaxComputeTest(place, "def", DDim(x_dims), axis)); arena::Arena arena(std::move(tester), place, abs_error); diff --git a/lite/tests/kernels/stack_compute_test.cc b/lite/tests/kernels/stack_compute_test.cc index 10b289e41972eb6a9f332f0376393fdfaae94abe..72529cac5165badd50c086a75e882417725adb96 100644 --- a/lite/tests/kernels/stack_compute_test.cc +++ b/lite/tests/kernels/stack_compute_test.cc @@ -106,7 +106,7 @@ TEST(Stack, precision) { Place place; #ifdef LITE_WITH_ARM place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/kernels/transpose_compute_test.cc b/lite/tests/kernels/transpose_compute_test.cc index 0ec010e47fe22f0bd60f0c275696f726b6f01a68..933e9f8ec5fc7b1d9b510c71f57fda309a5477dc 100644 --- a/lite/tests/kernels/transpose_compute_test.cc +++ b/lite/tests/kernels/transpose_compute_test.cc @@ -164,7 +164,7 @@ TEST(Transpose, precision) { LOG(INFO) << "test Transpose op"; float abs_error = 2e-5; Place place; -#ifdef LITE_WITH_XPU +#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #elif defined(LITE_WITH_NPU) place = TARGET(kNPU); diff --git a/lite/tests/kernels/yolo_box_compute_test.cc b/lite/tests/kernels/yolo_box_compute_test.cc index c41c89608fd7496c5b01b1a813581f7f461ff0ee..b88f25e1e0ddb85683297c19a841a5d47b2bbccf 100644 --- a/lite/tests/kernels/yolo_box_compute_test.cc +++ b/lite/tests/kernels/yolo_box_compute_test.cc @@ -247,7 +247,7 @@ TEST(YoloBox, precision) { Place place; #if defined(LITE_WITH_ARM) place = TARGET(kARM); -#elif defined(LITE_WITH_XPU) +#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) place = TARGET(kXPU); #else return; diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc index 8265f9db2f85e54dd91314ac5dc7932e7f7e842a..9ad98ce6f4566898b3821e6bf540b331a84b97bb 100644 --- a/lite/tests/math/conv_compute_test.cc +++ b/lite/tests/math/conv_compute_test.cc @@ -236,19 +236,19 @@ void test_conv_fp32(const std::vector& input_dims, double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] * weight_dim[3] / param.groups; - LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape" - << dim_out << ",running time, avg: " << t0.LapTimes().Avg() - << ", min time: " << t0.LapTimes().Min() - << ", total GOPS: " << 1e-9 * gops - << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() - << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); + VLOG(4) << "conv fp32: input shape: " << dim_in << ", output shape" + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); if (FLAGS_check_result) { double max_ratio = 0; double max_diff = 0; tensor_cmp_host(tout_basic, *param.output, max_ratio, max_diff); - LOG(INFO) << "compare result, max diff: " << max_diff - << ", max ratio: " << max_ratio; + VLOG(4) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; if (std::abs(max_ratio) > 1e-3f) { if (max_diff > 5e-4f) { LOG(WARNING) << "basic result"; @@ -274,15 +274,15 @@ void test_conv_fp32(const std::vector& input_dims, } } } - LOG(INFO) << "test fp32 conv: input: " << dim_in - << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2] - << ", " << pads[3] << ", stride: " << strides[0] << ", " - << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] - << ", group: " << group - << ", bias: " << (flag_bias ? "true" : "false") - << ", act: " << flag_act << ", threads: " << th - << ", power_mode: " << cls << " successed!!\n"; + VLOG(4) << "test fp32 conv: input: " << dim_in + << ", output: " << dim_out << ", weight dim: " << weight_dim + << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2] + << ", " << pads[3] << ", stride: " << strides[0] << ", " + << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] + << ", group: " << group + << ", bias: " << (flag_bias ? "true" : "false") + << ", act: " << flag_act << ", threads: " << th + << ", power_mode: " << cls << " successed!!\n"; } } } diff --git a/lite/tests/math/conv_int8_compute_test.cc b/lite/tests/math/conv_int8_compute_test.cc index 8dac81fe9f08f3e85fab844ce2df0965fbb52289..ecd5c3966df3115a366fd722b3978258c88c0bf5 100644 --- a/lite/tests/math/conv_int8_compute_test.cc +++ b/lite/tests/math/conv_int8_compute_test.cc @@ -34,7 +34,7 @@ DEFINE_int32(power_mode, DEFINE_int32(threads, 1, "threads num"); DEFINE_int32(warmup, 0, "warmup times"); DEFINE_int32(repeats, 1, "repeats times"); -DEFINE_bool(basic_test, true, "do all tests"); +DEFINE_bool(basic_test, false, "do all tests"); DEFINE_bool(check_result, true, "check the result"); DEFINE_int32(batch, 1, "batch size"); @@ -614,6 +614,9 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) { dims.push_back(DDim({batch, cin, h, h})); } } + if (cin == 1 && cout == 1) { + continue; + } test_conv_int8(dims, weights_dim, 1, diff --git a/lite/tests/math/deformable_conv_compute_test.cc b/lite/tests/math/deformable_conv_compute_test.cc index e97203123d1db0752189a9965c922b048cd6bd38..76cb970ffe428ed393cdbdae0d281e6a511655ac 100644 --- a/lite/tests/math/deformable_conv_compute_test.cc +++ b/lite/tests/math/deformable_conv_compute_test.cc @@ -34,7 +34,7 @@ DEFINE_int32(power_mode, DEFINE_int32(threads, 1, "threads num"); DEFINE_int32(warmup, 0, "warmup times"); DEFINE_int32(repeats, 1, "repeats times"); -DEFINE_bool(basic_test, true, "do all tests"); +DEFINE_bool(basic_test, false, "do all tests"); DEFINE_bool(check_result, true, "check the result"); DEFINE_int32(batch, 1, "batch size"); @@ -342,7 +342,7 @@ TEST(TestDeformableConvRand, test_deformable_conv_rand) { if (FLAGS_basic_test) { for (auto& cin : {1, 3, 8}) { for (auto& cout : {1, 5, 16}) { - for (auto& g : {1, 2}) { + for (auto& g : {1}) { for (auto& kw : {1, 2, 3}) { for (auto& kh : {1, 2, 3}) { for (auto& stride : {1, 2}) { diff --git a/lite/tests/math/gemm_int8_compute_test.cc b/lite/tests/math/gemm_int8_compute_test.cc index adae19d013e50fbd484257a99f55229c75b94263..57899c8d1e2e0c073f410e90d18119327f21f066 100644 --- a/lite/tests/math/gemm_int8_compute_test.cc +++ b/lite/tests/math/gemm_int8_compute_test.cc @@ -120,6 +120,10 @@ bool test_gemm_int8(bool tra, auto dc_fp32 = tc_fp32.mutable_data(); auto dc_basic_int8 = tc_basic_int8.mutable_data(); auto dc_basic_fp32 = tc_basic_fp32.mutable_data(); + // set intial input to be 0 + memset(reinterpret_cast(dc_basic_fp32), + 0, + tc_basic_fp32.numel() * sizeof(float)); auto dbias = tbias.mutable_data(); if (FLAGS_check_result) { diff --git a/lite/tests/math/gemv_int8_compute_test.cc b/lite/tests/math/gemv_int8_compute_test.cc index 99db53511446ecd4772fa2fd1b202337581506ef..3819c0dcd7f87c69a5805aae643a6a3a4a037f03 100644 --- a/lite/tests/math/gemv_int8_compute_test.cc +++ b/lite/tests/math/gemv_int8_compute_test.cc @@ -108,6 +108,10 @@ bool test_gemv_int8(bool tra, auto dc_basic_int8 = tc_basic_int8.mutable_data(); auto dc_basic_fp32 = tc_basic_fp32.mutable_data(); auto dbias = tbias.mutable_data(); + // set intial input to be 0 + memset(reinterpret_cast(dc_basic_fp32), + 0, + tc_basic_fp32.numel() * sizeof(float)); paddle::lite_api::ActivationType act = paddle::lite_api::ActivationType::kIndentity; diff --git a/lite/tests/math/sgemm_c4_compute_test.cc b/lite/tests/math/sgemm_c4_compute_test.cc index 3e5577e03075502bab30aa03a50241b817fa8742..ecdf77fd37fff1da2914eeca5e29ef931de09c53 100644 --- a/lite/tests/math/sgemm_c4_compute_test.cc +++ b/lite/tests/math/sgemm_c4_compute_test.cc @@ -92,6 +92,7 @@ bool test_sgemm_c4( auto db_c4 = tb_c4.mutable_data(); auto dc_basic = tc_basic.mutable_data(); auto dbias = tbias.mutable_data(); + memset(reinterpret_cast(dc_basic), 0, tc_basic.numel()); // trans A, B to c4 basic_trans_mat_to_c4(da, da_c4, k, m, k, true); @@ -179,6 +180,141 @@ bool test_sgemm_c4( #endif return true; } +bool test_sgemm_c8( + int m, int n, int k, bool has_bias, bool has_relu, int cls, int ths) { + int m_round = (m + 7) / 8 * 8; + int k_round = (k + 7) / 8 * 8; + int size_a = m * k; + int size_b = n * k; + int size_a_c4 = m_round * k_round; + int size_b_c8 = k_round * n; + + Tensor ta; + Tensor tb; + Tensor ta_c4; + Tensor tb_c8; + Tensor tc; + Tensor tc_basic; + Tensor tc_backup; + Tensor tbias; + + ta.Resize({size_a}); + tb.Resize({size_b}); + ta_c4.Resize({size_a_c4}); + tb_c8.Resize({size_b_c8}); + tc.Resize({m_round * n}); + tc_basic.Resize({m_round * n}); + tbias.Resize({m}); + + ta.set_precision(PRECISION(kInt16)); + tb.set_precision(PRECISION(kInt16)); + ta_c4.set_precision(PRECISION(kInt16)); + tb_c8.set_precision(PRECISION(kInt16)); + tc.set_precision(PRECISION(kInt32)); + tc_basic.set_precision(PRECISION(kInt32)); + tbias.set_precision(PRECISION(kInt32)); + + fill_tensor_rand(ta); + fill_tensor_rand(tb); + fill_tensor_rand(tbias); + fill_tensor_rand(tc); + + auto da = ta.mutable_data(); + auto db = tb.mutable_data(); + auto da_c4 = ta_c4.mutable_data(); + auto db_c8 = tb_c8.mutable_data(); + auto dc_basic = tc_basic.mutable_data(); + auto dbias = tbias.mutable_data(); + + // trans A, B to c4 + basic_trans_mat_to_c8(da, da_c4, k, m, k, true); + basic_trans_mat_to_c8(db, db_c8, n, k, n, false); + + LOG(INFO) << "sgemm_c8 M: " << m << ", N: " << n << ", K: " << k + << ", relu: " << (has_relu ? "true" : "false") + << ", bias: " << (has_bias ? "true" : "false"); + + if (FLAGS_check_result) { + basic_gemm_c8(false, + false, + m, + n, + k, + 1, + da, + k, + db, + n, + 0, + dc_basic, + n, + dbias, + false, + false); + } + Timer t0; + LOG(INFO) << "basic test end"; +#ifdef LITE_WITH_ARM + //! compute + double ops = 2.0 * m_round * n * k_round; + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), ths); + auto dc = tc.mutable_data(); + for (int j = 0; j < FLAGS_warmup; ++j) { + paddle::lite::arm::math::sgemm_prepack_c8_int16_small( + m, n, k, da_c4, db_c8, dc, &ctx); + } + LOG(INFO) << "basic test end"; + + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + paddle::lite::arm::math::sgemm_prepack_c8_int16_small( + m, n, k, da_c4, db_c8, dc, &ctx); + t0.Stop(); + } + LOG(INFO) << "basic test end"; + LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k + << ", power_mode: " << cls << ", threads: " << ths + << ", GOPS: " << ops * 1e-9f + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() + << " GOPs"; + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tc_basic, tc, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-4f && std::abs(max_diff) > 5e-5f) { + Tensor tdiff; + tdiff.set_precision(PRECISION(kInt32)); + tdiff.Resize(tc.dims()); + tensor_diff(tc_basic, tc, tdiff); + LOG(INFO) << "a: "; + print_tensor(ta); + LOG(INFO) << "a_c8: "; + print_tensor(ta_c4); + LOG(INFO) << "b: "; + print_tensor(tb); + LOG(INFO) << "b_c8: "; + print_tensor(tb_c8); + LOG(INFO) << "basic result: "; + print_tensor(tc_basic); + LOG(INFO) << "lite result: "; + print_tensor(tc); + LOG(INFO) << "diff result: "; + print_tensor(tdiff); + return false; + } + } +#endif + return true; +} TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) { if (FLAGS_basic_test) { @@ -186,11 +322,11 @@ TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) { paddle::lite::DeviceInfo::Init(); #endif LOG(INFO) << "run basic sgemm_c4 test"; - for (auto& m : {1, 3, 8, 32, 397}) { - for (auto& n : {1, 2, 3, 4, 13, 141, 789}) { - for (auto& k : {1, 3, 8, 59, 234}) { - for (auto& has_bias : {false, true}) { - for (auto& has_relu : {false, true}) { + for (auto& m : {1, 3, 8, 32, 397, 32, 64, 77}) { + for (auto& n : {1, 2, 3, 4, 13, 141, 789, 1}) { + for (auto& k : {1, 3, 8, 59, 234, 19}) { + for (auto& has_bias : {false}) { + for (auto& has_relu : {false}) { for (auto& th : {1, 2, 4}) { auto flag = test_sgemm_c4( m, n, k, has_bias, has_relu, FLAGS_power_mode, th); @@ -213,8 +349,41 @@ TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) { } } } +TEST(TestSgemmC8, test_func_sgemm_c8_prepacked) { + if (FLAGS_basic_test) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + LOG(INFO) << "run basic sgemm_c4 test"; + for (auto& m : {1, 3, 8, 32, 397, 32, 64, 77}) { + for (auto& n : {1, 2, 3, 4, 13, 141, 789, 1}) { + for (auto& k : {1, 3, 8, 59, 234, 19}) { + for (auto& has_bias : {false}) { + for (auto& has_relu : {false}) { + for (auto& th : {1}) { + auto flag = test_sgemm_c8( + m, n, k, has_bias, has_relu, FLAGS_power_mode, th); + if (flag) { + LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k + << ", bias: " << (has_bias ? "true" : "false") + << ", relu: " << (has_relu ? "true" : "false") + << " passed\n"; + } else { + LOG(FATAL) << "test m = " << m << ", n=" << n << ", k=" << k + << ", bias: " << (has_bias ? "true" : "false") + << ", relu: " << (has_relu ? "true" : "false") + << " failed\n"; + } + } + } + } + } + } + } + } +} -TEST(TestSgemmC4Custom, test_func_sgemm_c4_prepacked_custom) { +TEST(TestSgemmCnCustom, test_func_sgemm_cn_prepacked_custom) { #ifdef LITE_WITH_ARM paddle::lite::DeviceInfo::Init(); #endif @@ -230,6 +399,18 @@ TEST(TestSgemmC4Custom, test_func_sgemm_c4_prepacked_custom) { << ", k=" << FLAGS_K << ", bias: " << FLAGS_flag_bias << ", relu: " << FLAGS_flag_relu << " failed!!"; } + flag = test_sgemm_c8(FLAGS_M, + FLAGS_N, + FLAGS_K, + FLAGS_flag_bias, + FLAGS_flag_relu, + FLAGS_power_mode, + FLAGS_threads); + if (!flag) { + LOG(FATAL) << "test m = " << FLAGS_M << ", n=" << FLAGS_N + << ", k=" << FLAGS_K << ", bias: " << FLAGS_flag_bias + << ", relu: " << FLAGS_flag_relu << " failed!!"; + } LOG(INFO) << "test m = " << FLAGS_M << ", n=" << FLAGS_N << ", k=" << FLAGS_K << ", bias: " << FLAGS_flag_bias << ", relu: " << FLAGS_flag_relu << " passed!!"; diff --git a/lite/tests/math/sgemv_compute_test.cc b/lite/tests/math/sgemv_compute_test.cc index 91a1fe1770dfa3eeb3f3b94fcd2361f1c1634b1e..661c4f02aa7eafe807f77767dfd4db01a338993e 100644 --- a/lite/tests/math/sgemv_compute_test.cc +++ b/lite/tests/math/sgemv_compute_test.cc @@ -84,6 +84,7 @@ bool test_sgemv(bool tra, auto db = tb.mutable_data(); auto dc = tc.mutable_data(); auto dc_basic = tc_basic.mutable_data(); + memset(reinterpret_cast(dc_basic), 0, tc_basic.numel()); auto dbias = tbias.mutable_data(); paddle::lite_api::ActivationType act = paddle::lite_api::ActivationType::kIndentity; diff --git a/lite/tests/utils/naive_math_impl.h b/lite/tests/utils/naive_math_impl.h index a1e793f91d8cd75a2daa7eb46134b841ecf1eac7..0a89d7ca3eaf52ccdfd6c1ce1727669b8c7284e1 100644 --- a/lite/tests/utils/naive_math_impl.h +++ b/lite/tests/utils/naive_math_impl.h @@ -62,6 +62,72 @@ static void basic_trans_mat_to_c4(const type* input, } delete[] zero_buf; } +template +static void basic_trans_mat_to_c8(const type* input, + type* output, + const int ldin, + const int M, + const int K, + bool pack_k) { + const int m_round = (M + 7) / 8 * 8; + int k_round = (K + 7) / 8 * 8; + if (!pack_k) { + k_round = K; + } + const int m_loop = m_round / 8; + type zero_buf[K]; + memset(zero_buf, 0, K * sizeof(type)); + for (int i = 0; i < m_loop; ++i) { + const type* in0 = input + i * 8 * ldin; + const type* in1 = in0 + ldin; + const type* in2 = in1 + ldin; + const type* in3 = in2 + ldin; + const type* in4 = in3 + ldin; + const type* in5 = in4 + ldin; + const type* in6 = in5 + ldin; + const type* in7 = in6 + ldin; + if (8 * (i + 1) - M > 0) { + switch (8 * (i + 1) - M) { + case 7: + in1 = zero_buf; + case 6: + in2 = zero_buf; + case 5: + in3 = zero_buf; + case 4: + in4 = zero_buf; + case 3: + in5 = zero_buf; + case 2: + in6 = zero_buf; + case 1: + in7 = zero_buf; + default: + break; + } + } + for (int j = 0; j < K; ++j) { + *output++ = *in0++; + *output++ = *in1++; + *output++ = *in2++; + *output++ = *in3++; + *output++ = *in4++; + *output++ = *in5++; + *output++ = *in6++; + *output++ = *in7++; + } + for (int j = K; j < k_round; ++j) { + *output++ = static_cast(0); + *output++ = static_cast(0); + *output++ = static_cast(0); + *output++ = static_cast(0); + *output++ = static_cast(0); + *output++ = static_cast(0); + *output++ = static_cast(0); + *output++ = static_cast(0); + } + } +} template static void basic_gemm_c4(bool trans_a, @@ -118,6 +184,60 @@ static void basic_gemm_c4(bool trans_a, free(tmp_c); } +template +static void basic_gemm_c8(bool trans_a, + bool trans_b, + int m, + int n, + int k, + type2 alpha, + const type* a, + int lda, + const type* b, + int ldb, + type2 beta, + type2* c, + int ldc, + const type2* bias, + bool flag_bias = false, + bool flag_relu = false) { + type2* tmp_c = reinterpret_cast(malloc(m * ldc * sizeof(type2))); + memset(tmp_c, 0, m * ldc * sizeof(type2)); +#pragma omp parallel for + for (int i = 0; i < m; ++i) { + auto bias_data = static_cast(0); + if (flag_bias) { + bias_data = bias[i]; + } + for (int j = 0; j < n; ++j) { + auto sum = static_cast(0); + for (int l = 0; l < k; ++l) { + type av; + type bv; + if (trans_a) { + av = a[l * lda + i]; + } else { + av = a[i * lda + l]; + } + if (trans_b) { + bv = b[j * ldb + l]; + } else { + bv = b[l * ldb + j]; + } + sum += av * bv; + } + type2 tmp = alpha * sum + beta * tmp_c[i * ldc + j] + bias_data; + if (flag_relu) { + tmp_c[i * ldc + j] = tmp > (type2)0 ? tmp : (type2)0; + } else { + tmp_c[i * ldc + j] = tmp; + } + } + } + //! trans c to c4 + basic_trans_mat_to_c8(tmp_c, c, ldc, m, n, false); + free(tmp_c); +} template static void basic_gemm(bool trans_a, bool trans_b, diff --git a/lite/tests/utils/tensor_utils.h b/lite/tests/utils/tensor_utils.h index 5a48b9da6c28b8da784acdaac4d89900d44728f9..8882bb2c08f7e5c930ad7284b31ccd4fd30b8c65 100644 --- a/lite/tests/utils/tensor_utils.h +++ b/lite/tests/utils/tensor_utils.h @@ -50,6 +50,10 @@ void fill_tensor_const(Tensor& tensor, float value) { // NOLINT fill_tensor_host_const_impl( tensor.mutable_data(), static_cast(value), size); break; + case PRECISION(kInt16): + fill_tensor_host_const_impl( + tensor.mutable_data(), static_cast(value), size); + break; case PRECISION(kInt32): fill_tensor_host_const_impl( tensor.mutable_data(), static_cast(value), size); @@ -78,6 +82,12 @@ void fill_tensor_host_rand_impl(signed char* dio, int64_t size) { } } template <> +void fill_tensor_host_rand_impl(int16_t* dio, int64_t size) { + for (int64_t i = 0; i < size; ++i) { + dio[i] = (rand() % 256 - 128) * 2; // NOLINT + } +} +template <> void fill_tensor_host_rand_impl(unsigned char* dio, int64_t size) { for (int64_t i = 0; i < size; ++i) { @@ -95,6 +105,9 @@ void fill_tensor_rand(Tensor& tensor) { // NOLINT case PRECISION(kInt8): fill_tensor_host_rand_impl(tensor.mutable_data(), size); break; + case PRECISION(kInt16): + fill_tensor_host_rand_impl(tensor.mutable_data(), size); + break; case PRECISION(kInt32): fill_tensor_host_rand_impl(tensor.mutable_data(), size); break; diff --git a/lite/tools/build.sh b/lite/tools/build.sh index 9365120772d96d31ff0af98c2cab4dea609be5ab..f3f9b9a94236b0d4f25448deb6a702b82c38740f 100755 --- a/lite/tools/build.sh +++ b/lite/tools/build.sh @@ -22,6 +22,7 @@ OPTMODEL_DIR="" BUILD_TAILOR=OFF BUILD_CV=OFF WITH_LOG=ON +WITH_EXCEPTION=OFF WITH_PROFILE=OFF BUILD_NPU=OFF NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/ @@ -32,6 +33,9 @@ BUILD_APU=OFF APU_DDK_ROOT="$(pwd)/apu_sdk_lib/" BUILD_RKNPU=OFF RKNPU_DDK_ROOT="$(pwd)/rknpu/" +WITH_HUAWEI_ASCEND_NPU=OFF # Huawei Ascend Builder/Runtime Libs on X86 host +# default installation path, ensure acllib/atc/opp directories are all in this root dir +HUAWEI_ASCEND_NPU_DDK_ROOT="/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux_gcc4.8.5" PYTHON_EXECUTABLE_OPTION="" readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz @@ -39,8 +43,8 @@ readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/t readonly workspace=$PWD # if operating in mac env, we should expand the maximum file num -os_nmae=`uname -s` -if [ ${os_nmae} == "Darwin" ]; then +os_name=`uname -s` +if [ ${os_name} == "Darwin" ]; then ulimit -n 1024 fi @@ -126,6 +130,7 @@ function make_tiny_publish_so { -DLITE_WITH_JAVA=$BUILD_JAVA \ -DLITE_WITH_PYTHON=$BUILD_PYTHON \ -DLITE_WITH_LOG=$WITH_LOG \ + -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \ -DLITE_ON_TINY_PUBLISH=ON \ -DANDROID_STL_TYPE=$android_stl \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ @@ -181,6 +186,7 @@ function make_opencl { -DWITH_TESTING=OFF \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ -DLITE_WITH_LOG=$WITH_LOG \ + -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \ -DLITE_WITH_CV=$BUILD_CV \ -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_LANG=$3 @@ -219,6 +225,7 @@ function make_full_publish_so { -DLITE_WITH_JAVA=$BUILD_JAVA \ -DLITE_WITH_PYTHON=$BUILD_PYTHON \ -DLITE_WITH_LOG=$WITH_LOG \ + -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \ -DLITE_WITH_PROFILE=${WITH_PROFILE} \ -DANDROID_STL_TYPE=$android_stl \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ @@ -343,6 +350,8 @@ function make_cuda { -DLITE_WITH_STATIC_CUDA=OFF \ -DLITE_WITH_PYTHON=${BUILD_PYTHON} \ -DLITE_BUILD_EXTRA=ON \ + -DLITE_WITH_LOG=${WITH_LOG} \ + -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \ -DLITE_WITH_XPU=$BUILD_XPU \ -DLITE_WITH_XTCL=$BUILD_XTCL \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT @@ -358,6 +367,11 @@ function make_x86 { root_dir=$(pwd) build_directory=$BUILD_DIR/build.lite.x86 + if [ ${WITH_HUAWEI_ASCEND_NPU} == "ON" ]; then + export CXX=/usr/bin/g++ # Ascend need g++ in centos + build_directory=$BUILD_DIR/build.lite.huawei_ascend_npu + fi + if [ -d $build_directory ] then rm -rf $build_directory @@ -379,10 +393,13 @@ function make_x86 { -DLITE_WITH_PYTHON=${BUILD_PYTHON} \ -DLITE_BUILD_EXTRA=ON \ -DLITE_WITH_LOG=${WITH_LOG} \ + -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \ -DLITE_WITH_PROFILE=${WITH_PROFILE} \ -DLITE_WITH_XPU=$BUILD_XPU \ -DLITE_WITH_XTCL=$BUILD_XTCL \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \ + -DLITE_WITH_HUAWEI_ASCEND_NPU=$WITH_HUAWEI_ASCEND_NPU \ + -DHUAWEI_ASCEND_NPU_DDK_ROOT=$HUAWEI_ASCEND_NPU_DDK_ROOT \ -DCMAKE_BUILD_TYPE=Release \ -DPY_VERSION=$PY_VERSION \ $PYTHON_EXECUTABLE_OPTION @@ -409,6 +426,7 @@ function print_usage { echo echo -e "optional argument:" echo -e "--with_log: (OFF|ON); controls whether to print log information, default is ON" + echo -e "--with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF" echo -e "--build_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)" echo -e "--build_train: (OFF|ON); controls whether to publish training operators and kernels, build_train is only for full_publish library now" echo -e "--build_python: (OFF|ON); controls whether to publish python api lib (ANDROID and IOS is not supported)" @@ -491,6 +509,17 @@ function main { WITH_LOG="${i#*=}" shift ;; + --with_exception=*) + WITH_EXCEPTION="${i#*=}" + if [[ $WITH_EXCEPTION == "ON" && $ARM_OS=="android" && $ARM_ABI == "armv7" && $ARM_LANG != "clang" ]]; then + set +x + echo + echo -e "error: only clang provide C++ exception handling support for 32-bit ARM." + echo + exit 1 + fi + shift + ;; --with_profile=*) WITH_PROFILE="${i#*=}" shift @@ -539,6 +568,14 @@ function main { RKNPU_DDK_ROOT="${i#*=}" shift ;; + --with_huawei_ascend_npu=*) + WITH_HUAWEI_ASCEND_NPU="${i#*=}" + shift + ;; + --huawei_ascend_npu_ddk_root=*) + HUAWEI_ASCEND_NPU_DDK_ROOT="${i#*=}" + shift + ;; tiny_publish) make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL shift diff --git a/lite/tools/build_android.sh b/lite/tools/build_android.sh index aba5fb706cb62e5bc9b50127f16d07e0db55d595..ecf34f0dfc4ddd141af9ea07dd6c4f15d1c0c16b 100755 --- a/lite/tools/build_android.sh +++ b/lite/tools/build_android.sh @@ -1,5 +1,5 @@ #!/bin/bash -set -x +set +x ##################################################################################################### # 1. global variables, you can change them according to your requirements ##################################################################################################### @@ -17,6 +17,8 @@ WITH_JAVA=ON WITH_CV=OFF # controls whether to hide log information, default is ON. WITH_LOG=ON +# controls whether to throw the exception when error occurs, default is OFF +WITH_EXCEPTION=OFF # options of striping lib according to input model. OPTMODEL_DIR="" WITH_STRIP=OFF @@ -145,6 +147,7 @@ function make_tiny_publish_so { local cmake_mutable_options=" -DLITE_BUILD_EXTRA=$WITH_EXTRA \ -DLITE_WITH_LOG=$WITH_LOG \ + -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \ -DLITE_BUILD_TAILOR=$WITH_STRIP \ -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ -DLITE_WITH_JAVA=$WITH_JAVA \ @@ -194,6 +197,7 @@ function make_full_publish_so { local cmake_mutable_options=" -DLITE_BUILD_EXTRA=$WITH_EXTRA \ -DLITE_WITH_LOG=$WITH_LOG \ + -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \ -DLITE_BUILD_TAILOR=$WITH_STRIP \ -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ -DLITE_WITH_JAVA=$WITH_JAVA \ @@ -237,6 +241,7 @@ function print_usage { echo -e "| --with_java: (OFF|ON); controls whether to publish java api lib, default is ON |" echo -e "| --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF |" echo -e "| --with_log: (OFF|ON); controls whether to print log information, default is ON |" + echo -e "| --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF |" echo -e "| --with_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP) |" echo -e "| |" echo -e "| arguments of striping lib according to input model:(armv8, gcc, c++_static) |" @@ -269,6 +274,7 @@ function main { if [ -z "$1" ]; then # compiling result contains light_api lib only, recommanded. make_tiny_publish_so $ARCH $TOOLCHAIN $ANDROID_STL + exit 0 fi # Parse command line. @@ -319,6 +325,18 @@ function main { WITH_LOG="${i#*=}" shift ;; + # ON or OFF, default OFF + --with_exception=*) + WITH_EXCEPTION="${i#*=}" + if [[ $WITH_EXCEPTION == "ON" && $ARCH == "armv7" && $TOOLCHAIN != "clang" ]]; then + set +x + echo + echo -e "Error: only clang provide C++ exception handling support for 32-bit ARM." + echo + exit 1 + fi + shift + ;; # compiling lib which can operate on opencl and cpu. --with_opencl=*) WITH_OPENCL="${i#*=}" @@ -358,6 +376,7 @@ function main { done # compiling result contains light_api lib only, recommanded. make_tiny_publish_so + exit 0 } main $@ diff --git a/lite/tools/build_bm.sh b/lite/tools/build_bm.sh index 964da15b0b6fcf888812271b0a2c944d9efa63b8..055f6a35c3ab145e9dfe4bc5d46172a2119ffb25 100755 --- a/lite/tools/build_bm.sh +++ b/lite/tools/build_bm.sh @@ -43,7 +43,7 @@ function prepare_thirdparty { # clone bmlibs if [ ! -d ${workspace}/third-party/bmlibs ]; then git clone https://github.com/AnBaolei1984/bmlibs.git ${workspace}/third-party/bmlibs - fi + fi } # for code gen, a source file is generated after a test, but is dependended by some targets in cmake. @@ -70,6 +70,13 @@ function build_bm { mkdir -p $build_dir cd $build_dir + if [ $TARGET_NAME == "BM1684" ]; then + BM_SDK_ROOT="$workspace/third-party/bmlibs/bm_sc5_libs" + else + BM_SDK_ROOT="$workspace/third-party/bmlibs/bm_sc3_libs" + fi + echo $BM_SDK_ROOT + prepare_workspace cmake .. \ ${CMAKE_COMMON_OPTIONS} \ @@ -95,17 +102,7 @@ function main { case $i in --target_name=*) TARGET_NAME="${i#*=}" - shift - ;; - #--bm_sdk_root=*) - # BM_SDK_ROOT="${i#*=}" - # shift - # ;; - bm) build_bm - shift - ;; - *) # unknown option print_usage exit 1 diff --git a/lite/tools/build_ios.sh b/lite/tools/build_ios.sh index 2c7eeb466f3d82cf491b6a631d79918fa4fd4cd2..4eea073a058ba9e1e821e9f0746687baa0c38d5f 100755 --- a/lite/tools/build_ios.sh +++ b/lite/tools/build_ios.sh @@ -12,6 +12,8 @@ WITH_EXTRA=OFF WITH_CV=OFF # controls whether to hide log information, default is ON. WITH_LOG=ON +# controls whether to throw the exception when error occurs, default is OFF +WITH_EXCEPTION=OFF # absolute path of Paddle-Lite. workspace=$PWD/$(dirname $0)/../../ # options of striping lib according to input model. @@ -69,6 +71,7 @@ function make_ios { -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ -DLITE_WITH_X86=OFF \ -DLITE_WITH_LOG=$WITH_LOG \ + -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \ -DLITE_BUILD_TAILOR=$WITH_STRIP \ -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ -DARM_TARGET_ARCH_ABI=$arch \ @@ -96,6 +99,7 @@ function print_usage { echo -e "| --arch: (armv8|armv7), default is armv8 |" echo -e "| --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF |" echo -e "| --with_log: (OFF|ON); controls whether to print log information, default is ON |" + echo -e "| --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF |" echo -e "| --with_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP) |" echo -e "| |" echo -e "| arguments of striping lib according to input model:(armv8, gcc, c++_static) |" @@ -140,6 +144,10 @@ function main { WITH_LOG="${i#*=}" shift ;; + --with_exception=*) + WITH_EXCEPTION="${i#*=}" + shift + ;; help) print_usage exit 0 @@ -152,6 +160,7 @@ function main { esac done make_ios $ARCH + exit 0 } main $@ diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh index 5ed491cb7da7b33357b7e66ab8267e60815b5348..f6de128feb6073fe206d03b68c5d8bc04dc9f16c 100755 --- a/lite/tools/build_linux.sh +++ b/lite/tools/build_linux.sh @@ -17,6 +17,8 @@ PY_VERSION="" WITH_CV=OFF # controls whether to print log information, default is ON. WITH_LOG=ON +# controls whether to throw the exception when error occurs, default is OFF +WITH_EXCEPTION=OFF # options of striping lib according to input model. WITH_STRIP=OFF OPTMODEL_DIR="" @@ -60,6 +62,7 @@ function init_cmake_mutable_options { -DPY_VERSION=$PY_VERSION \ -DLITE_WITH_CV=$WITH_CV \ -DLITE_WITH_LOG=$WITH_LOG \ + -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \ -DLITE_BUILD_TAILOR=$WITH_STRIP \ -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ -DLITE_WITH_OPENCL=$WITH_OPENCL \ @@ -210,6 +213,7 @@ function print_usage { echo -e "| --python_version: (2.7|3.5|3.7); controls python version to compile whl, default is None |" echo -e "| --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF |" echo -e "| --with_log: (OFF|ON); controls whether to print log information, default is ON |" + echo -e "| --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF |" echo -e "| |" echo -e "| arguments of striping lib according to input model: |" echo -e "| ./lite/tools/build_linux.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir |" @@ -280,6 +284,11 @@ function main { shift ;; # ON or OFF, default OFF + --with_exception=*) + WITH_EXCEPTION="${i#*=}" + shift + ;; + # ON or OFF, default OFF --with_strip=*) BUILD_TAILOR="${i#*=}" shift diff --git a/lite/tools/build_mlu.sh b/lite/tools/build_mlu.sh index 01d71aaf213abb99633112664af580b897ce7454..e0fb2ab11b110cf5a29151ea7c8e544a4074c8c5 100755 --- a/lite/tools/build_mlu.sh +++ b/lite/tools/build_mlu.sh @@ -4,7 +4,7 @@ set -ex # global variables with default value NEUWARE_HOME="${NEUWARE_HOME}" TARGET_NAME="all" # default target -BUILD_EXTRA=OFF # ON(with sequence ops)/OFF +BUILD_EXTRA=ON # ON(with sequence ops)/OFF WITH_TESTING=ON # ON/OFF function print_usage { @@ -28,16 +28,13 @@ readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/t readonly workspace=$(pwd) function prepare_thirdparty { - if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then + if [ ! -d $workspace/third-party ]; then rm -rf $workspace/third-party - - if [ ! -f $workspace/third-party-05b862.tar.gz ]; then - wget $THIRDPARTY_TAR - fi - tar xzf third-party-05b862.tar.gz - else - git submodule update --init --recursive fi + if [ ! -f $workspace/third-party-05b862.tar.gz ]; then + wget $THIRDPARTY_TAR + fi + tar xvf third-party-05b862.tar.gz } # for code gen, a source file is generated after a test, but is dependended by some targets in cmake. diff --git a/lite/tools/check_api_approvals.sh b/lite/tools/check_api_approvals.sh old mode 100644 new mode 100755 index 6100558d68abb2b4c82c1f367078e519972546ce..b2a4659c964121b0a95961195340c296710db2de --- a/lite/tools/check_api_approvals.sh +++ b/lite/tools/check_api_approvals.sh @@ -5,13 +5,14 @@ if [ -z ${BRANCH} ]; then fi LITE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../.." && pwd )" - approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle-Lite/pulls/${GIT_PR_ID}/reviews?per_page=10000` -git_files=`git diff --numstat upstream/$BRANCH| wc -l` -git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'` failed_num=0 echo_list=() +# approval list +Superjomn=328693 +DannyIsFunny=45189361 + function add_failed(){ failed_num=`expr $failed_num + 1` echo_list="${echo_list[@]}$1" @@ -24,20 +25,105 @@ function check_approval(){ add_failed "${failed_num}. ${echo_line}" fi } +#################################################################################################### +# Check 1: You must have Superjomn's (Yunchunwei) approval for changing +# 20+ files or adding more than 1000+ lines of content +#################################################################################################### +function CheckModifiedFileNums() { + git_files=`git diff --numstat upstream/$BRANCH| wc -l` + git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'` + + if [[ $git_files -gt 19 || $git_count -gt 999 ]];then + echo_line="You must have Superjomn's (Yunchunwei) approval for changing 20+ files or adding more than 1000+ lines of content.\n" + check_approval 1 $Superjomn + fi + if [ -n "${echo_list}" ];then + echo "****************" + echo -e "${echo_list[@]}" + echo "There are ${failed_num} approved errors." + echo "****************" + fi + + if [ -n "${echo_list}" ]; then + exit 1 + fi +} +#################################################################################################### +# Check 2: You must have Superjomn's (Yunchunwei) approval for increasing +# size of dynamic lib for 10+ kb +#################################################################################################### +function CheckLibSizeDiff() { + # step1: record lib size of current branch + lite/tools/build_android.sh --arch=armv8 --toolchain=gcc --android_stl=c++_static --with_log=OFF + current_size=`stat -c%s build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/cxx/lib/libpaddle_light_api_shared.so` -if [[ $git_files -gt 19 || $git_count -gt 999 ]];then - echo_line="You must have Superjomn (Yunchunwei) approval for change 20+ files or add than 1000+ lines of content.\n" - check_approval 1 328693 -fi + # step2: record lib size of current develop branch + git checkout develop + git clean -f . && git checkout . + git fetch upstream && git merge upstream/develop -if [ -n "${echo_list}" ];then - echo "****************" - echo -e "${echo_list[@]}" - echo "There are ${failed_num} approved errors." - echo "****************" -fi + lite/tools/build_android.sh --arch=armv8 --toolchain=gcc --android_stl=c++_static --with_log=OFF + develop_size=`stat -c%s build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/cxx/lib/libpaddle_light_api_shared.so` + + # step3: if diff_size > 10485, special approval is needed + diff_size=$[$current_size - $develop_size] + if [ $diff_size -gt 10485 ]; then + echo_line="Your PR has increased basic inference lib for $diff_size Byte, exceeding maximum requirement of 10485 Byte (0.01M). You need Superjomn's (Yunchunwei) approval or you can contact DannyIsFunny(HuZhiqiang).\n" + echo "****************" + echo -e "${echo_line[@]}" + echo "There is an approved errors." + echo "****************" + exit 1 + fi +# Todo: Code below should be applied later. +# if [ $diff_size -gt 10485 ]; then +# echo_line="Your PR has increased basic inference lib for $diff_size Byte, exceeding maximum requirement of 10485 Byte (0.01M). You need Superjomn's (Yunchunwei) approval or you can contact DannyIsFunny(HuZhiqiang).\n" +# check_approval 1 $Superjomn +# fi +# +# if [ -n "${echo_list}" ];then +# echo "****************" +# echo -e "${echo_list[@]}" +# echo "There are ${failed_num} approved errors." +# echo "****************" +# fi +# +# if [ -n "${echo_list}" ]; then +# exit 1 +# fi +} + +#################################################################################################### +# Main functions +#################################################################################################### +function main { + if [ -z "$1" ]; then + # at least on argument is needed + echo "Error: at least on argument is needed!" + exit 1 + fi + + # Parse command line. + for i in "$@"; do + case $i in + check_modified_file_nums) + # modified files num can not exceed 20 + + CheckModifiedFileNums + exit 0 + ;; + check_lib_size_diff) + # size diff can not exceed 10K + + CheckLibSizeDiff + exit 0 + ;; + *) + # unknown option + echo "Error: unsupported input argument!" + exit 1 + ;; + esac + done +} -if [ -n "${echo_list}" ]; then - exit 1 -fi +main $@ diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh index 29ed9100f932b3215e45fc2352b5f0d73b7349b1..9cec7cdc5d566d1db5a8de4c723a9e0b11408d4d 100755 --- a/lite/tools/ci_build.sh +++ b/lite/tools/ci_build.sh @@ -21,8 +21,8 @@ USE_ADB_EMULATOR=ON LITE_WITH_COVERAGE=OFF # if operating in mac env, we should expand the maximum file num -os_nmae=`uname -s` -if [ ${os_nmae} == "Darwin" ]; then +os_name=`uname -s` +if [ ${os_name} == "Darwin" ]; then ulimit -n 1024 fi @@ -279,7 +279,7 @@ function test_server { } function assert_api_spec_approvals() { - /bin/bash ${LITE_ROOT}/lite/tools/check_api_approvals.sh + /bin/bash ${LITE_ROOT}/lite/tools/check_api_approvals.sh check_modified_file_nums if [ "$?" != 0 ];then exit 1 fi @@ -353,7 +353,7 @@ function cmake_xpu { -DWITH_MKL=ON \ -DLITE_BUILD_EXTRA=ON \ -DLITE_WITH_XPU=ON \ - -DXPU_SDK_ROOT="$(pwd)/../../XPU_SDK" + -DXPU_SDK_ROOT="/opt/output" } function build_xpu { @@ -399,6 +399,64 @@ function build_test_xpu { test_xpu } +function cmake_huawei_ascend_npu { + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib" + prepare_workspace + cmake .. \ + ${common_flags} \ + -DWITH_GPU=OFF \ + -DWITH_MKLDNN=OFF \ + -DLITE_WITH_X86=ON \ + -DWITH_MKL=ON \ + -DLITE_BUILD_EXTRA=ON \ + -DLITE_WITH_HUAWEI_ASCEND_NPU=ON \ + -DHUAWEI_ASCEND_NPU_DDK_ROOT="/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux_gcc4.8.5" \ + -DCMAKE_BUILD_TYPE=Release +} + +function build_huawei_ascend_npu { + make lite_compile_deps -j$NUM_CORES_FOR_COMPILE +} + +# It will eagerly test all lite related unittests. +function test_huawei_ascend_npu { + # Due to the missing of ascend kernels, we skip the following tests temporarily. + # TODO(xxx) clear the skip list latter + local skip_list=("test_paddle_api" "test_cxx_api" "test_googlenet" + "test_mobilenetv1_lite_x86" "test_mobilenetv2_lite_x86" + "test_inceptionv4_lite_x86" "test_light_api" + "test_apis" "test_model_bin" + ) + local to_skip=0 + for _test in $(cat $TESTS_FILE); do + to_skip=0 + for skip_name in ${skip_list[@]}; do + if [ $skip_name = $_test ]; then + echo "to skip " $skip_name + to_skip=1 + fi + done + + if [ $to_skip -eq 0 ]; then + ctest -R $_test -V + fi + done +} + +# Build the code and run lite server tests. This is executed in the CI system. +function build_test_huawei_ascend_npu { + cur_dir=$(pwd) + + build_dir=$cur_dir/build.lite.huawei_ascend_npu_test + mkdir -p $build_dir + cd $build_dir + + cmake_huawei_ascend_npu + build_huawei_ascend_npu + + test_huawei_ascend_npu +} + # test_arm_android function test_arm_android { local test_name=$1 @@ -415,7 +473,7 @@ function test_arm_android { echo "test name: ${test_name}" adb_work_dir="/data/local/tmp" - skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl") + skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl" "test_transformer_with_mask_fp32_arm") for skip_name in ${skip_list[@]} ; do [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return done @@ -564,8 +622,18 @@ function test_arm_model { function test_model_optimize_tool_compile { cd $workspace cd build + # Compile opt tool cmake .. -DWITH_LITE=ON -DLITE_ON_MODEL_OPTIMIZE_TOOL=ON -DWITH_TESTING=OFF -DLITE_BUILD_EXTRA=ON make opt -j$NUM_CORES_FOR_COMPILE + # Check whether opt can transform quantized mobilenetv1 successfully. + cd lite/api && chmod +x ./opt + wget --no-check-certificate https://paddlelite-data.bj.bcebos.com/doc_models/MobileNetV1_quant.tar.gz + tar zxf MobileNetV1_quant.tar.gz + ./opt --model_dir=./MobileNetV1_quant --valid_targets=arm --optimize_out=quant_mobilenetv1 + if [ ! -f quant_mobilenetv1.nb ]; then + echo -e "Error! Resulted opt can not tramsform MobileNetV1_quant successfully!" + exit 1 + fi } function _test_paddle_code_generator { @@ -1147,6 +1215,10 @@ function main { test_arm_android $TEST_NAME $ARM_PORT shift ;; + test_huawei_ascend_npu) + test_huawei_ascend_npu + shift + ;; build_test_cuda_server) build_test_cuda_server shift @@ -1164,6 +1236,10 @@ function main { build_test_xpu shift ;; + build_test_huawei_ascend_npu) + build_test_huawei_ascend_npu + shift + ;; build_test_train) build_test_train shift @@ -1189,6 +1265,7 @@ function main { build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu build_test_arm_subtask_model test_resnet50 resnet50 build_test_arm_subtask_model test_inceptionv4 inception_v4_simple + build_test_arm_subtask_model test_transformer_with_mask_fp32_arm transformer_with_mask_fp32 shift ;; build_test_arm_subtask_armlinux) diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py index abb60f6141fbee53916a7db1711cf606afb09924..0cf14d12d553a4d9f7f4ed9780e4274560a8b23f 100644 --- a/lite/tools/cmake_tools/record_supported_kernel_op.py +++ b/lite/tools/cmake_tools/record_supported_kernel_op.py @@ -56,8 +56,8 @@ const std::vector> supported_ops_target = { ops_lines = [] # valid targets and valid_ops -valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU"] -valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[]] +valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU"] +valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] class TargetType: kUnk = 0 kHost = 1 @@ -73,6 +73,7 @@ class TargetType: kMLU = 11 kRKNPU = 12 kAPU = 13 + kHuaweiAscendNPU = 14 # record op_info of valid kernels into `valid_ops` according to different target type diff --git a/lite/utils/all.h b/lite/utils/all.h index a0d323aa24b36dac7858f484eb1cf1d5a7bcba50..8586188b99971d04271d14ac2d3b8043b0ea4414 100644 --- a/lite/utils/all.h +++ b/lite/utils/all.h @@ -14,10 +14,16 @@ #pragma once +#include +#include +#include +#include +#include +#include + #include "lite/utils/any.h" #include "lite/utils/check.h" #include "lite/utils/cp_logging.h" -#include "lite/utils/factory.h" #include "lite/utils/hash.h" #include "lite/utils/io.h" #include "lite/utils/macros.h" diff --git a/lite/utils/env.h b/lite/utils/env.h index 3048c84b42f6f658eaf0c8ee0d08456f53162c37..1d26148cea1ed499c8d5ca408ae9235788be6e91 100644 --- a/lite/utils/env.h +++ b/lite/utils/env.h @@ -15,13 +15,24 @@ #pragma once #include #include - #include #include +// Specify the path of configuration file for the subgraph segmentation, an +// example is shown as below: +// op_type:in_var_name_0,in_var_name1:out_var_name_0,out_var_name1 +// op_type::out_var_name_0 +// op_type:in_var_name_0 +// op_type #define SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE \ "SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE" +// The original weight/local/unused variables in the subblock of the subgraph op +// will be saved only if 'SUBGRAPH_ONLINE_MODE' is set to true(default) during +// the analysis phase, it ensure the ops in the subblock can be converted to the +// target device model online during the execution phase. +#define SUBGRAPH_ONLINE_MODE "SUBGRAPH_ONLINE_MODE" + namespace paddle { namespace lite { diff --git a/lite/utils/factory.h b/lite/utils/factory.h deleted file mode 100644 index d286ceb42ce32dba68bc68cabab2a600ad3d7789..0000000000000000000000000000000000000000 --- a/lite/utils/factory.h +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include "lite/utils/all.h" -#include "lite/utils/cp_logging.h" -#include "lite/utils/replace_stl/stream.h" - -namespace paddle { -namespace lite { - -/* - * Factor for any Type creator. - * - * Usage: - * - * struct SomeType; - * // Register a creator. - * Factory::Global().Register("some_key", [] -> - * std::unique_ptr { ... }); - * // Retrive a creator. - * auto some_type_instance = Factory::Global().Create("some_key"); - */ -template -class Factory { - public: - using item_t = ItemType; - using self_t = Factory; - using item_ptr_t = ItemTypePtr; - using creator_t = std::function; - - static Factory& Global() { - static Factory* x = new self_t; - return *x; - } - - void Register(const std::string& op_type, creator_t&& creator) { - creators_[op_type].emplace_back(std::move(creator)); - } - - item_ptr_t Create(const std::string& op_type) const { - auto res = Creates(op_type); - if (res.empty()) return nullptr; - CHECK_EQ(res.size(), 1UL) << "Get multiple Op for type " << op_type; - return std::move(res.front()); - } - - std::list Creates(const std::string& op_type) const { - std::list res; - auto it = creators_.find(op_type); - if (it == creators_.end()) return res; - for (auto& c : it->second) { - res.emplace_back(c()); - } - return res; - } - - std::string DebugString() const { - STL::stringstream ss; - for (const auto& item : creators_) { - ss << " - " << item.first << "\n"; - } - return ss.str(); - } - - protected: - std::map> creators_; -}; - -/* A helper function to help run a lambda at the start. - */ -template -class Registor { - public: - explicit Registor(std::function&& functor) { functor(); } - - // Touch will do nothing. - int Touch() { return 0; } -}; - -} // namespace lite -} // namespace paddle diff --git a/lite/utils/io.h b/lite/utils/io.h index 2141364df79bb189772592a556dd9a115ae1a67e..5de95e72f06856df01189e8ae3f1c22115801094 100644 --- a/lite/utils/io.h +++ b/lite/utils/io.h @@ -120,5 +120,40 @@ static std::vector ListDir(const std::string& path, return paths; } +static bool ReadFile(const std::string& filename, std::vector* contents) { + FILE* fp = fopen(filename.c_str(), "rb"); + if (!fp) return false; + fseek(fp, 0, SEEK_END); + size_t size = ftell(fp); + fseek(fp, 0, SEEK_SET); + contents->clear(); + contents->resize(size); + size_t offset = 0; + char* ptr = reinterpret_cast(&(contents->at(0))); + while (offset < size) { + size_t already_read = fread(ptr, 1, size - offset, fp); + offset += already_read; + ptr += already_read; + } + fclose(fp); + return true; +} + +static bool WriteFile(const std::string& filename, + const std::vector& contents) { + FILE* fp = fopen(filename.c_str(), "wb"); + if (!fp) return false; + size_t size = contents.size(); + size_t offset = 0; + const char* ptr = reinterpret_cast(&(contents.at(0))); + while (offset < size) { + size_t already_written = fwrite(ptr, 1, size - offset, fp); + offset += already_written; + ptr += already_written; + } + fclose(fp); + return true; +} + } // namespace lite } // namespace paddle diff --git a/lite/utils/logging.h b/lite/utils/logging.h index f292f220c006135af664ea34acc03525a5c112ab..c7fa8d4cf113abebb29c4ebe972e243a39573cf0 100644 --- a/lite/utils/logging.h +++ b/lite/utils/logging.h @@ -57,7 +57,7 @@ static int gettimeofday(struct timeval* tp, void* tzp) { #include "lite/utils/replace_stl/stream.h" #include "lite/utils/string.h" -#ifdef LITE_WITH_ANDROID +#if defined(LITE_WITH_LOG) && defined(LITE_WITH_ANDROID) #include // Android log macors #define ANDROID_LOG_TAG "Paddle-Lite" @@ -143,8 +143,10 @@ class LogMessage { ANDROID_LOG_I(log_stream_.str().c_str()); } else if (level_ == "W") { ANDROID_LOG_W(log_stream_.str().c_str()); + } else if (level_ == "F") { + ANDROID_LOG_F(log_stream_.str().c_str()); } else { - fprintf(stderr, "Unsupported log level: %s", level_.c_str()); + fprintf(stderr, "Unsupported log level: %s\n", level_.c_str()); assert(false); } #endif @@ -170,17 +172,25 @@ class LogMessageFatal : public LogMessage { const char* level = "F") : LogMessage(file, func, lineno, level) {} - ~LogMessageFatal() { + ~LogMessageFatal() +#ifdef LITE_WITH_EXCEPTION + noexcept(false) +#endif + { log_stream_ << '\n'; #ifdef LITE_WITH_ANDROID ANDROID_LOG_F(log_stream_.str().c_str()); #endif fprintf(stderr, "%s", log_stream_.str().c_str()); +#ifdef LITE_WITH_EXCEPTION + throw std::exception(); +#else #ifndef LITE_ON_TINY_PUBLISH abort(); #else assert(false); +#endif #endif } }; @@ -237,7 +247,11 @@ class Voidify { class VoidifyFatal : public Voidify { public: +#ifdef LITE_WITH_EXCEPTION + ~VoidifyFatal() noexcept(false) { throw std::exception(); } +#else ~VoidifyFatal() { assert(false); } +#endif }; #endif diff --git a/lite/utils/md5.h b/lite/utils/md5.h new file mode 100644 index 0000000000000000000000000000000000000000..c2e972dd8001a9a85e29688f460be061d64a16b5 --- /dev/null +++ b/lite/utils/md5.h @@ -0,0 +1,104 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace paddle { +namespace lite { + +std::string MD5(std::string message) { + const uint32_t shiftAmounts[] = { + 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, + 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, + 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, + 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21}; + const uint32_t partsOfSines[] = { + 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, + 0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, + 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340, + 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, + 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, + 0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, + 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa, + 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, + 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, + 0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, + 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391}; + + uint32_t state[4]; + state[0] = 0x67452301; + state[1] = 0xefcdab89; + state[2] = 0x98badcfe; + state[3] = 0x10325476; + + // Pad with zeros + int size = ((((message.length() + 8) / 64) + 1) * 64) - 8; + uint8_t *buf = reinterpret_cast(calloc(size + 64, 1)); + memcpy(buf, message.c_str(), message.length()); + buf[message.length()] = 128; + uint32_t bits = 8 * message.length(); + memcpy(buf + size, &bits, 4); + +// Process at each 512-bit(64 bytes) chunk +#define LEFTROTATE(x, c) (((x) << (c)) | ((x) >> (32 - (c)))) + for (int offset = 0; offset < size; offset += 64) { + uint32_t A = state[0]; + uint32_t B = state[1]; + uint32_t C = state[2]; + uint32_t D = state[3]; + uint32_t *W = reinterpret_cast(buf + offset); + for (uint32_t i = 0; i < 64; i++) { + uint32_t F, g; + if (i < 16) { + F = (B & C) | ((~B) & D); + g = i; + } else if (i < 32) { + F = (D & B) | ((~D) & C); + g = (5 * i + 1) % 16; + } else if (i < 48) { + F = B ^ C ^ D; + g = (3 * i + 5) % 16; + } else { + F = C ^ (B | (~D)); + g = (7 * i) % 16; + } + uint32_t T = D; + D = C; + C = B; + B = B + LEFTROTATE((A + F + partsOfSines[i] + W[g]), shiftAmounts[i]); + A = T; + } + state[0] += A; + state[1] += B; + state[2] += C; + state[3] += D; + } +#undef LEFTROTATE + free(buf); + + // Convert digest to string + std::string res; + res.reserve(16 << 1); + const uint8_t *digest = reinterpret_cast(state); + char hex[3]; + for (size_t i = 0; i < 16; i++) { + snprintf(hex, sizeof(hex), "%02x", digest[i]); + res.append(hex); + } + return res; +} + +} // namespace lite +} // namespace paddle diff --git a/lite/utils/paddle_enforce.h b/lite/utils/paddle_enforce.h deleted file mode 100644 index 82534af996919ac69a8624e442f1af6a9abb2c07..0000000000000000000000000000000000000000 --- a/lite/utils/paddle_enforce.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/* - * This file defines PADDLE_ENFORCE_xx, which helps to adapt the legacy fluid - * codes. - */ -#pragma once -#include "lite/utils/cp_logging.h" -#include "lite/utils/string.h" - -#define PADDLE_ENFORCE(cond, ...) \ - CHECK((cond)) << paddle::lite::string_format("" __VA_ARGS__); -#define PADDLE_ENFORCE_EQ(a, b, ...) \ - CHECK_EQ((a), (b)) << paddle::lite::string_format("" __VA_ARGS__); -#define PADDLE_ENFORCE_LE(a, b, ...) \ - CHECK_LE((a), (b)) << paddle::lite::string_format("" __VA_ARGS__); -#define PADDLE_ENFORCE_LT(a, b, ...) \ - CHECK_LT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__); - -#define PADDLE_ENFORCE_GE(a, b, ...) \ - CHECK_GE((a), (b)) << paddle::lite::string_format("" __VA_ARGS__); -#define PADDLE_ENFORCE_GT(a, b, ...) \ - CHECK_GT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__); - -#ifndef PADDLE_THROW -#define PADDLE_THROW(...) printf("" __VA_ARGS__); -#endif diff --git a/lite/utils/replace_stl/stream.cc b/lite/utils/replace_stl/stream.cc index 081006be6711d5d26c405181fd6d86e89c9e4e95..8e14e4d6d5dbab8dc01b9f8a07910a905cae6abf 100644 --- a/lite/utils/replace_stl/stream.cc +++ b/lite/utils/replace_stl/stream.cc @@ -23,6 +23,14 @@ namespace paddle { namespace lite { namespace replace_stl { +#ifndef LITE_WITH_LOG +#define ADD_DATA_AS_STRING(data_, obj_) +#else +#define ADD_DATA_AS_STRING(data_, obj_) \ + std::string text = std::to_string(obj_); \ + pad(text); \ + data_ = data_ + text; + void ostream::pad(const std::string& text) { if (display_width_ > 0) { if (display_width_ < text.size()) { @@ -36,15 +44,6 @@ void ostream::pad(const std::string& text) { } } } - -#ifndef LITE_WITH_LOG -#define ADD_DATA_AS_STRING(data_, obj_) -#else -#define ADD_DATA_AS_STRING(data_, obj_) \ - std::string text = std::to_string(obj_); \ - pad(text); \ - data_ = data_ + text; - #endif template <> diff --git a/lite/utils/replace_stl/stream.h b/lite/utils/replace_stl/stream.h index 3288a1986906b3fd600b91b6a56ae7134644456f..c58265a0cd864ebe2d2d158d953b17e2c230531f 100644 --- a/lite/utils/replace_stl/stream.h +++ b/lite/utils/replace_stl/stream.h @@ -57,7 +57,9 @@ class ostream { ostream& operator<<(const T* obj); private: +#ifdef LITE_WITH_LOG void pad(const std::string& text); +#endif std::string data_; int display_width_{-1}; // -1 refers to no setting }; diff --git a/lite/utils/string.h b/lite/utils/string.h index ada51d0b85d7536bfc937a7b1b8368a0f0e053be..b1aaf5d6c56d8931c4ad416f9d38c947abc68dd8 100644 --- a/lite/utils/string.h +++ b/lite/utils/string.h @@ -60,6 +60,38 @@ static std::string to_string(const T& v) { return ss.str(); } +static std::string to_string(int index) { + const int BUFFER_LENGTH = 15; + char buffer[BUFFER_LENGTH]; + snprintf(buffer, sizeof(buffer), "%d", index); + return std::string(buffer); +} + +template +static T parse_string(const std::string& v) { + return v; +} + +template <> +int32_t parse_string(const std::string& v) { + return std::stoi(v); +} + +template <> +int64_t parse_string(const std::string& v) { + return std::stoll(v); +} + +template <> +float parse_string(const std::string& v) { + return std::stof(v); +} + +template <> +double parse_string(const std::string& v) { + return std::stod(v); +} + template std::string Join(const std::vector& vec, const std::string& delim) { if (vec.empty()) return ""; @@ -84,19 +116,20 @@ static std::string Repr(const std::vector& v) { return "{" + Join(tmp, ",") + "}"; } -static std::vector Split(const std::string& original, - const std::string& separator) { - std::vector results; +template +static std::vector Split(const std::string& original, + const std::string& separator) { + std::vector results; std::string::size_type pos1, pos2; pos2 = original.find(separator); pos1 = 0; while (std::string::npos != pos2) { - results.push_back(original.substr(pos1, pos2 - pos1)); + results.push_back(parse_string(original.substr(pos1, pos2 - pos1))); pos1 = pos2 + separator.size(); pos2 = original.find(separator, pos1); } if (pos1 != original.length()) { - results.push_back(original.substr(pos1)); + results.push_back(parse_string(original.substr(pos1))); } return results; } diff --git a/third-party/flatbuffers b/third-party/flatbuffers new file mode 160000 index 0000000000000000000000000000000000000000..6df40a2471737b27271bdd9b900ab5f3aec746c7 --- /dev/null +++ b/third-party/flatbuffers @@ -0,0 +1 @@ +Subproject commit 6df40a2471737b27271bdd9b900ab5f3aec746c7