From c4a87224af1db292c2d3999a3a71238e0a8e27be Mon Sep 17 00:00:00 2001 From: yongqiangma Date: Tue, 14 Jan 2020 14:49:28 +0800 Subject: [PATCH] Support bitman backend,test=develop (#2761) * Support bitman backend --- CMakeLists.txt | 4 + cmake/bm.cmake | 80 +++++++++ cmake/configure.cmake | 4 + cmake/lite.cmake | 32 +++- lite/CMakeLists.txt | 4 + lite/api/CMakeLists.txt | 25 ++- lite/api/paddle_place.cc | 7 +- lite/api/paddle_place.h | 3 +- lite/api/test_resnet50_lite_bm.cc | 92 ++++++++++ lite/backends/CMakeLists.txt | 1 + lite/backends/bm/CMakeLists.txt | 5 + lite/backends/bm/target_wrapper.cc | 111 ++++++++++++ lite/backends/bm/target_wrapper.h | 73 ++++++++ lite/core/CMakeLists.txt | 3 +- lite/core/arena/CMakeLists.txt | 2 +- lite/core/context.h | 27 +++ lite/core/memory.cc | 15 ++ lite/core/memory.h | 9 + lite/core/mir/fusion/conv_bn_fuse_pass.cc | 2 +- .../mir/fusion/conv_elementwise_fuse_pass.cc | 2 +- .../elementwise_add_activation_fuse_pass.cc | 1 + lite/core/mir/fusion/fc_fuse_pass.cc | 1 + lite/core/mir/memory_optimize_pass.cc | 2 +- lite/core/mir/subgraph/subgraph_pass.cc | 16 ++ lite/core/mir/subgraph/subgraph_pass.h | 5 + lite/core/op_registry.cc | 8 + lite/core/op_registry.h | 10 ++ lite/kernels/CMakeLists.txt | 1 + lite/kernels/bm/CMakeLists.txt | 6 + lite/kernels/bm/bridges/CMakeLists.txt | 31 ++++ lite/kernels/bm/bridges/act_op.cc | 67 +++++++ lite/kernels/bm/bridges/batch_norm_op.cc | 109 +++++++++++ lite/kernels/bm/bridges/conv_op.cc | 106 +++++++++++ lite/kernels/bm/bridges/elementwise_ops.cc | 135 ++++++++++++++ lite/kernels/bm/bridges/graph.cc | 35 ++++ lite/kernels/bm/bridges/graph.h | 48 +++++ lite/kernels/bm/bridges/mul_op.cc | 95 ++++++++++ lite/kernels/bm/bridges/paddle_use_bridges.h | 24 +++ lite/kernels/bm/bridges/pool_op.cc | 101 +++++++++++ lite/kernels/bm/bridges/scale_op.cc | 80 +++++++++ lite/kernels/bm/bridges/softmax_op.cc | 78 ++++++++ lite/kernels/bm/bridges/utility.cc | 60 +++++++ lite/kernels/bm/bridges/utility.h | 37 ++++ lite/kernels/bm/subgraph_compute.cc | 169 ++++++++++++++++++ lite/kernels/bm/subgraph_compute.h | 74 ++++++++ lite/kernels/npu/bridges/CMakeLists.txt | 2 +- lite/tests/kernels/CMakeLists.txt | 64 +++---- lite/tools/build_bm.sh | 112 ++++++++++++ 48 files changed, 1928 insertions(+), 50 deletions(-) create mode 100644 cmake/bm.cmake create mode 100644 lite/api/test_resnet50_lite_bm.cc create mode 100644 lite/backends/bm/CMakeLists.txt create mode 100644 lite/backends/bm/target_wrapper.cc create mode 100644 lite/backends/bm/target_wrapper.h create mode 100644 lite/kernels/bm/CMakeLists.txt create mode 100644 lite/kernels/bm/bridges/CMakeLists.txt create mode 100644 lite/kernels/bm/bridges/act_op.cc create mode 100644 lite/kernels/bm/bridges/batch_norm_op.cc create mode 100644 lite/kernels/bm/bridges/conv_op.cc create mode 100644 lite/kernels/bm/bridges/elementwise_ops.cc create mode 100644 lite/kernels/bm/bridges/graph.cc create mode 100644 lite/kernels/bm/bridges/graph.h create mode 100644 lite/kernels/bm/bridges/mul_op.cc create mode 100644 lite/kernels/bm/bridges/paddle_use_bridges.h create mode 100644 lite/kernels/bm/bridges/pool_op.cc create mode 100644 lite/kernels/bm/bridges/scale_op.cc create mode 100644 lite/kernels/bm/bridges/softmax_op.cc create mode 100644 lite/kernels/bm/bridges/utility.cc create mode 100644 lite/kernels/bm/bridges/utility.h create mode 100644 lite/kernels/bm/subgraph_compute.cc create mode 100644 lite/kernels/bm/subgraph_compute.h create mode 100755 lite/tools/build_bm.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index f1034e0b95..8c636780af 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,7 @@ lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) +lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF) @@ -192,6 +193,9 @@ if(LITE_WITH_CUDA) include(cuda) endif() +if(LITE_WITH_BM) + include(bm) +endif() include(generic) # simplify cmake module include(ccache) # set ccache for compilation include(util) # set unittest and link libs diff --git a/cmake/bm.cmake b/cmake/bm.cmake new file mode 100644 index 0000000000..3a3abb5966 --- /dev/null +++ b/cmake/bm.cmake @@ -0,0 +1,80 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_BM) + return() +endif() + +if(NOT DEFINED BM_SDK_ROOT) + set(BM_SDK_ROOT $ENV{BM_SDK_ROOT}) + if(NOT BM_SDK_ROOT) + message(FATAL_ERROR "Must set BM_SDK_ROOT or env BM_SDK_ROOT when LITE_WITH_BM=ON") + endif() +endif() + +message(STATUS "BM_SDK_ROOT: ${BM_SDK_ROOT}") +find_path(BM_SDK_INC NAMES bmruntime_interface.h + PATHS ${BM_SDK_ROOT}/include/bmruntime NO_DEFAULT_PATH) +if(NOT BM_SDK_INC) + message(FATAL_ERROR "Can not find bmruntime_interface.h in ${BM_SDK_ROOT}/include") +endif() + +include_directories("${BM_SDK_ROOT}/include/bmruntime") +include_directories("${BM_SDK_ROOT}/include/bmlib") +include_directories("${BM_SDK_ROOT}/include/bmcompiler") +include_directories("${BM_SDK_ROOT}/include/bmcpu") +include_directories("${BM_SDK_ROOT}/include/bmlog") + +find_library(BM_SDK_RT_LIB NAMES bmrt + PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie) +if(NOT BM_SDK_RT_LIB) + message(FATAL_ERROR "Can not find bmrt Library in ${BM_SDK_ROOT}") +else() + message(STATUS "Found bmrt Library: ${BM_SDK_RT_LIB}") + add_library(bmrt SHARED IMPORTED GLOBAL) + set_property(TARGET bmrt PROPERTY IMPORTED_LOCATION ${BM_SDK_RT_LIB}) +endif() + +find_library(BM_SDK_BM_LIB NAMES bmlib + PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie) +if(NOT BM_SDK_BM_LIB) + message(FATAL_ERROR "Can not find bmlib Library in ${BM_SDK_ROOT}") +else() + message(STATUS "Found bmlib Library: ${BM_SDK_BM_LIB}") + add_library(bmlib SHARED IMPORTED GLOBAL) + set_property(TARGET bmlib PROPERTY IMPORTED_LOCATION ${BM_SDK_BM_LIB}) +endif() + +find_library(BM_SDK_COMPILER_LIB NAMES bmcompiler + PATHS ${BM_SDK_ROOT}/lib/bmcompiler) +if(NOT BM_SDK_COMPILER_LIB) + message(FATAL_ERROR "Can not find bmcompiler Library in ${BM_SDK_ROOT}") +else() + message(STATUS "Found bmcompiler Library: ${BM_SDK_COMPILER_LIB}") + add_library(bmcompiler SHARED IMPORTED GLOBAL) + set_property(TARGET bmcompiler PROPERTY IMPORTED_LOCATION ${BM_SDK_COMPILER_LIB}) +endif() + +find_library(BM_SDK_CPU_LIB NAMES bmcpu + PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie) +if(NOT BM_SDK_CPU_LIB) + message(FATAL_ERROR "Can not find bmcpu Library in ${BM_SDK_ROOT}") +else() + message(STATUS "Found bmcpu Library: ${BM_SDK_CPU_LIB}") + add_library(bmcpu SHARED IMPORTED GLOBAL) + set_property(TARGET bmcpu PROPERTY IMPORTED_LOCATION ${BM_SDK_CPU_LIB}) +endif() + +set(bm_runtime_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm runtime libs") +set(bm_builder_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm builder libs") diff --git a/cmake/configure.cmake b/cmake/configure.cmake index bc055d3186..752b22461d 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -143,6 +143,10 @@ if (LITE_WITH_FPGA) add_definitions("-DLITE_WITH_FPGA") endif() +if (LITE_WITH_BM) +add_definitions("-DLITE_WITH_BM") +endif() + if (LITE_WITH_PROFILE) add_definitions("-DLITE_WITH_PROFILE") if (LITE_WITH_PRECISION_PROFILE) diff --git a/cmake/lite.cmake b/cmake/lite.cmake index d01e2d67ed..fd40fa437b 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -22,7 +22,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -94,6 +94,12 @@ function (lite_deps TARGET) endforeach(var) endif() + if (LITE_WITH_BM) + foreach(var ${lite_deps_BM_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + set(${TARGET} ${deps} PARENT_SCOPE) endfunction() @@ -119,7 +125,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -129,6 +135,7 @@ function(lite_cc_library TARGET) X86_DEPS ${args_X86_DEPS} CUDA_DEPS ${args_CUDA_DEPS} CL_DEPS ${args_CL_DEPS} + BM_DEPS ${args_BM_DEPS} ARM_DEPS ${args_ARM_DEPS} CV_DEPS ${args_CV_DEPS} FPGA_DEPS ${args_FPGA_DEPS} @@ -163,7 +170,7 @@ function(lite_cc_binary TARGET) set(options " -g ") endif() set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -177,6 +184,7 @@ function(lite_cc_binary TARGET) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -210,7 +218,7 @@ function(lite_cc_test TARGET) endif() set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) @@ -232,6 +240,7 @@ function(lite_cc_test TARGET) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -260,6 +269,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels") set(npu_kernels CACHE INTERNAL "npu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels") +set(bm_kernels CACHE INTERNAL "bm kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels") set(host_kernels CACHE INTERNAL "host kernels") @@ -270,12 +280,12 @@ if(LITE_BUILD_TAILOR) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) endif() # add a kernel for some specific device -# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA) +# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM) # level: one of (basic, extra) function(add_kernel TARGET device level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -341,6 +351,12 @@ function(add_kernel TARGET device level) endif() set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "BM") + if (NOT LITE_WITH_BM) + return() + endif() + set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "") + endif() if ("${device}" STREQUAL "OPENCL") if (NOT LITE_WITH_OPENCL) return() @@ -374,6 +390,7 @@ function(add_kernel TARGET device level) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -392,7 +409,7 @@ endif() function(add_operator TARGET level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -424,6 +441,7 @@ function(add_operator TARGET level) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index 2264e57b9d..cb6a872e06 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") +message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}") @@ -66,6 +67,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (LITE_WITH_FPGA) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga") endif(LITE_WITH_FPGA) + if (LITE_WITH_BM) + set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm") + endif(LITE_WITH_BM) else() set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib") endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index d91fe9cd50..f0ac9c9e3b 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -61,13 +61,19 @@ if (WITH_TESTING) ${ops} ${host_kernels} CUDA_DEPS ${cuda_kernels} X86_DEPS ${x86_kernels} - XPU_DEPS ${xpu_kernels}) + XPU_DEPS ${xpu_kernels} + BM_DEPS ${bm_kernels}) endif() if(LITE_WITH_FPGA) set(light_api_deps ${light_api_deps} ${fpga_deps}) set(cxx_api_deps ${cxx_api_deps} ${fpga_deps}) endif() +if(LITE_WITH_BM) + set(light_api_deps ${light_api_deps} ${bm_deps}) + set(cxx_api_deps ${cxx_api_deps} ${bm_deps}) +endif() + message(STATUS "get ops ${ops}") message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get CUDA kernels ${cuda_kernels}") @@ -76,6 +82,7 @@ message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get NPU kernels ${npu_kernels}") message(STATUS "get XPU kernels ${xpu_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}") +message(STATUS "get BM kernels ${bm_kernels}") # for full api if (NOT LITE_ON_TINY_PUBLISH) @@ -90,6 +97,7 @@ if (NOT LITE_ON_TINY_PUBLISH) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + BM_DEPS ${bm_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels}) endif() @@ -111,7 +119,8 @@ lite_cc_library(light_api SRCS light_api.cc NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels}) + FPGA_DEPS ${fpga_kernels} + BM_DEPS ${bm_kernels}) include(ExternalProject) set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING @@ -129,6 +138,7 @@ if(WITH_TESTING) XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + BM_DEPS ${bm_kernels} EXCLUDE_COMPILE_DEPS "ON" ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -164,6 +174,12 @@ if(WITH_TESTING) ${ops} ${host_kernels} ${x86_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn) add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz) + if(LITE_WITH_BM) + lite_cc_test(test_resnet50_lite_bm SRCS test_resnet50_lite_bm.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges} + ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + endif() endif() endif() @@ -254,6 +270,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc DEPS light_api program mir_passes paddle_api_light CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + BM_DEPS ${bm_kernels} ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) lite_cc_test(test_apis SRCS apis_test.cc @@ -262,6 +279,7 @@ lite_cc_test(test_apis SRCS apis_test.cc X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} FPGA_DEPS ${fpga_kernels} + BM_DEPS ${bm_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -293,6 +311,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle CL_DEPS ${opencl_kernels} X86_DEPS ${x86_kernels} FPGA_DEPS ${fpga_kernels} + BM_DEPS ${bm_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL) if (WITH_TESTING) add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz) @@ -307,6 +326,7 @@ if(NOT IOS) NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} + BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -328,6 +348,7 @@ if(NOT IOS) NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} + BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index 6d12df67ac..2cced919e6 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -55,7 +55,8 @@ const std::string& TargetToStr(TargetType target) { "any", "fpga", "npu", - "xpu"}; + "xpu", + "bm"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -94,7 +95,8 @@ const std::string& TargetRepr(TargetType target) { "kAny", "kFPGA", "kNPU", - "kXPU"}; + "kXPU", + "kBM"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -135,6 +137,7 @@ std::set ExpandValidTargets(TargetType target) { TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU), + TARGET(kBM), TARGET(kFPGA)}); if (target == TARGET(kAny)) { return valid_set; diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index 1aa4152235..c8f136ace8 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -52,8 +52,9 @@ enum class TargetType : int { kFPGA = 7, kNPU = 8, kXPU = 9, + kBM = 10, kAny = 6, // any target - NUM = 10, // number of fields. + NUM = 11, // number of fields. }; enum class PrecisionType : int { kUnk = 0, diff --git a/lite/api/test_resnet50_lite_bm.cc b/lite/api/test_resnet50_lite_bm.cc new file mode 100644 index 0000000000..62a58704f4 --- /dev/null +++ b/lite/api/test_resnet50_lite_bm.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "lite/api/cxx_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/core/op_registry.h" + +DEFINE_string(input_img_txt_path, + "", + "if set input_img_txt_path, read the img filename as input."); + +namespace paddle { +namespace lite { + +void TestModel(const std::vector& valid_places) { + lite::Predictor predictor; + std::vector passes; + passes.push_back("bm_subgraph_pass"); + predictor.Build(FLAGS_model_dir, "", "", valid_places, passes); + + auto* input_tensor = predictor.GetInput(0); + input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); + auto* data = input_tensor->mutable_data(); + auto item_size = input_tensor->dims().production(); + if (FLAGS_input_img_txt_path.empty()) { + for (int i = 0; i < item_size; i++) { + data[i] = 1; + } + } else { + std::fstream fs(FLAGS_input_img_txt_path, std::ios::in); + if (!fs.is_open()) { + LOG(FATAL) << "open input_img_txt error."; + } + for (int i = 0; i < item_size; i++) { + fs >> data[i]; + } + } + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor.Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor.Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; + + auto* out = predictor.GetOutput(0); + ASSERT_EQ(out->dims().size(), 2); + ASSERT_EQ(out->dims()[0], 1); + ASSERT_EQ(out->dims()[1], 1000); + + auto* out_data = out->data(); + FILE* fp = fopen("result.txt", "wb"); + for (int i = 0; i < out->numel(); i++) { + fprintf(fp, "%f\n", out_data[i]); + } + fclose(fp); +} + +TEST(ResNet50, test_bm) { + std::vector valid_places({Place{TARGET(kBM), PRECISION(kFloat)}, + Place{TARGET(kX86), PRECISION(kFloat)}}); + + TestModel(valid_places); +} + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index dec63e6efa..e351746481 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -6,3 +6,4 @@ add_subdirectory(fpga) add_subdirectory(host) add_subdirectory(npu) add_subdirectory(xpu) +add_subdirectory(bm) diff --git a/lite/backends/bm/CMakeLists.txt b/lite/backends/bm/CMakeLists.txt new file mode 100644 index 0000000000..9e15b9836b --- /dev/null +++ b/lite/backends/bm/CMakeLists.txt @@ -0,0 +1,5 @@ +if (NOT LITE_WITH_BM) + return() +endif() + +lite_cc_library(target_wrapper_bm SRCS target_wrapper.cc DEPS ${bm_runtime_libs}) diff --git a/lite/backends/bm/target_wrapper.cc b/lite/backends/bm/target_wrapper.cc new file mode 100644 index 0000000000..c75c714522 --- /dev/null +++ b/lite/backends/bm/target_wrapper.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "lite/backends/bm/target_wrapper.h" +#include +#include +#include + +namespace paddle { +namespace lite { + +int TargetWrapperBM::device_id_ = 0; +std::map TargetWrapperBM::bm_hds_; + +size_t TargetWrapperBM::num_devices() { + int count = 0; + bm_dev_getcount(&count); + return count; +} + +void TargetWrapperBM::SetDevice(int id) { + /* + if (id < 0 || (size_t)id >= num_devices()) { + LOG(FATAL) << "Failed with invalid device id " << id; + } + */ + device_id_ = id; + if (bm_hds_.find(id) == bm_hds_.end()) { + bm_handle_t bm_handle; + bm_status_t ret = bm_dev_request(&bm_handle, id); + CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: " + << static_cast(ret); + bm_hds_.insert(std::pair(id, bm_handle)); + } + return; +} + +void* TargetWrapperBM::GetHandle() { + if (bm_hds_.find(device_id_) == bm_hds_.end()) { + LOG(FATAL) << "device not initialized " << device_id_; + } + return bm_hds_.at(device_id_); +} + +void* TargetWrapperBM::Malloc(size_t size) { + void* ptr{}; + + if (bm_hds_.find(device_id_) == bm_hds_.end()) { + SetDevice(device_id_); + } + + bm_handle_t bm_handle = static_cast(bm_hds_.at(device_id_)); + bm_device_mem_t* p_mem = + reinterpret_cast(malloc(sizeof(bm_device_mem_t))); + bm_malloc_device_byte(bm_handle, p_mem, size); + ptr = reinterpret_cast(p_mem); + return ptr; +} + +void TargetWrapperBM::Free(void* ptr) { + if (ptr != NULL) { + bm_handle_t bm_handle = static_cast(bm_hds_.at(device_id_)); + bm_device_mem_t* mem = static_cast(ptr); + bm_free_device(bm_handle, *mem); + free(ptr); + } + return; +} + +void TargetWrapperBM::MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir) { + if (bm_hds_.find(device_id_) == bm_hds_.end()) { + return; + } + + bm_handle_t bm_handle = static_cast(bm_hds_.at(device_id_)); + bm_device_mem_t* pmem{}; + const bm_device_mem_t* pcst_mem{}; + + switch (dir) { + case IoDirection::HtoD: + pmem = static_cast(dst); + bm_memcpy_s2d_partial_offset( + bm_handle, *pmem, const_cast(src), size, 0); + break; + case IoDirection::DtoH: + pcst_mem = static_cast(src); + bm_memcpy_d2s_partial_offset( + bm_handle, reinterpret_cast(dst), *pcst_mem, size, 0); + break; + default: + LOG(FATAL) << "Unsupported IoDirection " << static_cast(dir); + break; + } + return; +} + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/bm/target_wrapper.h b/lite/backends/bm/target_wrapper.h new file mode 100644 index 0000000000..2674ffe161 --- /dev/null +++ b/lite/backends/bm/target_wrapper.h @@ -0,0 +1,73 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/target_wrapper.h" + +namespace paddle { +namespace lite { + +using TargetWrapperBM = TargetWrapper; + +template <> +class TargetWrapper { + public: + using stream_t = int; + using event_t = int; + + static size_t num_devices(); + static size_t maximum_stream() { return 0; } + + static void SetDevice(int id); + static void CreateStream(stream_t* stream) {} + static void DestroyStream(const stream_t& stream) {} + + static void CreateEvent(event_t* event) {} + static void DestroyEvent(const event_t& event) {} + + static void RecordEvent(const event_t& event) {} + static void SyncEvent(const event_t& event) {} + + static void StreamSync(const stream_t& stream) {} + + static void* Malloc(size_t size); + static void Free(void* ptr); + + static void* GetHandle(); + + static void MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir); + + static void MemcpyAsync(void* dst, + const void* src, + size_t size, + IoDirection dir, + const stream_t& stream) {} + + static void MemsetSync(void* devPtr, int value, size_t count) {} + + static void MemsetAsync(void* devPtr, + int value, + size_t count, + const stream_t& stream) {} + + private: + static int device_id_; + static std::map bm_hds_; +}; +} // namespace lite +} // namespace paddle diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index 8fda0a12fd..1d0558451f 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -6,7 +6,8 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc X86_DEPS target_wrapper_x86 CUDA_DEPS target_wrapper_cuda CL_DEPS cl_target_wrapper - FPGA_DEPS fpga_target_wrapper) + FPGA_DEPS fpga_target_wrapper + BM_DEPS target_wrapper_bm) lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper) diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index 1c85353d53..0f3f36768b 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -6,5 +6,5 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/core/context.h b/lite/core/context.h index 2830bca5c1..653329e4f2 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -55,6 +55,7 @@ using NPUContext = Context; using XPUContext = Context; using OpenCLContext = Context; using FPGAContext = Context; +using BMContext = Context; template <> class Context { @@ -82,6 +83,23 @@ class Context { }; #endif +#ifdef LITE_WITH_BM +template <> +class Context { + public: + Context() {} + explicit Context(const BMContext& ctx); + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() { Init(0); } + + void Init(int dev_id) { TargetWrapperBM::SetDevice(dev_id); } + void CopySharedTo(BMContext* ctx) {} + void* GetHandle() { return TargetWrapperBM::GetHandle(); } + + std::string name() const { return "BMContext"; } +}; +#endif + #ifdef LITE_WITH_XPU template <> class Context { @@ -374,6 +392,12 @@ class ContextScheduler { kernel_contexts_[TargetType::kFPGA].As().CopySharedTo( &ctx->As()); break; +#endif +#ifdef LITE_WITH_BM + case TARGET(kBM): + kernel_contexts_[TargetType::kBM].As().CopySharedTo( + &ctx->As()); + break; #endif default: #ifndef LITE_ON_MODEL_OPTIMIZE_TOOL @@ -412,6 +436,9 @@ class ContextScheduler { #endif #ifdef LITE_WITH_XPU InitContext(); +#endif +#ifdef LITE_WITH_BM + InitContext(); #endif } diff --git a/lite/core/memory.cc b/lite/core/memory.cc index b3cb18b336..cfb0b3ae17 100644 --- a/lite/core/memory.cc +++ b/lite/core/memory.cc @@ -40,6 +40,11 @@ void* TargetMalloc(TargetType target, size_t size) { data = TargetWrapper::Malloc(size); break; #endif // LITE_WITH_OPENCL +#ifdef LITE_WITH_BM + case TargetType::kBM: + data = TargetWrapper::Malloc(size); + break; +#endif default: LOG(FATAL) << "Unknown supported target " << TargetToStr(target); } @@ -69,6 +74,11 @@ void TargetFree(TargetType target, void* data) { TargetWrapper::Free(data); break; #endif // LITE_WITH_CUDA +#ifdef LITE_WITH_BM + case TargetType::kBM: + TargetWrapper::Free(data); + break; +#endif default: LOG(FATAL) << "Unknown type"; } @@ -95,6 +105,11 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) { dst, src, size, IoDirection::DtoD); break; #endif +#ifdef LITE_WITH_BM + case TargetType::kBM: + TargetWrapper::MemcpySync(dst, src, size, IoDirection::DtoD); + break; +#endif #ifdef LITE_WITH_OPENCL case TargetType::kOpenCL: TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD); diff --git a/lite/core/memory.h b/lite/core/memory.h index 001db760a0..051d47bdde 100644 --- a/lite/core/memory.h +++ b/lite/core/memory.h @@ -25,6 +25,10 @@ #include "lite/backends/cuda/target_wrapper.h" #endif // LITE_WITH_CUDA +#ifdef LITE_WITH_BM +#include "lite/backends/bm/target_wrapper.h" +#endif // LITE_WITH_BM + namespace paddle { namespace lite { @@ -71,6 +75,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) { case TARGET(kFPGA): TargetWrapper::MemcpySync(dst, src, size, dir); break; +#endif +#ifdef LITE_WITH_BM + case TARGET(kBM): + TargetWrapper::MemcpySync(dst, src, size, dir); + break; #endif } } diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc index 4725ca7485..4fa79d226d 100644 --- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc @@ -46,4 +46,4 @@ void ConvBNFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_conv_bn_fuse_pass, paddle::lite::mir::ConvBNFusePass) .BindTargets({TARGET(kAny)}) - .ExcludeTargets({TARGET(kX86), TARGET(kXPU)}); + .ExcludeTargets({TARGET(kX86), TARGET(kXPU), TARGET(kBM)}); diff --git a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc index b1b492ce03..2021bdd348 100644 --- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc @@ -47,4 +47,4 @@ void ConvElementwiseFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_conv_elementwise_fuse_pass, paddle::lite::mir::ConvElementwiseFusePass) .BindTargets({TARGET(kAny)}) - .ExcludeTargets({TARGET(kXPU)}); + .ExcludeTargets({TARGET(kXPU), TARGET(kBM)}); diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc index ffa045f7de..1c2297710b 100644 --- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc @@ -36,5 +36,6 @@ REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass, paddle::lite::mir::ElementwiseAddActivationFusePass) .BindTargets({TARGET(kAny)}) .ExcludeTargets({TARGET(kXPU)}) + .ExcludeTargets({TARGET(kBM)}) .ExcludeTargets({TARGET(kX86)}) .BindKernel("fusion_elementwise_add_activation"); diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc index 041d896790..46695be396 100644 --- a/lite/core/mir/fusion/fc_fuse_pass.cc +++ b/lite/core/mir/fusion/fc_fuse_pass.cc @@ -39,5 +39,6 @@ void FcFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass) .BindTargets({TARGET(kAny)}) .ExcludeTargets({TARGET(kXPU)}) + .ExcludeTargets({TARGET(kBM)}) .ExcludeTargets({TARGET(kCUDA)}) .BindKernel("fc"); diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc index dbf32da234..6256a49a99 100644 --- a/lite/core/mir/memory_optimize_pass.cc +++ b/lite/core/mir/memory_optimize_pass.cc @@ -256,4 +256,4 @@ void MemoryOptimizePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) .BindTargets({TARGET(kARM)}) - .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU)}); + .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU), TARGET(kBM)}); diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc index 1f83139791..5e2cecd277 100644 --- a/lite/core/mir/subgraph/subgraph_pass.cc +++ b/lite/core/mir/subgraph/subgraph_pass.cc @@ -53,6 +53,20 @@ void XPUSubgraphPass::Apply(const std::unique_ptr& graph) { fuser(); } +void BMSubgraphPass::Apply(const std::unique_ptr& graph) { + std::unordered_set supported_lists; +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); +#include "lite/kernels/bm/bridges/paddle_use_bridges.h" +#undef USE_SUBGRAPH_BRIDGE + auto teller = [&](Node* node) { + if (!node->IsStmt()) return false; + auto& stmt = node->AsStmt(); + return supported_lists.count(stmt.op_type()) != 0; + }; + SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */); + fuser(); +} + } // namespace mir } // namespace lite } // namespace paddle @@ -61,3 +75,5 @@ REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass) .BindTargets({TARGET(kNPU)}); REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass) .BindTargets({TARGET(kXPU)}); +REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass) + .BindTargets({TARGET(kBM)}); diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h index 554f54304a..1ba0f2ab4a 100644 --- a/lite/core/mir/subgraph/subgraph_pass.h +++ b/lite/core/mir/subgraph/subgraph_pass.h @@ -32,6 +32,11 @@ class XPUSubgraphPass : public ProgramPass { void Apply(const std::unique_ptr& graph) override; }; +class BMSubgraphPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + } // namespace mir } // namespace lite } // namespace paddle diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc index 716ce9d6a8..b49670eefb 100644 --- a/lite/core/op_registry.cc +++ b/lite/core/op_registry.cc @@ -100,6 +100,9 @@ std::list> KernelRegistry::Create( case TARGET(kFPGA): { CREATE_KERNEL(kFPGA); } break; + case TARGET(kBM): { + CREATE_KERNEL(kBM); + } break; default: CHECK(false) << "not supported kernel target " << TargetToStr(target); } @@ -186,6 +189,11 @@ KernelRegistry::KernelRegistry() INIT_FOR(kFPGA, kFloat, kNHWC); INIT_FOR(kFPGA, kAny, kNHWC); INIT_FOR(kFPGA, kAny, kAny); + + INIT_FOR(kBM, kFloat, kNCHW); + INIT_FOR(kBM, kInt8, kNCHW); + INIT_FOR(kBM, kAny, kNCHW); + INIT_FOR(kBM, kAny, kAny); #undef INIT_FOR } diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index 0df5cb41ec..a49682eea6 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -230,6 +230,16 @@ class KernelRegistry final { PRECISION(kInt8), DATALAYOUT(kNCHW)> *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt index 0bfd39ae9a..4e0092b392 100644 --- a/lite/kernels/CMakeLists.txt +++ b/lite/kernels/CMakeLists.txt @@ -10,3 +10,4 @@ add_subdirectory(opencl) add_subdirectory(fpga) add_subdirectory(npu) add_subdirectory(xpu) +add_subdirectory(bm) diff --git a/lite/kernels/bm/CMakeLists.txt b/lite/kernels/bm/CMakeLists.txt new file mode 100644 index 0000000000..691fe55978 --- /dev/null +++ b/lite/kernels/bm/CMakeLists.txt @@ -0,0 +1,6 @@ +if(NOT LITE_WITH_BM) + return () +endif() + +add_subdirectory(bridges) +add_kernel(subgraph_compute_bm BM basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${bm_subgraph_bridges}) diff --git a/lite/kernels/bm/bridges/CMakeLists.txt b/lite/kernels/bm/bridges/CMakeLists.txt new file mode 100644 index 0000000000..f9d1f8feea --- /dev/null +++ b/lite/kernels/bm/bridges/CMakeLists.txt @@ -0,0 +1,31 @@ +if(NOT LITE_WITH_BM) + return() +endif() + +lite_cc_library(subgraph_bridge_utility_bm SRCS utility.cc DEPS) +lite_cc_library(subgraph_bridge_graph_bm SRCS graph.cc DEPS subgraph_bridge_utility_bm) + +set(bm_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_bm subgraph_bridge_graph_bm) + +lite_cc_library(subgraph_bridge_act_op_bm SRCS act_op.cc DEPS ${bm_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_conv_op_bm SRCS conv_op.cc DEPS ${bm_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_elementwise_ops_bm SRCS elementwise_ops.cc DEPS ${bm_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_pool_op_bm SRCS pool_op.cc DEPS ${subgraph_bridge_deps_bm}) +lite_cc_library(subgraph_bridge_softmax_op_bm SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_bm}) +lite_cc_library(subgraph_bridge_mul_op_bm SRCS mul_op.cc DEPS ${bm_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_batch_norm_op_bm SRCS batch_norm_op.cc DEPS ${bm_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_scale_op_bm SRCS scale_op.cc DEPS ${bm_subgraph_bridge_deps}) + +set(bm_subgraph_bridges + subgraph_bridge_registry + subgraph_bridge_engine + subgraph_bridge_graph_bm + subgraph_bridge_act_op_bm + subgraph_bridge_conv_op_bm + subgraph_bridge_elementwise_ops_bm + subgraph_bridge_pool_op_bm + subgraph_bridge_softmax_op_bm + subgraph_bridge_mul_op_bm + subgraph_bridge_batch_norm_op_bm + subgraph_bridge_scale_op_bm + CACHE INTERNAL "bm_subgraph_bridges") diff --git a/lite/kernels/bm/bridges/act_op.cc b/lite/kernels/bm/bridges/act_op.cc new file mode 100644 index 0000000000..905bce2506 --- /dev/null +++ b/lite/kernels/bm/bridges/act_op.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/kernels/bm/bridges/graph.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace bm { + +int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims(); + auto output_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(output_var_name)->GetMutable(); + auto output_dims = output->dims(); + const int64_t* x_shape_data = const_cast(&x_dims.data()[0]); + const int64_t* output_shape_data = + const_cast(&output_dims.data()[0]); + std::vector i_x_shape_data(x_dims.size()); + std::vector i_output_shape_data(output_dims.size()); + for (size_t i = 0; i < x_dims.size(); i++) { + i_x_shape_data[i] = static_cast(x_shape_data[i]); + } + for (size_t i = 0; i < output_dims.size(); i++) { + i_output_shape_data[i] = static_cast(output_shape_data[i]); + } + CHECK_EQ(op_type, "relu"); + add_relu_layer(graph->GetCompilerHandle(), + const_cast(&i_x_shape_data[0]), + x_dims.size(), + static_cast(x_var_name.c_str()), + const_cast(&i_output_shape_data[0]), + output_dims.size(), + static_cast(output_var_name.c_str()), + 0.f, + -1.f); + graph->AddNode(output_var_name); + return SUCCESS; +} + +} // namespace bm +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(relu, kBM, paddle::lite::subgraph::bm::ActConverter); diff --git a/lite/kernels/bm/bridges/batch_norm_op.cc b/lite/kernels/bm/bridges/batch_norm_op.cc new file mode 100644 index 0000000000..fbf70178fd --- /dev/null +++ b/lite/kernels/bm/bridges/batch_norm_op.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/kernels/bm/bridges/graph.h" +#include "lite/kernels/bm/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace bm { + +int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_name = lite::subgraph::bm::UniqueName(op_type); + // input + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims(); + const int64_t* x_shape_data = const_cast(&x_dims.data()[0]); + std::vector i_x_shape_data(x_dims.size()); + for (size_t i = 0; i < x_dims.size(); i++) { + i_x_shape_data[i] = static_cast(x_shape_data[i]); + } + int channel_size = x_dims[1]; + auto scale_var_name = op_info->Input("Scale").front(); + auto scale = scope->FindVar(scale_var_name)->GetMutable(); + auto bias_var_name = op_info->Input("Bias").front(); + auto bias = scope->FindVar(bias_var_name)->GetMutable(); + auto mean_var_name = op_info->Input("Mean").front(); + auto mean = scope->FindVar(mean_var_name)->GetMutable(); + auto variance_var_name = op_info->Input("Variance").front(); + auto variance = scope->FindVar(variance_var_name)->GetMutable(); + // output + auto output_var_name = op_info->Output("Y").front(); + auto output = scope->FindVar(output_var_name)->GetMutable(); + auto output_dims = output->dims(); + const int64_t* output_shape_data = + const_cast(&output_dims.data()[0]); + std::vector i_output_shape_data(output_dims.size()); + for (size_t i = 0; i < output_dims.size(); i++) { + i_output_shape_data[i] = static_cast(output_shape_data[i]); + } + auto epsilon = op_info->GetAttr("epsilon"); + auto unique_bn_out_name = lite::subgraph::bm::UniqueName("batch_norm_out"); + auto* scale_data = scale->mutable_data(); + auto* bias_data = bias->mutable_data(); + auto* mean_data = mean->mutable_data(); + auto* variance_data = variance->mutable_data(); + for (int c = 0; c < channel_size; c++) { + float inv_scale = 1.f / (std::sqrt(variance_data[c] + epsilon)); + bias_data[c] = bias_data[c] - inv_scale * scale_data[c] * mean_data[c]; + scale_data[c] = inv_scale * scale_data[c]; + } + + const int input_num = 1; + int** shape = new int*[input_num]; + int* dim = new int[input_num]; + const char** name = new const char*[input_num]; + name[0] = static_cast(x_var_name.c_str()); + dim[0] = x_dims.size(); + shape[0] = &i_x_shape_data[0]; + add_scale_layer(graph->GetCompilerHandle(), + input_num, + shape, + dim, + name, + const_cast(&i_output_shape_data[0]), + output_dims.size(), + static_cast(output_var_name.c_str()), + static_cast(unique_op_name.c_str()), + static_cast(scale->mutable_data()), + static_cast(bias->mutable_data()), + 1, + 1, + 1); + delete[] shape; + delete[] name; + delete[] dim; + + graph->AddNode(output_var_name); + return SUCCESS; +} + +} // namespace bm +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(batch_norm, + kBM, + paddle::lite::subgraph::bm::BatchNormConverter); diff --git a/lite/kernels/bm/bridges/conv_op.cc b/lite/kernels/bm/bridges/conv_op.cc new file mode 100644 index 0000000000..ab48ade68f --- /dev/null +++ b/lite/kernels/bm/bridges/conv_op.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/conv_op.h" +#include +#include "lite/kernels/bm/bridges/graph.h" +#include "lite/kernels/bm/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace bm { + +int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_name = lite::subgraph::bm::UniqueName(op_type); + auto input_var_name = op_info->Input("Input").front(); + auto input = scope->FindVar(input_var_name)->GetMutable(); + auto input_dims = input->dims(); + auto output_var_name = op_info->Output("Output").front(); + auto output = scope->FindVar(output_var_name)->GetMutable(); + auto output_dims = output->dims(); + auto filter_var_name = op_info->Input("Filter").front(); + auto filter = scope->FindVar(filter_var_name)->GetMutable(); + auto filter_dims = filter->dims(); + CHECK_EQ(input_dims.size(), 4); + CHECK_EQ(output_dims.size(), 4); + CHECK_EQ(filter_dims.size(), 4); + bool has_bias = lite::subgraph::bm::HasInputArg(op_info, scope, "Bias"); + float* bias_data = nullptr; + if (has_bias) { + auto bias_var_name = op_info->Input("Bias").front(); + auto* bias = scope->FindVar(bias_var_name)->GetMutable(); + bias_data = static_cast(bias->mutable_data()); + } + const int64_t* input_shape_data = + const_cast(&input_dims.data()[0]); + const int64_t* output_shape_data = + const_cast(&output_dims.data()[0]); + std::vector i_input_shape_data(input_dims.size()); + std::vector i_output_shape_data(output_dims.size()); + + for (size_t i = 0; i < input_dims.size(); i++) { + i_input_shape_data[i] = static_cast(input_shape_data[i]); + } + for (size_t i = 0; i < output_dims.size(); i++) { + i_output_shape_data[i] = static_cast(output_shape_data[i]); + } + const float* filter_data = + const_cast(filter->mutable_data()); + auto groups = op_info->GetAttr("groups"); + auto paddings = op_info->GetAttr>("paddings"); + auto strides = op_info->GetAttr>("strides"); + auto dilations = op_info->GetAttr>("dilations"); + + add_conv_layer(graph->GetCompilerHandle(), + const_cast(&i_input_shape_data[0]), + input_dims.size(), + static_cast(input_var_name.c_str()), + const_cast(&i_output_shape_data[0]), + output_dims.size(), + static_cast(output_var_name.c_str()), + static_cast(unique_op_name.c_str()), + filter_data, + bias_data, + filter_dims.data()[2], + filter_dims.data()[3], + groups, + paddings[0], + paddings[0], + paddings[1], + paddings[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + static_cast(has_bias)); + graph->AddNode(output_var_name); + return SUCCESS; +} + +} // namespace bm +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(conv2d, + kBM, + paddle::lite::subgraph::bm::ConvConverter); diff --git a/lite/kernels/bm/bridges/elementwise_ops.cc b/lite/kernels/bm/bridges/elementwise_ops.cc new file mode 100644 index 0000000000..7d158110ee --- /dev/null +++ b/lite/kernels/bm/bridges/elementwise_ops.cc @@ -0,0 +1,135 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include +#include "lite/kernels/bm/bridges/graph.h" +#include "lite/kernels/bm/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace bm { + +int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + // input + const int input_num = 2; + int** shape = new int*[input_num]; + int* dim = new int[input_num]; + const char** name = new const char*[input_num]; + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims(); + name[0] = static_cast(x_var_name.c_str()); + dim[0] = x_dims.size(); + const int64_t* x_shape_data = const_cast(&x_dims.data()[0]); + std::vector i_x_shape_data(x_dims.size()); + for (size_t i = 0; i < x_dims.size(); i++) { + i_x_shape_data[i] = static_cast(x_shape_data[i]); + } + shape[0] = &i_x_shape_data[0]; + auto y_var_name = op_info->Input("Y").front(); + auto y = scope->FindVar(y_var_name)->GetMutable(); + auto y_dims = y->dims(); + name[1] = static_cast(y_var_name.c_str()); + dim[1] = y_dims.size(); + const int64_t* y_shape_data = const_cast(&y_dims.data()[0]); + std::vector i_y_shape_data(y_dims.size()); + for (size_t i = 0; i < y_dims.size(); i++) { + i_y_shape_data[i] = static_cast(y_shape_data[i]); + } + shape[1] = &i_y_shape_data[0]; + bool y_is_const = !graph->HasNode(y_var_name); + // output + auto output_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(output_var_name)->GetMutable(); + auto output_dims = output->dims(); + const int64_t* output_shape_data = + const_cast(&output_dims.data()[0]); + std::vector i_output_shape_data(output_dims.size()); + for (size_t i = 0; i < output_dims.size(); i++) { + i_output_shape_data[i] = static_cast(output_shape_data[i]); + } + if (y_is_const) { + CHECK_EQ(op_type, "elementwise_add"); + } + int op_code{-1}; + float coeff[2] = {1.f, 1.f}; + if (op_type == "elementwise_mul") { + op_code = 0; + } else if (op_type == "elementwise_add") { + op_code = 1; + } else if (op_type == "elementwise_sub") { + op_code = 1; + coeff[1] = -1.f; + } else { + LOG(FATAL) << "UNSUPPORTED ELTWISE OPERATION: " << op_type; + } + if (!y_is_const) { + add_eltwise_layer(graph->GetCompilerHandle(), + input_num, + shape, + dim, + name, + const_cast(&i_output_shape_data[0]), + output_dims.size(), + static_cast(output_var_name.c_str()), + op_code, + coeff); + } else { + const float* y_data = const_cast(y->mutable_data()); + const float* x_data = const_cast(x->mutable_data()); + bm_add_const_tensor(graph->GetCompilerHandle(), + name[1], + shape[0], + dim[0], + static_cast(DTYPE_FP32), + static_cast(y_data)); + + add_binary_layer_v2(graph->GetCompilerHandle(), + name[0], + shape[0], + dim[0], + 0, + static_cast(x_data), + name[1], + shape[0], + dim[0], + 0, + static_cast(y_data), + static_cast(output_var_name.c_str()), + 0); + } + delete[] shape; + delete[] name; + delete[] dim; + graph->AddNode(output_var_name); + return SUCCESS; +} + +} // namespace bm +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(elementwise_add, + kBM, + paddle::lite::subgraph::bm::ElementwiseConverter); diff --git a/lite/kernels/bm/bridges/graph.cc b/lite/kernels/bm/bridges/graph.cc new file mode 100644 index 0000000000..81dedb30c6 --- /dev/null +++ b/lite/kernels/bm/bridges/graph.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/bm/bridges/graph.h" +#include + +namespace paddle { +namespace lite { +namespace subgraph { +namespace bm { + +void Graph::AddNode(const std::string& name) { + nodes_.insert(std::make_pair(name, name)); +} + +void Graph::CreateCompilerHandle() { + compiler_handle_ = create_bmcompiler("BM1684"); + CHECK(compiler_handle_ != nullptr); +} + +} // namespace bm +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/bm/bridges/graph.h b/lite/kernels/bm/bridges/graph.h new file mode 100644 index 0000000000..40dadcc92d --- /dev/null +++ b/lite/kernels/bm/bridges/graph.h @@ -0,0 +1,48 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace bm { + +// Graph to collect all of converted BM IR nodes +class Graph { + public: + void AddNode(const std::string& name); + bool HasNode(const std::string& name) { + return nodes_.find(name) != nodes_.end(); + } + void CreateCompilerHandle(); + void* GetCompilerHandle() { return compiler_handle_; } + + private: + std::unordered_map nodes_; + void* compiler_handle_; +}; + +} // namespace bm +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/bm/bridges/mul_op.cc b/lite/kernels/bm/bridges/mul_op.cc new file mode 100644 index 0000000000..add4c89d2b --- /dev/null +++ b/lite/kernels/bm/bridges/mul_op.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include "lite/kernels/bm/bridges/graph.h" +#include "lite/kernels/bm/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace bm { + +int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_name = lite::subgraph::bm::UniqueName(op_type); + // only support y is const + // input + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims(); + const int64_t* x_shape_data = const_cast(&x_dims.data()[0]); + std::vector i_x_shape_data(x_dims.size()); + for (size_t i = 0; i < x_dims.size(); i++) { + i_x_shape_data[i] = static_cast(x_shape_data[i]); + } + // add reshape layer + int i_x_reshape_shape_data[2]; + for (size_t i = 0; i < 2; i++) { + i_x_reshape_shape_data[i] = static_cast(x_shape_data[i]); + } + int reshape_param[] = {0, -1}; + auto unique_op_reshape_name = + lite::subgraph::bm::UniqueName(op_type + "_reshape"); + add_reshape_layer(graph->GetCompilerHandle(), + const_cast(&i_x_shape_data[0]), + x_dims.size(), + static_cast(x_var_name.c_str()), + const_cast(&i_x_reshape_shape_data[0]), + 2, + static_cast(unique_op_reshape_name.c_str()), + const_cast(reshape_param)); + + auto y_var_name = op_info->Input("Y").front(); + auto y = scope->FindVar(y_var_name)->GetMutable(); + auto y_dims = y->dims(); + // output + auto output_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(output_var_name)->GetMutable(); + auto output_dims = output->dims(); + const int64_t* output_shape_data = + const_cast(&output_dims.data()[0]); + std::vector i_output_shape_data(output_dims.size()); + for (size_t i = 0; i < output_dims.size(); i++) { + i_output_shape_data[i] = static_cast(output_shape_data[i]); + } + add_fc_layer(graph->GetCompilerHandle(), + const_cast(&i_x_reshape_shape_data[0]), + 2, + static_cast(unique_op_reshape_name.c_str()), + const_cast(&i_output_shape_data[0]), + output_dims.size(), + static_cast(output_var_name.c_str()), + static_cast(unique_op_name.c_str()), + i_x_reshape_shape_data[1], + i_output_shape_data[1], + static_cast(y->mutable_data()), + nullptr, + 0, + 0); + graph->AddNode(output_var_name); + return SUCCESS; +} + +} // namespace bm +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(mul, kBM, paddle::lite::subgraph::bm::MulConverter); diff --git a/lite/kernels/bm/bridges/paddle_use_bridges.h b/lite/kernels/bm/bridges/paddle_use_bridges.h new file mode 100644 index 0000000000..417d016c78 --- /dev/null +++ b/lite/kernels/bm/bridges/paddle_use_bridges.h @@ -0,0 +1,24 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +USE_SUBGRAPH_BRIDGE(relu, kBM); +USE_SUBGRAPH_BRIDGE(conv2d, kBM); +USE_SUBGRAPH_BRIDGE(elementwise_add, kBM); +USE_SUBGRAPH_BRIDGE(pool2d, kBM); +USE_SUBGRAPH_BRIDGE(softmax, kBM); +USE_SUBGRAPH_BRIDGE(mul, kBM); +USE_SUBGRAPH_BRIDGE(batch_norm, kBM); +USE_SUBGRAPH_BRIDGE(scale, kBM); diff --git a/lite/kernels/bm/bridges/pool_op.cc b/lite/kernels/bm/bridges/pool_op.cc new file mode 100644 index 0000000000..8b0c0cfffb --- /dev/null +++ b/lite/kernels/bm/bridges/pool_op.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include "lite/kernels/bm/bridges/graph.h" +#include "lite/kernels/bm/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace bm { + +int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_name = lite::subgraph::bm::UniqueName(op_type); + // input + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims(); + const int64_t* x_shape_data = const_cast(&x_dims.data()[0]); + std::vector i_x_shape_data(x_dims.size()); + for (size_t i = 0; i < x_dims.size(); i++) { + i_x_shape_data[i] = static_cast(x_shape_data[i]); + } + // output + int32_t* shape[1]; + int32_t dim[1]; + const char* name[1]; + auto output_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(output_var_name)->GetMutable(); + auto output_dims = output->dims(); + const int64_t* output_shape_data = + const_cast(&output_dims.data()[0]); + std::vector i_output_shape_data(output_dims.size()); + for (size_t i = 0; i < output_dims.size(); i++) { + i_output_shape_data[i] = static_cast(output_shape_data[i]); + } + shape[0] = &i_output_shape_data[0]; + name[0] = static_cast(output_var_name.c_str()); + dim[0] = output_dims.size(); + auto pooling_type = op_info->GetAttr("pooling_type"); + CHECK(pooling_type == "max" || pooling_type == "avg"); + auto ksize = op_info->GetAttr>("ksize"); + auto paddings = op_info->GetAttr>("paddings"); + auto strides = op_info->GetAttr>("strides"); + auto global_pooling = op_info->GetAttr("global_pooling"); + auto ceil_mode = op_info->GetAttr("ceil_mode"); + bool average_exclusive = false; + if (pooling_type == "avg") { + average_exclusive = op_info->GetAttr("exclusive"); + } + add_pooling_layer( + graph->GetCompilerHandle(), + const_cast(&i_x_shape_data[0]), + x_dims.size(), + static_cast(x_var_name.c_str()), + 1, + shape, + dim, + name, + ksize[0], + ksize[1], + paddings[0], + paddings[0], + paddings[1], + paddings[1], + strides[0], + strides[1], + (ksize[0] > 1 && ksize[1] > 1) && pooling_type == "max" ? 0 : 1, + static_cast(average_exclusive), + static_cast(global_pooling), + static_cast(ceil_mode), + static_cast(unique_op_name.c_str()), + nullptr); + graph->AddNode(output_var_name); + return SUCCESS; +} + +} // namespace bm +} // namespace subgraph +} // namespace lite +} // namespace paddle +REGISTER_SUBGRAPH_BRIDGE(pool2d, + kBM, + paddle::lite::subgraph::bm::PoolConverter); diff --git a/lite/kernels/bm/bridges/scale_op.cc b/lite/kernels/bm/bridges/scale_op.cc new file mode 100644 index 0000000000..bbbf896116 --- /dev/null +++ b/lite/kernels/bm/bridges/scale_op.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/kernels/bm/bridges/graph.h" +#include "lite/kernels/bm/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace bm { + +int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_name = lite::subgraph::bm::UniqueName(op_type); + // input + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims(); + const int64_t* x_shape_data = const_cast(&x_dims.data()[0]); + std::vector i_x_shape_data(x_dims.size()); + for (size_t i = 0; i < x_dims.size(); i++) { + i_x_shape_data[i] = static_cast(x_shape_data[i]); + } + // output + auto output_var_name = op_info->Output("Out").front(); + auto scale = op_info->GetAttr("scale"); + auto bias = op_info->GetAttr("bias"); + auto bias_after_scale = op_info->GetAttr("bias_after_scale"); + if (!bias_after_scale) { + bias *= scale; + } + auto unique_op_scale_name = lite::subgraph::bm::UniqueName(op_type); + add_const_binary_layer(graph->GetCompilerHandle(), + static_cast(x_var_name.c_str()), + const_cast(&i_x_shape_data[0]), + x_dims.size(), + scale, + static_cast(unique_op_scale_name.c_str()), + BINARY_MUL, + 0); + add_const_binary_layer(graph->GetCompilerHandle(), + static_cast(unique_op_scale_name.c_str()), + const_cast(&i_x_shape_data[0]), + x_dims.size(), + bias, + static_cast(output_var_name.c_str()), + BINARY_ADD, + 0); + graph->AddNode(output_var_name); + return SUCCESS; +} + +} // namespace bm +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(scale, + kBM, + paddle::lite::subgraph::bm::ScaleConverter); diff --git a/lite/kernels/bm/bridges/softmax_op.cc b/lite/kernels/bm/bridges/softmax_op.cc new file mode 100644 index 0000000000..fc08d9db4f --- /dev/null +++ b/lite/kernels/bm/bridges/softmax_op.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include "lite/kernels/bm/bridges/graph.h" +#include "lite/kernels/bm/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace bm { + +int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto scope = op->scope(); + auto op_info = op->op_info(); + // input + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims(); + const int64_t* x_shape_data = const_cast(&x_dims.data()[0]); + size_t length = x_dims.size(); + std::vector i_x_shape_data(length); + for (size_t i = 0; i < length; i++) { + i_x_shape_data[i] = static_cast(x_shape_data[i]); + } + // output + auto output_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(output_var_name)->GetMutable(); + auto output_dims = output->dims(); + const int64_t* output_shape_data = + const_cast(&output_dims.data()[0]); + length = output_dims.size(); + std::vector i_output_shape_data(length); + for (size_t i = 0; i < length; i++) { + i_output_shape_data[i] = static_cast(output_shape_data[i]); + } + auto axis = op_info->GetAttr("axis"); + if (axis < 0) { + axis += x_dims.size(); + } + int outer_num = x_dims.Slice(0, axis).production(); + int inner_num = x_dims.Slice(axis + 1, x_dims.size()).production(); + add_softmax_layer(graph->GetCompilerHandle(), + const_cast(&i_x_shape_data[0]), + x_dims.size(), + static_cast(x_var_name.c_str()), + const_cast(&i_output_shape_data[0]), + output_dims.size(), + static_cast(output_var_name.c_str()), + inner_num, + outer_num, + x_dims[axis]); + graph->AddNode(output_var_name); + return SUCCESS; +} + +} // namespace bm +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(softmax, + kBM, + paddle::lite::subgraph::bm::SoftmaxConverter); diff --git a/lite/kernels/bm/bridges/utility.cc b/lite/kernels/bm/bridges/utility.cc new file mode 100644 index 0000000000..aa61462d04 --- /dev/null +++ b/lite/kernels/bm/bridges/utility.cc @@ -0,0 +1,60 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/bm/bridges/utility.h" +#include //NOLINT +#include + +namespace paddle { +namespace lite { +namespace subgraph { +namespace bm { + +std::string UniqueName(const std::string& prefix) { + static std::mutex counter_mtx; + static std::unordered_map counter_map; + std::unique_lock counter_lck(counter_mtx); + int counter = 1; + auto it = counter_map.find(prefix); + if (it == counter_map.end()) { + counter_map[prefix] = counter; + } else { + counter = ++(it->second); + } + + return prefix + "_" + std::to_string(counter); +} + +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname) { + auto iarg_names = op_info->input_argnames(); + if (std::find(iarg_names.begin(), iarg_names.end(), argname) != + iarg_names.end()) { + auto inputs = op_info->Input(argname); + if (inputs.empty()) { + return false; + } + auto var_name = inputs.front(); + auto var = scope->FindVar(var_name); + return var != nullptr; + } else { + return false; + } +} + +} // namespace bm +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/bm/bridges/utility.h b/lite/kernels/bm/bridges/utility.h new file mode 100644 index 0000000000..55910bc47c --- /dev/null +++ b/lite/kernels/bm/bridges/utility.h @@ -0,0 +1,37 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace bm { + +std::string UniqueName(const std::string& prefix); + +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname); + +} // namespace bm +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc new file mode 100644 index 0000000000..83f9fe3bed --- /dev/null +++ b/lite/kernels/bm/subgraph_compute.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "lite/kernels/bm/subgraph_compute.h" +#include +#include +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" +#include "lite/kernels/bm/bridges/graph.h" +#include "lite/kernels/bm/bridges/paddle_use_bridges.h" +#include "lite/kernels/bm/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace bm { + +int SubgraphEngine::BuildDeviceProgram() { + int status = 0; + subgraph::bm::Graph graph; + const auto& bridges = subgraph::Registry::Instance(); + graph.CreateCompilerHandle(); + auto& ctx = this->ctx_->template As(); + for (auto& inst : origin_program_) { + auto op = inst.op(); + CHECK(op); + op->CheckShape(); + op->InferShape(); + std::string op_type = op->op_info()->Type(); + if (!bridges.Exists(op_type, TARGET(kBM))) { + return subgraph::FAILED; + } + auto kernel = inst.kernel(); + status |= + bridges.Select(op_type, TARGET(kBM))(reinterpret_cast(&graph), + const_cast(op), + const_cast(kernel)); + if (subgraph::CHECK_FAILED(status)) { + return subgraph::FAILED; + } + } + std::string net_name = "paddle_bitmain"; + __bmcompile_opt( + graph.GetCompilerHandle(), const_cast(net_name.c_str()), 2); + void* bmodel_data = nullptr; + unsigned int data_size = 0; + bm_hd_ = static_cast(ctx.GetHandle()); + finish_bmcompiler_data(graph.GetCompilerHandle(), &bmodel_data, &data_size); + bmrt_hd_ = bmrt_create(bm_hd_); + if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) { + return subgraph::FAILED; + } + bmrt_get_network_names(bmrt_hd_, &net_names_); + net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]); + auto& stage = net_info_->stages[0]; + // input + origin_idims_.resize(input_names_.size()); + origin_itensors_.resize(input_names_.size()); + device_inputs_.resize(input_names_.size()); + for (size_t i = 0; i < input_names_.size(); i++) { + origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]); + CHECK(origin_itensors_[i]); + origin_idims_[i] = origin_itensors_[i]->dims(); + bm_device_mem_t* p_mem = + static_cast(malloc(sizeof(bm_device_mem_t))); + CHECK(p_mem != nullptr); + CHECK_EQ(bm_malloc_device_byte( + bm_hd_, p_mem, origin_itensors_[i]->memory_size()), + BM_SUCCESS); + bmrt_tensor_with_device(&device_inputs_[i], + *p_mem, + net_info_->input_dtypes[i], + stage.input_shapes[i]); + } + // output + origin_odims_.resize(output_names_.size()); + origin_otensors_.resize(output_names_.size()); + device_outputs_.resize(output_names_.size()); + for (size_t i = 0; i < output_names_.size(); i++) { + origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]); + CHECK(origin_otensors_[i]); + origin_odims_[i] = origin_otensors_[i]->dims(); + output_map_.insert(std::pair(output_names_[i], i)); + origin_otensors_[i]->mutable_data(); + } + for (size_t i = 0; i < output_names_.size(); i++) { + int mapping_index = output_map_.at(net_info_->output_names[i]); + bm_device_mem_t* p_mem = + static_cast(malloc(sizeof(bm_device_mem_t))); + CHECK(p_mem != nullptr); + CHECK_EQ(bm_malloc_device_byte( + bm_hd_, p_mem, origin_otensors_[mapping_index]->memory_size()), + BM_SUCCESS); + bmrt_tensor_with_device(&device_outputs_[i], + *p_mem, + net_info_->output_dtypes[i], + stage.output_shapes[i]); + } + + return status; +} + +int SubgraphEngine::LaunchDeviceProgram() { + for (size_t i = 0; i < device_inputs_.size(); i++) { + bm_memcpy_s2d(bm_hd_, + device_inputs_[i].device_mem, + const_cast(origin_itensors_[i]->raw_data())); + } + bmrt_launch_tensor_ex(bmrt_hd_, + net_names_[0], + static_cast(&device_inputs_[0]), + net_info_->input_num, + static_cast(&device_outputs_[0]), + net_info_->output_num, + true, + false); + bm_thread_sync(bm_hd_); + for (size_t i = 0; i < device_outputs_.size(); i++) { + bm_memcpy_d2s(bm_hd_, + const_cast(origin_otensors_[i]->raw_data()), + device_outputs_[i].device_mem); + } + return 0; +} + +void SubgraphCompute::PrepareForRun() { + auto& param = this->Param(); + engine_.reset(new SubgraphEngine(ctx_.get(), + param.sub_block_idx, + param.sub_block_desc, + param.input_data_names, + param.output_data_names, + param.scope)); + CHECK(engine_); + engine_->Build(); +} + +void SubgraphCompute::Run() { + CHECK(engine_); + engine_->Launch(); +} + +} // namespace bm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(subgraph, + kBM, + kFloat, + kNCHW, + paddle::lite::kernels::bm::SubgraphCompute, + def) + .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); diff --git a/lite/kernels/bm/subgraph_compute.h b/lite/kernels/bm/subgraph_compute.h new file mode 100644 index 0000000000..0e4b1dfa32 --- /dev/null +++ b/lite/kernels/bm/subgraph_compute.h @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/program.h" +#include "lite/core/types.h" +#include "lite/kernels/npu/bridges/engine.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace bm { + +class SubgraphEngine : public subgraph::Engine { + public: + SubgraphEngine(KernelContext *ctx, + int block_idx, + cpp::BlockDesc *block_desc, + const std::vector &input_names, + const std::vector &output_names, + Scope *scope) + : subgraph::Engine( + ctx, block_idx, block_desc, input_names, output_names, scope) {} + + protected: + int BuildDeviceProgram() override; + int LaunchDeviceProgram() override; + + private: + void *bmrt_hd_; + std::vector device_inputs_; + std::vector device_outputs_; + std::map output_map_; + const char **net_names_; + const bm_net_info_t *net_info_; + bm_handle_t bm_hd_; +}; + +class SubgraphCompute : public KernelLite { + public: + using param_t = operators::SubgraphParam; + void PrepareForRun() override; + void Run() override; + virtual ~SubgraphCompute() = default; + + private: + std::unique_ptr engine_; +}; + +} // namespace bm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index a8f2f7a68a..4e104ef748 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU) +if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM) return() endif() diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index 6f1cb2aac7..0c1e71cbe3 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -1,4 +1,4 @@ -if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) +if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) @@ -36,36 +36,36 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) if(LITE_BUILD_EXTRA) - lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS ${bm_kernels} arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() - lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) - lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/tools/build_bm.sh b/lite/tools/build_bm.sh new file mode 100755 index 0000000000..f4cfee5ec6 --- /dev/null +++ b/lite/tools/build_bm.sh @@ -0,0 +1,112 @@ +#!/bin/bash +set -ex + +# global variables with default value +BM_SDK_ROOT="$(pwd)/../BM_SDK" # BM SDK +TARGET_NAME="BM1682" # default target +BUILD_EXTRA=OFF # ON(with sequence ops)/OFF +WITH_TESTING=ON # ON/OFF + +function print_usage { + echo -e "\nUSAGE:" + echo + echo "----------------------------------------" + echo -e "--bm_sdk_root=" + echo -e "--target_name=" + echo "----------------------------------------" + echo +} + +# readonly variables with default value +readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ + -DWITH_PYTHON=OFF \ + -DLITE_WITH_ARM=OFF" + +readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THRLITE_BUILD_THREADSEADS:-1} + +readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz +readonly workspace=$(pwd) + +function prepare_thirdparty { + if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then + rm -rf $workspace/third-party + + if [ ! -f $workspace/third-party-05b862.tar.gz ]; then + wget $THIRDPARTY_TAR + fi + tar xzf third-party-05b862.tar.gz + else + git submodule update --init --recursive + fi +} + +# for code gen, a source file is generated after a test, but is dependended by some targets in cmake. +# here we fake an empty file to make cmake works. +function prepare_workspace { + # in build directory + # 1. Prepare gen_code file + GEN_CODE_PATH_PREFIX=lite/gen_code + mkdir -p ./${GEN_CODE_PATH_PREFIX} + touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc + + # 2.Prepare debug tool + DEBUG_TOOL_PATH_PREFIX=lite/tools/debug + mkdir -p ./${DEBUG_TOOL_PATH_PREFIX} + cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/ + + # clone submodule + # git submodule update --init --recursive + prepare_thirdparty +} + +function build_bm { + build_dir=${workspace}/build.lite.bm + mkdir -p $build_dir + cd $build_dir + + prepare_workspace + cmake .. \ + ${CMAKE_COMMON_OPTIONS} \ + -DWITH_GPU=OFF \ + -DWITH_MKLDNN=OFF \ + -DLITE_WITH_X86=ON \ + -DWITH_MKL=ON \ + -DLITE_BUILD_EXTRA=ON \ + -DLITE_WITH_XPU=OFF \ + -DLITE_WITH_BM=ON \ + -DWITH_TESTING=${WITH_TESTING} \ + -DBM_SDK_ROOT=${BM_SDK_ROOT} + + make -j$NUM_CORES_FOR_COMPILE + + cd - + echo "Done" +} + +function main { + # Parse command line. + for i in "$@"; do + case $i in + --target_name=*) + TARGET_NAME="${i#*=}" + shift + ;; + --bm_sdk_root=*) + BM_SDK_ROOT="${i#*=}" + shift + ;; + bm) + build_bm + shift + ;; + *) + # unknown option + print_usage + exit 1 + ;; + esac + done +} + +main $@ -- GitLab