diff --git a/CMakeLists.txt b/CMakeLists.txt index a6733d11fd97a290d43ef699ca71655241e93f81..767196359ec5b62921e9a367de30fd0f0a4e6686 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,7 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF) lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) +lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF) lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU) @@ -129,6 +130,10 @@ if (LITE_WITH_PYTHON) include(external/pybind11) # download, build, install pybind11 endif() +if(LITE_WITH_RKNPU) + include(device/rknpu) +endif() + # for mobile if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 564173750e126c5f097e15f578f535a0370f8d07..8f9eb1b048f8a722f8dab6ed1bf01da8bf7819f0 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -134,6 +134,10 @@ if (LITE_WITH_NPU) add_definitions("-DLITE_WITH_NPU") endif() +if (LITE_WITH_RKNPU) + add_definitions("-DLITE_WITH_RKNPU") +endif() + if (LITE_WITH_XPU) add_definitions("-DLITE_WITH_XPU") if (LITE_WITH_XTCL) diff --git a/cmake/device/rknpu.cmake b/cmake/device/rknpu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..7d430888072b0219bba3112534818d2e10a55579 --- /dev/null +++ b/cmake/device/rknpu.cmake @@ -0,0 +1,55 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_RKNPU) + return() +endif() + +if(NOT DEFINED RKNPU_DDK_ROOT) + set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT}) + if(NOT RKNPU_DDK_ROOT) + message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON") + endif() +endif() + +message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}") +find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h + PATHS ${RKNPU_DDK_ROOT}/include/ NO_DEFAULT_PATH) +if(NOT RKNPU_DDK_INC) + message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include") +endif() + +include_directories("${RKNPU_DDK_ROOT}/include") + +set(RKNPU_SUB_LIB_PATH "lib64") +if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") + set(RKNPU_SUB_LIB_PATH "lib64") +endif() + +if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") + set(RKNPU_SUB_LIB_PATH "lib") +endif() + +find_library(RKNPU_DDK_FILE NAMES rknpu_ddk + PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}) + +if(NOT RKNPU_DDK_FILE) + message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}") +else() + message(STATUS "Found RKNPU_DDK_FILE Library: ${RKNPU_DDK_FILE}") + add_library(rknpu_ddk SHARED IMPORTED GLOBAL) + set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE}) +endif() + +set(rknpu_runtime_libs rknpu_ddk CACHE INTERNAL "rknpu ddk runtime libs") diff --git a/cmake/lite.cmake b/cmake/lite.cmake index 47e21e6e8088c8210334b63dac27ee048a3ed22f..772a2baaef772853639db8abc3ba702dc52ad825 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -22,7 +22,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -88,6 +88,12 @@ function (lite_deps TARGET) endforeach(var) endif() + if (LITE_WITH_RKNPU) + foreach(var ${lite_deps_RKNPU_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + if (LITE_WITH_XPU) foreach(var ${lite_deps_XPU_DEPS}) set(deps ${deps} ${var}) @@ -131,7 +137,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -142,6 +148,7 @@ function(lite_cc_library TARGET) CUDA_DEPS ${args_CUDA_DEPS} CL_DEPS ${args_CL_DEPS} BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} ARM_DEPS ${args_ARM_DEPS} CV_DEPS ${args_CV_DEPS} FPGA_DEPS ${args_FPGA_DEPS} @@ -177,7 +184,7 @@ function(lite_cc_binary TARGET) set(options " -g ") endif() set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -191,7 +198,8 @@ function(lite_cc_binary TARGET) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -226,7 +234,7 @@ function(lite_cc_test TARGET) endif() set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) @@ -248,7 +256,8 @@ function(lite_cc_test TARGET) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -280,6 +289,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels") set(mlu_kernels CACHE INTERNAL "mlu kernels") set(bm_kernels CACHE INTERNAL "bm kernels") +set(rknpu_kernels CACHE INTERNAL "rknpu kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels") set(host_kernels CACHE INTERNAL "host kernels") @@ -295,12 +305,12 @@ if(LITE_BUILD_TAILOR) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) endif() # add a kernel for some specific device -# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM) +# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM, RKNPU) # level: one of (basic, extra) function(add_kernel TARGET device level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -323,6 +333,12 @@ function(add_kernel TARGET device level) if ("${device}" STREQUAL "Host") + if (LITE_ON_MODEL_OPTIMIZE_TOOL) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "ARM") @@ -379,6 +395,15 @@ function(add_kernel TARGET device level) endif() set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "RKNPU") + if (NOT LITE_WITH_RKNPU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() + set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "") + endif() if ("${device}" STREQUAL "MLU") if (NOT LITE_WITH_MLU) foreach(src ${args_SRCS}) @@ -427,7 +452,8 @@ function(add_kernel TARGET device level) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} @@ -481,7 +507,8 @@ function(add_operator TARGET level) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index cabce2f7cd2faaf954eda142f9aecc02e103bbcb..8f9574c92a3c6e361a664d59a751595a533ac78b 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -7,6 +7,7 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}") message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") +message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") @@ -76,6 +77,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (LITE_WITH_BM) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm") endif(LITE_WITH_BM) + if (LITE_WITH_RKNPU) + set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu") + endif(LITE_WITH_RKNPU) else() set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib") endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 579a45ecaf0b894d646f28d60cace42f6aaffc1d..20cb716644ceca9a8dc65c6c1fb958ef9e5dd736 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -34,9 +34,11 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${rknpu_kernels} ) + add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) - target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) + target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels}) if(NOT APPLE) set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map") set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}") @@ -59,6 +61,11 @@ else() # Need to add HIAI runtime libs (libhiai.so) dependency target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs}) endif() + if (LITE_WITH_RKNPU) + # Need to add RKNPU runtime libs dependency + target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs}) + endif() + endif() endif() @@ -69,6 +76,7 @@ if (WITH_TESTING) CUDA_DEPS ${cuda_kernels} X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels}) endif() @@ -82,6 +90,12 @@ if(LITE_WITH_BM) set(cxx_api_deps ${cxx_api_deps} ${bm_deps}) endif() +if(LITE_WITH_RKNPU) + set(light_api_deps ${light_api_deps} ${rknpu_deps}) + set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps}) +endif() + + message(STATUS "get ops ${ops}") message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get CUDA kernels ${cuda_kernels}") @@ -90,6 +104,7 @@ message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get OpenCL kernels ${opencl_kernels}") message(STATUS "get NPU kernels ${npu_kernels}") message(STATUS "get XPU kernels ${xpu_kernels}") +message(STATUS "get RKNPU kernels ${rknpu_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get BM kernels ${bm_kernels}") message(STATUS "get MLU kernels ${mlu_kernels}") @@ -107,6 +122,7 @@ if (NOT LITE_ON_TINY_PUBLISH) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels}) @@ -128,6 +144,7 @@ lite_cc_library(light_api SRCS light_api.cc CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} @@ -147,6 +164,7 @@ if(WITH_TESTING) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} @@ -248,6 +266,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL) add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz) + # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc # DEPS ${lite_model_test_DEPS}) @@ -291,6 +310,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc DEPS light_api program mir_passes paddle_api_light CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -300,6 +320,7 @@ lite_cc_test(test_apis SRCS apis_test.cc X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} FPGA_DEPS ${fpga_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model @@ -335,6 +356,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} X86_DEPS ${x86_kernels} FPGA_DEPS ${fpga_kernels} @@ -356,6 +378,7 @@ if(NOT IOS) MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} + RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -369,6 +392,7 @@ if(NOT IOS) MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} + RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -382,6 +406,7 @@ if(NOT IOS) MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} + RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -392,6 +417,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} @@ -404,17 +430,20 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) + lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} diff --git a/lite/api/opt.cc b/lite/api/opt.cc index 7a8cd7f1ef1234269c986b781f0546b26df53c4b..efad7b74e943c29c9af1af5c14ac51621eefe576 100644 --- a/lite/api/opt.cc +++ b/lite/api/opt.cc @@ -109,6 +109,10 @@ std::vector ParserValidPlaces() { valid_places.emplace_back(TARGET(kNPU)); } else if (target_repr == "xpu") { valid_places.emplace_back(TARGET(kXPU)); + } else if (target_repr == "rknpu") { + valid_places.emplace_back(TARGET(kRKNPU)); + valid_places.emplace_back( + TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)); } else if (target_repr == "mlu") { valid_places.emplace_back(TARGET(kMLU)); } else { @@ -187,6 +191,7 @@ void PrintOpsInfo(std::set valid_ops = {}) { "kFPGA", "kNPU", "kXPU", + "kRKNPU", "kAny", "kUnk"}; int maximum_optype_length = 0; @@ -251,16 +256,16 @@ void PrintHelpInfo() { " `--param_file=`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out=`\n" - " `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" + " `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`\n" " `--record_tailoring_info=(true|false)`\n" " Arguments of model checking and ops information:\n" " `--print_all_ops=true` Display all the valid operators of " "Paddle-Lite\n" " `--print_supported_ops=true " - "--valid_targets=(arm|opencl|x86|npu|xpu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`" " Display valid operators of input targets\n" " `--print_model_ops=true --model_dir= " - "--valid_targets=(arm|opencl|x86|npu|xpu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`" " Display operators in the input model\n"; std::cout << "opt version:" << opt_version << std::endl << help_info << std::endl; diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index aceb047b64f54ac18ac492ef495d32c3180ad4b4..efd22fc22a4180c3cac9f269fc14f6541c16b885 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -72,7 +72,8 @@ const std::string& TargetToStr(TargetType target) { "npu", "xpu", "bm", - "mlu"}; + "mlu", + "rknpu"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -113,7 +114,8 @@ const std::string& TargetRepr(TargetType target) { "kNPU", "kXPU", "kMLU", - "kBM"}; + "kBM", + "kRKNPU"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index f57b9832f2b35fc3db74232192bd397ec8b4930c..2b271a4872e7e14c48632a2bb1aae56d53145cba 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -54,8 +54,9 @@ enum class TargetType : int { kXPU = 9, kBM = 10, kMLU = 11, + kRKNPU = 12, kAny = 6, // any target - NUM = 12, // number of fields. + NUM = 13, // number of fields. }; enum class PrecisionType : int { kUnk = 0, diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index fb459ae3621d1281f0a2433ca6b237a165d078a1..1e8734a6e45ead93bb33024a2e918cdb401265d9 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -8,3 +8,4 @@ add_subdirectory(npu) add_subdirectory(xpu) add_subdirectory(mlu) add_subdirectory(bm) +add_subdirectory(rknpu) diff --git a/lite/backends/rknpu/CMakeLists.txt b/lite/backends/rknpu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cec60c80759cfc02e25a82eb795746c8b93e7cfe --- /dev/null +++ b/lite/backends/rknpu/CMakeLists.txt @@ -0,0 +1,5 @@ +if(NOT LITE_WITH_RKNPU) + return() +endif() + +lite_cc_library(device_rknpu SRCS device.cc DEPS ${rknpu_builder_libs} ${rknpu_runtime_libs}) diff --git a/lite/backends/rknpu/device.cc b/lite/backends/rknpu/device.cc new file mode 100644 index 0000000000000000000000000000000000000000..5b486259b3b328713062648df445f94735ae6380 --- /dev/null +++ b/lite/backends/rknpu/device.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/rknpu/device.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace rknpu { + +std::unique_ptr Device::Build( + std::string& model_name, // NOLINT + rk::nn::Graph* rk_graph, // NOLINT + std::vector> input_nodes, // NOLINT + std::vector> output_nodes // NOLINT + ) { + VLOG(3) << "[RKNPU] Build model"; + + rk_graph->SetInputsOutputs(input_nodes, output_nodes); + + std::unique_ptr exector = + std::unique_ptr(new rk::nn::Exection(rk_graph)); + + exector->Build(); + + return exector; +} + +} // namespace rknpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/rknpu/device.h b/lite/backends/rknpu/device.h new file mode 100644 index 0000000000000000000000000000000000000000..9284725aac7fbd9840aef64b7e8f411059f9ba15 --- /dev/null +++ b/lite/backends/rknpu/device.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "rknpu/rknpu_pub.h" // NOLINT + +namespace paddle { +namespace lite { +namespace rknpu { + +class Device { + public: + static Device& Global() { + static Device x; + return x; + } + Device() {} + + // Build the RK IR graph to om model, return RK model exector to + // load om model and run inference. + std::unique_ptr Build( + std::string& model_name, // NOLINT + rk::nn::Graph* rk_graph, // NOLINT + std::vector> input_nodes, // NOLINT + std::vector> output_nodes // NOLINT + ); // NOLINT + + private: +}; + +} // namespace rknpu +} // namespace lite +} // namespace paddle diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index afc104073684ff00395fb32335630705ff3f7bc8..75971570fb078ce4e39413e5b3df629fe2a7ac3e 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -6,5 +6,5 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/core/context.h b/lite/core/context.h index fa38b7c81bc511e00b860999bbc1ae7a46b7ae72..7ab45bae1d3b3ff518ffa7a1db61cd1f56c92728 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -59,6 +59,7 @@ using OpenCLContext = Context; using FPGAContext = Context; using BMContext = Context; using MLUContext = Context; +using RKNPUContext = Context; template <> class Context { @@ -103,6 +104,21 @@ class Context { }; #endif +#ifdef LITE_WITH_RKNPU +template <> +class Context { + public: + Context() {} + explicit Context(const RKNPUContext& ctx); + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() {} + void CopySharedTo(RKNPUContext* ctx) {} + + RKNPUContext& operator=(const RKNPUContext& ctx) {} + std::string name() const { return "RKNPUContext"; } +}; +#endif + #ifdef LITE_WITH_XPU template <> class Context { @@ -392,6 +408,12 @@ class ContextScheduler { &ctx->As()); break; #endif +#ifdef LITE_WITH_RKNPU + case TARGET(kRKNPU): + kernel_contexts_[TargetType::kRKNPU].As().CopySharedTo( + &ctx->As()); + break; +#endif #ifdef LITE_WITH_XPU case TARGET(kXPU): kernel_contexts_[TargetType::kXPU].As().CopySharedTo( @@ -461,6 +483,9 @@ class ContextScheduler { #ifdef LITE_WITH_NPU InitContext(); #endif +#ifdef LITE_WITH_RKNPU + InitContext(); +#endif #ifdef LITE_WITH_XPU InitContext(); #endif diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc index 38293ede76ed35bf05767ce1333947b7dfdbc4ac..6c7a7c5803268f0729be3a1d2164c0598c8738bd 100644 --- a/lite/core/mir/memory_optimize_pass.cc +++ b/lite/core/mir/memory_optimize_pass.cc @@ -313,4 +313,4 @@ void MemoryOptimizePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) .BindTargets({TARGET(kARM), TARGET(kOpenCL)}) - .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM)}); + .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM), TARGET(kRKNPU)}); diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc index 40cad8f6af75300ab85753b16e391daeeadc6c2f..37fff018caf4a6d90a48ad3f173ec28c09866690 100644 --- a/lite/core/mir/quantized_op_attributes_inference_pass.cc +++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc @@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply( REGISTER_MIR_PASS(quantized_op_attributes_inference_pass, paddle::lite::mir::QuantizedOpAttributesInferencePass) - .BindTargets({TARGET(kNPU)}); + .BindTargets({TARGET(kNPU), TARGET(kRKNPU)}); diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc index eecd9348ae684929d3f55dee2a94921a078f148c..5c5dc3204b8728e8b30661fae21b056db6960179 100644 --- a/lite/core/mir/subgraph/subgraph_pass.cc +++ b/lite/core/mir/subgraph/subgraph_pass.cc @@ -69,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr& graph) { fuser(); } +void RKNPUSubgraphPass::Apply(const std::unique_ptr& graph) { + std::unordered_set supported_lists; +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); +#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h" +#undef USE_SUBGRAPH_BRIDGE + auto teller = [&](Node* node) { + if (!node->IsStmt()) return false; + auto& stmt = node->AsStmt(); + return supported_lists.count(stmt.op_type()) != 0; + }; + SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */); + fuser(); +} + void MLUSubgraphPass::Apply(const std::unique_ptr& graph) { std::unordered_set supported_lists; #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); @@ -93,5 +107,7 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass) .BindTargets({TARGET(kXPU)}); REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass) .BindTargets({TARGET(kBM)}); +REGISTER_MIR_PASS(rknpu_subgraph_pass, paddle::lite::mir::RKNPUSubgraphPass) + .BindTargets({TARGET(kRKNPU)}); REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass) .BindTargets({TARGET(kMLU)}); diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h index f83448df42ffe6d6d8c5b37503b5127290037dce..b89c20f3bd4b7ca8e9650d20925f5b75dc26ec59 100644 --- a/lite/core/mir/subgraph/subgraph_pass.h +++ b/lite/core/mir/subgraph/subgraph_pass.h @@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass { void Apply(const std::unique_ptr& graph) override; }; +class RKNPUSubgraphPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + class MLUSubgraphPass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override; diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc index b51fd2b3db4c4699e3ece5404af41db97a3d5a76..0c8d42f4e2dc0b0a32d352ed9b460e1a0b7bfb90 100644 --- a/lite/core/op_registry.cc +++ b/lite/core/op_registry.cc @@ -110,6 +110,9 @@ std::list> KernelRegistry::Create( case TARGET(kMLU): { CREATE_KERNEL(kMLU); } break; + case TARGET(kRKNPU): { + CREATE_KERNEL(kRKNPU); + } break; default: CHECK(false) << "not supported kernel target " << TargetToStr(target); } @@ -232,6 +235,11 @@ KernelRegistry::KernelRegistry() INIT_FOR(kBM, kInt8, kNCHW); INIT_FOR(kBM, kAny, kNCHW); INIT_FOR(kBM, kAny, kAny); + + INIT_FOR(kRKNPU, kFloat, kNCHW); + INIT_FOR(kRKNPU, kInt8, kNCHW); + INIT_FOR(kRKNPU, kAny, kNCHW); + INIT_FOR(kRKNPU, kAny, kAny); #undef INIT_FOR } diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index 96c9fc2358199594cf9590385c2efdaf1c671425..ba9f3d5bd1f503160344c455889ff444a39e9ef2 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -251,6 +251,16 @@ class KernelRegistry final { PRECISION(kInt8), DATALAYOUT(kNCHW)> *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 5004be79afbef51a3503b92188dcd8b82539c28a..b4675819a5def48cbec56029bb97be587df65ac3 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -86,6 +86,7 @@ class Optimizer { "npu_subgraph_pass", "xpu_subgraph_pass", "bm_subgraph_pass", + "rknpu_subgraph_pass", "static_kernel_pick_pass", // pick original kernel from graph "variable_place_inference_pass", // inference arg/var's // info(target/precision/layout/device) diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt index 40c95415546d99a66abf2d6f3595ae8695c4df86..2416278ad74068d28f6de523c55513891b08cc72 100644 --- a/lite/gen_code/CMakeLists.txt +++ b/lite/gen_code/CMakeLists.txt @@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${rknpu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} @@ -43,6 +44,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${rknpu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt index 78bb8d10b798b73861ddbf25e427289fc2984a55..b00e818c6cd21de717dab7b896a8f757b5b0011a 100644 --- a/lite/kernels/CMakeLists.txt +++ b/lite/kernels/CMakeLists.txt @@ -12,3 +12,4 @@ add_subdirectory(npu) add_subdirectory(xpu) add_subdirectory(mlu) add_subdirectory(bm) +add_subdirectory(rknpu) diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index 21a5650c7fa1df35102e86da96faf89bbae46541..68ccba569ad0726bdda4c15840c7680b51ba0c58 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) +if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU) return() endif() diff --git a/lite/kernels/rknpu/CMakeLists.txt b/lite/kernels/rknpu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebb432748f363fb6326dc7d06ced5a5238061637 --- /dev/null +++ b/lite/kernels/rknpu/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(bridges) +add_kernel(subgraph_compute_rknpu RKNPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_rknpu ${rknpu_subgraph_bridges}) diff --git a/lite/kernels/rknpu/bridges/CMakeLists.txt b/lite/kernels/rknpu/bridges/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c4801553df8c9bf17eea595fce29206c24aa0cd --- /dev/null +++ b/lite/kernels/rknpu/bridges/CMakeLists.txt @@ -0,0 +1,34 @@ +if(NOT LITE_WITH_RKNPU) + return() +endif() + +lite_cc_library(subgraph_bridge_utility_rknpu SRCS utility.cc DEPS ${rknpu_builder_libs} tensor) +lite_cc_library(subgraph_bridge_graph_rknpu SRCS graph.cc DEPS subgraph_bridge_utility_rknpu) + +set(rknpu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_rknpu subgraph_bridge_graph_rknpu) + +lite_cc_library(subgraph_bridge_conv_op_rknpu SRCS conv_op.cc DEPS ${rknpu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_act_op_rknpu SRCS act_op.cc DEPS ${rknpu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_softmax_op_rknpu SRCS softmax_op.cc DEPS ${rknpu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_pool_op_rknpu SRCS pool_op.cc DEPS ${rknpu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_fc_op_rknpu SRCS fc_op.cc DEPS ${rknpu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_batch_norm_op_rknpu SRCS batch_norm_op.cc DEPS ${rknpu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_concat_op_rknpu SRCS concat_op.cc DEPS ${rknpu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_elementwise_ops_rknpu SRCS elementwise_ops.cc DEPS ${rknpu_subgraph_bridge_deps}) + + +set(rknpu_subgraph_bridges + subgraph_bridge_registry + subgraph_bridge_utility_rknpu + subgraph_bridge_graph_rknpu + subgraph_bridge_conv_op_rknpu + subgraph_bridge_act_op_rknpu + subgraph_bridge_softmax_op_rknpu + subgraph_bridge_pool_op_rknpu + subgraph_bridge_fc_op_rknpu + subgraph_bridge_batch_norm_op_rknpu + subgraph_bridge_concat_op_rknpu + subgraph_bridge_elementwise_ops_rknpu + CACHE INTERNAL "rknpu_subgraph_bridges") + +message(STATUS "+++++ rknpu_subgraph_bridges: ${rknpu_subgraph_bridges}") diff --git a/lite/kernels/rknpu/bridges/act_op.cc b/lite/kernels/rknpu/bridges/act_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..decc9b46d653594d7e5eaa53766d43dc841b14b5 --- /dev/null +++ b/lite/kernels/rknpu/bridges/act_op.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/rknpu/bridges/graph.h" +// #include "lite/kernels/npu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace rknpu { + +int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto x_dims = x->dims(); + auto output_var_name = op_info->Output("Out").front(); + auto output = scope->FindVar(output_var_name)->GetMutable(); + auto output_dims = output->dims(); + const int64_t* x_shape_data = const_cast(&x_dims.data()[0]); + const int64_t* output_shape_data = + const_cast(&output_dims.data()[0]); + std::vector i_x_shape_data(x_dims.size()); + std::vector i_output_shape_data(output_dims.size()); + + VLOG(3) << "[RKNPU] Converting " + op_type + "..."; + + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + + for (size_t i = 0; i < x_dims.size(); i++) { + i_x_shape_data[i] = static_cast(x_shape_data[i]); + } + for (size_t i = 0; i < output_dims.size(); i++) { + i_output_shape_data[i] = static_cast(output_shape_data[i]); + } + CHECK_EQ(op_type, "relu"); + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_var_name)) { + x_node = graph->Get(x_var_name); + } else { + x_node = graph->Add(x_var_name, *x, x_type->precision(), x_type->layout()); + } + + auto output_node = graph->Add( + output_var_name, *output, out_type->precision(), out_type->layout()); + auto rGraph = graph->GetHandle(); + std::vector> inputs; + std::vector> outputs; + + inputs.push_back(x_node->data()); + outputs.push_back(output_node->data()); + auto relu = + rGraph->AddOperator(rk::nn::OperatorType::RELU, inputs, outputs, nullptr); + + return SUCCESS; +} + +} // namespace rknpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(relu, + kRKNPU, + paddle::lite::subgraph::rknpu::ActConverter); diff --git a/lite/kernels/rknpu/bridges/batch_norm_op.cc b/lite/kernels/rknpu/bridges/batch_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9ad892e3b8073862abede3d01e25e9b51c005631 --- /dev/null +++ b/lite/kernels/rknpu/bridges/batch_norm_op.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/rknpu/bridges/graph.h" +#include "lite/kernels/rknpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace rknpu { + +int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[RKNPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto scale_name = op_info->Input("Scale").front(); + auto scale_type = kernel->GetInputDeclType("Scale"); + CHECK(scale_type->layout() == DATALAYOUT(kNCHW)); + auto scale = scope->FindMutableTensor(scale_name); + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + CHECK(bias_type->layout() == DATALAYOUT(kNCHW)); + auto bias = scope->FindMutableTensor(bias_name); + auto mean_name = op_info->Input("Mean").front(); + auto mean_type = kernel->GetInputDeclType("Mean"); + CHECK(mean_type->layout() == DATALAYOUT(kNCHW)); + auto mean = scope->FindMutableTensor(mean_name); + auto variance_name = op_info->Input("Variance").front(); + auto variance_type = kernel->GetInputDeclType("Variance"); + CHECK(variance_type->layout() == DATALAYOUT(kNCHW)); + auto variance = scope->FindMutableTensor(variance_name); + auto y_name = op_info->Output("Y").front(); + auto y_type = kernel->GetOutputDeclType("Y"); + auto y = scope->FindMutableTensor(y_name); + CHECK(y_type->layout() == DATALAYOUT(kNCHW)); + float momentum = op_info->GetAttr("momentum"); + float epsilon = op_info->GetAttr("epsilon"); + int mode = 1; // bnScale, bnBias tensor dims are 1xCx1x1 + bool use_global_stats = op_info->GetAttr("use_global_stats"); + + // for quantization + bool enable_int8 = false; + float input_scale = 1.0; + float output_scale = 1.0; + int bit_length = 8; + DataLayoutType layout = DATALAYOUT(kNCHW); + PrecisionType precision = PRECISION(kFloat); + + if (op_info->HasAttr("enable_int8")) { + enable_int8 = op_info->GetAttr("enable_int8"); + input_scale = op_info->GetAttr("input_scale"); + bit_length = op_info->GetAttr("bit_length"); + output_scale = op_info->GetAttr("output_scale"); + + if (enable_int8) { + precision = PRECISION(kInt8); + } + } + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + x_node = graph->Add(x_name, *x); + } + + // Scale, Bias, Mean, Variance node + auto scale_node = graph->Add(scale_name, *scale); + auto bias_node = graph->Add(bias_name, *bias); + auto mean_node = graph->Add(mean_name, *mean); + auto variance_node = graph->Add(variance_name, *variance); + + std::shared_ptr output_node = nullptr; + QuantizationInfo output_qnt; + + output_qnt.enable_int8 = enable_int8; + + if (enable_int8) { + output_qnt.quant_bits = bit_length; + output_qnt.scale.push_back(output_scale); + y->mutable_data(); + } + + output_node = graph->Add(y_name, *y, precision, layout, output_qnt); + + std::vector> inputs; + std::vector> outputs; + + inputs.push_back(x_node->data()); + inputs.push_back(mean_node->data()); + inputs.push_back(variance_node->data()); + inputs.push_back(scale_node->data()); + inputs.push_back(bias_node->data()); + outputs.push_back(output_node->data()); + + rk::nn::BatchNormAttr attrs; + attrs.eps = epsilon; + + auto rGraph = graph->GetHandle(); + auto bn = rGraph->AddOperator( + rk::nn::OperatorType::BATCH_NORM, inputs, outputs, &attrs); + + return SUCCESS; +} + +} // namespace rknpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(batch_norm, + kRKNPU, + paddle::lite::subgraph::rknpu::BatchNormConverter); diff --git a/lite/kernels/rknpu/bridges/concat_op.cc b/lite/kernels/rknpu/bridges/concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..382d7c3a6038cd2bd0998debf157ee494f24de91 --- /dev/null +++ b/lite/kernels/rknpu/bridges/concat_op.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/rknpu/bridges/graph.h" +#include "lite/kernels/rknpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace rknpu { + +int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[RKNPU] Converting " << op_type << " ... "; + + // Get input and output vars and op attributes + auto x_names = op_info->Input("X"); + auto x_type = kernel->GetInputDeclType("X"); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + auto output = scope->FindMutableTensor(out_name); + + auto axis = op_info->GetAttr("axis"); + auto num = x_names.size(); + + // for quantization + bool enable_int8 = false; + float input_scale = 1.0; + float output_scale = 1.0; + int bit_length = 8; + DataLayoutType layout = DATALAYOUT(kNCHW); + PrecisionType precision = PRECISION(kFloat); + + if (op_info->HasAttr("enable_int8")) { + enable_int8 = op_info->GetAttr("enable_int8"); + input_scale = op_info->GetAttr("input_scale"); + bit_length = op_info->GetAttr("bit_length"); + output_scale = op_info->GetAttr("output_scale"); + + if (enable_int8) { + precision = PRECISION(kInt8); + } + } + + // Traverse all of input nodes which are added into the new created concat + // node + std::vector> inputs; + std::vector> outputs; + + int idx = 1; + for (auto& x_name : x_names) { + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + x_node = graph->Add(x_name, *x); + QuantizationInfo qnt; + qnt.enable_int8 = enable_int8; + + if (enable_int8) { + qnt.quant_bits = bit_length; + qnt.scale.push_back(input_scale); + x->mutable_data(); + } + x_node = + graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt); + } + + inputs.push_back(x_node->data()); + idx++; + } + + std::shared_ptr output_node = nullptr; + QuantizationInfo output_qnt; + + output_qnt.enable_int8 = enable_int8; + + if (enable_int8) { + output_qnt.quant_bits = bit_length; + output_qnt.scale.push_back(output_scale); + output->mutable_data(); + } + + output_node = graph->Add(out_name, *output, precision, layout, output_qnt); + outputs.push_back(output_node->data()); + + rk::nn::ConcatAttr attrs; + attrs.axis = axis; + + auto rGraph = graph->GetHandle(); + auto concat = rGraph->AddOperator( + rk::nn::OperatorType::CONCAT, inputs, outputs, &attrs); + + return SUCCESS; +} + +} // namespace rknpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(concat, + kRKNPU, + paddle::lite::subgraph::rknpu::ConcatConverter); diff --git a/lite/kernels/rknpu/bridges/conv_op.cc b/lite/kernels/rknpu/bridges/conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5d474f0ef10771b1f8a0fdc6c3446c97eff261ec --- /dev/null +++ b/lite/kernels/rknpu/bridges/conv_op.cc @@ -0,0 +1,292 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/conv_op.h" +#include +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/rknpu/bridges/graph.h" +#include "lite/kernels/rknpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace rknpu { + +int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[RKNPU] Converting " << op_type << "... "; + + // Get input and output vars and op attributes + auto input_name = op_info->Input("Input").front(); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + auto filter_name = op_info->Input("Filter").front(); + auto filter = scope->FindMutableTensor(filter_name); + auto filter_dims = filter->dims(); + auto output_name = op_info->Output("Output").front(); + auto output = scope->FindMutableTensor(output_name); + auto output_dims = output->dims(); + auto bs = input_dims[0]; + auto ic = input_dims[1]; + auto oc = filter_dims[0]; + CHECK_EQ(input_dims.size(), 4L); + CHECK_EQ(output_dims.size(), 4L); + CHECK_EQ(filter_dims.size(), 4L); + CHECK_EQ(output_dims[0], bs); + CHECK_EQ(output_dims[1], oc); + auto strides = op_info->GetAttr>("strides"); + auto paddings = op_info->GetAttr>("paddings"); + auto groups = op_info->GetAttr("groups"); + auto dilations = op_info->GetAttr>("dilations"); + auto fuse_relu = op_info->GetAttr("fuse_relu"); + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + // Check depthwise mode + bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1); + auto weight_scale = op_info->GetAttr>("weight_scale"); + + // for quantization + bool enable_int8 = false; + float input_scale = 1.0; + float output_scale = 1.0; + int bit_length = 8; + DataLayoutType layout = DATALAYOUT(kNCHW); + PrecisionType precision = PRECISION(kFloat); + + if (op_info->HasAttr("enable_int8")) { + enable_int8 = op_info->GetAttr("enable_int8"); + input_scale = op_info->GetAttr("input_scale"); + bit_length = op_info->GetAttr("bit_length"); + output_scale = op_info->GetAttr("output_scale"); + + if (enable_int8) { + precision = PRECISION(kInt8); + } + } + + // // Input node + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); + } else { + QuantizationInfo qnt; + qnt.enable_int8 = enable_int8; + + if (enable_int8) { + qnt.scale.clear(); + qnt.scale.push_back(input_scale); + qnt.quant_bits = bit_length; + } + input_node = + graph->Add(input_name, *input, input->precision(), layout, qnt); + } + + if (paddings.size() == 2L) { + for (size_t i = 0; i < strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "[NPU] Paddings size should be the same or twice as the input size."; + + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + operators::UpdatePaddingAndDilation(&paddings, + &dilations, + strides, + padding_algorithm, + input_dims, + filter_dims); + // Filter node + std::shared_ptr filter_node = nullptr; + QuantizationInfo filter_qnt; + + filter_qnt.enable_int8 = enable_int8; + + if (enable_int8) { + filter_qnt.scale = weight_scale; + filter_qnt.quant_bits = bit_length; + } + + filter_node = + graph->Add(filter_name, *filter, filter->precision(), layout, filter_qnt); + + // Add bias node if exists bias + // Supports the bias nodes with the following dimensions + // 0: {oc} + std::shared_ptr bias_node = nullptr; + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); + } else { + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + auto bias_data_size = bias_dims.production(); + auto output_data_size = output_dims.production(); + std::vector bias_shape; + + if (bias_data_size == oc) { + // 0: {oc} + bias_shape = {oc}; + } else { + LOG(WARNING) + << "[RKNPU] Bias dimension " << bias_dims + << " isn't supported in conv2d Op when output dimension is " + << output_dims; + return FAILED; + } + + if (enable_int8) { + auto bias_name_qnt = bias_name + "/qnt"; + auto* bias_qnt = scope->NewTensor(bias_name_qnt); + + bias_qnt->Resize(bias_shape); + bias_qnt->set_persistable(true); + bias_qnt->set_precision(PrecisionType::kInt32); + + auto* bias_qnt_data = bias_qnt->mutable_data(); + auto* bias_data = bias->mutable_data(); + + QuantizationInfo qnt; + qnt.enable_int8 = enable_int8; + qnt.quant_bits = 32; + + qnt.scale.resize(weight_scale.size()); + for (int i = 0; i < weight_scale.size(); i++) { + qnt.scale[i] = input_scale * weight_scale[i]; + } + + auto dtype_max = static_cast((1 << (qnt.quant_bits - 1)) - 1); + auto dtype_min = static_cast(0 - dtype_max); + + for (int i = 0; i < oc; i++) { + bias_qnt_data[i] = + std::min(std::max(static_cast(bias_data[i] / qnt.scale[i]), + dtype_min), + dtype_max); + } + + bias_node = graph->Add( + bias_name, *bias_qnt, bias_qnt->precision(), layout, qnt); + } else { + bias_node = graph->Add(bias_name, *bias, bias_shape); + } + } + } else { + auto bias_name = filter_name + "/bias/dummy"; + auto* bias = scope->NewTensor(bias_name); + std::vector bias_shape = {oc}; + + bias->Resize(bias_shape); + bias->set_persistable(true); + + if (enable_int8) { + bias->set_precision(PrecisionType::kInt32); + auto* bias_data = bias->mutable_data(); + + for (int i = 0; i < oc; i++) { + bias_data[i] = 0; + } + + QuantizationInfo qnt; + qnt.enable_int8 = enable_int8; + qnt.quant_bits = 32; + qnt.scale.resize(weight_scale.size()); + for (int i = 0; i < weight_scale.size(); i++) { + qnt.scale[i] = input_scale * weight_scale[i]; + } + + bias_node = graph->Add(bias_name, *bias, bias->precision(), layout, qnt); + } else { + bias->set_precision(PrecisionType::kFloat); + auto* bias_data = bias->mutable_data(); + + for (int i = 0; i < oc; i++) { + bias_data[i] = 0.0; + } + bias_node = graph->Add(bias_name, *bias, bias_shape); + } + } + + // Conv node + std::shared_ptr conv_node = nullptr; + std::shared_ptr output_node = nullptr; + std::vector> inputs; + std::vector> outputs; + QuantizationInfo output_qnt; + + output_qnt.enable_int8 = enable_int8; + + if (enable_int8) { + output_qnt.quant_bits = bit_length; + output_qnt.scale.push_back(output_scale); + output->mutable_data(); + } + + output_node = graph->Add(output_name, *output, precision, layout, output_qnt); + + inputs.push_back(input_node->data()); + inputs.push_back(filter_node->data()); + inputs.push_back(bias_node->data()); + outputs.push_back(output_node->data()); + + rk::nn::Conv2DAttr attr; + attr.ksize[0] = filter_dims[2]; + attr.ksize[1] = filter_dims[3]; + attr.stride[0] = strides[0]; + attr.stride[1] = strides[1]; + attr.pad[0] = paddings[0]; + attr.pad[1] = paddings[1]; + attr.pad[2] = paddings[2]; + attr.pad[3] = paddings[3]; + attr.group = groups; + attr.weights = oc; + attr.dilation[0] = dilations[0]; + attr.dilation[1] = dilations[1]; + attr.pad_type = rk::nn::PadType::AUTO; + attr.has_relu = fuse_relu; + + if (is_depthwise_mode) { + attr.multiplier = 1; + } else { + attr.multiplier = 0; + } + + auto rGraph = graph->GetHandle(); + auto conv = rGraph->AddOperator( + rk::nn::OperatorType::CONV2D, inputs, outputs, &attr, output_name); + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace rknpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(conv2d, + kRKNPU, + paddle::lite::subgraph::rknpu::ConvConverter); +REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d, + kRKNPU, + paddle::lite::subgraph::rknpu::ConvConverter); diff --git a/lite/kernels/rknpu/bridges/elementwise_ops.cc b/lite/kernels/rknpu/bridges/elementwise_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..dbd1f9ccb2a49115a9a0fc6d51ad4537cac253ed --- /dev/null +++ b/lite/kernels/rknpu/bridges/elementwise_ops.cc @@ -0,0 +1,185 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/rknpu/bridges/graph.h" +#include "lite/kernels/rknpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace rknpu { + +std::vector CvtYShape(const DDim& x_dims, + const DDim& y_dims, + int axis) { + CHECK_EQ(x_dims.size(), 4UL) << "[RKNPU] Only support 4-dimension x"; + CHECK_GE(x_dims.size(), y_dims.size()); + + if (axis < 0) { + axis += x_dims.size(); + } + + std::vector y_new_shape(y_dims.Vectorize()); + if (y_new_shape.size() == 4UL) { + return y_new_shape; + } + for (int i = 0; i < axis; i++) { + y_new_shape.insert(y_new_shape.begin(), 1); + } + while (y_new_shape.size() < 4) { + y_new_shape.push_back(1); + } + CHECK_EQ(y_new_shape.size(), 4UL); + return y_new_shape; +} + +int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[RKNPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto y_name = op_info->Input("Y").front(); + auto y_type = kernel->GetInputDeclType("Y"); + auto y = scope->FindMutableTensor(y_name); + auto y_dims = y->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + auto output = scope->FindMutableTensor(out_name); + auto axis = op_info->GetAttr("axis"); + + // for quantization + bool enable_int8 = false; + float input_scale = 1.0; + float output_scale = 1.0; + int bit_length = 8; + DataLayoutType layout = DATALAYOUT(kNCHW); + PrecisionType precision = PRECISION(kFloat); + + if (op_info->HasAttr("enable_int8")) { + enable_int8 = op_info->GetAttr("enable_int8"); + input_scale = op_info->GetAttr("input_scale"); + bit_length = op_info->GetAttr("bit_length"); + output_scale = op_info->GetAttr("output_scale"); + + if (enable_int8) { + precision = PRECISION(kInt8); + } + } + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + QuantizationInfo qnt; + qnt.enable_int8 = enable_int8; + + if (enable_int8) { + qnt.scale.clear(); + qnt.scale.push_back(input_scale); + qnt.quant_bits = op_info->GetAttr("bit_length"); + } + x_node = graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt); + } + + // Y node + std::shared_ptr y_node = nullptr; + if (graph->Has(y_name)) { + y_node = graph->Get(y_name); + } else { + // auto y_new_shape = CvtYShape(x_dims, y_dims, axis); + // y_node = graph->Add(y_name, *y, y_new_shape); + QuantizationInfo qnt; + qnt.enable_int8 = enable_int8; + + if (enable_int8) { + qnt.quant_bits = bit_length; + qnt.scale.clear(); + qnt.scale.push_back(input_scale); + } + y_node = graph->Add(y_name, *y, y_type->precision(), y_type->layout(), qnt); + } + + std::shared_ptr output_node = nullptr; + QuantizationInfo output_qnt; + + output_qnt.enable_int8 = enable_int8; + + if (enable_int8) { + output_qnt.quant_bits = bit_length; + output_qnt.scale.clear(); + output_qnt.scale.push_back(output_scale); + output->mutable_data(); + } + + output_node = graph->Add( + out_name, *output, x_type->precision(), x_type->layout(), output_qnt); + + std::vector> inputs; + std::vector> outputs; + + inputs.push_back(x_node->data()); + inputs.push_back(y_node->data()); + outputs.push_back(output_node->data()); + + auto rGraph = graph->GetHandle(); + + // Elementwise node + if (op_type == "elementwise_add") { + auto elt_node = rGraph->AddOperator( + rk::nn::OperatorType::ADD, inputs, outputs, nullptr); + } else if (op_type == "elementwise_sub") { + auto elt_node = rGraph->AddOperator( + rk::nn::OperatorType::SUBTRACT, inputs, outputs, nullptr); + } else if (op_type == "elementwise_mul") { + auto elt_node = rGraph->AddOperator( + rk::nn::OperatorType::MULTIPLY, inputs, outputs, nullptr); + } else if (op_type == "elementwise_div") { + auto elt_node = rGraph->AddOperator( + rk::nn::OperatorType::DIVIDE, inputs, outputs, nullptr); + } else { + LOG(WARNING) << "[RKNPU] Unsupported op type: " << op_type; + return FAILED; + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace rknpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(elementwise_add, + kRKNPU, + paddle::lite::subgraph::rknpu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(elementwise_sub, + kRKNPU, + paddle::lite::subgraph::rknpu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(elementwise_mul, + kRKNPU, + paddle::lite::subgraph::rknpu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(elementwise_div, + kRKNPU, + paddle::lite::subgraph::rknpu::ElementwiseConverter); diff --git a/lite/kernels/rknpu/bridges/fc_op.cc b/lite/kernels/rknpu/bridges/fc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ef548ed222a69bbc8c116e4146c0a0cea128e81a --- /dev/null +++ b/lite/kernels/rknpu/bridges/fc_op.cc @@ -0,0 +1,247 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/rknpu/bridges/graph.h" +#include "lite/kernels/rknpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace rknpu { + +int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[RKNPU] Converting " + op_type + "..."; + + auto input_name = op_info->Input("Input").front(); + auto input_type = kernel->GetInputDeclType("Input"); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + CHECK_GE(input_dims.size(), 2UL); + auto w_name = op_info->Input("W").front(); + auto w_type = kernel->GetInputDeclType("W"); + auto w = scope->FindMutableTensor(w_name); + auto w_dims = w->dims(); + CHECK_EQ(w_dims.size(), 2UL); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + auto output = scope->FindMutableTensor(out_name); + int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); + int m = input_dims.Slice(0, in_num_col_dims).production(); + int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production(); + int n = w_dims[1]; + CHECK_EQ(k * n, w_dims.production()); + VLOG(3) << "[RKNPU] input dims: " << input_dims << " w dims: " << w_dims + << " m: " << m << " k: " << k << " n: " << n; + + // for quantization + bool enable_int8 = false; + float input_scale = 1.0; + float output_scale = 1.0; + int bit_length = 8; + DataLayoutType layout = DATALAYOUT(kNCHW); + PrecisionType precision = PRECISION(kFloat); + + if (op_info->HasAttr("enable_int8")) { + enable_int8 = op_info->GetAttr("enable_int8"); + input_scale = op_info->GetAttr("input_scale"); + bit_length = op_info->GetAttr("bit_length"); + output_scale = op_info->GetAttr("output_scale"); + if (enable_int8) { + precision = PRECISION(kInt8); + } + } + + // Create input node and reshape it to (m, k, 1, 1) + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); + } else { + input_node = graph->Add(input_name, *input); + } + + // Create w const node, set its shape to (n, k) and fill with + // the transposed w tensor + auto* transpose_w = scope->NewTensor(w_name + "/transpose"); + std::shared_ptr trans_w_node = nullptr; + transpose_w->Resize({n, k}); + transpose_w->set_persistable(true); + + if (enable_int8) { + QuantizationInfo filter_qnt; + auto weight_scale = op_info->GetAttr>("weight_scale"); + filter_qnt.enable_int8 = enable_int8; + filter_qnt.scale = weight_scale; + filter_qnt.quant_bits = bit_length; + + auto transpose_w_data = transpose_w->mutable_data(); + auto w_data = w->mutable_data(); + + for (int i = 0; i < k; i++) { + for (int j = 0; j < n; j++) { + transpose_w_data[j * k + i] = w_data[i * n + j]; + } + } + trans_w_node = graph->Add( + w_name, *transpose_w, precision, w_type->layout(), filter_qnt); + } else { + auto transpose_w_data = transpose_w->mutable_data(); + auto w_data = w->mutable_data(); + + for (int i = 0; i < k; i++) { + for (int j = 0; j < n; j++) { + transpose_w_data[j * k + i] = w_data[i * n + j]; + } + } + trans_w_node = + graph->Add(w_name, *transpose_w, precision, w_type->layout()); + } + + // Add bias node if bias tensor exists + std::shared_ptr bias_node = nullptr; + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); + } else { + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + auto bias_data_size = bias_dims.production(); + std::vector bias_shape = {n}; + + VLOG(3) << "[RKNPU] bias precision: " + << PrecisionToStr(bias->precision()); + // We need to quantize bias + if (enable_int8) { + auto bias_name_qnt = bias_name + "/qnt"; + auto* bias_qnt = scope->NewTensor(bias_name_qnt); + auto weight_scale = + op_info->GetAttr>("weight_scale"); + + bias_qnt->Resize(bias_shape); + bias_qnt->set_persistable(true); + bias_qnt->set_precision(PrecisionType::kInt32); + + auto* bias_qnt_data = bias_qnt->mutable_data(); + auto* bias_data = bias->mutable_data(); + + QuantizationInfo qnt; + qnt.enable_int8 = enable_int8; + qnt.quant_bits = 32; + qnt.scale.resize(weight_scale.size()); + + for (int i = 0; i < weight_scale.size(); i++) { + qnt.scale[i] = input_scale * weight_scale[i]; + } + + auto dtype_max = static_cast((1 << (qnt.quant_bits - 1)) - 1); + auto dtype_min = static_cast(0 - dtype_max); + + for (int i = 0; i < n; i++) { + bias_qnt_data[i] = + std::min(std::max(static_cast(bias_data[i] / qnt.scale[i]), + dtype_min), + dtype_max); + } + + bias_node = graph->Add( + bias_name, *bias_qnt, bias_qnt->precision(), layout, qnt); + } else { + bias_node = graph->Add(bias_name, *bias, bias_shape); + } + } + } else { + auto bias_name = w_name + "/bias/dummy"; + auto* bias = scope->NewTensor(bias_name); + std::vector bias_shape = {n}; + + bias->Resize(bias_shape); + bias->set_persistable(true); + + if (enable_int8) { + auto weight_scale = op_info->GetAttr>("weight_scale"); + bias->set_precision(PrecisionType::kInt32); + auto* bias_data = bias->mutable_data(); + + for (int i = 0; i < n; i++) { + bias_data[i] = 0; + } + + QuantizationInfo qnt; + qnt.enable_int8 = enable_int8; + qnt.quant_bits = 32; + qnt.scale.resize(weight_scale.size()); + + for (int i = 0; i < weight_scale.size(); i++) { + qnt.scale[i] = input_scale * weight_scale[i]; + } + + bias_node = graph->Add(bias_name, *bias, bias->precision(), layout, qnt); + } else { + bias->set_precision(PrecisionType::kFloat); + auto* bias_data = bias->mutable_data(); + + for (int i = 0; i < n; i++) { + bias_data[i] = 0.0; + } + bias_node = graph->Add(bias_name, *bias, bias_shape); + } + } + + std::shared_ptr output_node = nullptr; + QuantizationInfo output_qnt; + + output_qnt.enable_int8 = enable_int8; + + if (enable_int8) { + output_qnt.quant_bits = bit_length; + output_qnt.scale.clear(); + output_qnt.scale.push_back(output_scale); + output->mutable_data(); + } + output_node = graph->Add(out_name, *output, precision, layout, output_qnt); + + std::vector> inputs; + std::vector> outputs; + + inputs.push_back(input_node->data()); + inputs.push_back(trans_w_node->data()); + inputs.push_back(bias_node->data()); + outputs.push_back(output_node->data()); + + rk::nn::FCAttr attrs; + attrs.weights = n; + attrs.has_relu = false; + + auto rGraph = graph->GetHandle(); + auto fc = rGraph->AddOperator( + rk::nn::OperatorType::FULLCONNECT, inputs, outputs, &attrs); + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace rknpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(fc, + kRKNPU, + paddle::lite::subgraph::rknpu::FCConverter); diff --git a/lite/kernels/rknpu/bridges/graph.cc b/lite/kernels/rknpu/bridges/graph.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c1297c2e7e14d2138e05c4949573fd1db7cc235 --- /dev/null +++ b/lite/kernels/rknpu/bridges/graph.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/rknpu/bridges/graph.h" +#include +#include "lite/kernels/rknpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace rknpu { + +int Graph::Add(const std::string& name, std::shared_ptr node) { + auto it = nodes_.find(name); + if (it != nodes_.end()) { + // Only variable node can be shared with the same name + if (!node->is_var() || !it->second.back()->is_var()) { + LOG(FATAL) << "[RKNPU] Const or data node " << name << " is redefined."; + return -1; + } + } else { + auto ret = nodes_.insert( + std::make_pair(name, std::vector>())); + CHECK(ret.second); + it = ret.first; + } + it->second.push_back(node); + return it->second.size(); +} + +// Const or data node +std::shared_ptr Graph::Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + PrecisionType precision, + DataLayoutType layout, + const QuantizationInfo& qnt) { + std::shared_ptr node = nullptr; + + if (precision == PrecisionType::kUnk) { + precision = tensor.precision(); // todo + } + + if (precision == PrecisionType::kUnk) { + if (qnt.enable_int8 && qnt.quant_bits == 8) { + precision = PrecisionType::kInt8; + } else if (!qnt.enable_int8) { + precision = PrecisionType::kFloat; + } else { + LOG(ERROR) << "[rknpu]:Graph:: tensor precision unknown!"; + } + } + + if (precision != tensor.precision()) { + LOG(INFO) << "[rknpu]:Graph::Add: tensor precision mismatch!" << name << ":" + << PrecisionToStr(precision) << " vs " + << PrecisionToStr(tensor.precision()); + } + + if (tensor.persistable()) { + // Const node + node = std::make_shared(precision, layout, Node::Role::kConst); + auto idx = Add(name, node); + CHECK_EQ(idx, 1); + auto attr = std::make_shared(); + attr->precision = ToRknpuPrecisionType(precision); + attr->layout = ToRknpuDataLayoutType(layout); + attr->role = rk::nn::TensorRole::CONST; + attr->name = name; + + switch (precision) { + case PrecisionType::kInt8: + attr->qntBits = 8; + attr->qntType = rk::nn::QuantizationType::SYMMETRIC; + attr->qntParamSymmetric.scale = qnt.scale; + break; + case PrecisionType::kInt32: + attr->qntBits = 32; + attr->qntType = rk::nn::QuantizationType::SYMMETRIC; + attr->qntParamSymmetric.scale = qnt.scale; + break; + default: + break; + } + + attr->dims.resize(shape.size()); + for (int i = 0; i < shape.size(); i++) { + attr->dims[i] = shape[i]; + } + + LOG(INFO) << "[rknpu]:Graph::Add const node:" << name + << " precision: " << PrecisionToStr(precision) + << " layout: " << DataLayoutToStr(layout); + node->set_data( + rgraph_->CreateTensor(attr, const_cast(tensor.raw_data()))); + } else { + // Data node + node = Add(name, shape, precision, layout, qnt); + } + return node; +} + +// Data node +std::shared_ptr Graph::Add(const std::string& name, + std::vector shape, + PrecisionType precision, + DataLayoutType layout, + const QuantizationInfo& qnt) { + auto node = std::make_shared(precision, layout, Node::Role::kData); + auto idx = Add(name, node); + CHECK_EQ(idx, 1); + auto attr = std::make_shared(); + attr->precision = ToRknpuPrecisionType(precision); + attr->layout = ToRknpuDataLayoutType(layout); + attr->role = rk::nn::TensorRole::VAR; + attr->name = name; + + switch (precision) { + case PrecisionType::kInt8: + attr->qntBits = 8; + attr->qntType = rk::nn::QuantizationType::SYMMETRIC; + attr->qntParamSymmetric.scale = qnt.scale; + break; + case PrecisionType::kInt32: + attr->qntBits = 32; + attr->qntType = rk::nn::QuantizationType::SYMMETRIC; + attr->qntParamSymmetric.scale = qnt.scale; + break; + + default: + break; + } + + attr->dims.resize(shape.size()); + for (int i = 0; i < shape.size(); i++) { + attr->dims[i] = shape[i]; + } + + LOG(INFO) << "[rknpu]:Graph::Add data node:" << name + << " precision: " << PrecisionToStr(precision) + << " layout: " << DataLayoutToStr(layout); + node->set_data(rgraph_->CreateTensor(attr, nullptr)); // todo + return node; +} + +Graph::Graph() { + rgraph_ = new rk::nn::Graph(); + CHECK(rgraph_ != nullptr); +} + +Graph::~Graph() {} + +} // namespace rknpu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/rknpu/bridges/graph.h b/lite/kernels/rknpu/bridges/graph.h new file mode 100644 index 0000000000000000000000000000000000000000..a106d282de9e2c13f422dd5d8bd736968741a6d6 --- /dev/null +++ b/lite/kernels/rknpu/bridges/graph.h @@ -0,0 +1,133 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/tensor.h" +#include "rknpu/rknpu_pub.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace rknpu { + +// Graph and node is defined to collect all of converted RKNPU IR nodes +struct QuantizationInfo { + int enable_int8; + int quant_bits; + std::vector scale; +}; + +class Node { + public: + enum class Role { + kVar = 0, + kConst, + kData, + }; + + Node(std::shared_ptr data, + PrecisionType precision, + DataLayoutType layout, + Role role) + : data_(data), precision_(precision), layout_(layout), role_(role) {} + Node(PrecisionType precision, DataLayoutType layout, Role role) + : precision_(precision), layout_(layout), role_(role) {} + + void set_data(std::shared_ptr data) { data_ = data; } + void set_precision(PrecisionType precision) { precision_ = precision; } + void set_layout(DataLayoutType layout) { layout_ = layout; } + void set_role(Role role) { role_ = role; } + void set_quant_param(const QuantizationInfo& qnt) { qnt_ = qnt; } + + std::shared_ptr data() { return data_; } + PrecisionType precision() const { return precision_; } + DataLayoutType layout() const { return layout_; } + Role role() const { return role_; } + bool is_var() const { return role_ == Role::kVar; } + bool is_const() const { return role_ == Role::kConst; } + bool is_data() const { return role_ == Role::kData; } + + private: + std::shared_ptr data_{nullptr}; + PrecisionType precision_{PRECISION(kFloat)}; + DataLayoutType layout_{DATALAYOUT(kNCHW)}; + Role role_{Role::kVar}; + QuantizationInfo qnt_; +}; + +class Graph { + public: + Graph(); + ~Graph(); + + public: + int Add(const std::string& name, std::shared_ptr node); + + // Const or data node + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + PrecisionType precision = PRECISION(kUnk), + DataLayoutType layout = DATALAYOUT(kNCHW), + const QuantizationInfo& qnt = QuantizationInfo()); + std::shared_ptr Get(const std::string& name) { + CHECK(Has(name)) << "[RKNPU] Node " << name << " not found."; + return nodes_.at(name).back(); + } + + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + PrecisionType precision = PRECISION(kUnk), + DataLayoutType layout = DATALAYOUT(kNCHW), + const QuantizationInfo& qnt = QuantizationInfo()) { + return Add(name, tensor, tensor.dims().Vectorize(), precision, layout, qnt); + } + + // Data node + std::shared_ptr Add(const std::string& name, + std::vector shape, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW), + const QuantizationInfo& qnt = QuantizationInfo()); + + std::shared_ptr Add(const std::string& name, + DDim dims, + PrecisionType precision = PRECISION(kFloat), + DataLayoutType layout = DATALAYOUT(kNCHW), + const QuantizationInfo& qnt = QuantizationInfo()) { + return Add(name, dims.Vectorize(), precision, layout, qnt); + } + + bool Has(const std::string& name) { + return nodes_.find(name) != nodes_.end(); + } + + rk::nn::Graph* GetHandle() { return rgraph_; } + + private: + std::unordered_map>> nodes_; + rk::nn::Graph* rgraph_; +}; + +} // namespace rknpu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/rknpu/bridges/paddle_use_bridges.h b/lite/kernels/rknpu/bridges/paddle_use_bridges.h new file mode 100644 index 0000000000000000000000000000000000000000..e63033bfcc01ba66e0b01c01aedd15319a3968ce --- /dev/null +++ b/lite/kernels/rknpu/bridges/paddle_use_bridges.h @@ -0,0 +1,30 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +USE_SUBGRAPH_BRIDGE(relu, kRKNPU); +USE_SUBGRAPH_BRIDGE(conv2d, kRKNPU); +USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kRKNPU); + +USE_SUBGRAPH_BRIDGE(pool2d, kRKNPU); +USE_SUBGRAPH_BRIDGE(fc, kRKNPU); +USE_SUBGRAPH_BRIDGE(softmax, kRKNPU); +USE_SUBGRAPH_BRIDGE(batch_norm, kRKNPU); +USE_SUBGRAPH_BRIDGE(concat, kRKNPU); + +USE_SUBGRAPH_BRIDGE(elementwise_add, kRKNPU); +USE_SUBGRAPH_BRIDGE(elementwise_sub, kRKNPU); +USE_SUBGRAPH_BRIDGE(elementwise_mul, kRKNPU); +USE_SUBGRAPH_BRIDGE(elementwise_div, kRKNPU); diff --git a/lite/kernels/rknpu/bridges/pool_op.cc b/lite/kernels/rknpu/bridges/pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4d6f8e11e57f0528acdc8ef526186e56a2f5545d --- /dev/null +++ b/lite/kernels/rknpu/bridges/pool_op.cc @@ -0,0 +1,187 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/pool_op.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/rknpu/bridges/graph.h" +#include "lite/kernels/rknpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace rknpu { + +int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[RKNPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + + auto out_name = op_info->Output("Out").front(); + auto output = scope->FindMutableTensor(out_name); + + auto pooling_type = op_info->GetAttr("pooling_type"); + auto global_pooling = op_info->GetAttr("global_pooling"); + auto ksize = op_info->GetAttr>("ksize"); + auto paddings = op_info->GetAttr>("paddings"); + + // for quantization + bool enable_int8 = false; + float input_scale = 1.0; + float output_scale = 1.0; + int bit_length = 8; + DataLayoutType layout = DATALAYOUT(kNCHW); + PrecisionType precision = PRECISION(kFloat); + + if (x->precision() == PRECISION(kInt8)) { + // enable_int8 = op_info->GetAttr("enable_int8"); + enable_int8 = true; + input_scale = op_info->GetAttr("input_scale"); + bit_length = op_info->GetAttr("bit_length"); + output_scale = op_info->GetAttr("output_scale"); + + if (enable_int8) { + precision = PRECISION(kInt8); + LOG(WARNING) << "[RKNPU] Pooling int8"; + } + } + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + QuantizationInfo qnt; + qnt.enable_int8 = enable_int8; + + if (enable_int8) { + qnt.scale.push_back(input_scale); + qnt.quant_bits = bit_length; + } + x_node = graph->Add(x_name, *x, x->precision(), layout, qnt); + } + + // pool mode + rk::nn::PoolType mode = rk::nn::PoolType::POOLING_UNKNOWN; + if (pooling_type == "max") { + mode = rk::nn::PoolType::POOLING_MAX; + } else if (pooling_type == "avg") { + mode = rk::nn::PoolType::POOLING_AVG; + } else { + LOG(WARNING) << "[RKNPU] Unsupported pooling type: " << pooling_type; + return FAILED; + } + + // pad mode + rk::nn::PadType pad_mode = rk::nn::PadType::AUTO; + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + if (padding_algorithm == "SAME") { + pad_mode = rk::nn::PadType::SAME; + } else if (padding_algorithm == "VALID") { + pad_mode = rk::nn::PadType::VALID; + } + + // paddings and strides + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "[NPU] Paddings size should be the same or twice as the inputs size."; + + bool adaptive = false; + if (op_info->HasAttr("adaptive")) { + adaptive = op_info->GetAttr("adaptive"); + } + auto strides = op_info->GetAttr>("strides"); + lite::operators::UpdatePadding(&paddings, + global_pooling, + adaptive, + padding_algorithm, + x->dims(), + strides, + ksize); + + // ceil mode + int ceil_mode = 0; + if (op_info->HasAttr("ceil_mode")) { + ceil_mode = op_info->GetAttr("ceil_mode") ? 1 : 0; + } + + std::shared_ptr output_node = nullptr; + QuantizationInfo output_qnt; + + output_qnt.enable_int8 = enable_int8; + + if (enable_int8) { + output_qnt.quant_bits = bit_length; + output_qnt.scale.push_back(output_scale); + output->mutable_data(); + } + + output_node = graph->Add(out_name, *output, precision, layout, output_qnt); + + std::vector> inputs; + std::vector> outputs; + + inputs.push_back(x_node->data()); + outputs.push_back(output_node->data()); + + rk::nn::PoolAttr attrs; + attrs.ksize[0] = ksize[0]; + attrs.ksize[1] = ksize[1]; + attrs.stride[0] = strides[0]; + attrs.stride[1] = strides[1]; + attrs.pad[0] = paddings[0]; + attrs.pad[1] = paddings[1]; + attrs.pad[2] = paddings[2]; + attrs.pad[3] = paddings[3]; + attrs.pad_type = pad_mode; + attrs.pool_type = mode; + attrs.global_pooling = global_pooling; + + if (ceil_mode) { + attrs.round_type = rk::nn::RoundType::ROUND_CEIL; + } else { + attrs.round_type = rk::nn::RoundType::ROUND_FLOOR; + } + + auto rGraph = graph->GetHandle(); + auto pool = + rGraph->AddOperator(rk::nn::OperatorType::POOL, inputs, outputs, &attrs); + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace rknpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(pool2d, + kRKNPU, + paddle::lite::subgraph::rknpu::PoolConverter); diff --git a/lite/kernels/rknpu/bridges/softmax_op.cc b/lite/kernels/rknpu/bridges/softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1ec0b9c7462526f0409a634159d17d5afbd795f5 --- /dev/null +++ b/lite/kernels/rknpu/bridges/softmax_op.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/rknpu/bridges/graph.h" +#include "lite/kernels/rknpu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace rknpu { + +int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[RKNPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto x_rank = x_dims.size(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + auto output = scope->FindMutableTensor(out_name); + auto axis = op_info->GetAttr("axis"); + if (axis < 0) { + axis += x_rank; + } + + // for quantization + bool enable_int8 = false; + float input_scale = 1.0; + float output_scale = 1.0; + int bit_length = 8; + DataLayoutType layout = DATALAYOUT(kNCHW); + PrecisionType precision = PRECISION(kFloat); + + if (op_info->HasAttr("enable_int8")) { + enable_int8 = op_info->GetAttr("enable_int8"); + input_scale = op_info->GetAttr("input_scale"); + bit_length = op_info->GetAttr("bit_length"); + output_scale = op_info->GetAttr("output_scale"); + + if (enable_int8) { + precision = PRECISION(kInt8); + } + } + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + QuantizationInfo qnt; + qnt.enable_int8 = enable_int8; + + if (enable_int8) { + qnt.scale.push_back(input_scale); + qnt.quant_bits = bit_length; + } + x_node = graph->Add(x_name, *x, precision, layout, qnt); + } + + std::shared_ptr output_node = nullptr; + QuantizationInfo output_qnt; + + output_qnt.enable_int8 = enable_int8; + + if (enable_int8) { + output_qnt.quant_bits = bit_length; + output_qnt.scale.push_back(output_scale); + output->mutable_data(); + } + + output_node = graph->Add(out_name, *output, precision, layout, output_qnt); + + std::vector> inputs; + std::vector> outputs; + + inputs.push_back(x_node->data()); + outputs.push_back(output_node->data()); + + rk::nn::SoftmaxAttr attrs; + attrs.axis = axis; + attrs.beta = 1.0; + + auto rGraph = graph->GetHandle(); + auto softmax = rGraph->AddOperator( + rk::nn::OperatorType::SOFTMAX, inputs, outputs, &attrs); + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace rknpu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(softmax, + kRKNPU, + paddle::lite::subgraph::rknpu::SoftmaxConverter); diff --git a/lite/kernels/rknpu/bridges/utility.cc b/lite/kernels/rknpu/bridges/utility.cc new file mode 100644 index 0000000000000000000000000000000000000000..df236951ff1c4ede5fed11286fa7547903611fb4 --- /dev/null +++ b/lite/kernels/rknpu/bridges/utility.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/rknpu/bridges/utility.h" +#include +#include +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/tensor.h" +#include "rknpu/rknpu_pub.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace rknpu { + +rk::nn::PrecisionType ToRknpuPrecisionType(PrecisionType precision) { + rk::nn::PrecisionType t = rk::nn::PrecisionType::UNKNOWN; + + switch (precision) { + case PrecisionType::kFloat: + t = rk::nn::PrecisionType::FLOAT32; + break; + case PrecisionType::kFP16: + t = rk::nn::PrecisionType::FLOAT16; + break; + case PrecisionType::kInt16: + t = rk::nn::PrecisionType::INT16; + break; + case PrecisionType::kInt32: + t = rk::nn::PrecisionType::INT32; + break; + case PrecisionType::kInt64: + t = rk::nn::PrecisionType::INT64; + break; + case PrecisionType::kInt8: + t = rk::nn::PrecisionType::INT8; + break; + case PrecisionType::kBool: + t = rk::nn::PrecisionType::BOOL8; + break; + default: + break; + } + + return t; +} + +rk::nn::DataLayoutType ToRknpuDataLayoutType(DataLayoutType layout) { + rk::nn::DataLayoutType t = rk::nn::DataLayoutType::UNKNOWN; + + switch (layout) { + case DataLayoutType::kNCHW: + t = rk::nn::DataLayoutType::NCHW; + break; + case DataLayoutType::kNHWC: + t = rk::nn::DataLayoutType::NHWC; + break; + default: + break; + } + + return t; +} + +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname) { + auto iarg_names = op_info->input_argnames(); + if (std::find(iarg_names.begin(), iarg_names.end(), argname) != + iarg_names.end()) { + auto inputs = op_info->Input(argname); + if (inputs.empty()) { + return false; + } + auto var_name = inputs.front(); + auto var = scope->FindVar(var_name); + return var != nullptr; + } else { + return false; + } +} +} // namespace rknpu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/rknpu/bridges/utility.h b/lite/kernels/rknpu/bridges/utility.h new file mode 100644 index 0000000000000000000000000000000000000000..7e8e5b5c97cbb00e784b7cbecf25e7238d271520 --- /dev/null +++ b/lite/kernels/rknpu/bridges/utility.h @@ -0,0 +1,39 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/tensor.h" +#include "rknpu/rknpu_pub.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace rknpu { + +rk::nn::PrecisionType ToRknpuPrecisionType(PrecisionType precision); +rk::nn::DataLayoutType ToRknpuDataLayoutType(DataLayoutType layout); +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname); +} // namespace rknpu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/rknpu/subgraph_compute.cc b/lite/kernels/rknpu/subgraph_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0b63205705609b6899918ce8e254ccdf6cbad47 --- /dev/null +++ b/lite/kernels/rknpu/subgraph_compute.cc @@ -0,0 +1,239 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/rknpu/subgraph_compute.h" +#include +#include +#include +#include "lite/backends/rknpu/device.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/rknpu/bridges/graph.h" +#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h" +#include "lite/kernels/rknpu/bridges/utility.h" +#include "rknpu/rknpu_pub.h" // NOLINT + +namespace paddle { +namespace lite { +namespace kernels { +namespace rknpu { + +int SubgraphEngine::BuildDeviceProgram() { + LOG(INFO) << "[RKNPU]:BuildDeviceProgram"; + int status = 0; + // Convert all of ops and their input vars and weights and added into the NPU + // RKNPU IR graph + subgraph::rknpu::Graph graph; + const auto& bridges = subgraph::Registry::Instance(); + for (auto& inst : origin_program_) { + auto op = const_cast(inst.op()); + CHECK(op); + op->CheckShape(); + op->InferShape(); + std::string op_type = op->op_info()->Type(); + if (!bridges.Exists(op_type, TARGET(kRKNPU))) { + return subgraph::FAILED; + } + auto kernel = inst.kernel(); + status |= bridges.Select(op_type, TARGET(kRKNPU))( + reinterpret_cast(&graph), op, const_cast(kernel)); + if (subgraph::CHECK_FAILED(status)) { + return subgraph::FAILED; + } + } + // Collect the valid input and output nodes in the RKNPU IR graph and update + // the input and output names + device_inames_.clear(); + device_onames_.clear(); + + for (auto& input_name : input_names_) { + LOG(INFO) << "[RKNPU] Input node " << input_name; + if (graph.Has(input_name)) { + LOG(INFO) << input_name << " Precision " + << PrecisionToStr(graph.Get(input_name)->precision()); + device_itensors_.push_back(graph.Get(input_name)->data()); + device_inames_.push_back(input_name); + } else { + LOG(WARNING) << "[RKNPU] Input node " << input_name + << " is ignored because it does not exist."; + } + } + + for (auto& output_name : output_names_) { + LOG(INFO) << "[RKNPU] Output node " << output_name; + if (graph.Has(output_name)) { + auto tensor = scope_->FindMutableTensor(output_name); + LOG(INFO) << output_name << " Precision " + << PrecisionToStr(tensor->precision()); + device_otensors_.push_back(graph.Get(output_name)->data()); + device_onames_.push_back(output_name); + } else { + LOG(WARNING) << "[RKNPU] Output node " << output_name + << " is ignored because it does not exist."; + } + } + CHECK(!device_inames_.empty()) + << "[RKNPU] No input nodes found for building NPU model"; + CHECK(!device_onames_.empty()) + << "[RKNPU] No output nodes found for building NPU model"; + + device_program_ = lite::rknpu::Device::Global().Build( + model_name_, graph.GetHandle(), device_itensors_, device_otensors_); + if (device_program_ == nullptr) { + LOG(WARNING) << "[RKNPU] Build model failed!"; + return subgraph::FAILED; + } + + // input + origin_idims_.resize(input_names_.size()); + origin_itensors_.resize(input_names_.size()); + for (size_t i = 0; i < input_names_.size(); i++) { + origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]); + CHECK(origin_itensors_[i]); + origin_idims_[i] = origin_itensors_[i]->dims(); + } + // output + origin_odims_.resize(output_names_.size()); + origin_otensors_.resize(output_names_.size()); + for (size_t i = 0; i < output_names_.size(); i++) { + origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]); + CHECK(origin_otensors_[i]); + origin_odims_[i] = origin_otensors_[i]->dims(); + + auto output_dims = origin_otensors_[i]->dims(); + } + + origin_idims_.resize(device_inames_.size()); + origin_itensors_.resize(device_inames_.size()); + device_itensors_.resize(device_inames_.size()); + origin_odims_.resize(device_onames_.size()); + origin_otensors_.resize(device_onames_.size()); + device_otensors_.resize(device_onames_.size()); + for (int i = 0; i < device_inames_.size(); i++) { + auto node = graph.Get(device_inames_[i]); + auto precision = node->precision(); + auto layout = node->layout(); + origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); + CHECK(origin_itensors_[i]); + origin_idims_[i] = origin_itensors_[i]->dims(); + + LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << device_inames_[i] + << " precision: " << PrecisionToStr(precision) + << " layout: " << DataLayoutToStr(layout); + } + for (int i = 0; i < device_onames_.size(); i++) { + auto node = graph.Get(device_onames_[i]); + auto precision = node->precision(); + auto layout = node->layout(); + origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); + CHECK(origin_otensors_[i]); + origin_odims_[i] = origin_otensors_[i]->dims(); + LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << device_onames_[i] + << " precision: " << PrecisionToStr(precision) + << " layout: " << DataLayoutToStr(layout); + // Prepare the device output tensors + switch (precision) { + case PRECISION(kFloat): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt8): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt16): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt32): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt64): + origin_otensors_[i]->mutable_data(); + break; + default: + LOG(FATAL) << "[RKNPU] " << device_onames_[i] + << " can't mutable data with precision type " + << PrecisionToStr(precision); + break; + } + } + return status; +} + +int SubgraphEngine::LaunchDeviceProgram() { + LOG(INFO) << "[RKNPU]:LaunchDeviceProgram"; + std::vector inputs; + std::vector outputs; + + inputs.resize(device_itensors_.size()); + for (size_t i = 0; i < device_itensors_.size(); i++) { + inputs[i].index = i; + inputs[i].buf = const_cast(origin_itensors_[i]->raw_data()); + inputs[i].size = origin_itensors_[i]->memory_size(); + inputs[i].pass_through = false; + inputs[i].type = + subgraph::rknpu::ToRknpuPrecisionType(origin_itensors_[i]->precision()); + inputs[i].layout = rk::nn::DataLayoutType::NCHW; + } + + outputs.resize(device_otensors_.size()); + for (size_t i = 0; i < device_otensors_.size(); i++) { + outputs[i].index = i; + outputs[i].buf = const_cast(origin_otensors_[i]->raw_data()); + outputs[i].size = origin_otensors_[i]->memory_size(); + outputs[i].want_float = false; + } + + device_program_->SetInputs(inputs); + device_program_->Run(); + device_program_->GetOutputs(outputs); + return 0; +} + +void SubgraphCompute::PrepareForRun() { + LOG(INFO) << "[RKNPU]:PrepareForRun"; + auto& param = this->Param(); + engine_.reset(new SubgraphEngine(ctx_.get(), + param.sub_block_idx, + param.sub_block_desc, + param.input_data_names, + param.output_data_names, + param.scope)); + CHECK(engine_); + engine_->Build(); +} + +void SubgraphCompute::Run() { + LOG(INFO) << "[RKNPU]:Run"; + CHECK(engine_); + engine_->Launch(); +} + +} // namespace rknpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(subgraph, + kRKNPU, + kInt8, + kNCHW, + paddle::lite::kernels::rknpu::SubgraphCompute, + def) + .BindInput("Inputs", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .BindOutput("Outputs", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/rknpu/subgraph_compute.h b/lite/kernels/rknpu/subgraph_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..863e6aef39ad54f0e9d94d4b507c6fca4128ebb8 --- /dev/null +++ b/lite/kernels/rknpu/subgraph_compute.h @@ -0,0 +1,74 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/program.h" +#include "lite/core/types.h" +#include "lite/kernels/npu/bridges/engine.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "rknpu/rknpu_pub.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace rknpu { + +class SubgraphEngine : public subgraph::Engine { + public: + SubgraphEngine(KernelContext *ctx, + int block_idx, + cpp::BlockDesc *block_desc, + const std::vector &input_names, + const std::vector &output_names, + Scope *scope) + : subgraph::Engine( + ctx, block_idx, block_desc, input_names, output_names, scope) {} + + protected: + int BuildDeviceProgram() override; + int LaunchDeviceProgram() override; + + std::string model_name_; + std::vector device_inames_; + std::vector device_onames_; + std::vector> device_itensors_; + std::vector> device_otensors_; + std::unique_ptr device_program_{nullptr}; +}; + +class SubgraphCompute + : public KernelLite { + public: + using param_t = operators::SubgraphParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~SubgraphCompute() = default; + + private: + std::unique_ptr engine_; +}; + +} // namespace rknpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt index c31e3ba58fc793aa92a5b37a59ad612e03c61a53..2ef2ea8232cba4b87032c6b28272c6aa598fe4b5 100644 --- a/lite/tests/api/CMakeLists.txt +++ b/lite/tests/api/CMakeLists.txt @@ -12,3 +12,10 @@ if(LITE_WITH_XPU) ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) endif() + +if(LITE_WITH_RKNPU) + lite_cc_test(test_mobilenetv1_int8_rknpu SRCS test_mobilenetv1_int8_rknpu.cc + DEPS ${lite_model_test_DEPS} paddle_api_full + RKNPU_DEPS ${rknpu_kernels} ${rknpu_bridges} + ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL) +endif() diff --git a/lite/tests/api/test_mobilenetv1_int8_rknpu.cc b/lite/tests/api/test_mobilenetv1_int8_rknpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..8c123088b3f69560abf3555dd2e459af926426ef --- /dev/null +++ b/lite/tests/api/test_mobilenetv1_int8_rknpu.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +inline int64_t ShapeProduction(std::vector shape) { + int64_t s = 1; + for (int64_t dim : shape) { + s *= dim; + } + return s; +} + +int main(int argc, char** argv) { + if (argc < 2) { + std::cerr << "[ERROR] usage: ./" << argv[0] + << " model_dir [thread_num] [warmup_times] [repeat_times] " + "[input_data_path] [output_data_path]" + << std::endl; + return -1; + } + std::string model_dir = argv[1]; + int thread_num = 1; + if (argc > 2) { + thread_num = atoi(argv[2]); + } + int warmup_times = 5; + if (argc > 3) { + warmup_times = atoi(argv[3]); + } + int repeat_times = 10; + if (argc > 4) { + repeat_times = atoi(argv[4]); + } + std::string input_data_path; + if (argc > 5) { + input_data_path = argv[5]; + } + std::string output_data_path; + if (argc > 6) { + output_data_path = argv[6]; + } + paddle::lite_api::CxxConfig config; + config.set_model_dir(model_dir); + config.set_threads(thread_num); + config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH); + config.set_valid_places( + {paddle::lite_api::Place{ + TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + paddle::lite_api::Place{ + TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)}, + paddle::lite_api::Place{ + TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)}, + paddle::lite_api::Place{ + TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}}); + auto predictor = paddle::lite_api::CreatePaddlePredictor(config); + + std::unique_ptr input_tensor( + std::move(predictor->GetInput(0))); + input_tensor->Resize({1, 3, 224, 224}); + auto input_data = input_tensor->mutable_data(); + auto input_size = ShapeProduction(input_tensor->shape()); + if (input_data_path.empty()) { + for (int i = 0; i < input_size; i++) { + input_data[i] = 1; + } + } else { + std::fstream fs(input_data_path, std::ios::in); + if (!fs.is_open()) { + std::cerr << "open input data file failed." << std::endl; + return -1; + } + for (int i = 0; i < input_size; i++) { + fs >> input_data[i]; + } + } + + for (int i = 0; i < warmup_times; ++i) { + predictor->Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < repeat_times; ++i) { + predictor->Run(); + } + + std::cout << "Model: " << model_dir << ", threads num " << thread_num + << ", warmup times: " << warmup_times + << ", repeat times: " << repeat_times << ", spend " + << (GetCurrentUS() - start) / repeat_times / 1000.0 + << " ms in average." << std::endl; + + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + auto output_data = output_tensor->data(); + auto output_size = ShapeProduction(output_tensor->shape()); + std::cout << "output data:"; + for (int i = 0; i < output_size; i += 100) { + std::cout << "[" << i << "] " << output_data[i] << std::endl; + } + return 0; +} diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index cb454c4da5bc15d65e480f55dabe01124bf18ca5..9411942f504c6c95d15b2a9af638b24cd85e3552 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -1,4 +1,4 @@ -if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) +if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_RKNPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) diff --git a/lite/tools/build.sh b/lite/tools/build.sh index fb540d3cd2bae881cfc31cf13f4afa017bf88ebe..d52680c13e4fe6f724456e090587fb85dc53a25e 100755 --- a/lite/tools/build.sh +++ b/lite/tools/build.sh @@ -27,6 +27,8 @@ NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.hua BUILD_XPU=OFF BUILD_XTCL=OFF XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/" +BUILD_RKNPU=OFF +RKNPU_DDK_ROOT="$(pwd)/rknpu/" LITE_WITH_ARM_LANG=OFF readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz @@ -141,6 +143,8 @@ function make_tiny_publish_so { -DLITE_WITH_XPU=$BUILD_XPU \ -DLITE_WITH_XTCL=$BUILD_XTCL \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \ + -DLITE_WITH_RKNPU=$BUILD_RKNPU \ + -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \ -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} make publish_inference -j$NUM_PROC @@ -230,6 +234,8 @@ function make_full_publish_so { -DLITE_WITH_XPU=$BUILD_XPU \ -DLITE_WITH_XTCL=$BUILD_XTCL \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \ + -DLITE_WITH_RKNPU=$BUILD_RKNPU \ + -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \ -DLITE_WITH_TRAIN=$BUILD_TRAIN \ -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} @@ -265,6 +271,8 @@ function make_all_tests { -DLITE_WITH_XPU=$BUILD_XPU \ -DLITE_WITH_XTCL=$BUILD_XTCL \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \ + -DLITE_WITH_RKNPU=$BUILD_RKNPU \ + -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \ -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} make lite_compile_deps -j$NUM_PROC @@ -498,6 +506,14 @@ function main { XPU_SDK_ROOT="${i#*=}" shift ;; + --build_rknpu=*) + BUILD_RKNPU="${i#*=}" + shift + ;; + --rknpu_ddk_root=*) + RKNPU_DDK_ROOT="${i#*=}" + shift + ;; tiny_publish) make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL shift diff --git a/lite/tools/build_rknpu.sh b/lite/tools/build_rknpu.sh new file mode 100755 index 0000000000000000000000000000000000000000..aa2fb5a124077b43f65537ab12715602ab1fe6b8 --- /dev/null +++ b/lite/tools/build_rknpu.sh @@ -0,0 +1,162 @@ +#!/bin/bash +set -ex + +# global variables with default value +ARM_OS="armlinux" # android only yet +ARM_ABI="armv8" # armv8, armv7 +ARM_LANG="gcc" # gcc only yet +DDK_ROOT="$(pwd)/rknpu" +TARGET_NAME="test_subgraph_pass" # default target +BUILD_EXTRA=OFF # ON(with sequence ops)/OFF +WITH_TESTING=ON # ON/OFF +SHUTDOWN_LOG=OFF # ON(disable logging)/OFF +ON_TINY_PUBLISH=OFF # ON(tiny publish)/OFF(full publish) + +function print_usage { + echo -e "\nUSAGE:" + echo + echo "----------------------------------------" + echo -e "--arm_os= android only yet." + echo -e "--arm_abi= armv8, armv7 yet." + echo -e "--arm_lang=" + echo -e "--ddk_root=" + echo -e "--target_name=" + echo "----------------------------------------" + echo +} + +# for code gen, a source file is generated after a test, +# but is dependended by some targets in cmake. +# here we fake an empty file to make cmake works. +function prepare_workspace { + # in build directory + # 1. Prepare gen_code file + GEN_CODE_PATH_PREFIX=lite/gen_code + mkdir -p ./${GEN_CODE_PATH_PREFIX} + touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc + + # 2.Prepare debug tool + DEBUG_TOOL_PATH_PREFIX=lite/tools/debug + mkdir -p ./${DEBUG_TOOL_PATH_PREFIX} + cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/ +} + +function prepare_thirdparty { + readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz + + readonly workspace=$PWD + if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then + rm -rf $workspace/third-party + + if [ ! -f $workspace/third-party-05b862.tar.gz ]; then + wget $THIRDPARTY_TAR + fi + tar xzf third-party-05b862.tar.gz + else + git submodule update --init --recursive + fi +} + +function build_npu { + cur_dir=$(pwd) + + prepare_thirdparty + + local publish_dir + if [[ "${ON_TINY_PUBLISH}" == "ON" ]]; then + WITH_TESTING=OFF + SHUTDOWN_LOG=ON + publish_dir="tiny_publish" + else + publish_dir="full_publish" + fi + build_dir=$cur_dir/build.lite.rknpu.${ARM_OS}.${ARM_ABI}.${ARM_LANG}.${publish_dir} + mkdir -p $build_dir + cd $build_dir + + # NPU libs need API LEVEL 24 above + prepare_workspace + cmake .. \ + -DWITH_GPU=OFF \ + -DWITH_MKL=OFF \ + -DWITH_LITE=ON \ + -DLITE_WITH_CUDA=OFF \ + -DLITE_WITH_X86=OFF \ + -DLITE_WITH_NPU=OFF \ + -DLITE_WITH_JAVA=OFF \ + -DLITE_WITH_ARM=ON \ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ + -DWITH_ARM_DOTPROD=ON \ + -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \ + -DWITH_TESTING=${WITH_TESTING} \ + -DLITE_SHUTDOWN_LOG=${SHUTDOWN_LOG} \ + -DLITE_ON_TINY_PUBLISH=${ON_TINY_PUBLISH} \ + -DARM_TARGET_OS=${ARM_OS} \ + -DARM_TARGET_ARCH_ABI=${ARM_ABI} \ + -DARM_TARGET_LANG=${ARM_LANG} \ + -DLITE_WITH_RKNPU=ON \ + -DRKNPU_DDK_ROOT=${DDK_ROOT} + + make $TARGET_NAME -j2 + + cd - + echo "Done" +} + +function main { + # Parse command line. + for i in "$@"; do + case $i in + --target_name=*) + TARGET_NAME="${i#*=}" + shift + ;; + --arm_os=*) + ARM_OS="${i#*=}" + shift + ;; + --arm_abi=*) + ARM_ABI="${i#*=}" + shift + ;; + --arm_lang=*) + ARM_LANG="${i#*=}" + shift + ;; + --android_stl=*) + ANDROID_STL="${i#*=}" + shift + ;; + --build_extra=*) + BUILD_EXTRA="${i#*=}" + shift + ;; + --ddk_root=*) + DDK_ROOT="${i#*=}" + shift + ;; + build) + build_npu + shift + ;; + full_publish) + TARGET_NAME=publish_inference + build_npu + shift + ;; + tiny_publish) + ON_TINY_PUBLISH=ON + TARGET_NAME=publish_inference + build_npu + shift + ;; + *) + # unknown option + print_usage + exit 1 + ;; + esac + done +} + +main $@ diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py index 560174bc632bec89b9655ff89fd5eeb9e7db7786..a89e99bb70a9853eeb103d077173fefbb9a9a399 100644 --- a/lite/tools/cmake_tools/record_supported_kernel_op.py +++ b/lite/tools/cmake_tools/record_supported_kernel_op.py @@ -56,8 +56,8 @@ const std::vector> supported_ops_target = { ops_lines = [] # valid targets and valid_ops -valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"] -valid_ops = [[], [], [], [], [], [], [], [], [], []] +valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU"] +valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[]] class TargetType: kUnk = 0 kHost = 1 @@ -68,6 +68,9 @@ class TargetType: kFPGA = 7 kNPU = 8 kXPU = 9 + kBM = 10 + kMLU = 11 + kRKNPU = 12 kAny = 6 # any target # record op_info of valid kernels into `valid_ops` according to different target type