diff --git a/CMakeLists.txt b/CMakeLists.txt index a28613647b32c44c472917b10cdcab7acab843d1..3127e799f01ca16fd5ceea5838723f8ae195f7c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,6 +87,7 @@ lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF) lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF) lite_option(LITE_WITH_HUAWEI_ASCEND_NPU "Enable HUAWEI_ASCEND_NPU in lite mode" OFF) +lite_option(LITE_WITH_IMAGINATION "Enable IMAGINATION_NNA in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) @@ -171,6 +172,10 @@ if(LITE_WITH_RKNPU) include(device/rknpu) endif() +if(LITE_WITH_NNA) + include(device/nna) +endif() + include(external/flatbuffers) # for mobile diff --git a/cmake/configure.cmake b/cmake/configure.cmake index e980922d5b4869ede65e57e750b5b85676ed0dde..e1580d089c29674497cdd818018a2ab40ea89887 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -175,6 +175,10 @@ if (LITE_WITH_MLU) add_definitions("-DLITE_WITH_MLU") endif() +if (LITE_WITH_NNA) +add_definitions("-DLITE_WITH_NNA") +endif() + if (LITE_WITH_HUAWEI_ASCEND_NPU) add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU") endif() diff --git a/cmake/device/nna.cmake b/cmake/device/nna.cmake new file mode 100644 index 0000000000000000000000000000000000000000..127c11cd729dfec8c0f6027dfa5510a76ca19a6b --- /dev/null +++ b/cmake/device/nna.cmake @@ -0,0 +1,69 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_NNA) + return() +endif() + +if(NOT DEFINED IMGNNA_DDK_ROOT) + set(IMGNNA_DDK_ROOT $ENV{IMGNNA_DDK_ROOT}) + if(NOT IMGNNA_DDK_ROOT) + message(FATAL_ERROR "Must set IMGNNA_DDK_ROOT or env IMGNNA_DDK_ROOT when LITE_WITH_IMGNNA=ON") + endif() +endif() + +message(STATUS "IMGNNA_DDK_ROOT: ${IMGNNA_DDK_ROOT}") +find_path(IMGNNA_DDK_INC NAMES imgdnn.h + PATHS ${IMGNNA_DDK_ROOT}/include/imgdnn NO_DEFAULT_PATH) +if(NOT IMGNNA_DDK_INC) + message(FATAL_ERROR "Can not find imgdnn.h in ${IMGNNA_DDK_ROOT}/include") +endif() + +#include_directories("${IMGNNA_DDK_ROOT}/include") +include_directories(${IMGNNA_DDK_INC}) + +#set(IMGNNA_SUB_LIB_PATH "lib64") +#if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") +# set(IMGNNA_SUB_LIB_PATH "lib64") +#endif() +#if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") +# set(IMGNNA_SUB_LIB_PATH "lib") +#endif() +set(IMGNNA_LIB_PATH "lib") + +find_library(IMGNNA_DDK_IMGDNN_FILE NAMES imgdnn + PATHS ${IMGNNA_DDK_ROOT}/${IMGNNA_LIB_PATH}) + +if(NOT IMGNNA_DDK_IMGDNN_FILE) + message(FATAL_ERROR "Can not find IMGNNA_DDK_IMGDNN_FILE in ${IMGNNA_DDK_ROOT}") +else() + message(STATUS "Found IMGNNA_DDK IMGDNN Library: ${IMGNNA_DDK_IMGDNN_FILE}") + add_library(nna_ddk_imgdnn SHARED IMPORTED GLOBAL) + set_property(TARGET nna_ddk_imgdnn PROPERTY IMPORTED_LOCATION ${IMGNNA_DDK_IMGDNN_FILE}) +endif() + + +find_library(IMGNNA_DDK_RUNTIME_FILE NAMES nnasession + PATHS ${IMGNNA_DDK_ROOT}/${IMGNNA_LIB_PATH}) + +if(NOT IMGNNA_DDK_RUNTIME_FILE) + message(FATAL_ERROR "Can not find IMGNNA_DDK_RUNTIME_FILE in ${IMGNNA_DDK_ROOT}") +else() + message(STATUS "Found IMGNNA_DDK RUNTIME Library: ${IMGNNA_DDK_RUNTIME_FILE}") + add_library(nna_ddk_runtime SHARED IMPORTED GLOBAL) + set_property(TARGET nna_ddk_runtime PROPERTY IMPORTED_LOCATION ${IMGNNA_DDK_RUNTIME_FILE}) +endif() + +set(nna_runtime_libs nna_ddk_runtime CACHE INTERNAL "imgnna ddk runtime libs") +set(nna_builder_libs nna_ddk_imgdnn CACHE INTERNAL "imgnna ddk builder libs") diff --git a/cmake/lite.cmake b/cmake/lite.cmake index fe66d0f643e9bdf0cb778c4e4647294f553c023e..590243dc769fcaebb236ce57a88e037ac4f43690 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -118,6 +118,12 @@ function (lite_deps TARGET) endforeach(var) endif() + if (LITE_WITH_NNA) + foreach(var ${lite_deps_NNA_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + if (LITE_WITH_HUAWEI_ASCEND_NPU) foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS}) set(deps ${deps} ${var}) @@ -149,7 +155,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -160,6 +166,7 @@ function(lite_cc_library TARGET) CUDA_DEPS ${args_CUDA_DEPS} CL_DEPS ${args_CL_DEPS} BM_DEPS ${args_BM_DEPS} + NNA_DEPS ${args_NNA_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS} ARM_DEPS ${args_ARM_DEPS} CV_DEPS ${args_CV_DEPS} @@ -200,7 +207,7 @@ function(lite_cc_binary TARGET) set(options " -g ") endif() set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NNA_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -217,6 +224,7 @@ function(lite_cc_binary TARGET) XPU_DEPS ${args_XPU_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS} BM_DEPS ${args_BM_DEPS} + NNA_DEPS ${args_NNA_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -254,7 +262,7 @@ function(lite_cc_test TARGET) endif() set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) @@ -279,6 +287,7 @@ function(lite_cc_test TARGET) XPU_DEPS ${args_XPU_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS} BM_DEPS ${args_BM_DEPS} + NNA_DEPS ${args_NNA_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -315,6 +324,7 @@ set(xpu_kernels CACHE INTERNAL "xpu kernels") set(mlu_kernels CACHE INTERNAL "mlu kernels") set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels") set(bm_kernels CACHE INTERNAL "bm kernels") +set(nna_kernels CACHE INTERNAL "nna kernels") set(rknpu_kernels CACHE INTERNAL "rknpu kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels") set(host_kernels CACHE INTERNAL "host kernels") @@ -331,12 +341,12 @@ if(LITE_BUILD_TAILOR) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) endif() # add a kernel for some specific device -# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU) +# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU NNA) # level: one of (basic, extra) function(add_kernel TARGET device level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NNA_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -448,6 +458,16 @@ function(add_kernel TARGET device level) endif() set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "NNA") + if (NOT LITE_WITH_NNA) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() + set(nna_kernels "${nna_kernels};${TARGET}" CACHE INTERNAL "") + endif() + if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU") if (NOT LITE_WITH_HUAWEI_ASCEND_NPU) foreach(src ${args_SRCS}) @@ -500,6 +520,7 @@ function(add_kernel TARGET device level) RKNPU_DEPS ${args_RKNPU_DEPS} BM_DEPS ${args_BM_DEPS} MLU_DEPS ${args_MLU_DEPS} + NNA_DEPS ${args_NNA_DEPS} HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} @@ -519,7 +540,7 @@ endif() function(add_operator TARGET level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NNA_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -557,6 +578,7 @@ function(add_operator TARGET level) RKNPU_DEPS ${args_RKNPU_DEPS} BM_DEPS ${args_BM_DEPS} MLU_DEPS ${args_MLU_DEPS} + NNA_DEPS ${args_NNA_DEPS} HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index b4635a48d9c259b8897785092c7502e7fa40f90c..217bb15fffdd00d6dddb3a42e60e9a9a4cd686a7 100755 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -14,6 +14,7 @@ message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}") message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") +message(STATUS "LITE_WITH_NNA:\t${LITE_WITH_NNA}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}") @@ -93,6 +94,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (LITE_WITH_RKNPU) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu") endif(LITE_WITH_RKNPU) + if (LITE_WITH_NNA) + set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.nna") + endif(LITE_WITH_NNA) else() set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib") endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index fb8784cb2084311a306d709006461ca349963bed..de464b4d45b3ed65bbe6c9426738d5be8f398d05 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -40,6 +40,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH NPU_DEPS ${npu_kernels} APU_DEPS ${apu_kernels} RKNPU_DEPS ${rknpu_kernels} + NNA_DEPS ${nna_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} ) @@ -85,7 +86,10 @@ else() # Need to add RKNPU runtime libs dependency target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs}) endif() - + if (LITE_WITH_NNA) + # Need to add IMG NNA runtime libs (libhiai.so) dependency + #target_link_libraries(paddle_light_api_shared ${nna_builder_libs} ${nna_runtime_libs}) + endif() endif() endif() @@ -118,6 +122,11 @@ if(LITE_WITH_RKNPU) set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps}) endif() +if(LITE_WITH_NNA) + set(light_api_deps ${light_api_deps} ${nna_deps}) + set(cxx_api_deps ${cxx_api_deps} ${nna_deps}) +endif() + if(LITE_WITH_HUAWEI_ASCEND_NPU) set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps}) set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps}) @@ -137,6 +146,7 @@ list(LENGTH fpga_kernels num_fpga_kernels) list(LENGTH bm_kernels num_bm_kernels) list(LENGTH mlu_kernels num_mlu_kernels) list(LENGTH huawei_ascend_npu_kernels num_huawei_ascend_npu_kernels) +list(LENGTH imagination_nna_kernels num_imagination_nna_kernels) message(STATUS "Collected ${num_ops} ops") message(STATUS "Collected ${num_x86_kernels} X86 kernels") @@ -152,6 +162,7 @@ message(STATUS "Collected ${num_fpga_kernels} FPGA kernels") message(STATUS "Collected ${num_bm_kernels} BM kernels") message(STATUS "Collected ${num_mlu_kernels} MLU kernels") message(STATUS "Collected ${num_huawei_ascend_npu_kernels} HUAWEI_ASCEND_NPU kernels") +message(STATUS "Collected ${num_imagination_nna_kernels} IMAGINATION_NNA kernels") # for full api if (NOT LITE_ON_TINY_PUBLISH) @@ -169,6 +180,7 @@ if (NOT LITE_ON_TINY_PUBLISH) APU_DEPS ${apu_kernels} RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} + NNA_DEPS ${nna_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) @@ -195,6 +207,7 @@ lite_cc_library(light_api SRCS light_api.cc CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} + NNA_DEPS ${nna_kernels} MLU_DEPS ${mlu_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) @@ -219,6 +232,7 @@ if(WITH_TESTING) FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} + NNA_DEPS ${nna_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} EXCLUDE_COMPILE_DEPS "ON" ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model @@ -351,6 +365,7 @@ if (NOT LITE_ON_TINY_PUBLISH) APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + NNA_DEPS ${nna_kernels} BM_DEPS ${bm_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}) # The final inference library for just MobileConfig. @@ -382,6 +397,7 @@ if(NOT WITH_COVERAGE) RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} + NNA_DEPS ${nna_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) endif() @@ -424,6 +440,7 @@ if(NOT WITH_COVERAGE) FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} + NNA_DEPS ${nna_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL) if (WITH_TESTING) @@ -444,6 +461,7 @@ if(NOT IOS) CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} RKNPU_DEPS ${rknpu_kernels} + NNA_DEPS ${nna_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} @@ -460,6 +478,7 @@ if(NOT IOS) CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} RKNPU_DEPS ${rknpu_kernels} + NNA_DEPS ${nna_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} @@ -476,6 +495,7 @@ if(NOT IOS) CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} RKNPU_DEPS ${rknpu_kernels} + NNA_DEPS ${nna_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} @@ -486,6 +506,7 @@ if(NOT IOS) ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + NNA_DEPS ${nna_kernels} XPU_DEPS ${xpu_kernels} RKNPU_DEPS ${rknpu_kernels} MLU_DEPS ${mlu_kernels} @@ -504,6 +525,7 @@ if(NOT IOS) APU_DEPS ${apu_kernels} XPU_DEPS ${xpu_kernels} RKNPU_DEPS ${rknpu_kernels} + NNA_DEPS ${nna_kernels} MLU_DEPS ${mlu_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} CL_DEPS ${opencl_kernels} @@ -518,6 +540,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} RKNPU_DEPS ${npu_kernels} + NNA_DEPS ${nna_kernels} XPU_DEPS ${xpu_kernels} APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} diff --git a/lite/api/opt.cc b/lite/api/opt.cc index e6a53e93e72261082fa220c5fe7b0c12bf60ca87..ef2bddc956af456fd6f638cb6b53351204cb33cc 100644 --- a/lite/api/opt.cc +++ b/lite/api/opt.cc @@ -125,6 +125,10 @@ std::vector ParserValidPlaces() { } else if (target_repr == "apu") { valid_places.emplace_back( Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}); + } else if (target_repr == "nna") { + valid_places.emplace_back(TARGET(kNNA)); + valid_places.emplace_back( + Place{TARGET(kNNA), PRECISION(kInt8), DATALAYOUT(kNCHW)}); } else { LOG(FATAL) << lite::string_format( "Wrong target '%s' found, please check the command flag " @@ -204,6 +208,7 @@ void PrintOpsInfo(std::set valid_ops = {}) { "kRKNPU", "kAPU", "kHuaweiAscendNPU", + "kNNA", "kAny", "kUnk"}; int maximum_optype_length = 0; @@ -269,16 +274,19 @@ void PrintHelpInfo() { " `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out=`\n" " " - "`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n" + "`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu|" + "nna)`\n" " `--record_tailoring_info=(true|false)`\n" " Arguments of model checking and ops information:\n" " `--print_all_ops=true` Display all the valid operators of " "Paddle-Lite\n" " `--print_supported_ops=true " - "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu|nna)" + "`" " Display valid operators of input targets\n" " `--print_model_ops=true --model_dir= " - "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu|nna)" + "`" " Display operators in the input model\n"; std::cout << "opt version:" << opt_version << std::endl << help_info << std::endl; diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc index ed41a821c0938b599dc8900baa021491df78f329..47d6682bdae7eb34765983f3f4f247f676116723 100644 --- a/lite/api/opt_base.cc +++ b/lite/api/opt_base.cc @@ -84,6 +84,10 @@ void OptBase::SetValidPlaces(const std::string& valid_places) { } else if (target_repr == "apu") { valid_places_.emplace_back( Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}); + } else if (target_repr == "nna") { + valid_places.emplace_back(TARGET(kNNA)); + valid_places.emplace_back( + Place{TARGET(kNNA), PRECISION(kInt8), DATALAYOUT(kNCHW)}); } else { LOG(FATAL) << lite::string_format( "Wrong target '%s' found, please check the command flag " @@ -240,7 +244,8 @@ void OptBase::PrintHelpInfo() { "default\n" " `set_lite_out(output_optimize_model_dir)`\n" " " - "`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n" + "`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu|" + "nna)`\n" " `record_model_info(false|true)`: refer to whether to record ops " "info for striping lib, false by default`\n" " `run() : start model transformation`\n" @@ -277,16 +282,17 @@ void OptBase::PrintExecutableBinHelpInfo() { " `--param_file=`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out=`\n" - " `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n" + " " + "`--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu|nna)`\n" " `--record_tailoring_info=(true|false)`\n" " Arguments of model checking and ops information:\n" " `--print_all_ops=true` Display all the valid operators of " "Paddle-Lite\n" " `--print_supported_ops=true " - "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu|nna)`" " Display valid operators of input targets\n" " `--print_model_ops=true --model_dir= " - "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu|nna)`" " Display operators in the input model\n"; std::cout << "paddlelite opt version:" << opt_version << std::endl << help_info << std::endl; @@ -305,6 +311,7 @@ void OptBase::PrintOpsInfo(const std::set& valid_ops) { "kRKNPU", "kAPU", "kHuaweiAscendNPU", + "kNNA", "kAny", "kUnk"}; // Get the lengh of the first column: maximum length of the op_type diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index e70c09e9913aeba7a2c52eb77c9846f7413c0b44..177cc1f03a2d33f58c5ece4532b4dc02eab9c49a 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -81,7 +81,8 @@ const std::string& TargetToStr(TargetType target) { "mlu", "rknpu", "apu", - "huawei_ascend_npu"}; + "huawei_ascend_npu", + "nna"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -125,7 +126,8 @@ const std::string& TargetRepr(TargetType target) { "kMLU", "kRKNPU", "kAPU", - "kHuaweiAscendNPU"}; + "kHuaweiAscendNPU", + "kNNA"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -171,7 +173,8 @@ std::set ExpandValidTargets(TargetType target) { TARGET(kAPU), TARGET(kRKNPU), TARGET(kFPGA), - TARGET(kHuaweiAscendNPU)}); + TARGET(kHuaweiAscendNPU), + TARGET(kNNA)}); if (target == TARGET(kAny)) { return valid_set; } diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index 9ee4a5d2e7f45a410c553c9f1d441ceeef061951..6db085351716492f0b3ada0a77fc9b4988d090f8 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -58,7 +58,8 @@ enum class TargetType : int { kRKNPU = 12, kAPU = 13, kHuaweiAscendNPU = 14, - NUM = 15, // number of fields. + kNNA = 15, + NUM = 16, // number of fields. }; enum class PrecisionType : int { kUnk = 0, diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index a4ea030cbf3ae7ead5836f02638ff440335f89fe..619fd7c0ad366269d9905c34bf8dbc1fd702ac7b 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -53,6 +53,7 @@ USE_MIR_PASS(multi_stream_analysis_pass); USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(huawei_ascend_npu_subgraph_pass); +USE_MIR_PASS(nna_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass); USE_MIR_PASS(mlu_subgraph_pass); USE_MIR_PASS(mlu_postprocess_pass); diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc index e32b61094a0b9ce9781cb6e9b8aef7ab753d7278..3de7e7ec015411317c4c8d7b1fc7ea545b135043 100644 --- a/lite/api/python/pybind/pybind.cc +++ b/lite/api/python/pybind/pybind.cc @@ -39,17 +39,17 @@ namespace paddle { namespace lite { namespace pybind { -using lite_api::Tensor; +using lite::LightPredictorImpl; using lite_api::CxxConfig; -using lite_api::MobileConfig; -using lite_api::PowerMode; -using lite_api::TargetType; -using lite_api::PrecisionType; using lite_api::DataLayoutType; -using lite_api::Place; using lite_api::MLUCoreVersion; -using lite::LightPredictorImpl; +using lite_api::MobileConfig; using lite_api::OptBase; +using lite_api::Place; +using lite_api::PowerMode; +using lite_api::PrecisionType; +using lite_api::TargetType; +using lite_api::Tensor; #ifndef LITE_ON_TINY_PUBLISH using lite::CxxPaddleApiImpl; @@ -192,6 +192,7 @@ void BindLitePlace(py::module *m) { .value("RKNPU", TargetType::kRKNPU) .value("APU", TargetType::kAPU) .value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU) + .value("NNA", TargetType::kNNA) .value("Any", TargetType::kAny); // PrecisionType diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index 27a8a46cfa1413ea0d9ffa3641d8e4bd60785e11..4b7192370a0fdef75bc74e931f5749357fa0418e 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -11,3 +11,4 @@ add_subdirectory(bm) add_subdirectory(apu) add_subdirectory(rknpu) add_subdirectory(huawei_ascend_npu) +add_subdirectory(nna) diff --git a/lite/backends/nna/CMakeLists.txt b/lite/backends/nna/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..436db22d6a845486912dd143e1f49f6ca3c82ee0 --- /dev/null +++ b/lite/backends/nna/CMakeLists.txt @@ -0,0 +1,5 @@ +if(NOT LITE_WITH_NNA) + return() +endif() + +lite_cc_library(device_nna SRCS imgdnn_manager.cc DEPS ${nna_builder_libs} ${nna_runtime_libs}) diff --git a/lite/backends/nna/imgdnn_manager.cc b/lite/backends/nna/imgdnn_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..72adb1b7ff3b9eac180db4c3c65180f893063042 --- /dev/null +++ b/lite/backends/nna/imgdnn_manager.cc @@ -0,0 +1,395 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "imgdnn_manager.h" // NOLINT +#include + +namespace paddle { +namespace lite { +namespace nna { + +static void err_callback(imgdnn_report_flags flags, + const char **tensor_names, + int num_tensor_names, + imgdnn_err_code error_code, + const char *error_message) { + std::string msg_prefix; + switch (flags) { + case imgdnn_report_flags::IMGDNN_REPORT_ERROR: + msg_prefix = "ERROR"; + break; + case imgdnn_report_flags::IMGDNN_REPORT_VERBOSE: + msg_prefix = "VERBOSE"; + break; + case imgdnn_report_flags::IMGDNN_REPORT_INFO: + msg_prefix = "INFO"; + break; + case imgdnn_report_flags::IMGDNN_REPORT_WARNING: + msg_prefix = "WARNING"; + break; + default: + std::cerr << "unknown report flag in error callback" << std::endl; + } + + std::cerr << msg_prefix << ": " << error_message << std::endl; +} + +ImgdnnManager::ImgdnnManager() { + err_ = imgdnnSetErrorHandler(err_callback); + net_ = imgdnnCreateNetwork(&err_); + ASSERT(err_ != IMGDNN_SUCCESS, "CreateNetwork failed!"); + + unsigned int num_devices; + err_ = imgdnnGetDevices( + IMGDNN_DEVICE_TYPE_ACCELERATOR, 1, &device_, &num_devices); + ASSERT(err_ != IMGDNN_SUCCESS, "GetDevices failed!"); + context_ = imgdnnCreateContext(num_devices, &device_, 0, &err_); + ASSERT(err_ != IMGDNN_SUCCESS, "CreateContext failed!"); + binding_ = imgdnnCreateBinding(&err_); + ASSERT(err_ != IMGDNN_SUCCESS, "CreateBinding failed!"); +} + +imgdnn_tensor ImgdnnManager::createConvolutionLayer( + imgdnn_tensor input_tensor, + imgdnn_tensor weights_tensor, + imgdnn_tensor bias_tensor, + imgdnn_quant_param dst_quant_param, + unsigned int stride[2], + unsigned int pad_begin[2], + unsigned int pad_end[2], + unsigned int dilation[2], + bool use_dwconv) { + imgdnn_tensor convw_tensor; + if (use_dwconv) { + // transpose weight + int order[4] = {1, 0, 2, 3}; + imgdnn_tensor transport_weights = + imgdnnNetworkTransposeOp(net_, weights_tensor, order, &err_); + convw_tensor = imgdnnNetworkDepthConvolution2dOp_v2(net_, + input_tensor, + transport_weights, + stride, + pad_begin, + pad_end, + dilation, + &err_); + } else { + convw_tensor = imgdnnNetworkConvolution2dOp_v2(net_, + input_tensor, + weights_tensor, + stride, + pad_begin, + pad_end, + dilation, + &err_); + } + + // debug + imgdnn_tensor_descriptor desc_1; + imgdnnGetTensorDescriptor(input_tensor, &desc_1); + imgdnnGetTensorDescriptor(weights_tensor, &desc_1); + imgdnnGetTensorDescriptor(convw_tensor, &desc_1); + + imgdnn_tensor conv2d_tensor; + if (bias_tensor) { + imgdnn_tensor convw_int_tensor = imgdnnNetworkCastOp( + net_, convw_tensor, IMGDNN_TYPE_I32, nullptr, &err_); + + imgdnn_tensor_descriptor bias_desc; + imgdnnGetTensorDescriptor(convw_tensor, &bias_desc); + + imgdnn_tensor broadcast2_tensor; + broadcast2_tensor = imgdnnNetworkBroadcastOp( + net_, bias_tensor, 2, bias_desc.size[2], &err_); + + imgdnn_tensor broadcast3_tensor; + broadcast3_tensor = imgdnnNetworkBroadcastOp( + net_, broadcast2_tensor, 3, bias_desc.size[3], &err_); + + conv2d_tensor = imgdnnNetworkBinaryOp( + net_, convw_int_tensor, broadcast3_tensor, IMGDNN_OPERATION_ADD, &err_); + } else { + conv2d_tensor = convw_tensor; + } + + imgdnn_tensor conv2d_out_tensor; + imgdnn_tensor_descriptor desc; + imgdnnGetTensorDescriptor(input_tensor, &desc); + if (desc.type == IMGDNN_TYPE_Q_I8 || desc.type == IMGDNN_TYPE_Q_U8) { + conv2d_out_tensor = imgdnnNetworkCastOp( + net_, conv2d_tensor, desc.type, &dst_quant_param, &err_); + } + + return conv2d_out_tensor; +} + +imgdnn_tensor ImgdnnManager::createBatchNormLayer(imgdnn_tensor input_tensor, + const void *const avg_in, + const void *const var_in, + const float eps) { + imgdnn_tensor bna_tensor; + imgdnn_tensor average_tensor; + imgdnn_tensor_descriptor av_desc; + + imgdnn_tensor broadcast2_tensor; + imgdnn_tensor broadcast3_tensor; + + unsigned int buffer_size; + + imgdnn_tensor_descriptor in_desc; + imgdnnGetTensorDescriptor(input_tensor, &in_desc); + + av_desc.dimensions = 2; + av_desc.type = in_desc.type; + av_desc.size[0] = in_desc.size[0]; + av_desc.size[1] = in_desc.size[1]; + + average_tensor = createFixedInputTensor(&av_desc, avg_in, true); + + broadcast2_tensor = + imgdnnNetworkBroadcastOp(net_, average_tensor, 2, in_desc.size[2], &err_); + + broadcast3_tensor = imgdnnNetworkBroadcastOp( + net_, broadcast2_tensor, 3, in_desc.size[3], &err_); + + bna_tensor = imgdnnNetworkBinaryOp( + net_, input_tensor, broadcast3_tensor, IMGDNN_OPERATION_SUB, &err_); + + imgdnn_tensor variance_tensor; + imgdnn_tensor_descriptor va_desc; + + va_desc.dimensions = 2; + va_desc.type = in_desc.type; + va_desc.size[0] = in_desc.size[0]; + va_desc.size[1] = in_desc.size[1]; + + buffer_size = imgdnnGetDescriptorSize(&va_desc, &err_); + float *variance = reinterpret_cast(GetBufromPool(buffer_size)); + memcpy(variance, var_in, buffer_size); + // Perform 1/sqrt(var+eps) and Update var_data. + buffer_size /= sizeof(float); + for (size_t i = 0; i < buffer_size; i++) { + variance[i] = 1.0 / (sqrt(variance[i] + eps)); + } + variance_tensor = createFixedInputTensor(&va_desc, variance, false); + + broadcast2_tensor = imgdnnNetworkBroadcastOp( + net_, variance_tensor, 2, in_desc.size[2], &err_); + + broadcast3_tensor = imgdnnNetworkBroadcastOp( + net_, broadcast2_tensor, 3, in_desc.size[3], &err_); + + imgdnn_tensor bn_tensor; + bn_tensor = imgdnnNetworkBinaryOp( + net_, bna_tensor, broadcast3_tensor, IMGDNN_OPERATION_MUL, &err_); + + return bn_tensor; +} + +imgdnn_tensor ImgdnnManager::createPoolingLayer( + imgdnn_tensor in_tensor, + imgdnn_quant_param dst_quant_param, + const unsigned int size[2], + const unsigned int stride[2], + const unsigned int pad_to_begin[2], + const unsigned int pad_to_end[2], + imgdnn_pooling_type type) { + // debug + imgdnn_tensor_descriptor desc_1; + imgdnnGetTensorDescriptor(in_tensor, &desc_1); + + imgdnn_tensor pool_tensor = imgdnnNetworkPooling2dOp_v2( + net_, in_tensor, size, stride, pad_to_begin, pad_to_end, type, &err_); + // debug + imgdnnGetTensorDescriptor(pool_tensor, &desc_1); + + imgdnn_tensor_descriptor desc; + imgdnnGetTensorDescriptor(in_tensor, &desc); + if (desc.type == IMGDNN_TYPE_Q_I8 || desc.type == IMGDNN_TYPE_Q_U8) { + pool_tensor = imgdnnNetworkCastOp( + net_, pool_tensor, desc.type, &dst_quant_param, &err_); + } + + return pool_tensor; +} + +imgdnn_tensor ImgdnnManager::createFullyConnectedLayer( + imgdnn_tensor input_tensor, + imgdnn_tensor weights_tensor, + imgdnn_tensor bias_tensor, + imgdnn_quant_param dst_quant_param) { + imgdnn_tensor fcw_tensor; + imgdnn_tensor fcb_tensor; + + imgdnn_tensor_descriptor in_desc; + imgdnnGetTensorDescriptor(input_tensor, &in_desc); + + // int flatten_dim = 1 + for (unsigned i = 2; i < in_desc.dimensions; ++i) + in_desc.size[1] *= in_desc.size[i]; + in_desc.dimensions = 2; + + auto reshaped_input = + imgdnnNetworkReshapeOp(net_, input_tensor, &in_desc, &err_); + + // debug + imgdnn_tensor_descriptor desc_1; + imgdnnGetTensorDescriptor(reshaped_input, &desc_1); + imgdnn_tensor_descriptor desc_2; + imgdnnGetTensorDescriptor(weights_tensor, &desc_2); + imgdnn_tensor_descriptor desc_3; + imgdnnGetTensorDescriptor(bias_tensor, &desc_3); + + // handle weights [num_units, input_size] tensor + /* const int order[] = { 1, 0 }; + auto isnu_weights_tensor = imgdnnNetworkTransposeOp(net, + weights_tensor, + order, + &err_);*/ + + fcw_tensor = imgdnnNetworkBinaryOp( + net_, reshaped_input, weights_tensor, IMGDNN_OPERATION_MATMUL, &err_); + + if (bias_tensor) { + imgdnn_tensor fcw_int_tensor = + imgdnnNetworkCastOp(net_, fcw_tensor, IMGDNN_TYPE_I32, nullptr, &err_); + + imgdnn_tensor_descriptor desc_4; + imgdnnGetTensorDescriptor(fcw_int_tensor, &desc_4); + + fcb_tensor = imgdnnNetworkBinaryOp( + net_, fcw_int_tensor, bias_tensor, IMGDNN_OPERATION_ADD, &err_); + } else { + fcb_tensor = fcw_tensor; + } + + imgdnn_tensor_descriptor desc; + imgdnnGetTensorDescriptor(input_tensor, &desc); + if (desc.type == IMGDNN_TYPE_Q_I8 || desc.type == IMGDNN_TYPE_Q_U8) { + fcb_tensor = imgdnnNetworkCastOp( + net_, fcb_tensor, desc.type, &dst_quant_param, &err_); + } + + return fcb_tensor; +} + +imgdnn_tensor ImgdnnManager::createSoftmaxLayer( + imgdnn_tensor input_tensor, + float beta, + unsigned int axis, + imgdnn_quant_param dst_quant_param) { + // debug + imgdnn_tensor_descriptor desc_1; + imgdnnGetTensorDescriptor(input_tensor, &desc_1); + + imgdnn_tensor softmax_tensor = + imgdnnNetworkSoftmaxOp(net_, input_tensor, beta, axis, &err_); + imgdnn_tensor_descriptor desc; + imgdnnGetTensorDescriptor(input_tensor, &desc); + if (desc.type == IMGDNN_TYPE_Q_I8 || desc.type == IMGDNN_TYPE_Q_U8) { + softmax_tensor = imgdnnNetworkCastOp( + net_, softmax_tensor, desc.type, &dst_quant_param, &err_); + } + + imgdnn_tensor_descriptor desc_2; + imgdnnGetTensorDescriptor(softmax_tensor, &desc_2); + + return softmax_tensor; +} + +imgdnn_tensor ImgdnnManager::createScaleLayer(imgdnn_tensor input_tensor, + bool with_biasscale, + const void *const scale, + const void *const bias) { + imgdnn_tensor sc_tensor; + imgdnn_tensor scale_tensor; + imgdnn_tensor_descriptor sc_desc; + + imgdnn_tensor broadcast2_tensor; + imgdnn_tensor broadcast3_tensor; + + unsigned int buffer_size; + + imgdnn_tensor_descriptor in_desc; + imgdnnGetTensorDescriptor(input_tensor, &in_desc); + + sc_desc.dimensions = 2; + sc_desc.type = in_desc.type; + sc_desc.size[0] = in_desc.size[0]; + sc_desc.size[1] = in_desc.size[1]; + + scale_tensor = createFixedInputTensor(&sc_desc, scale, true); + + broadcast2_tensor = + imgdnnNetworkBroadcastOp(net_, scale_tensor, 2, in_desc.size[2], &err_); + + broadcast3_tensor = imgdnnNetworkBroadcastOp( + net_, broadcast2_tensor, 3, in_desc.size[3], &err_); + + sc_tensor = imgdnnNetworkBinaryOp( + net_, input_tensor, broadcast3_tensor, IMGDNN_OPERATION_MUL, &err_); + + if (with_biasscale) { + imgdnn_tensor bsc_tensor; + imgdnn_tensor biasscale_tensor; + + biasscale_tensor = createFixedInputTensor(&sc_desc, bias, true); + + broadcast2_tensor = imgdnnNetworkBroadcastOp( + net_, biasscale_tensor, 2, in_desc.size[2], &err_); + + broadcast3_tensor = imgdnnNetworkBroadcastOp( + net_, broadcast2_tensor, 3, in_desc.size[3], &err_); + + bsc_tensor = imgdnnNetworkBinaryOp( + net_, sc_tensor, broadcast3_tensor, IMGDNN_OPERATION_ADD, &err_); + return bsc_tensor; + } else { + return sc_tensor; + } +} + +imgdnn_network_object ImgdnnManager::createNetworkObject( + unsigned int num_inputs, + imgdnn_tensor *inputs, + unsigned int num_outputs, + imgdnn_tensor *outputs) { + const imgdnn_network_object_flags flags = 0; + + std::string options_str; + std::string ddk_root{"/home/jasonwang/imgtools/ndk/main/"}; + std::string hwconfig = + ddk_root + "nna-tools/config/mirage_hw_config06_23_2_6500_301.json"; + std::string mapconfig = ddk_root + "nna-tools/config/mapconfig_q8a.json"; + options_str += "-h " + hwconfig; + options_str += " -m " + mapconfig; + // options_str += " --dump_debug_binaries enabled"; + + net_obj_ = imgdnnCreateNetworkObject(device_, + context_, + net_, + num_inputs, + inputs, + num_outputs, + outputs, + flags, + options_str.c_str(), + &err_); + ASSERT(err_ != IMGDNN_SUCCESS, "CreateNetworkObject failed!"); + return net_obj_; +} + +} // namespace nna +} // namespace lite +} // namespace paddle diff --git a/lite/backends/nna/imgdnn_manager.h b/lite/backends/nna/imgdnn_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..ef2fb15d13a9d43d514764b271e34e04ba3270eb --- /dev/null +++ b/lite/backends/nna/imgdnn_manager.h @@ -0,0 +1,257 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "imgdnn.h" // NOLINT +#include "lite/core/op_lite.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace nna { + +static inline void CheckAndPrint(bool cond, + const char *msg, + int line, + const char *filename) { + if (cond) { + std::stringstream err_msg; + err_msg << "ERROR: " << msg << "\n"; + err_msg << "Violated condition at line " << line << " in " << filename; + std::cerr << err_msg.str() << "\n"; + exit(EXIT_FAILURE); + } +} + +#define ASSERT(statement, msg) \ + lite::nna::CheckAndPrint(statement, msg, __LINE__, __FILE__) + +class ImgdnnManager { + imgdnn_err_code err_; + imgdnn_device device_; + imgdnn_network net_{nullptr}; + imgdnn_context context_{nullptr}; + imgdnn_binding binding_{nullptr}; + imgdnn_network_object net_obj_{nullptr}; + + std::vector coef_pool; + + public: + ImgdnnManager(); + + virtual ~ImgdnnManager() { + std::cout << "~ImgdnnManager called" << std::endl; + if (net_obj_) err_ = imgdnnNetworkObjectDestroy(net_obj_); + if (context_) err_ = imgdnnContextDestroy(context_); + if (binding_) err_ = imgdnnBindingDestroy(binding_); + if (net_) err_ = imgdnnNetworkDestroy(net_); + + for (auto buf : coef_pool) delete[] buf; + } + + uint8_t *GetBufromPool(size_t size) { + uint8_t *buf = new uint8_t[size]; + coef_pool.push_back(buf); + return buf; + } + + imgdnn_network GetNetwork() { return net_; } + + imgdnn_tensor createInputTensor(imgdnn_tensor_descriptor *desc) { + return imgdnnNetworkInput(net_, desc, &err_); + } + + imgdnn_tensor createFixedInputTensor(imgdnn_tensor_descriptor *desc, + const void *const fixed_data, + bool mem_copy) { + imgdnn_tensor fixed_input; + if (mem_copy) { + size_t buffer_size = imgdnnGetDescriptorSize(desc, &err_); + void *buf = GetBufromPool(buffer_size); + memcpy(buf, fixed_data, buffer_size); + fixed_input = imgdnnNetworkFixedInput(net_, desc, buf, &err_); + } else { + fixed_input = imgdnnNetworkFixedInput(net_, desc, fixed_data, &err_); + } + return fixed_input; + } + + imgdnn_tensor createConvolutionLayer(imgdnn_tensor input_tensor, + imgdnn_tensor weights_tensor, + imgdnn_tensor bias_tensor, + imgdnn_quant_param dst_quant_param, + unsigned int stride[2], + unsigned int pad_begin[2], + unsigned int pad_end[2], + unsigned int dilation[2], + bool use_dwconv = false); + imgdnn_tensor createBatchNormLayer(imgdnn_tensor input_tensor, + const void *const avg_in, + const void *const var_in, + const float eps); + imgdnn_tensor createPoolingLayer(imgdnn_tensor in_tensor, + imgdnn_quant_param dst_quant_param, + const unsigned int size[2], + const unsigned int stride[2], + const unsigned int pad_to_begin[2], + const unsigned int pad_to_end[2], + imgdnn_pooling_type type); + imgdnn_tensor createFullyConnectedLayer(imgdnn_tensor input_tensor, + imgdnn_tensor weights_tensor, + imgdnn_tensor bias_tensor, + imgdnn_quant_param dst_quant_param); + imgdnn_tensor createSoftmaxLayer(imgdnn_tensor in_tensor, + float beta, + unsigned int axis, + imgdnn_quant_param dst_quant_param); + imgdnn_tensor createScaleLayer(imgdnn_tensor input_tensor, + bool with_biasscale, + const void *const scale, + const void *const bias); + + imgdnn_tensor createReLULayer(imgdnn_tensor in_tensor, + bool has_min_clamp, + float min_clamp, + bool has_max_clamp, + float max_clamp, + float negative_slope) { + imgdnn_tensor relu_tensor = imgdnnNetworkReLUOp(net_, + in_tensor, + has_min_clamp, + min_clamp, + has_max_clamp, + max_clamp, + negative_slope, + &err_); + ASSERT(err_ != IMGDNN_SUCCESS, "ReLU OP fails"); + + imgdnn_tensor_descriptor in_desc, relu_desc; + imgdnnGetTensorDescriptor(in_tensor, &in_desc); + imgdnnGetTensorDescriptor(relu_tensor, &relu_desc); + if (relu_desc.type != in_desc.type) { + relu_tensor = imgdnnNetworkCastOp( + net_, relu_tensor, in_desc.type, &in_desc.quant_param, &err_); + ASSERT(err_ != IMGDNN_SUCCESS, "ReLU cast fails"); + } + + return relu_tensor; + } + + imgdnn_network_object createNetworkObject(unsigned int num_inputs, + imgdnn_tensor *inputs, + unsigned int num_outputs, + imgdnn_tensor *outputs); + + imgdnn_memory importMemory( + void *memory, + size_t size, + imgdnn_import_mem_type import_mem_type = IMGDNN_IMPORT_MEM_TYPE_CPU) { + imgdnn_memory mem = + imgdnnImportMemory(context_, memory, size, import_mem_type, &err_); + ASSERT(err_ != IMGDNN_SUCCESS, "ImportMemory fails"); + return mem; + } + + imgdnn_memory allocateMemory(size_t size) { + imgdnn_memory mem = imgdnnAllocateMemory(context_, size, &err_); + ASSERT(err_ != IMGDNN_SUCCESS, "AllocateMemory fails"); + return mem; + } + + void destroyMemory(imgdnn_memory memory) { + err_ = imgdnnMemoryDestroy(memory); + ASSERT(err_ != IMGDNN_SUCCESS, "MemoryDestroy fails"); + } + + void *lockMemory(imgdnn_memory memory, imgdnn_lock_access lock_access) { + void *mem = imgdnnMemoryLock(memory, lock_access, &err_); + ASSERT(err_ != IMGDNN_SUCCESS, "MemoryLock fails"); + return mem; + } + + void unlockMemory(imgdnn_memory memory) { + err_ = imgdnnMemoryUnlock(memory); + ASSERT(err_ != IMGDNN_SUCCESS, "MemoryUnLock fails"); + } + + void getNetworkObjectInputs(unsigned int max_inputs, + imgdnn_input inputs[], + unsigned int *num_inputs) { + ASSERT(net_obj_ == nullptr, "NetworkObject NULL when get its inputs"); + err_ = + imgdnnNetworkObjectGetInputs(net_obj_, max_inputs, inputs, num_inputs); + ASSERT(err_ != IMGDNN_SUCCESS, "NetworkObjectGetInputs failed!"); + } + + void getNetworkObjectOutputs(unsigned int max_outputs, + imgdnn_output outputs[], + unsigned int *num_outputs) { + ASSERT(net_obj_ == nullptr, "NetworkObject NULL when get its outputs"); + err_ = imgdnnNetworkObjectGetOutputs( + net_obj_, max_outputs, outputs, num_outputs); + ASSERT(err_ != IMGDNN_SUCCESS, "NetworkObjectGetOutputs failed!"); + } + + imgdnn_tensor_descriptor getInputDescriptor(imgdnn_input input) { + imgdnn_tensor_descriptor desc = imgdnnGetInputDescriptor(input, &err_); + ASSERT(err_ != IMGDNN_SUCCESS, "GetInputDescriptors failed!"); + return desc; + } + + imgdnn_tensor_descriptor getOutputDescriptor(imgdnn_output output) { + imgdnn_tensor_descriptor desc = imgdnnGetOutputDescriptor(output, &err_); + ASSERT(err_ != IMGDNN_SUCCESS, "GetOutputDescriptors failed!"); + return desc; + } + + size_t getDescriptorSize(const imgdnn_tensor_descriptor *const descriptor) { + size_t size = imgdnnGetDescriptorSize(descriptor, &err_); + ASSERT(err_ != IMGDNN_SUCCESS, "GetDescriptorSize failed!"); + return size; + } + + void addBindingInput(imgdnn_input input, imgdnn_memory memory) { + err_ = imgdnnBindingAddInput(binding_, input, memory); + ASSERT(err_ != IMGDNN_SUCCESS, "BindingAddInput failed!"); + } + + void addBindingOutput(imgdnn_output output, imgdnn_memory memory) { + err_ = imgdnnBindingAddOutput(binding_, output, memory); + ASSERT(err_ != IMGDNN_SUCCESS, "BindingAddOutput failed!"); + } + + void executeNetworkObject(bool blocking_execute, + unsigned int num_events_in_wait_list, + const imgdnn_event event_wait_list[], + imgdnn_event *event) { + err_ = imgdnnNetworkObjectExecute(net_obj_, + binding_, + blocking_execute, + num_events_in_wait_list, + event_wait_list, + event); + ASSERT(err_ != IMGDNN_SUCCESS, "NetworkObjectExecute failed!"); + } +}; + +} // namespace nna +} // namespace lite +} // namespace paddle diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index 53988f063b89ae3e75f4c27cc1d937d12bb6dae5..b2360467e1b586689ca8a9d740da662314893d18 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -6,5 +6,5 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${nna_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/core/context.h b/lite/core/context.h index 84742bf478c26e5609c507925c6d28805cb3a70c..1cfdf435a33d320300f461cd9ec8438c1cee195f 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -64,6 +64,7 @@ using BMContext = Context; using MLUContext = Context; using RKNPUContext = Context; using HuaweiAscendNPUContext = Context; +using NNAContext = Context; template <> class Context { @@ -173,6 +174,21 @@ class Context { }; #endif +#ifdef LITE_WITH_NNA +template <> +class Context { + public: + Context() {} + explicit Context(const NNAContext& ctx); + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() {} + void CopySharedTo(NNAContext* ctx) {} + + // NNAContext& operator=(const NNAContext& ctx) {} + std::string name() const { return "NNAContext"; } +}; +#endif + #ifdef LITE_WITH_XPU template <> class Context { @@ -471,6 +487,12 @@ class ContextScheduler { &ctx->As()); break; #endif +#ifdef LITE_WITH_NNA + case TARGET(kNNA): + kernel_contexts_[TargetType::kNNA].As().CopySharedTo( + &ctx->As()); + break; +#endif #ifdef LITE_WITH_MLU case TARGET(kMLU): { int dev_id = TargetWrapper::GetCurDevice(); @@ -533,6 +555,9 @@ class ContextScheduler { #endif #ifdef LITE_WITH_MLU InitContext(); +#endif +#ifdef LITE_WITH_NNA + InitContext(); #endif } diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc index 3817d0049c9e302b5b39aae6bca96dff2180bd73..a3791308811315c5c37a3c4cce6d269fd531efb2 100644 --- a/lite/core/mir/memory_optimize_pass.cc +++ b/lite/core/mir/memory_optimize_pass.cc @@ -52,21 +52,21 @@ void MemoryOptimizePass::CollectLifeCycleByDevice( "feed", "fetch"}; - auto insert_invalid_op_nodes_for_specific_target = [&]( - std::set op_node_set, TargetType specific_target) { - std::set invalid_op_nodes_opencl = {"layout", "fc"}; - for (auto& op_node : graph->StmtTopologicalOrder()) { - if (!op_node->IsStmt()) continue; - TargetType op_target_type = op_node->AsStmt().place().target; - if (op_target_type == specific_target && - specific_target == TARGET(kOpenCL)) { - invalid_op_nodes.insert(invalid_op_nodes_opencl.begin(), - invalid_op_nodes_opencl.end()); - break; - } - // else if // you can add more targets - } - }; + auto insert_invalid_op_nodes_for_specific_target = + [&](std::set op_node_set, TargetType specific_target) { + std::set invalid_op_nodes_opencl = {"layout", "fc"}; + for (auto& op_node : graph->StmtTopologicalOrder()) { + if (!op_node->IsStmt()) continue; + TargetType op_target_type = op_node->AsStmt().place().target; + if (op_target_type == specific_target && + specific_target == TARGET(kOpenCL)) { + invalid_op_nodes.insert(invalid_op_nodes_opencl.begin(), + invalid_op_nodes_opencl.end()); + break; + } + // else if // you can add more targets + } + }; VLOG(4) << "invalid_op_nodes.size();" << invalid_op_nodes.size(); insert_invalid_op_nodes_for_specific_target(invalid_op_nodes, @@ -315,4 +315,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) TARGET(kRKNPU), TARGET(kAPU), TARGET(kMLU), - TARGET(kHuaweiAscendNPU)}); + TARGET(kHuaweiAscendNPU), + TARGET(kNNA)}); diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc index 429c780912094baf9ceb8b5124dc197abd51af41..a02dea65023b432144d2efeca40cbd882555767f 100644 --- a/lite/core/mir/subgraph/subgraph_pass.cc +++ b/lite/core/mir/subgraph/subgraph_pass.cc @@ -128,6 +128,20 @@ void MLUSubgraphPass::Apply(const std::unique_ptr& graph) { fuser(); } +void NNASubgraphPass::Apply(const std::unique_ptr& graph) { + std::set supported_lists; +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); +#include "lite/kernels/nna/bridges/paddle_use_bridges.h" +#undef USE_SUBGRAPH_BRIDGE + auto teller = [&](Node* node) { + if (!node->IsStmt()) return false; + auto& stmt = node->AsStmt(); + return supported_lists.count(stmt.op_type()) != 0; + }; + SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */); + fuser(); +} + } // namespace mir } // namespace lite } // namespace paddle @@ -147,3 +161,5 @@ REGISTER_MIR_PASS(rknpu_subgraph_pass, paddle::lite::mir::RKNPUSubgraphPass) .BindTargets({TARGET(kRKNPU)}); REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass) .BindTargets({TARGET(kMLU)}); +REGISTER_MIR_PASS(nna_subgraph_pass, paddle::lite::mir::NNASubgraphPass) + .BindTargets({TARGET(kNNA)}); diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h index c40a527cfe72ab1556e868d05aab5c0280fa4514..9e9530ca3892b7f0b674b9a77fc1a4e4a24dbc11 100644 --- a/lite/core/mir/subgraph/subgraph_pass.h +++ b/lite/core/mir/subgraph/subgraph_pass.h @@ -57,6 +57,11 @@ class MLUSubgraphPass : public ProgramPass { void Apply(const std::unique_ptr& graph) override; }; +class NNASubgraphPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + } // namespace mir } // namespace lite } // namespace paddle diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 7709090c038cf81bee5a735b682ea0721ee30ec1..5be3e1e630df126f5bfeb3188300e2c1ce3ae229 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -126,6 +126,7 @@ class Optimizer { // of the quantized ops. "npu_subgraph_pass", "huawei_ascend_npu_subgraph_pass", + "imagination_nna_subgraph_pass", "xpu_subgraph_pass", "bm_subgraph_pass", "apu_subgraph_pass", diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt index 5dffd7c1a93225a38e433a4ff447b9b0fc863216..20e7f22db36304eeeae0f2bf8eee10737c0cea5b 100644 --- a/lite/gen_code/CMakeLists.txt +++ b/lite/gen_code/CMakeLists.txt @@ -17,6 +17,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc NPU_DEPS ${npu_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} RKNPU_DEPS ${rknpu_kernels} + NNA_DEPS ${nna_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} @@ -47,6 +48,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co NPU_DEPS ${npu_kernels} HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels} RKNPU_DEPS ${rknpu_kernels} + NNA_DEPS ${nna_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt index 91268bc28dbdf38137904f986b254a76cbd5e538..60d9a0fe1207d776213aff6a7ec51774bd7d600b 100644 --- a/lite/kernels/CMakeLists.txt +++ b/lite/kernels/CMakeLists.txt @@ -15,3 +15,4 @@ add_subdirectory(apu) add_subdirectory(bm) add_subdirectory(rknpu) add_subdirectory(huawei_ascend_npu) +add_subdirectory(nna) diff --git a/lite/kernels/nna/CMakeLists.txt b/lite/kernels/nna/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5af4f05d7c16e10f942a02940731a46d4f965479 --- /dev/null +++ b/lite/kernels/nna/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory(bridges) + +add_kernel(subgraph_compute_nna NNA basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_nna subgraph_bridge_engine ${nna_subgraph_bridges}) diff --git a/lite/kernels/nna/bridges/CMakeLists.txt b/lite/kernels/nna/bridges/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8ad9a8a738af3c6c8c0ed621d4f4b44e5f7f025 --- /dev/null +++ b/lite/kernels/nna/bridges/CMakeLists.txt @@ -0,0 +1,81 @@ +if(NOT LITE_WITH_NNA) + return() +endif() + +lite_cc_library(subgraph_bridge_utility_nna SRCS utility.cc DEPS ${nna_builder_libs} ${nna_runtime_libs} tensor) +lite_cc_library(subgraph_bridge_graph_nna SRCS graph.cc DEPS subgraph_bridge_utility_nna) + +set(nna_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_nna subgraph_bridge_graph_nna) + +lite_cc_library(subgraph_bridge_fc_op_nna SRCS fc_op.cc DEPS ${nna_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_conv_op_nna SRCS conv_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_matmul_op_nna SRCS matmul_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_mul_op_nna SRCS mul_op.cc DEPS ${nna_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_act_op_nna SRCS act_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_scale_op_nna SRCS scale_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_softmax_op_nna SRCS softmax_op.cc DEPS ${nna_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_pool_op_nna SRCS pool_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_batch_norm_op_nna SRCS batch_norm_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_elementwise_ops_nna SRCS elementwise_ops.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_reshape_op_nna SRCS reshape_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_conv_transpose_op_nna SRCS conv_transpose_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_interpolate_op_nna SRCS interpolate_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_transpose_op_nna SRCS transpose_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_split_op_nna SRCS split_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_concat_op_nna SRCS concat_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_shuffle_channel_op_nna SRCS shuffle_channel_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_pad2d_op_nna SRCS pad2d_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_reduce_mean_op_nna SRCS reduce_mean_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_unsqueeze_op_nna SRCS unsqueeze_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_gather_op_nna SRCS gather_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_lookup_table_op_nna SRCS lookup_table_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_argmax_op_nna SRCS argmax_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_instance_norm_op_nna SRCS instance_norm_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_dropout_op_nna SRCS dropout_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_topk_op_nna SRCS topk_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_layer_norm_op_nna SRCS layer_norm_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_fill_constant_op_nna SRCS fill_constant_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_fill_constant_batch_size_like_op_nna SRCS fill_constant_batch_size_like_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_increment_op_nna SRCS increment_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_compare_op_nna SRCS compare_op.cc DEPS ${nna_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_shape_op_nna SRCS shape_op.cc DEPS ${nna_subgraph_bridge_deps}) + + +set(nna_subgraph_bridges + subgraph_bridge_registry + subgraph_bridge_utility_nna + subgraph_bridge_graph_nna + subgraph_bridge_fc_op_nna + subgraph_bridge_conv_op_nna + #subgraph_bridge_matmul_op_nna + #subgraph_bridge_mul_op_nna + subgraph_bridge_act_op_nna + #subgraph_bridge_scale_op_nna + #subgraph_bridge_softmax_op_nna + subgraph_bridge_pool_op_nna + #subgraph_bridge_batch_norm_op_nna + #subgraph_bridge_elementwise_ops_nna + #subgraph_bridge_reshape_op_nna + #subgraph_bridge_conv_transpose_op_nna + #subgraph_bridge_interpolate_op_nna + #subgraph_bridge_transpose_op_nna + #subgraph_bridge_split_op_nna + #subgraph_bridge_concat_op_nna + #subgraph_bridge_shuffle_channel_op_nna + #subgraph_bridge_pad2d_op_nna + #subgraph_bridge_reduce_mean_op_nna + #subgraph_bridge_unsqueeze_op_nna + #subgraph_bridge_gather_op_nna + #subgraph_bridge_lookup_table_op_nna + #subgraph_bridge_argmax_op_nna + #subgraph_bridge_instance_norm_op_nna + #subgraph_bridge_dropout_op_nna + #subgraph_bridge_topk_op_nna + #subgraph_bridge_layer_norm_op_nna + #subgraph_bridge_fill_constant_op_nna + #subgraph_bridge_fill_constant_batch_size_like_op_nna + #subgraph_bridge_increment_op_nna + #subgraph_bridge_compare_op_nna + CACHE INTERNAL "nna_subgraph_bridges") + +message(STATUS "+++++ nna_subgraph_bridges: ${nna_subgraph_bridges}") diff --git a/lite/kernels/nna/bridges/act_op.cc b/lite/kernels/nna/bridges/act_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d8fc30fab483cfa9b9f5176bc07a4f7a6280ae9f --- /dev/null +++ b/lite/kernels/nna/bridges/act_op.cc @@ -0,0 +1,112 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/nna/bridges/graph.h" +#include "lite/kernels/nna/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace nna { + +// template +int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[NNA] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x_type = kernel->GetInputDeclType("X"); + CHECK(x_type->precision() == PRECISION(kFloat)); + CHECK(x_type->layout() == DATALAYOUT(kNCHW)); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out_type = kernel->GetOutputDeclType("Out"); + CHECK(out_type->precision() == PRECISION(kFloat)); + CHECK(out_type->layout() == DATALAYOUT(kNCHW)); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + // x_node = graph->Add(x_name, *x); + LOG(WARNING) << "ActConverter:x_node not in graph"; + } + + imgdnn_tensor relu_output = graph->GetBuilder()->createReLULayer( + x_node->data(), true, 0.0, false, 0.0, 0.0); + + imgdnn_tensor_descriptor desc; + imgdnn_err_code err = imgdnnGetTensorDescriptor(relu_output, &desc); + CHECK(err == IMGDNN_SUCCESS) << "fail get tensor description(RELU)"; + + graph->Add(out_name, relu_output, desc.type); + + return SUCCESS; +} + +} // namespace nna +} // namespace subgraph +} // namespace lite +} // namespace paddle +#if 0 +REGISTER_SUBGRAPH_BRIDGE( + sigmoid, + kNNA, + paddle::lite::subgraph::nna::ActConverter); +#endif +REGISTER_SUBGRAPH_BRIDGE(relu, kNNA, paddle::lite::subgraph::nna::ActConverter); +#if 0 +REGISTER_SUBGRAPH_BRIDGE( + tanh, kNNA, paddle::lite::subgraph::nna::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + relu_clipped, + kNNA, + paddle::lite::subgraph::nna::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + relu6, kNNA, paddle::lite::subgraph::nna::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + leaky_relu, + kNNA, + paddle::lite::subgraph::nna::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + abs, kNNA, paddle::lite::subgraph::nna::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + softsign, + kNNA, + paddle::lite::subgraph::nna::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + softplus, + kNNA, + paddle::lite::subgraph::nna::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + hard_sigmoid, + kNNA, + paddle::lite::subgraph::nna::ActConverter); + +REGISTER_SUBGRAPH_BRIDGE( + log, kNNA, paddle::lite::subgraph::nna::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + square, kNNA, paddle::lite::subgraph::nna::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + sqrt, kNNA, paddle::lite::subgraph::nna::ActConverter); +#endif diff --git a/lite/kernels/nna/bridges/batch_norm_op.cc b/lite/kernels/nna/bridges/batch_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f870a1a3a93b2ae1f5eb60d688b68fbc3b717dd0 --- /dev/null +++ b/lite/kernels/nna/bridges/batch_norm_op.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/nna/bridges/graph.h" +#include "lite/kernels/nna/bridges/registry.h" +#include "lite/kernels/nna/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace nna { + +int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[NNA] Converting " + op_type + "..."; + + // Get innat and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto scale_name = op_info->Input("Scale").front(); + auto scale = scope->FindMutableTensor(scale_name); + auto bias_name = op_info->Input("Bias").front(); + auto bias = scope->FindMutableTensor(bias_name); + auto mean_name = op_info->Input("Mean").front(); + auto mean = scope->FindMutableTensor(mean_name); + auto variance_name = op_info->Input("Variance").front(); + auto variance = scope->FindMutableTensor(variance_name); + auto y_name = op_info->Output("Y").front(); + // float momentum = op_info->GetAttr("momentum"); + float epsilon = op_info->GetAttr("epsilon"); + // int mode = 1; // bnScale, bnBias tensor dims are 1xCx1x1 + /* + bool use_global_stats = !op_info->HasAttr("use_global_stats") || + op_info->GetAttr("use_global_stats"); + if (!use_global_stats) { + LOG(WARNING) << "[NNA] Only use_global_stats=true is supported by DDK"; + } + */ + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + // x_node = graph->Add(x_name, *x); + LOG(WARNING) << "BatchNormConverter:x_node not in graph"; + } + + ConvNetBuilder& builder = graph->GetBuilder(); + auto bn_out = builder.createBatchNormLayer(x_node->data(), + mean->mutable_data(), + variance->mutable_data(), + epsilon); + bn_out = builder.createScaleLayer( + bn_out, true, scale->mutable_data(), bias->mutable_data()); + + // PrecisionType precision = x->precision(); + imgdnn_tensor_descriptor desc; + imgdnn_err_code err = imgdnnGetTensorDescriptor(bn_out, &desc); + CHECK(err == IMGDNN_SUCCESS) << "fail get tensor description(BN)"; + + graph->Add(y_name, bn_out, desc.type); + + return SUCCESS; +} + +} // namespace nna +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(batch_norm, + kNNA, + paddle::lite::subgraph::nna::BatchNormConverter); diff --git a/lite/kernels/nna/bridges/conv_op.cc b/lite/kernels/nna/bridges/conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..967aef8b6ea6d3aa6c4badfe71f57ba2897d0e29 --- /dev/null +++ b/lite/kernels/nna/bridges/conv_op.cc @@ -0,0 +1,296 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/conv_op.h" +#include "lite/kernels/nna/bridges/graph.h" +#include "lite/kernels/nna/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace nna { + +int ConvConverter(void *ctx, OpLite *op, KernelBase *kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[NNA] Converting " << op_type << "... "; + + // Get input and output vars and op attributes + auto input_name = op_info->Input("Input").front(); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + + auto filter_name = op_info->Input("Filter").front(); + auto filter = scope->FindMutableTensor(filter_name); + auto filter_dims = filter->dims(); + + auto output_name = op_info->Output("Output").front(); + auto output = scope->FindMutableTensor(output_name); + auto output_dims = output->dims(); + + auto bs = input_dims[0]; + auto ic = input_dims[1]; + auto oc = filter_dims[0]; + CHECK_EQ(input_dims.size(), 4L); + CHECK_EQ(output_dims.size(), 4L); + CHECK_EQ(filter_dims.size(), 4L); + CHECK_EQ(output_dims[0], bs); + CHECK_EQ(output_dims[1], oc); + auto strides = op_info->GetAttr>("strides"); + auto paddings = op_info->GetAttr>("paddings"); + auto groups = op_info->GetAttr("groups"); + auto dilations = op_info->GetAttr>("dilations"); + bool with_act = + op_info->HasAttr("with_act") && op_info->GetAttr("with_act"); + std::string act_type = + with_act ? op_info->GetAttr("act_type") : ""; + float leaky_relu_alpha = act_type == "leaky_relu" + ? op_info->GetAttr("leaky_relu_alpha") + : 0.f; + + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + + // for quantization + bool enable_int8 = false; + float input_scale = 1.0; + float output_scale = 1.0; + std::vector weight_scale; + TensorInfo qnt; + + if (op_info->HasAttr("enable_int8")) { + enable_int8 = op_info->GetAttr("enable_int8"); + input_scale = op_info->GetAttr("input_scale"); + output_scale = op_info->GetAttr("output_scale"); + weight_scale = op_info->GetAttr>("weight_scale"); + } + + // Input node + std::shared_ptr input_node = nullptr; + imgdnn_tensor in_tensor; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); + in_tensor = input_node->data(); + } else { + TensorInfoReset(&qnt); + if (enable_int8) + qnt.type = IMGDNN_TYPE_Q_U8; + else + qnt.type = IMGDNN_TYPE_F32; + qnt.scales.push_back(input_scale); + qnt.zero_points.push_back(128); + input_node = graph->Add(input_name, *input, qnt, Node::Role::kInput); + in_tensor = input_node->data(); + } + + if (paddings.size() == 2L) { + for (size_t i = 0; i < strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "[NNA] Paddings size should be the same or twice as the input size."; + + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + operators::UpdatePaddingAndDilation(&paddings, + &dilations, + strides, + padding_algorithm, + input_dims, + filter_dims); + + // Check depthwise mode, and decide whether use ConvolutionDepthwise Op + bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1); + + // Filter node + std::shared_ptr filter_node = nullptr; + imgdnn_tensor filter_tensor; + bool per_channel = isScalesPerChannel(weight_scale); + TensorInfoReset(&qnt); + uint8_t *weights_u8 = + graph->GetBuilder()->GetBufromPool(filter_dims.production()); + if (enable_int8) { + char *weight_src = static_cast(filter->raw_data()); + + qnt.type = IMGDNN_TYPE_Q_U8; + if (per_channel) { + qnt.scales.assign(weight_scale.begin(), weight_scale.end()); + qnt.zero_points.assign(weight_scale.size(), 128); + qnt.count = oc; + qnt.axis = 1; + } else { + qnt.scales.push_back(weight_scale.at(0)); + qnt.zero_points.push_back(128); + } + for (int i = 0; i < filter_dims.production(); i++) { + weights_u8[i] = static_cast(weight_src[i] + 128); + } + + filter_node = graph->Add(filter_name, + weights_u8, + filter_dims.Vectorize(), + qnt, + Node::Role::kConst); + filter_tensor = filter_node->data(); + } else { + qnt.type = IMGDNN_TYPE_F32; + filter_node = graph->Add(filter_name, *filter, qnt, Node::Role::kConst); + } + + // Add bias node if exists bias + // Supports the bias nodes with the following dimensions + // 0: {oc} + // 1: {1, oc, oh, ow} + // 2: {n, oc, oh, ow} + std::shared_ptr bias_node = NULL; + imgdnn_tensor bias_tensor = nullptr; + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); + } else { + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + auto bias_data_size = bias_dims.production(); + auto output_data_size = output_dims.production(); + std::vector bias_shape; + if (bias_data_size == oc) { + // 0: {oc} + bias_shape = {1, oc, 1, 1}; + } else if (bias_data_size == output_data_size / bs) { + // 1: {1, oc, oh, ow} + bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]}; + } else if (bias_data_size == output_data_size) { + // 2: {n, oc, oh, ow} + bias_shape = output_dims.Vectorize(); + } else { + LOG(WARNING) + << "[NNA] Bias dimension " << bias_dims + << " isn't supported in conv2d Op when output dimension is " + << output_dims; + return FAILED; + } + + TensorInfoReset(&qnt); + std::vector shapes{1, oc}; + auto bias_data = bias->data(); + if (enable_int8) { + qnt.type = IMGDNN_TYPE_I32; + if (per_channel) { + qnt.scales.resize(bias_data_size); + for (int i = 0; i < bias_data_size; i++) + qnt.scales[i] = input_scale * weight_scale[i]; + qnt.zero_points.assign(bias_data_size, 0); + qnt.count = 2; + qnt.axis = 1; + } else { + qnt.scales.push_back(input_scale * weight_scale[0]); + qnt.zero_points.push_back(0); + } + + int quant_bits = 32; + auto dtype_max = static_cast((1 << (quant_bits - 1)) - 1); + auto dtype_min = static_cast(0 - dtype_max); + + int32_t *bias_qnt_data = + reinterpret_cast(graph->GetBuilder()->GetBufromPool( + bias_dims.production() * sizeof(int32_t))); + for (int i = 0; i < bias_data_size; i++) { + float current_scale = per_channel ? qnt.scales[i] : qnt.scales[0]; + bias_qnt_data[i] = + std::min(std::max(static_cast(bias_data[i] / current_scale), + dtype_min), + dtype_max); + } + + bias_node = graph->Add( + bias_name, bias_qnt_data, shapes, qnt, Node::Role::kConst); + } else { + qnt.type = IMGDNN_TYPE_F32; + std::vector bias_float_data(bias_data, + bias_data + bias_data_size); + bias_node = graph->Add( + bias_name, bias_float_data.data(), shapes, qnt, Node::Role::kConst); + } + bias_tensor = bias_node->data(); + } + } + + unsigned int img_stride[2] = {(unsigned int)strides[0], + (unsigned int)strides[1]}; + unsigned int pad_to_begin[2] = {(unsigned int)paddings[0], + (unsigned int)paddings[2]}; // top,left + unsigned int pad_to_end[2] = {(unsigned int)paddings[1], + (unsigned int)paddings[3]}; // bottom,right + unsigned int img_dilation[2] = {(unsigned int)dilations[0], + (unsigned int)dilations[1]}; + + imgdnn_quant_param output_quant_param; + output_quant_param.scale = output_scale; + output_quant_param.zero_point = 128; + + imgdnn_tensor conv_out = + graph->GetBuilder()->createConvolutionLayer(in_tensor, + filter_tensor, + bias_tensor, + output_quant_param, + img_stride, + pad_to_begin, + pad_to_end, + img_dilation, + is_depthwise_mode); + + if (!act_type.empty()) { + imgdnn_tensor act_out; + if (act_type == "leaky_relu") { + act_out = graph->GetBuilder()->createReLULayer( + conv_out, false, 0.0, false, 0.0, leaky_relu_alpha); + } else if (act_type == "relu6") { + act_out = graph->GetBuilder()->createReLULayer( + conv_out, true, 0.0, true, 6.0, false); + } else if (act_type == "relu") { + act_out = graph->GetBuilder()->createReLULayer( + conv_out, true, 0.0, false, 0.0, false); + } else { + VLOG(3) << "act_type: " << act_type << " Not handled"; + } + graph->Add(output_name, act_out, IMGDNN_TYPE_Q_U8); + } else { + graph->Add(output_name, conv_out, IMGDNN_TYPE_Q_U8); + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace nna +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(conv2d, + kNNA, + paddle::lite::subgraph::nna::ConvConverter); + +REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d, + kNNA, + paddle::lite::subgraph::nna::ConvConverter); diff --git a/lite/kernels/nna/bridges/fc_op.cc b/lite/kernels/nna/bridges/fc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9d7822c0d745e3c55cc932de93e415e539b58bd5 --- /dev/null +++ b/lite/kernels/nna/bridges/fc_op.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "imgdnn.h" // NOLINT +#include "lite/kernels/nna/bridges/graph.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace nna { + +int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[NNA] Converting " + op_type + "..."; + + auto input_name = op_info->Input("Input").front(); + auto input = scope->FindTensor(input_name); + auto input_dims = input->dims(); + + auto weight_name = op_info->Input("W").front(); + auto weights = scope->FindTensor(weight_name); + auto w_dims = weights->dims(); + CHECK_EQ(w_dims.size(), 2UL); + + auto out_name = op_info->Output("Out").front(); + auto out = scope->FindTensor(out_name); + auto out_dims = out->dims(); + + // notes : m, input row + // k, input col + // n, weight col + // input_dims : {1,1024,1,1} + // in_num_col_dims : 1 + // m =1, k=1024,n=1000 + // w_dims : {1024,1000} + int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); + int m = input_dims.Slice(0, in_num_col_dims).production(); + int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production(); + int n = w_dims[1]; + CHECK_EQ(k * n, w_dims.production()); + VLOG(3) << "[NNA] input dims: " << input_dims << " w dims: " << w_dims + << " m: " << m << " k: " << k << " n: " << n; + + // for quantization + bool enable_int8 = false; + float input_scale = 1.0; + float output_scale = 1.0; + std::vector weight_scale; + TensorInfo qnt; + + if (op_info->HasAttr("enable_int8")) { + enable_int8 = op_info->GetAttr("enable_int8"); + input_scale = op_info->GetAttr("input_scale"); + output_scale = op_info->GetAttr("output_scale"); + weight_scale = op_info->GetAttr>("weight_scale"); + } + + // Create input node and reshape it to (m, k, 1, 1) + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); + } else { + LOG(FATAL) << "[NNA] input node: " << input_name << ", could not be found"; + } + + // weight tensor + std::shared_ptr weight_node = nullptr; + bool per_channel = isScalesPerChannel(weight_scale); + uint8_t* weights_u8 = graph->GetBuilder()->GetBufromPool(w_dims.production()); + if (enable_int8) { + qnt.type = IMGDNN_TYPE_Q_U8; + if (per_channel) { + LOG(FATAL) + << "[NNA] FC per-channel quantization is not supported for Mirage"; + } else { + qnt.scales.push_back(weight_scale.at(0)); + qnt.zero_points.push_back(128); + } + const char* weight_src = static_cast(weights->raw_data()); + for (int i = 0; i < w_dims.production(); i++) + weights_u8[i] = static_cast(weight_src[i] + 128); + } else { + LOG(FATAL) << "[NNA] PaddleLite Only 8-bits quantization."; + } + weight_node = graph->Add( + weight_name, weights_u8, w_dims.Vectorize(), qnt, Node::Role::kConst); + + // Add bias node if bias tensor exists + imgdnn_tensor bias_tensor = nullptr; + if (HasInputArg(op_info, scope, "Bias")) { + std::shared_ptr bias_node = nullptr; + auto bias_name = op_info->Input("Bias").front(); + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); + } else { + auto bias = scope->FindTensor(bias_name); + auto bias_dims = bias->dims(); + CHECK_EQ(bias_dims.production(), n); + + if (enable_int8 && bias->precision() == PRECISION(kFloat)) { + TensorInfoReset(&qnt); + qnt.type = IMGDNN_TYPE_I32; + if (per_channel) { + qnt.scales.resize(weight_scale.size()); + qnt.count = bias_dims.size(); + qnt.axis = 0; + for (int i = 0; i < weight_scale.size(); i++) { + qnt.scales[i] = input_scale * weight_scale[i]; + } + LOG(FATAL) + << "[NNA] per-channel quantization is not supported for FC"; + } else { + qnt.scales.push_back(weight_scale.at(0) * input_scale); + qnt.zero_points.push_back(0); + } + + int quant_bits = 32; + auto dtype_max = static_cast((1 << (quant_bits - 1)) - 1); + auto dtype_min = static_cast(0 - dtype_max); + + auto* bias_data = bias->data(); + int32_t* bias_qnt_data = + reinterpret_cast(graph->GetBuilder()->GetBufromPool( + bias_dims.production() * sizeof(int32_t))); + for (int i = 0; i < n; i++) { + float current_scale = per_channel ? qnt.scales[i] : qnt.scales[0]; + bias_qnt_data[i] = + std::min(std::max(static_cast(bias_data[i] / current_scale), + dtype_min), + dtype_max); + } + + std::vector shapes{1}; + bias_node = graph->Add( + bias_name, bias_qnt_data, shapes, qnt, Node::Role::kConst); + } else { + qnt.type = IMGDNN_TYPE_F32; + bias_node = graph->Add(bias_name, *bias, qnt, Node::Role::kConst); + } + } + bias_tensor = bias_node->data(); + } + + imgdnn_quant_param output_quant_param; + output_quant_param.scale = output_scale; + output_quant_param.zero_point = 128; + imgdnn_tensor fc_out_tensor = graph->GetBuilder()->createFullyConnectedLayer( + input_node->data(), weight_node->data(), bias_tensor, output_quant_param); + + imgdnn_tensor_descriptor desc; + imgdnn_err_code err = imgdnnGetTensorDescriptor(fc_out_tensor, &desc); + graph->Add(out_name, fc_out_tensor, desc.type); + CHECK(err == IMGDNN_SUCCESS) << "fail get tensor description(FC)"; + + // reshape to out_dims + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace nna +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(fc, kNNA, paddle::lite::subgraph::nna::FCConverter); diff --git a/lite/kernels/nna/bridges/graph.cc b/lite/kernels/nna/bridges/graph.cc new file mode 100644 index 0000000000000000000000000000000000000000..c867dfa6967e55c160080b6c2163017108b592b5 --- /dev/null +++ b/lite/kernels/nna/bridges/graph.cc @@ -0,0 +1,151 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/nna/bridges/graph.h" +#include +#include "lite/kernels/nna/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace nna { + +// Add 1 +int Graph::Add(const std::string& name, std::shared_ptr node) { + auto it = nodes_.find(name); + if (it != nodes_.end()) { + // Only intermediate node can be shared with the same name + if (!node->is_data() || !it->second.back()->is_data()) { + LOG(FATAL) << "[NNA] Const or Input node " << name << " is redefined."; + return -1; + } + } else { + auto ret = nodes_.insert( + std::make_pair(name, std::vector>())); + CHECK(ret.second); + it = ret.first; + } + it->second.push_back(node); + return it->second.size(); +} + +// Add 2 +std::shared_ptr Graph::Add(const std::string& name, + const void* const const_data, + std::vector shape, + const TensorInfo& qnt, + Node::Role role /* = Node::Role::kData*/) { + auto node = std::make_shared(qnt.type, qnt.layout, role); + auto idx = Add(name, node); + CHECK_GE(idx, 1); + + imgdnn_tensor_descriptor desc; + desc.type = qnt.type; + desc.dimensions = (unsigned)shape.size(); + for (uint32_t i = 0; i < shape.size(); ++i) desc.size[i] = shape[i]; + + switch (qnt.type) { + case IMGDNN_TYPE_F32: + case IMGDNN_TYPE_I32: + break; + case IMGDNN_TYPE_Q_I8: + case IMGDNN_TYPE_Q_U8: + desc.quant_param.scale = qnt.scales[0]; + desc.quant_param.zero_point = qnt.zero_points[0]; + break; + case IMGDNN_TYPE_QPA_I8: + case IMGDNN_TYPE_QPA_U8: + desc.quant_param.per_axis = imgdnnCreatePerAxisQuantParam( + qnt.axis, qnt.count, qnt.scales.data(), qnt.zero_points.data()); + CHECK(desc.quant_param.per_axis != nullptr); + break; + default: + LOG(FATAL) << "[NNA] invalid tensor type set in node: " << name; + return nullptr; + } + + imgdnn_tensor out_tensor; + if (role == Node::Role::kConst) { + out_tensor = pImgdnnMgr->createFixedInputTensor(&desc, const_data, true); + } else { + LOG(INFO) << "[NNA] invald role set in this path: " << name; + } + + if ((desc.type == IMGDNN_TYPE_QPA_I8 || desc.type == IMGDNN_TYPE_QPA_U8) && + desc.quant_param.per_axis != nullptr) + imgdnnDestroyPerAxisQuantParam(desc.quant_param.per_axis); + + node->set_data(out_tensor); + + return node; +} + +// Add 3 +std::shared_ptr Graph::Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + const TensorInfo& qnt, + Node::Role role) { + auto node = std::make_shared(qnt.type, qnt.layout, role); + auto idx = Add(name, node); + CHECK_GE(idx, 1); + + imgdnn_tensor_descriptor desc; + desc.type = qnt.type; + desc.dimensions = (unsigned)shape.size(); + for (uint32_t i = 0; i < shape.size(); ++i) desc.size[i] = shape[i]; + + switch (qnt.type) { + case IMGDNN_TYPE_F32: + case IMGDNN_TYPE_I32: + break; + case IMGDNN_TYPE_Q_I8: + case IMGDNN_TYPE_Q_U8: + desc.quant_param.scale = qnt.scales[0]; + desc.quant_param.zero_point = qnt.zero_points[0]; + break; + case IMGDNN_TYPE_QPA_I8: + case IMGDNN_TYPE_QPA_U8: + desc.quant_param.per_axis = imgdnnCreatePerAxisQuantParam( + qnt.axis, qnt.count, qnt.scales.data(), qnt.zero_points.data()); + CHECK(desc.quant_param.per_axis != nullptr); + break; + default: + LOG(FATAL) << "[NNA] invalid tensor type set in node: " << name; + return nullptr; + } + + imgdnn_tensor out_tensor; + if (role == Node::Role::kInput) { + out_tensor = pImgdnnMgr->createInputTensor(&desc); + } else if (role == Node::Role::kConst) { + const void* const_data = tensor.raw_data(); + out_tensor = pImgdnnMgr->createFixedInputTensor(&desc, const_data, false); + } else { + LOG(INFO) << "[NNA] invald role set in this path: " << name; + } + + if ((desc.type == IMGDNN_TYPE_QPA_I8 || desc.type == IMGDNN_TYPE_QPA_U8) && + desc.quant_param.per_axis != nullptr) + imgdnnDestroyPerAxisQuantParam(desc.quant_param.per_axis); + + node->set_data(out_tensor); + + return node; +} + +} // namespace nna +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/nna/bridges/graph.h b/lite/kernels/nna/bridges/graph.h new file mode 100644 index 0000000000000000000000000000000000000000..660c04211777bb0bd5ea9de7e7ef6c0fb2091996 --- /dev/null +++ b/lite/kernels/nna/bridges/graph.h @@ -0,0 +1,145 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "imgdnn.h" // NOLINT +#include "lite/backends/nna/imgdnn_manager.h" +#include "lite/core/op_lite.h" +#include "lite/core/tensor.h" +#include "utility.h" // NOLINT + +namespace paddle { +namespace lite { +namespace subgraph { +namespace nna { + +#define NNA_UNUSED(var) \ + do { \ + (void)(var); \ + } while (0) + +// Graph and node is defined to collect all of converted IMGDNN IR nodes +class Node { + public: + enum class Role { + kInput = 0, + kConst, + kData, + }; + + Node(imgdnn_tensor data, imgdnn_type type, DataLayoutType layout, Role role) + : data_(data), type_(type), layout_(layout), role_(role) {} + + Node(imgdnn_type type, DataLayoutType layout, Role role) + : type_(type), layout_(layout), role_(role) {} + + void set_data(imgdnn_tensor data) { data_ = data; } + void set_type(imgdnn_type type) { type_ = type; } + void set_layout(DataLayoutType layout) { layout_ = layout; } + void set_role(Role role) { role_ = role; } + + template + std::shared_ptr data() { + return std::static_pointer_cast(data_); + } + imgdnn_tensor data() { return data_; } + imgdnn_type type() const { return type_; } + DataLayoutType layout() const { return layout_; } + + bool is_input() const { return role_ == Role::kInput; } + bool is_const() const { return role_ == Role::kConst; } + bool is_data() const { return role_ == Role::kData; } + + private: + imgdnn_tensor data_{nullptr}; + imgdnn_type type_{IMGDNN_TYPE_MAX}; + DataLayoutType layout_{DATALAYOUT(kNCHW)}; + Role role_{Role::kData}; +}; + +class Graph { + public: + explicit Graph(lite::nna::ImgdnnManager* pMgr) { + pImgdnnMgr = pMgr; + std::cout << "graph construct" << std::endl; + } + + ~Graph() { std::cout << "Graph deconst" << std::endl; } + + // Add 1 + int Add(const std::string& name, std::shared_ptr node); + + // Add 2, weights,bias + std::shared_ptr Add(const std::string& name, + const void* const const_data, + std::vector shape, + const TensorInfo& qnt, + Node::Role role /* = Node::Role::kData*/); + + // Add 3 + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + std::vector shape, + const TensorInfo& qnt, + Node::Role role); + // Add 4 + std::shared_ptr Add(const std::string& name, + const Tensor& tensor, + const TensorInfo& qnt, + Node::Role role) { + return Add(name, tensor, tensor.dims().Vectorize(), qnt, role); + } + + // Used to add intermediate tensor + // Add 5 + int Add(const std::string& name, + imgdnn_tensor tensor, + imgdnn_type type, + DataLayoutType layout = DATALAYOUT(kNCHW)) { + Node::Role role = Node::Role::kData; + auto node = std::make_shared(type, layout, role); + node->set_data(tensor); + return Add(name, node); // call Add 1 + } + + std::shared_ptr Get(std::string name) { + CHECK(Has(name)) << "[NNA] Node " << name << " not found."; + return nodes_.at(name).back(); + } + + bool Has(const std::string& name) { + return nodes_.find(name) != nodes_.end(); + } + + lite::nna::ImgdnnManager* GetBuilder() { + ASSERT(pImgdnnMgr == nullptr, "pImgdnnMgr used before initialize"); + return pImgdnnMgr; + } + + private: + std::unordered_map>> nodes_; + lite::nna::ImgdnnManager* pImgdnnMgr{nullptr}; +}; +} // namespace nna + +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/nna/bridges/paddle_use_bridges.h b/lite/kernels/nna/bridges/paddle_use_bridges.h new file mode 100644 index 0000000000000000000000000000000000000000..38803ec15b9dc86380e636a10f30b41d371f17d6 --- /dev/null +++ b/lite/kernels/nna/bridges/paddle_use_bridges.h @@ -0,0 +1,22 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +USE_SUBGRAPH_BRIDGE(relu, kNNA); +USE_SUBGRAPH_BRIDGE(conv2d, kNNA); +USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kNNA); +USE_SUBGRAPH_BRIDGE(fc, kNNA); +USE_SUBGRAPH_BRIDGE(pool2d, kNNA); +// USE_SUBGRAPH_BRIDGE(softmax, kNNA); diff --git a/lite/kernels/nna/bridges/pool_op.cc b/lite/kernels/nna/bridges/pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6142af9bb27c053d79fe6c34777522ff7c2aa4c8 --- /dev/null +++ b/lite/kernels/nna/bridges/pool_op.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/pool_op.h" +#include "imgdnn.h" // NOLINT +#include "lite/kernels/nna/bridges/graph.h" +#include "lite/kernels/nna/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace nna { + +int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[NNA] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto pooling_type = op_info->GetAttr("pooling_type"); + auto global_pooling = op_info->GetAttr("global_pooling"); + auto ksize = op_info->GetAttr>("ksize"); + auto paddings = op_info->GetAttr>("paddings"); + + // for quantization + float output_scale = 1.0; + + if (op_info->HasAttr("enable_int8")) { + output_scale = op_info->GetAttr("output_scale"); + } + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + // x_node = graph->Add(x_name, *x); + LOG(INFO) << "[NNA] Pooling input not found: " << x_name; + } + + // pool mode + imgdnn_pooling_type img_pool_type; + if (pooling_type == "max") { + img_pool_type = IMGDNN_POOLING_MAX; + } else if (pooling_type == "avg") { + img_pool_type = IMGDNN_POOLING_AVERAGE; + } else { + LOG(WARNING) << "[NNA] Unsupported pooling type: " << pooling_type; + return FAILED; + } + + // pad mode + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + // paddings and strides + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "[NNA] Paddings size should be the same or twice as the inputs size."; + bool adaptive = false; + if (op_info->HasAttr("adaptive")) { + adaptive = op_info->GetAttr("adaptive"); + } + auto strides = op_info->GetAttr>("strides"); + lite::operators::UpdatePadding(&paddings, + global_pooling, + adaptive, + padding_algorithm, + x->dims(), + strides, + ksize); + + // ceil mode + /* bool ceil_mode = + op_info->HasAttr("ceil_mode") && op_info->GetAttr("ceil_mode"); + */ + + unsigned int img_ksize[2] = {(unsigned int)ksize[0], (unsigned int)ksize[1]}; + unsigned int img_stride[2] = {(unsigned int)strides[0], + (unsigned int)strides[1]}; + unsigned int pad_to_begin[2] = {(unsigned int)paddings[0], + (unsigned int)paddings[2]}; // top,left + unsigned int pad_to_end[2] = {(unsigned int)paddings[1], + (unsigned int)paddings[3]}; // bottom,right + + if (global_pooling) { + img_ksize[0] = x_dims[2]; + img_ksize[1] = x_dims[3]; + } + + imgdnn_quant_param output_quant_param; + output_quant_param.scale = output_scale; + output_quant_param.zero_point = 128; + imgdnn_tensor pooling_out = + graph->GetBuilder()->createPoolingLayer(x_node->data(), + output_quant_param, + img_ksize, + img_stride, + pad_to_begin, + pad_to_end, + img_pool_type); + + // LOG(INFO) << "pooling op output:" << static_cast(pooling_out); + + imgdnn_tensor_descriptor desc; + imgdnn_err_code err = imgdnnGetTensorDescriptor(pooling_out, &desc); + CHECK(err == IMGDNN_SUCCESS) << "fail get tensor description(POOL)"; + + graph->Add(out_name, pooling_out, desc.type); + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace nna +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(pool2d, + kNNA, + paddle::lite::subgraph::nna::PoolConverter); diff --git a/lite/kernels/nna/bridges/softmax_op.cc b/lite/kernels/nna/bridges/softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9842cb05d3c39d36c9e0905baaa54c03a4b02da8 --- /dev/null +++ b/lite/kernels/nna/bridges/softmax_op.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/nna/bridges/graph.h" +#include "lite/kernels/nna/bridges/registry.h" +#include "lite/kernels/npu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace nna { + +int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[NNA] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto x_rank = x_dims.size(); + auto out_name = op_info->Output("Out").front(); + int axis = op_info->HasAttr("axis") ? op_info->GetAttr("axis") : -1; + if (axis < 0) { + axis += x_rank; + } + + // for quantization + float output_scale = 1.0; + + if (op_info->HasAttr("enable_int8")) { + output_scale = op_info->GetAttr("output_scale"); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + LOG(FATAL) << "[NNA] Softmax: Could not find the input tensor."; + } + + imgdnn_quant_param output_quant_param; + output_quant_param.scale = output_scale; + output_quant_param.zero_point = 128; + imgdnn_tensor softmax_out_tensor = graph->GetBuilder()->createSoftmaxLayer( + x_node->data(), 1.0, axis, output_quant_param); + + graph->Add(out_name, softmax_out_tensor, IMGDNN_TYPE_Q_U8); + } else { + LOG(FATAL) << "[NNA] Softmax: has no enable_int8 attribute."; + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace nna +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(softmax, + kNNA, + paddle::lite::subgraph::nna::SoftmaxConverter); diff --git a/lite/kernels/nna/bridges/utility.cc b/lite/kernels/nna/bridges/utility.cc new file mode 100644 index 0000000000000000000000000000000000000000..f9c33655da375b714076b4ea5fa56b00cf3563fa --- /dev/null +++ b/lite/kernels/nna/bridges/utility.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/nna/bridges/utility.h" +#include + +namespace paddle { +namespace lite { +namespace subgraph { +namespace nna { + +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname) { + auto iarg_names = op_info->input_argnames(); + if (std::find(iarg_names.begin(), iarg_names.end(), argname) != + iarg_names.end()) { + auto inputs = op_info->Input(argname); + if (inputs.empty()) { + return false; + } + auto var_name = inputs.front(); + auto var = scope->FindVar(var_name); + return var != nullptr; + } else { + return false; + } +} + +bool isScalesPerChannel(std::vector scales) { + bool per_channel = false; + for (std::vector::iterator iter = scales.begin() + 1; + iter != scales.end(); + iter++) { + if (*iter != scales.at(0)) { + per_channel = true; + break; + } + } + return per_channel; +} + +void TensorInfoReset(TensorInfo* qnt) { + qnt->count = 0; + qnt->axis = 0; + qnt->scales.clear(); + // qnt.scales.shrink_to_fit(); + qnt->zero_points.clear(); + // qnt.zero_points.shrink_to_fit(); + qnt->layout = DATALAYOUT(kNCHW); +} + +} // namespace nna +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/nna/bridges/utility.h b/lite/kernels/nna/bridges/utility.h new file mode 100644 index 0000000000000000000000000000000000000000..940db366da90bb354dd3dc511ced2288e93124df --- /dev/null +++ b/lite/kernels/nna/bridges/utility.h @@ -0,0 +1,50 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "imgdnn.h" // NOLINT +#include "lite/core/op_lite.h" +#include "lite/utils/macros.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace nna { + +struct TensorInfo { + imgdnn_type type; + std::vector scales; + std::vector zero_points; + DataLayoutType layout; + unsigned count; + unsigned axis; +}; + +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname); +bool isScalesPerChannel(std::vector scales); + +void TensorInfoReset(TensorInfo* qnt); + +} // namespace nna +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/nna/subgraph_compute.cc b/lite/kernels/nna/subgraph_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4c8f18804675772f48029d8b9173430a2202fdb --- /dev/null +++ b/lite/kernels/nna/subgraph_compute.cc @@ -0,0 +1,249 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/nna/subgraph_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/nna/bridges/graph.h" +#include "lite/kernels/nna/bridges/paddle_use_bridges.h" +#include "lite/kernels/nna/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace nna { + +bool SubgraphEngine::BuildDeviceProgram() { + int status = 0; + // Convert all of ops and their input vars and weights and added into the NNA + // IMG IR graph + subgraph::nna::Graph graph{&imgdnn_mgr_}; + const auto& bridges = subgraph::Registry::Instance(); + if (!origin_program_) { + BuildOriginProgram(); + } + const auto& insts = origin_program_->instructions(kRootBlockIdx); + for (auto& inst : insts) { + auto op = const_cast(inst.op()); + CHECK(op); + op->CheckShape(); + op->InferShape(); + std::string op_type = op->op_info()->Type(); + if (!bridges.Exists(op_type, TARGET(kNNA))) { + // return subgraph::FAILED; + return false; + } + auto kernel = inst.kernel(); + status |= + bridges.Select(op_type, TARGET(kNNA))(reinterpret_cast(&graph), + const_cast(op), + const_cast(kernel)); + if (subgraph::CHECK_FAILED(status)) { + // return subgraph::FAILED; + return false; + } + } + + // Collect the valid input and output nodes in the IMGDNN IR graph and update + // the input and output names + device_inames_.clear(); + std::vector device_inodes; + for (auto& input_name : input_names_) { + if (graph.Has(input_name)) { + device_inodes.push_back(graph.Get(input_name)->data()); + device_inames_.push_back(input_name); + } else { + LOG(WARNING) << "[NNA] Input node " << input_name + << " is ignored because it does not exist."; + } + } + + device_onames_.clear(); + std::vector device_onodes; + for (auto& output_name : output_names_) { + if (graph.Has(output_name)) { + device_onodes.push_back(graph.Get(output_name)->data()); + device_onames_.push_back(output_name); + } else { + LOG(WARNING) << "[NNA] Output node " << output_name + << " is ignored because it does not exist."; + } + } + CHECK(!device_inames_.empty()) + << "[NNA] No input nodes found for building NNA model"; + CHECK(!device_onames_.empty()) + << "[NNA] No output nodes found for building NNA model"; + + imgdnn_mgr_.createNetworkObject(device_inodes.size(), + device_inodes.data(), + device_onodes.size(), + device_onodes.data()); + + // inputs + unsigned int num_inputs, num_outputs; + imgdnn_mgr_.getNetworkObjectInputs( + std::numeric_limits::max(), nullptr, &num_inputs); + CHECK_EQ(num_inputs, device_inames_.size()); + // origin_idims_.resize(num_inputs); + // origin_itensors_.resize(num_inputs); + device_itensors_.resize(num_inputs); + imgdnn_mgr_.getNetworkObjectInputs( + num_inputs, device_itensors_.data(), nullptr); + + // show input info + for (int i = 0; i < num_inputs; i++) { + auto node = graph.Get(device_inames_[i]); + auto type = node->type(); + auto layout = node->layout(); + // origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); + // CHECK(origin_itensors_[i]); + // origin_idims_[i] = origin_itensors_[i]->dims(); + VLOG(3) << "[NNA] Inputs[" << i << "] name: " << device_inames_[i] + << " type: " << type << " layout: " << DataLayoutToStr(layout); + } + + // outputs + imgdnn_mgr_.getNetworkObjectOutputs( + std::numeric_limits::max(), nullptr, &num_outputs); + CHECK_EQ(num_outputs, device_onames_.size()); + // origin_odims_.resize(num_outputs); + // origin_otensors_.resize(num_outputs); + device_otensors_.resize(num_outputs); + imgdnn_mgr_.getNetworkObjectOutputs( + num_outputs, device_otensors_.data(), nullptr); + // show output info + for (int i = 0; i < num_outputs; i++) { + auto node = graph.Get(device_onames_[i]); + auto type = node->type(); + auto layout = node->layout(); + // origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); + // CHECK(origin_otensors_[i]); + // origin_odims_[i] = origin_otensors_[i]->dims(); + VLOG(3) << "[NNA] Outputs[" << i << "] name: " << device_onames_[i] + << " type: " << type << " layout: " << DataLayoutToStr(layout); + // Prepare the device output tensors + switch (type) { + case IMGDNN_TYPE_F32: + origin_otensors_[i]->mutable_data(); + break; + case IMGDNN_TYPE_Q_I8: + case IMGDNN_TYPE_Q_U8: + origin_otensors_[i]->mutable_data(); + break; + case IMGDNN_TYPE_I16: + origin_otensors_[i]->mutable_data(); + break; + case IMGDNN_TYPE_I32: + origin_otensors_[i]->mutable_data(); + break; + default: + LOG(FATAL) << "[NNA] " << device_onames_[i] + << " can't mutable data with precision type " << type; + break; + } + } + + return true; +} + +bool SubgraphEngine::LaunchDeviceProgram() { + // Set input buffer + for (size_t i = 0; i < origin_itensors_.size(); i++) { + // check input shapes + imgdnn_tensor_descriptor in_desc = + imgdnn_mgr_.getInputDescriptor(device_itensors_[i]); + size_t in_size = imgdnn_mgr_.getDescriptorSize(&in_desc); + CHECK_EQ(in_size, origin_itensors_[i]->memory_size()); + + auto origin_data = origin_itensors_[i]->mutable_data(); + auto converted_data = reinterpret_cast(origin_data); + for (int j = 0; j < origin_itensors_[i]->data_size(); j++) { + converted_data[j] = + static_cast(static_cast(origin_data[j]) + 128); + } + + imgdnn_memory in_mem = imgdnn_mgr_.importMemory( + static_cast(converted_data), origin_itensors_[i]->memory_size()); + imgdnn_mgr_.addBindingInput(device_itensors_[i], in_mem); + } + + // Set output buffer + std::vector out_mems; + for (size_t i = 0; i < origin_otensors_.size(); i++) { + // check output shapes + imgdnn_tensor_descriptor out_desc = + imgdnn_mgr_.getOutputDescriptor(device_otensors_[i]); + size_t out_size = imgdnn_mgr_.getDescriptorSize(&out_desc); + CHECK_EQ(out_size, origin_otensors_[i]->memory_size()); + + imgdnn_memory out_mem = + imgdnn_mgr_.allocateMemory(origin_otensors_[i]->memory_size()); + imgdnn_mgr_.addBindingOutput(device_otensors_[i], out_mem); + out_mems.push_back(out_mem); + } + + // Run the img model by name + imgdnn_mgr_.executeNetworkObject(true, 0, nullptr, nullptr); + + // Copy the data of output tensor to the buffer of origin output tensors + for (size_t i = 0; i < out_mems.size(); i++) { + uint8_t* data = static_cast( + imgdnn_mgr_.lockMemory(out_mems[i], IMGDNN_LOCK_ACCESS_READ_ONLY)); + + int8_t* output_data = origin_otensors_[i]->mutable_data(); + for (size_t j = 0; j < origin_otensors_[i]->data_size(); j++) { + output_data[j] = data[j] - 128; + } + imgdnn_mgr_.unlockMemory(out_mems[i]); + imgdnn_mgr_.destroyMemory(out_mems[i]); + } + + return true; +} + +void SubgraphCompute::PrepareForRun() { + auto& param = this->Param(); + engine_.reset(new SubgraphEngine(ctx_.get(), + param.block_idx, + param.program_desc, + param.exec_scope, + param.input_data_names, + param.output_data_names)); + CHECK(engine_); +} + +void SubgraphCompute::Run() { + CHECK(engine_); + engine_->Run(); +} + +} // namespace nna +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(subgraph, + kNNA, + kInt8, + kNCHW, + paddle::lite::kernels::nna::SubgraphCompute, + def) + .BindInput("Inputs", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt8))}) + .BindOutput("Outputs", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt8))}) + .Finalize(); diff --git a/lite/kernels/nna/subgraph_compute.h b/lite/kernels/nna/subgraph_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..1e28f10c1376f85c3475fe2c81fd49f8dd5c718a --- /dev/null +++ b/lite/kernels/nna/subgraph_compute.h @@ -0,0 +1,81 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "imgdnn.h" // NOLINT +#include "lite/backends/nna/imgdnn_manager.h" +#include "lite/core/kernel.h" +#include "lite/kernels/nna/bridges/graph.h" +#include "lite/kernels/npu/bridges/engine.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace nna { + +class SubgraphEngine : public subgraph::Engine { + public: + SubgraphEngine(KernelContext* ctx, + int block_idx, + const std::shared_ptr& program_desc, + Scope* exec_scope, + const std::vector& input_names, + const std::vector& output_names) + : subgraph::Engine(ctx, + block_idx, + program_desc, + exec_scope, + input_names, + output_names) {} + + ~SubgraphEngine() {} + + protected: + bool BuildDeviceProgram() override; + bool LaunchDeviceProgram() override; + + std::vector device_inames_; + std::vector device_onames_; + std::vector device_itensors_; + std::vector device_otensors_; + lite::nna::ImgdnnManager imgdnn_mgr_; +}; + +class SubgraphCompute + : public KernelLite { + public: + using param_t = operators::SubgraphParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~SubgraphCompute() { // = default; + std::cout << "~SubgraphCompute" << std::endl; + engine_.reset(); + } + + private: + std::unique_ptr engine_; +}; + +} // namespace nna +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py index 0cf14d12d553a4d9f7f4ed9780e4274560a8b23f..cf4b0a9600f59c625f27dfcd5dc62fab1a4f874f 100644 --- a/lite/tools/cmake_tools/record_supported_kernel_op.py +++ b/lite/tools/cmake_tools/record_supported_kernel_op.py @@ -56,8 +56,8 @@ const std::vector> supported_ops_target = { ops_lines = [] # valid targets and valid_ops -valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU"] -valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] +valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU", "kNNA"] +valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]] class TargetType: kUnk = 0 kHost = 1 @@ -74,6 +74,7 @@ class TargetType: kRKNPU = 12 kAPU = 13 kHuaweiAscendNPU = 14 + kNNA = 15 # record op_info of valid kernels into `valid_ops` according to different target type diff --git a/tools/codestyle/clang_format.hook b/tools/codestyle/clang_format.hook index 1d928216867c0ba3897d71542fea44debf8d72a0..063ec099d96cd897af2263f89246b75a36add70f 100755 --- a/tools/codestyle/clang_format.hook +++ b/tools/codestyle/clang_format.hook @@ -1,7 +1,7 @@ #!/bin/bash set -e -readonly VERSION="3.8" +readonly VERSION="6.0.0" version=$(clang-format -version)