提交 30145270 编写于 作者: A airockchip 提交者: GitHub

[RKNPU] Add Rockchip NPU backend (#3382)

上级 bfe8b250
......@@ -59,6 +59,7 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON)
lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF)
lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU)
......@@ -129,6 +130,10 @@ if (LITE_WITH_PYTHON)
include(external/pybind11) # download, build, install pybind11
endif()
if(LITE_WITH_RKNPU)
include(device/rknpu)
endif()
# for mobile
if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
......
......@@ -134,6 +134,10 @@ if (LITE_WITH_NPU)
add_definitions("-DLITE_WITH_NPU")
endif()
if (LITE_WITH_RKNPU)
add_definitions("-DLITE_WITH_RKNPU")
endif()
if (LITE_WITH_XPU)
add_definitions("-DLITE_WITH_XPU")
if (LITE_WITH_XTCL)
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_RKNPU)
return()
endif()
if(NOT DEFINED RKNPU_DDK_ROOT)
set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT})
if(NOT RKNPU_DDK_ROOT)
message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON")
endif()
endif()
message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}")
find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h
PATHS ${RKNPU_DDK_ROOT}/include/ NO_DEFAULT_PATH)
if(NOT RKNPU_DDK_INC)
message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include")
endif()
include_directories("${RKNPU_DDK_ROOT}/include")
set(RKNPU_SUB_LIB_PATH "lib64")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
set(RKNPU_SUB_LIB_PATH "lib64")
endif()
if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(RKNPU_SUB_LIB_PATH "lib")
endif()
find_library(RKNPU_DDK_FILE NAMES rknpu_ddk
PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH})
if(NOT RKNPU_DDK_FILE)
message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}")
else()
message(STATUS "Found RKNPU_DDK_FILE Library: ${RKNPU_DDK_FILE}")
add_library(rknpu_ddk SHARED IMPORTED GLOBAL)
set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE})
endif()
set(rknpu_runtime_libs rknpu_ddk CACHE INTERNAL "rknpu ddk runtime libs")
......@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS})
......@@ -88,6 +88,12 @@ function (lite_deps TARGET)
endforeach(var)
endif()
if (LITE_WITH_RKNPU)
foreach(var ${lite_deps_RKNPU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
if (LITE_WITH_XPU)
foreach(var ${lite_deps_XPU_DEPS})
set(deps ${deps} ${var})
......@@ -131,7 +137,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -142,6 +148,7 @@ function(lite_cc_library TARGET)
CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS}
BM_DEPS ${args_BM_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS}
ARM_DEPS ${args_ARM_DEPS}
CV_DEPS ${args_CV_DEPS}
FPGA_DEPS ${args_FPGA_DEPS}
......@@ -177,7 +184,7 @@ function(lite_cc_binary TARGET)
set(options " -g ")
endif()
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -191,6 +198,7 @@ function(lite_cc_binary TARGET)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
......@@ -226,7 +234,7 @@ function(lite_cc_test TARGET)
endif()
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS
COMPILE_LEVEL # (basic|extra)
......@@ -248,6 +256,7 @@ function(lite_cc_test TARGET)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
......@@ -280,6 +289,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(mlu_kernels CACHE INTERNAL "mlu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels")
set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels")
set(host_kernels CACHE INTERNAL "host kernels")
......@@ -295,12 +305,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif()
# add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM, RKNPU)
# level: one of (basic, extra)
function(add_kernel TARGET device level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -323,6 +333,12 @@ function(add_kernel TARGET device level)
if ("${device}" STREQUAL "Host")
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "ARM")
......@@ -379,6 +395,15 @@ function(add_kernel TARGET device level)
endif()
set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "RKNPU")
if (NOT LITE_WITH_RKNPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "MLU")
if (NOT LITE_WITH_MLU)
foreach(src ${args_SRCS})
......@@ -427,6 +452,7 @@ function(add_kernel TARGET device level)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
......@@ -481,6 +507,7 @@ function(add_operator TARGET level)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
......
......@@ -7,6 +7,7 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
......@@ -76,6 +77,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (LITE_WITH_BM)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm")
endif(LITE_WITH_BM)
if (LITE_WITH_RKNPU)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu")
endif(LITE_WITH_RKNPU)
else()
set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
endif()
......
......@@ -34,9 +34,11 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
RKNPU_DEPS ${rknpu_kernels}
)
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels})
if(NOT APPLE)
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
......@@ -59,6 +61,11 @@ else()
# Need to add HIAI runtime libs (libhiai.so) dependency
target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
endif()
if (LITE_WITH_RKNPU)
# Need to add RKNPU runtime libs dependency
target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs})
endif()
endif()
endif()
......@@ -69,6 +76,7 @@ if (WITH_TESTING)
CUDA_DEPS ${cuda_kernels}
X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels})
endif()
......@@ -82,6 +90,12 @@ if(LITE_WITH_BM)
set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
endif()
if(LITE_WITH_RKNPU)
set(light_api_deps ${light_api_deps} ${rknpu_deps})
set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
endif()
message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}")
message(STATUS "get CUDA kernels ${cuda_kernels}")
......@@ -90,6 +104,7 @@ message(STATUS "get ARM kernels ${arm_kernels}")
message(STATUS "get OpenCL kernels ${opencl_kernels}")
message(STATUS "get NPU kernels ${npu_kernels}")
message(STATUS "get XPU kernels ${xpu_kernels}")
message(STATUS "get RKNPU kernels ${rknpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}")
message(STATUS "get BM kernels ${bm_kernels}")
message(STATUS "get MLU kernels ${mlu_kernels}")
......@@ -107,6 +122,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
......@@ -128,6 +144,7 @@ lite_cc_library(light_api SRCS light_api.cc
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
......@@ -147,6 +164,7 @@ if(WITH_TESTING)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
......@@ -248,6 +266,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
--model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
# brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model
# lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
# DEPS ${lite_model_test_DEPS})
......@@ -291,6 +310,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
DEPS light_api program mir_passes paddle_api_light
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -300,6 +320,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels}
FPGA_DEPS ${fpga_kernels}
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
......@@ -335,6 +356,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
CL_DEPS ${opencl_kernels}
X86_DEPS ${x86_kernels}
FPGA_DEPS ${fpga_kernels}
......@@ -356,6 +378,7 @@ if(NOT IOS)
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
......@@ -369,6 +392,7 @@ if(NOT IOS)
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
......@@ -382,6 +406,7 @@ if(NOT IOS)
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
......@@ -392,6 +417,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
......@@ -404,17 +430,20 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
RKNPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
......
......@@ -109,6 +109,10 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU));
} else if (target_repr == "rknpu") {
valid_places.emplace_back(TARGET(kRKNPU));
valid_places.emplace_back(
TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
} else if (target_repr == "mlu") {
valid_places.emplace_back(TARGET(kMLU));
} else {
......@@ -187,6 +191,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
"kFPGA",
"kNPU",
"kXPU",
"kRKNPU",
"kAny",
"kUnk"};
int maximum_optype_length = 0;
......@@ -251,16 +256,16 @@ void PrintHelpInfo() {
" `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`\n"
" `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n"
" `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
" Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
" Display operators in the input model\n";
std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl;
......
......@@ -72,7 +72,8 @@ const std::string& TargetToStr(TargetType target) {
"npu",
"xpu",
"bm",
"mlu"};
"mlu",
"rknpu"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......@@ -113,7 +114,8 @@ const std::string& TargetRepr(TargetType target) {
"kNPU",
"kXPU",
"kMLU",
"kBM"};
"kBM",
"kRKNPU"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......
......@@ -54,8 +54,9 @@ enum class TargetType : int {
kXPU = 9,
kBM = 10,
kMLU = 11,
kRKNPU = 12,
kAny = 6, // any target
NUM = 12, // number of fields.
NUM = 13, // number of fields.
};
enum class PrecisionType : int {
kUnk = 0,
......
......@@ -8,3 +8,4 @@ add_subdirectory(npu)
add_subdirectory(xpu)
add_subdirectory(mlu)
add_subdirectory(bm)
add_subdirectory(rknpu)
if(NOT LITE_WITH_RKNPU)
return()
endif()
lite_cc_library(device_rknpu SRCS device.cc DEPS ${rknpu_builder_libs} ${rknpu_runtime_libs})
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/rknpu/device.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace rknpu {
std::unique_ptr<rk::nn::Exection> Device::Build(
std::string& model_name, // NOLINT
rk::nn::Graph* rk_graph, // NOLINT
std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes, // NOLINT
std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes // NOLINT
) {
VLOG(3) << "[RKNPU] Build model";
rk_graph->SetInputsOutputs(input_nodes, output_nodes);
std::unique_ptr<rk::nn::Exection> exector =
std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(rk_graph));
exector->Build();
return exector;
}
} // namespace rknpu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "rknpu/rknpu_pub.h" // NOLINT
namespace paddle {
namespace lite {
namespace rknpu {
class Device {
public:
static Device& Global() {
static Device x;
return x;
}
Device() {}
// Build the RK IR graph to om model, return RK model exector to
// load om model and run inference.
std::unique_ptr<rk::nn::Exection> Build(
std::string& model_name, // NOLINT
rk::nn::Graph* rk_graph, // NOLINT
std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes, // NOLINT
std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes // NOLINT
); // NOLINT
private:
};
} // namespace rknpu
} // namespace lite
} // namespace paddle
......@@ -6,5 +6,5 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
......@@ -59,6 +59,7 @@ using OpenCLContext = Context<TargetType::kOpenCL>;
using FPGAContext = Context<TargetType::kFPGA>;
using BMContext = Context<TargetType::kBM>;
using MLUContext = Context<TargetType::kMLU>;
using RKNPUContext = Context<TargetType::kRKNPU>;
template <>
class Context<TargetType::kHost> {
......@@ -103,6 +104,21 @@ class Context<TargetType::kBM> {
};
#endif
#ifdef LITE_WITH_RKNPU
template <>
class Context<TargetType::kRKNPU> {
public:
Context() {}
explicit Context(const RKNPUContext& ctx);
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {}
void CopySharedTo(RKNPUContext* ctx) {}
RKNPUContext& operator=(const RKNPUContext& ctx) {}
std::string name() const { return "RKNPUContext"; }
};
#endif
#ifdef LITE_WITH_XPU
template <>
class Context<TargetType::kXPU> {
......@@ -392,6 +408,12 @@ class ContextScheduler {
&ctx->As<NPUContext>());
break;
#endif
#ifdef LITE_WITH_RKNPU
case TARGET(kRKNPU):
kernel_contexts_[TargetType::kRKNPU].As<RKNPUContext>().CopySharedTo(
&ctx->As<RKNPUContext>());
break;
#endif
#ifdef LITE_WITH_XPU
case TARGET(kXPU):
kernel_contexts_[TargetType::kXPU].As<XPUContext>().CopySharedTo(
......@@ -461,6 +483,9 @@ class ContextScheduler {
#ifdef LITE_WITH_NPU
InitContext<TargetType::kNPU, NPUContext>();
#endif
#ifdef LITE_WITH_RKNPU
InitContext<TargetType::kRKNPU, RKNPUContext>();
#endif
#ifdef LITE_WITH_XPU
InitContext<TargetType::kXPU, XPUContext>();
#endif
......
......@@ -313,4 +313,4 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
.BindTargets({TARGET(kARM), TARGET(kOpenCL)})
.ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM)});
.ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM), TARGET(kRKNPU)});
......@@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply(
REGISTER_MIR_PASS(quantized_op_attributes_inference_pass,
paddle::lite::mir::QuantizedOpAttributesInferencePass)
.BindTargets({TARGET(kNPU)});
.BindTargets({TARGET(kNPU), TARGET(kRKNPU)});
......@@ -69,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser();
}
void RKNPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto teller = [&](Node* node) {
if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
return supported_lists.count(stmt.op_type()) != 0;
};
SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
fuser();
}
void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
......@@ -93,5 +107,7 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
.BindTargets({TARGET(kXPU)});
REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
.BindTargets({TARGET(kBM)});
REGISTER_MIR_PASS(rknpu_subgraph_pass, paddle::lite::mir::RKNPUSubgraphPass)
.BindTargets({TARGET(kRKNPU)});
REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
.BindTargets({TARGET(kMLU)});
......@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass {
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
class RKNPUSubgraphPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
class MLUSubgraphPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
......
......@@ -110,6 +110,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
case TARGET(kMLU): {
CREATE_KERNEL(kMLU);
} break;
case TARGET(kRKNPU): {
CREATE_KERNEL(kRKNPU);
} break;
default:
CHECK(false) << "not supported kernel target " << TargetToStr(target);
}
......@@ -232,6 +235,11 @@ KernelRegistry::KernelRegistry()
INIT_FOR(kBM, kInt8, kNCHW);
INIT_FOR(kBM, kAny, kNCHW);
INIT_FOR(kBM, kAny, kAny);
INIT_FOR(kRKNPU, kFloat, kNCHW);
INIT_FOR(kRKNPU, kInt8, kNCHW);
INIT_FOR(kRKNPU, kAny, kNCHW);
INIT_FOR(kRKNPU, kAny, kAny);
#undef INIT_FOR
}
......
......@@ -251,6 +251,16 @@ class KernelRegistry final {
PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kRKNPU),
PRECISION(kAny),
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kRKNPU),
PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kRKNPU),
PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kFPGA),
PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, //
......
......@@ -86,6 +86,7 @@ class Optimizer {
"npu_subgraph_pass",
"xpu_subgraph_pass",
"bm_subgraph_pass",
"rknpu_subgraph_pass",
"static_kernel_pick_pass", // pick original kernel from graph
"variable_place_inference_pass", // inference arg/var's
// info(target/precision/layout/device)
......
......@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels}
RKNPU_DEPS ${rknpu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
......@@ -43,6 +44,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels}
RKNPU_DEPS ${rknpu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
......
......@@ -12,3 +12,4 @@ add_subdirectory(npu)
add_subdirectory(xpu)
add_subdirectory(mlu)
add_subdirectory(bm)
add_subdirectory(rknpu)
if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU)
if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU)
return()
endif()
......
add_subdirectory(bridges)
add_kernel(subgraph_compute_rknpu RKNPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_rknpu ${rknpu_subgraph_bridges})
if(NOT LITE_WITH_RKNPU)
return()
endif()
lite_cc_library(subgraph_bridge_utility_rknpu SRCS utility.cc DEPS ${rknpu_builder_libs} tensor)
lite_cc_library(subgraph_bridge_graph_rknpu SRCS graph.cc DEPS subgraph_bridge_utility_rknpu)
set(rknpu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_rknpu subgraph_bridge_graph_rknpu)
lite_cc_library(subgraph_bridge_conv_op_rknpu SRCS conv_op.cc DEPS ${rknpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_act_op_rknpu SRCS act_op.cc DEPS ${rknpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_softmax_op_rknpu SRCS softmax_op.cc DEPS ${rknpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_pool_op_rknpu SRCS pool_op.cc DEPS ${rknpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_fc_op_rknpu SRCS fc_op.cc DEPS ${rknpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_batch_norm_op_rknpu SRCS batch_norm_op.cc DEPS ${rknpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_concat_op_rknpu SRCS concat_op.cc DEPS ${rknpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_elementwise_ops_rknpu SRCS elementwise_ops.cc DEPS ${rknpu_subgraph_bridge_deps})
set(rknpu_subgraph_bridges
subgraph_bridge_registry
subgraph_bridge_utility_rknpu
subgraph_bridge_graph_rknpu
subgraph_bridge_conv_op_rknpu
subgraph_bridge_act_op_rknpu
subgraph_bridge_softmax_op_rknpu
subgraph_bridge_pool_op_rknpu
subgraph_bridge_fc_op_rknpu
subgraph_bridge_batch_norm_op_rknpu
subgraph_bridge_concat_op_rknpu
subgraph_bridge_elementwise_ops_rknpu
CACHE INTERNAL "rknpu_subgraph_bridges")
message(STATUS "+++++ rknpu_subgraph_bridges: ${rknpu_subgraph_bridges}")
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/rknpu/bridges/graph.h"
// #include "lite/kernels/npu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace rknpu {
int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto scope = op->scope();
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto x_var_name = op_info->Input("X").front();
auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
auto x_dims = x->dims();
auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
auto output_dims = output->dims();
const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
const int64_t* output_shape_data =
const_cast<const int64_t*>(&output_dims.data()[0]);
std::vector<int32_t> i_x_shape_data(x_dims.size());
std::vector<int32_t> i_output_shape_data(output_dims.size());
VLOG(3) << "[RKNPU] Converting " + op_type + "...";
auto x_type = kernel->GetInputDeclType("X");
CHECK(x_type->precision() == PRECISION(kFloat));
CHECK(x_type->layout() == DATALAYOUT(kNCHW));
auto out_type = kernel->GetOutputDeclType("Out");
CHECK(out_type->precision() == PRECISION(kFloat));
CHECK(out_type->layout() == DATALAYOUT(kNCHW));
for (size_t i = 0; i < x_dims.size(); i++) {
i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
}
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
}
CHECK_EQ(op_type, "relu");
// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_var_name)) {
x_node = graph->Get(x_var_name);
} else {
x_node = graph->Add(x_var_name, *x, x_type->precision(), x_type->layout());
}
auto output_node = graph->Add(
output_var_name, *output, out_type->precision(), out_type->layout());
auto rGraph = graph->GetHandle();
std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
inputs.push_back(x_node->data());
outputs.push_back(output_node->data());
auto relu =
rGraph->AddOperator(rk::nn::OperatorType::RELU, inputs, outputs, nullptr);
return SUCCESS;
}
} // namespace rknpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(relu,
kRKNPU,
paddle::lite::subgraph::rknpu::ActConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/rknpu/bridges/graph.h"
#include "lite/kernels/rknpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace rknpu {
int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[RKNPU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x_type = kernel->GetInputDeclType("X");
CHECK(x_type->layout() == DATALAYOUT(kNCHW));
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
auto scale_name = op_info->Input("Scale").front();
auto scale_type = kernel->GetInputDeclType("Scale");
CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
auto scale = scope->FindMutableTensor(scale_name);
auto bias_name = op_info->Input("Bias").front();
auto bias_type = kernel->GetInputDeclType("Bias");
CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
auto bias = scope->FindMutableTensor(bias_name);
auto mean_name = op_info->Input("Mean").front();
auto mean_type = kernel->GetInputDeclType("Mean");
CHECK(mean_type->layout() == DATALAYOUT(kNCHW));
auto mean = scope->FindMutableTensor(mean_name);
auto variance_name = op_info->Input("Variance").front();
auto variance_type = kernel->GetInputDeclType("Variance");
CHECK(variance_type->layout() == DATALAYOUT(kNCHW));
auto variance = scope->FindMutableTensor(variance_name);
auto y_name = op_info->Output("Y").front();
auto y_type = kernel->GetOutputDeclType("Y");
auto y = scope->FindMutableTensor(y_name);
CHECK(y_type->layout() == DATALAYOUT(kNCHW));
float momentum = op_info->GetAttr<float>("momentum");
float epsilon = op_info->GetAttr<float>("epsilon");
int mode = 1; // bnScale, bnBias tensor dims are 1xCx1x1
bool use_global_stats = op_info->GetAttr<bool>("use_global_stats");
// for quantization
bool enable_int8 = false;
float input_scale = 1.0;
float output_scale = 1.0;
int bit_length = 8;
DataLayoutType layout = DATALAYOUT(kNCHW);
PrecisionType precision = PRECISION(kFloat);
if (op_info->HasAttr("enable_int8")) {
enable_int8 = op_info->GetAttr<bool>("enable_int8");
input_scale = op_info->GetAttr<float>("input_scale");
bit_length = op_info->GetAttr<int>("bit_length");
output_scale = op_info->GetAttr<float>("output_scale");
if (enable_int8) {
precision = PRECISION(kInt8);
}
}
// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
x_node = graph->Add(x_name, *x);
}
// Scale, Bias, Mean, Variance node
auto scale_node = graph->Add(scale_name, *scale);
auto bias_node = graph->Add(bias_name, *bias);
auto mean_node = graph->Add(mean_name, *mean);
auto variance_node = graph->Add(variance_name, *variance);
std::shared_ptr<Node> output_node = nullptr;
QuantizationInfo output_qnt;
output_qnt.enable_int8 = enable_int8;
if (enable_int8) {
output_qnt.quant_bits = bit_length;
output_qnt.scale.push_back(output_scale);
y->mutable_data<int8_t>();
}
output_node = graph->Add(y_name, *y, precision, layout, output_qnt);
std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
inputs.push_back(x_node->data());
inputs.push_back(mean_node->data());
inputs.push_back(variance_node->data());
inputs.push_back(scale_node->data());
inputs.push_back(bias_node->data());
outputs.push_back(output_node->data());
rk::nn::BatchNormAttr attrs;
attrs.eps = epsilon;
auto rGraph = graph->GetHandle();
auto bn = rGraph->AddOperator(
rk::nn::OperatorType::BATCH_NORM, inputs, outputs, &attrs);
return SUCCESS;
}
} // namespace rknpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(batch_norm,
kRKNPU,
paddle::lite::subgraph::rknpu::BatchNormConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/rknpu/bridges/graph.h"
#include "lite/kernels/rknpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace rknpu {
int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[RKNPU] Converting " << op_type << " ... ";
// Get input and output vars and op attributes
auto x_names = op_info->Input("X");
auto x_type = kernel->GetInputDeclType("X");
auto out_name = op_info->Output("Out").front();
auto out_type = kernel->GetOutputDeclType("Out");
auto output = scope->FindMutableTensor(out_name);
auto axis = op_info->GetAttr<int>("axis");
auto num = x_names.size();
// for quantization
bool enable_int8 = false;
float input_scale = 1.0;
float output_scale = 1.0;
int bit_length = 8;
DataLayoutType layout = DATALAYOUT(kNCHW);
PrecisionType precision = PRECISION(kFloat);
if (op_info->HasAttr("enable_int8")) {
enable_int8 = op_info->GetAttr<bool>("enable_int8");
input_scale = op_info->GetAttr<float>("input_scale");
bit_length = op_info->GetAttr<int>("bit_length");
output_scale = op_info->GetAttr<float>("output_scale");
if (enable_int8) {
precision = PRECISION(kInt8);
}
}
// Traverse all of input nodes which are added into the new created concat
// node
std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
int idx = 1;
for (auto& x_name : x_names) {
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
x_node = graph->Add(x_name, *x);
QuantizationInfo qnt;
qnt.enable_int8 = enable_int8;
if (enable_int8) {
qnt.quant_bits = bit_length;
qnt.scale.push_back(input_scale);
x->mutable_data<int8_t>();
}
x_node =
graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt);
}
inputs.push_back(x_node->data());
idx++;
}
std::shared_ptr<Node> output_node = nullptr;
QuantizationInfo output_qnt;
output_qnt.enable_int8 = enable_int8;
if (enable_int8) {
output_qnt.quant_bits = bit_length;
output_qnt.scale.push_back(output_scale);
output->mutable_data<int8_t>();
}
output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
outputs.push_back(output_node->data());
rk::nn::ConcatAttr attrs;
attrs.axis = axis;
auto rGraph = graph->GetHandle();
auto concat = rGraph->AddOperator(
rk::nn::OperatorType::CONCAT, inputs, outputs, &attrs);
return SUCCESS;
}
} // namespace rknpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(concat,
kRKNPU,
paddle::lite::subgraph::rknpu::ConcatConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/conv_op.h"
#include <algorithm>
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/rknpu/bridges/graph.h"
#include "lite/kernels/rknpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace rknpu {
int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[RKNPU] Converting " << op_type << "... ";
// Get input and output vars and op attributes
auto input_name = op_info->Input("Input").front();
auto input = scope->FindMutableTensor(input_name);
auto input_dims = input->dims();
auto filter_name = op_info->Input("Filter").front();
auto filter = scope->FindMutableTensor(filter_name);
auto filter_dims = filter->dims();
auto output_name = op_info->Output("Output").front();
auto output = scope->FindMutableTensor(output_name);
auto output_dims = output->dims();
auto bs = input_dims[0];
auto ic = input_dims[1];
auto oc = filter_dims[0];
CHECK_EQ(input_dims.size(), 4L);
CHECK_EQ(output_dims.size(), 4L);
CHECK_EQ(filter_dims.size(), 4L);
CHECK_EQ(output_dims[0], bs);
CHECK_EQ(output_dims[1], oc);
auto strides = op_info->GetAttr<std::vector<int>>("strides");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
auto groups = op_info->GetAttr<int>("groups");
auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
CHECK_EQ(strides.size(), 2L);
CHECK_EQ(dilations.size(), 2L);
// Check depthwise mode
bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1);
auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
// for quantization
bool enable_int8 = false;
float input_scale = 1.0;
float output_scale = 1.0;
int bit_length = 8;
DataLayoutType layout = DATALAYOUT(kNCHW);
PrecisionType precision = PRECISION(kFloat);
if (op_info->HasAttr("enable_int8")) {
enable_int8 = op_info->GetAttr<bool>("enable_int8");
input_scale = op_info->GetAttr<float>("input_scale");
bit_length = op_info->GetAttr<int>("bit_length");
output_scale = op_info->GetAttr<float>("output_scale");
if (enable_int8) {
precision = PRECISION(kInt8);
}
}
// // Input node
std::shared_ptr<Node> input_node = nullptr;
if (graph->Has(input_name)) {
input_node = graph->Get(input_name);
} else {
QuantizationInfo qnt;
qnt.enable_int8 = enable_int8;
if (enable_int8) {
qnt.scale.clear();
qnt.scale.push_back(input_scale);
qnt.quant_bits = bit_length;
}
input_node =
graph->Add(input_name, *input, input->precision(), layout, qnt);
}
if (paddings.size() == 2L) {
for (size_t i = 0; i < strides.size(); ++i) {
int copy_pad = *(paddings.begin() + 2 * i);
paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
}
}
CHECK_EQ(paddings.size(), 4L)
<< "[NPU] Paddings size should be the same or twice as the input size.";
std::string padding_algorithm("");
if (op_info->HasAttr("padding_algorithm")) {
padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
}
operators::UpdatePaddingAndDilation(&paddings,
&dilations,
strides,
padding_algorithm,
input_dims,
filter_dims);
// Filter node
std::shared_ptr<Node> filter_node = nullptr;
QuantizationInfo filter_qnt;
filter_qnt.enable_int8 = enable_int8;
if (enable_int8) {
filter_qnt.scale = weight_scale;
filter_qnt.quant_bits = bit_length;
}
filter_node =
graph->Add(filter_name, *filter, filter->precision(), layout, filter_qnt);
// Add bias node if exists bias
// Supports the bias nodes with the following dimensions
// 0: {oc}
std::shared_ptr<Node> bias_node = nullptr;
if (HasInputArg(op_info, scope, "Bias")) {
auto bias_name = op_info->Input("Bias").front();
if (graph->Has(bias_name)) {
bias_node = graph->Get(bias_name);
} else {
auto bias = scope->FindMutableTensor(bias_name);
auto bias_dims = bias->dims();
auto bias_data_size = bias_dims.production();
auto output_data_size = output_dims.production();
std::vector<int64_t> bias_shape;
if (bias_data_size == oc) {
// 0: {oc}
bias_shape = {oc};
} else {
LOG(WARNING)
<< "[RKNPU] Bias dimension " << bias_dims
<< " isn't supported in conv2d Op when output dimension is "
<< output_dims;
return FAILED;
}
if (enable_int8) {
auto bias_name_qnt = bias_name + "/qnt";
auto* bias_qnt = scope->NewTensor(bias_name_qnt);
bias_qnt->Resize(bias_shape);
bias_qnt->set_persistable(true);
bias_qnt->set_precision(PrecisionType::kInt32);
auto* bias_qnt_data = bias_qnt->mutable_data<int32_t>();
auto* bias_data = bias->mutable_data<float>();
QuantizationInfo qnt;
qnt.enable_int8 = enable_int8;
qnt.quant_bits = 32;
qnt.scale.resize(weight_scale.size());
for (int i = 0; i < weight_scale.size(); i++) {
qnt.scale[i] = input_scale * weight_scale[i];
}
auto dtype_max = static_cast<int>((1 << (qnt.quant_bits - 1)) - 1);
auto dtype_min = static_cast<int>(0 - dtype_max);
for (int i = 0; i < oc; i++) {
bias_qnt_data[i] =
std::min(std::max(static_cast<int>(bias_data[i] / qnt.scale[i]),
dtype_min),
dtype_max);
}
bias_node = graph->Add(
bias_name, *bias_qnt, bias_qnt->precision(), layout, qnt);
} else {
bias_node = graph->Add(bias_name, *bias, bias_shape);
}
}
} else {
auto bias_name = filter_name + "/bias/dummy";
auto* bias = scope->NewTensor(bias_name);
std::vector<int64_t> bias_shape = {oc};
bias->Resize(bias_shape);
bias->set_persistable(true);
if (enable_int8) {
bias->set_precision(PrecisionType::kInt32);
auto* bias_data = bias->mutable_data<int32_t>();
for (int i = 0; i < oc; i++) {
bias_data[i] = 0;
}
QuantizationInfo qnt;
qnt.enable_int8 = enable_int8;
qnt.quant_bits = 32;
qnt.scale.resize(weight_scale.size());
for (int i = 0; i < weight_scale.size(); i++) {
qnt.scale[i] = input_scale * weight_scale[i];
}
bias_node = graph->Add(bias_name, *bias, bias->precision(), layout, qnt);
} else {
bias->set_precision(PrecisionType::kFloat);
auto* bias_data = bias->mutable_data<float>();
for (int i = 0; i < oc; i++) {
bias_data[i] = 0.0;
}
bias_node = graph->Add(bias_name, *bias, bias_shape);
}
}
// Conv node
std::shared_ptr<Node> conv_node = nullptr;
std::shared_ptr<Node> output_node = nullptr;
std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
QuantizationInfo output_qnt;
output_qnt.enable_int8 = enable_int8;
if (enable_int8) {
output_qnt.quant_bits = bit_length;
output_qnt.scale.push_back(output_scale);
output->mutable_data<int8_t>();
}
output_node = graph->Add(output_name, *output, precision, layout, output_qnt);
inputs.push_back(input_node->data());
inputs.push_back(filter_node->data());
inputs.push_back(bias_node->data());
outputs.push_back(output_node->data());
rk::nn::Conv2DAttr attr;
attr.ksize[0] = filter_dims[2];
attr.ksize[1] = filter_dims[3];
attr.stride[0] = strides[0];
attr.stride[1] = strides[1];
attr.pad[0] = paddings[0];
attr.pad[1] = paddings[1];
attr.pad[2] = paddings[2];
attr.pad[3] = paddings[3];
attr.group = groups;
attr.weights = oc;
attr.dilation[0] = dilations[0];
attr.dilation[1] = dilations[1];
attr.pad_type = rk::nn::PadType::AUTO;
attr.has_relu = fuse_relu;
if (is_depthwise_mode) {
attr.multiplier = 1;
} else {
attr.multiplier = 0;
}
auto rGraph = graph->GetHandle();
auto conv = rGraph->AddOperator(
rk::nn::OperatorType::CONV2D, inputs, outputs, &attr, output_name);
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace rknpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(conv2d,
kRKNPU,
paddle::lite::subgraph::rknpu::ConvConverter);
REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
kRKNPU,
paddle::lite::subgraph::rknpu::ConvConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/rknpu/bridges/graph.h"
#include "lite/kernels/rknpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace rknpu {
std::vector<int64_t> CvtYShape(const DDim& x_dims,
const DDim& y_dims,
int axis) {
CHECK_EQ(x_dims.size(), 4UL) << "[RKNPU] Only support 4-dimension x";
CHECK_GE(x_dims.size(), y_dims.size());
if (axis < 0) {
axis += x_dims.size();
}
std::vector<int64_t> y_new_shape(y_dims.Vectorize());
if (y_new_shape.size() == 4UL) {
return y_new_shape;
}
for (int i = 0; i < axis; i++) {
y_new_shape.insert(y_new_shape.begin(), 1);
}
while (y_new_shape.size() < 4) {
y_new_shape.push_back(1);
}
CHECK_EQ(y_new_shape.size(), 4UL);
return y_new_shape;
}
int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[RKNPU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x_type = kernel->GetInputDeclType("X");
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
auto y_name = op_info->Input("Y").front();
auto y_type = kernel->GetInputDeclType("Y");
auto y = scope->FindMutableTensor(y_name);
auto y_dims = y->dims();
auto out_name = op_info->Output("Out").front();
auto out_type = kernel->GetOutputDeclType("Out");
auto output = scope->FindMutableTensor(out_name);
auto axis = op_info->GetAttr<int>("axis");
// for quantization
bool enable_int8 = false;
float input_scale = 1.0;
float output_scale = 1.0;
int bit_length = 8;
DataLayoutType layout = DATALAYOUT(kNCHW);
PrecisionType precision = PRECISION(kFloat);
if (op_info->HasAttr("enable_int8")) {
enable_int8 = op_info->GetAttr<bool>("enable_int8");
input_scale = op_info->GetAttr<float>("input_scale");
bit_length = op_info->GetAttr<int>("bit_length");
output_scale = op_info->GetAttr<float>("output_scale");
if (enable_int8) {
precision = PRECISION(kInt8);
}
}
// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
QuantizationInfo qnt;
qnt.enable_int8 = enable_int8;
if (enable_int8) {
qnt.scale.clear();
qnt.scale.push_back(input_scale);
qnt.quant_bits = op_info->GetAttr<int>("bit_length");
}
x_node = graph->Add(x_name, *x, x_type->precision(), x_type->layout(), qnt);
}
// Y node
std::shared_ptr<Node> y_node = nullptr;
if (graph->Has(y_name)) {
y_node = graph->Get(y_name);
} else {
// auto y_new_shape = CvtYShape(x_dims, y_dims, axis);
// y_node = graph->Add(y_name, *y, y_new_shape);
QuantizationInfo qnt;
qnt.enable_int8 = enable_int8;
if (enable_int8) {
qnt.quant_bits = bit_length;
qnt.scale.clear();
qnt.scale.push_back(input_scale);
}
y_node = graph->Add(y_name, *y, y_type->precision(), y_type->layout(), qnt);
}
std::shared_ptr<Node> output_node = nullptr;
QuantizationInfo output_qnt;
output_qnt.enable_int8 = enable_int8;
if (enable_int8) {
output_qnt.quant_bits = bit_length;
output_qnt.scale.clear();
output_qnt.scale.push_back(output_scale);
output->mutable_data<int8_t>();
}
output_node = graph->Add(
out_name, *output, x_type->precision(), x_type->layout(), output_qnt);
std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
inputs.push_back(x_node->data());
inputs.push_back(y_node->data());
outputs.push_back(output_node->data());
auto rGraph = graph->GetHandle();
// Elementwise node
if (op_type == "elementwise_add") {
auto elt_node = rGraph->AddOperator(
rk::nn::OperatorType::ADD, inputs, outputs, nullptr);
} else if (op_type == "elementwise_sub") {
auto elt_node = rGraph->AddOperator(
rk::nn::OperatorType::SUBTRACT, inputs, outputs, nullptr);
} else if (op_type == "elementwise_mul") {
auto elt_node = rGraph->AddOperator(
rk::nn::OperatorType::MULTIPLY, inputs, outputs, nullptr);
} else if (op_type == "elementwise_div") {
auto elt_node = rGraph->AddOperator(
rk::nn::OperatorType::DIVIDE, inputs, outputs, nullptr);
} else {
LOG(WARNING) << "[RKNPU] Unsupported op type: " << op_type;
return FAILED;
}
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace rknpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
kRKNPU,
paddle::lite::subgraph::rknpu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
kRKNPU,
paddle::lite::subgraph::rknpu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
kRKNPU,
paddle::lite::subgraph::rknpu::ElementwiseConverter);
REGISTER_SUBGRAPH_BRIDGE(elementwise_div,
kRKNPU,
paddle::lite::subgraph::rknpu::ElementwiseConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/rknpu/bridges/graph.h"
#include "lite/kernels/rknpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace rknpu {
int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[RKNPU] Converting " + op_type + "...";
auto input_name = op_info->Input("Input").front();
auto input_type = kernel->GetInputDeclType("Input");
auto input = scope->FindMutableTensor(input_name);
auto input_dims = input->dims();
CHECK_GE(input_dims.size(), 2UL);
auto w_name = op_info->Input("W").front();
auto w_type = kernel->GetInputDeclType("W");
auto w = scope->FindMutableTensor(w_name);
auto w_dims = w->dims();
CHECK_EQ(w_dims.size(), 2UL);
auto out_name = op_info->Output("Out").front();
auto out_type = kernel->GetOutputDeclType("Out");
auto output = scope->FindMutableTensor(out_name);
int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
int m = input_dims.Slice(0, in_num_col_dims).production();
int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production();
int n = w_dims[1];
CHECK_EQ(k * n, w_dims.production());
VLOG(3) << "[RKNPU] input dims: " << input_dims << " w dims: " << w_dims
<< " m: " << m << " k: " << k << " n: " << n;
// for quantization
bool enable_int8 = false;
float input_scale = 1.0;
float output_scale = 1.0;
int bit_length = 8;
DataLayoutType layout = DATALAYOUT(kNCHW);
PrecisionType precision = PRECISION(kFloat);
if (op_info->HasAttr("enable_int8")) {
enable_int8 = op_info->GetAttr<bool>("enable_int8");
input_scale = op_info->GetAttr<float>("input_scale");
bit_length = op_info->GetAttr<int>("bit_length");
output_scale = op_info->GetAttr<float>("output_scale");
if (enable_int8) {
precision = PRECISION(kInt8);
}
}
// Create input node and reshape it to (m, k, 1, 1)
std::shared_ptr<Node> input_node = nullptr;
if (graph->Has(input_name)) {
input_node = graph->Get(input_name);
} else {
input_node = graph->Add(input_name, *input);
}
// Create w const node, set its shape to (n, k) and fill with
// the transposed w tensor
auto* transpose_w = scope->NewTensor(w_name + "/transpose");
std::shared_ptr<Node> trans_w_node = nullptr;
transpose_w->Resize({n, k});
transpose_w->set_persistable(true);
if (enable_int8) {
QuantizationInfo filter_qnt;
auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
filter_qnt.enable_int8 = enable_int8;
filter_qnt.scale = weight_scale;
filter_qnt.quant_bits = bit_length;
auto transpose_w_data = transpose_w->mutable_data<int8_t>();
auto w_data = w->mutable_data<int8_t>();
for (int i = 0; i < k; i++) {
for (int j = 0; j < n; j++) {
transpose_w_data[j * k + i] = w_data[i * n + j];
}
}
trans_w_node = graph->Add(
w_name, *transpose_w, precision, w_type->layout(), filter_qnt);
} else {
auto transpose_w_data = transpose_w->mutable_data<float>();
auto w_data = w->mutable_data<float>();
for (int i = 0; i < k; i++) {
for (int j = 0; j < n; j++) {
transpose_w_data[j * k + i] = w_data[i * n + j];
}
}
trans_w_node =
graph->Add(w_name, *transpose_w, precision, w_type->layout());
}
// Add bias node if bias tensor exists
std::shared_ptr<Node> bias_node = nullptr;
if (HasInputArg(op_info, scope, "Bias")) {
auto bias_name = op_info->Input("Bias").front();
if (graph->Has(bias_name)) {
bias_node = graph->Get(bias_name);
} else {
auto bias = scope->FindMutableTensor(bias_name);
auto bias_dims = bias->dims();
auto bias_data_size = bias_dims.production();
std::vector<int64_t> bias_shape = {n};
VLOG(3) << "[RKNPU] bias precision: "
<< PrecisionToStr(bias->precision());
// We need to quantize bias
if (enable_int8) {
auto bias_name_qnt = bias_name + "/qnt";
auto* bias_qnt = scope->NewTensor(bias_name_qnt);
auto weight_scale =
op_info->GetAttr<std::vector<float>>("weight_scale");
bias_qnt->Resize(bias_shape);
bias_qnt->set_persistable(true);
bias_qnt->set_precision(PrecisionType::kInt32);
auto* bias_qnt_data = bias_qnt->mutable_data<int32_t>();
auto* bias_data = bias->mutable_data<float>();
QuantizationInfo qnt;
qnt.enable_int8 = enable_int8;
qnt.quant_bits = 32;
qnt.scale.resize(weight_scale.size());
for (int i = 0; i < weight_scale.size(); i++) {
qnt.scale[i] = input_scale * weight_scale[i];
}
auto dtype_max = static_cast<int>((1 << (qnt.quant_bits - 1)) - 1);
auto dtype_min = static_cast<int>(0 - dtype_max);
for (int i = 0; i < n; i++) {
bias_qnt_data[i] =
std::min(std::max(static_cast<int>(bias_data[i] / qnt.scale[i]),
dtype_min),
dtype_max);
}
bias_node = graph->Add(
bias_name, *bias_qnt, bias_qnt->precision(), layout, qnt);
} else {
bias_node = graph->Add(bias_name, *bias, bias_shape);
}
}
} else {
auto bias_name = w_name + "/bias/dummy";
auto* bias = scope->NewTensor(bias_name);
std::vector<int64_t> bias_shape = {n};
bias->Resize(bias_shape);
bias->set_persistable(true);
if (enable_int8) {
auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
bias->set_precision(PrecisionType::kInt32);
auto* bias_data = bias->mutable_data<int32_t>();
for (int i = 0; i < n; i++) {
bias_data[i] = 0;
}
QuantizationInfo qnt;
qnt.enable_int8 = enable_int8;
qnt.quant_bits = 32;
qnt.scale.resize(weight_scale.size());
for (int i = 0; i < weight_scale.size(); i++) {
qnt.scale[i] = input_scale * weight_scale[i];
}
bias_node = graph->Add(bias_name, *bias, bias->precision(), layout, qnt);
} else {
bias->set_precision(PrecisionType::kFloat);
auto* bias_data = bias->mutable_data<float>();
for (int i = 0; i < n; i++) {
bias_data[i] = 0.0;
}
bias_node = graph->Add(bias_name, *bias, bias_shape);
}
}
std::shared_ptr<Node> output_node = nullptr;
QuantizationInfo output_qnt;
output_qnt.enable_int8 = enable_int8;
if (enable_int8) {
output_qnt.quant_bits = bit_length;
output_qnt.scale.clear();
output_qnt.scale.push_back(output_scale);
output->mutable_data<int8_t>();
}
output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
inputs.push_back(input_node->data());
inputs.push_back(trans_w_node->data());
inputs.push_back(bias_node->data());
outputs.push_back(output_node->data());
rk::nn::FCAttr attrs;
attrs.weights = n;
attrs.has_relu = false;
auto rGraph = graph->GetHandle();
auto fc = rGraph->AddOperator(
rk::nn::OperatorType::FULLCONNECT, inputs, outputs, &attrs);
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace rknpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(fc,
kRKNPU,
paddle::lite::subgraph::rknpu::FCConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/rknpu/bridges/graph.h"
#include <rknpu/graph.h>
#include "lite/kernels/rknpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace rknpu {
int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
auto it = nodes_.find(name);
if (it != nodes_.end()) {
// Only variable node can be shared with the same name
if (!node->is_var() || !it->second.back()->is_var()) {
LOG(FATAL) << "[RKNPU] Const or data node " << name << " is redefined.";
return -1;
}
} else {
auto ret = nodes_.insert(
std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
CHECK(ret.second);
it = ret.first;
}
it->second.push_back(node);
return it->second.size();
}
// Const or data node
std::shared_ptr<Node> Graph::Add(const std::string& name,
const Tensor& tensor,
std::vector<int64_t> shape,
PrecisionType precision,
DataLayoutType layout,
const QuantizationInfo& qnt) {
std::shared_ptr<Node> node = nullptr;
if (precision == PrecisionType::kUnk) {
precision = tensor.precision(); // todo
}
if (precision == PrecisionType::kUnk) {
if (qnt.enable_int8 && qnt.quant_bits == 8) {
precision = PrecisionType::kInt8;
} else if (!qnt.enable_int8) {
precision = PrecisionType::kFloat;
} else {
LOG(ERROR) << "[rknpu]:Graph:: tensor precision unknown!";
}
}
if (precision != tensor.precision()) {
LOG(INFO) << "[rknpu]:Graph::Add: tensor precision mismatch!" << name << ":"
<< PrecisionToStr(precision) << " vs "
<< PrecisionToStr(tensor.precision());
}
if (tensor.persistable()) {
// Const node
node = std::make_shared<Node>(precision, layout, Node::Role::kConst);
auto idx = Add(name, node);
CHECK_EQ(idx, 1);
auto attr = std::make_shared<rk::nn::TensorAttr>();
attr->precision = ToRknpuPrecisionType(precision);
attr->layout = ToRknpuDataLayoutType(layout);
attr->role = rk::nn::TensorRole::CONST;
attr->name = name;
switch (precision) {
case PrecisionType::kInt8:
attr->qntBits = 8;
attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
attr->qntParamSymmetric.scale = qnt.scale;
break;
case PrecisionType::kInt32:
attr->qntBits = 32;
attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
attr->qntParamSymmetric.scale = qnt.scale;
break;
default:
break;
}
attr->dims.resize(shape.size());
for (int i = 0; i < shape.size(); i++) {
attr->dims[i] = shape[i];
}
LOG(INFO) << "[rknpu]:Graph::Add const node:" << name
<< " precision: " << PrecisionToStr(precision)
<< " layout: " << DataLayoutToStr(layout);
node->set_data(
rgraph_->CreateTensor(attr, const_cast<void*>(tensor.raw_data())));
} else {
// Data node
node = Add(name, shape, precision, layout, qnt);
}
return node;
}
// Data node
std::shared_ptr<Node> Graph::Add(const std::string& name,
std::vector<int64_t> shape,
PrecisionType precision,
DataLayoutType layout,
const QuantizationInfo& qnt) {
auto node = std::make_shared<Node>(precision, layout, Node::Role::kData);
auto idx = Add(name, node);
CHECK_EQ(idx, 1);
auto attr = std::make_shared<rk::nn::TensorAttr>();
attr->precision = ToRknpuPrecisionType(precision);
attr->layout = ToRknpuDataLayoutType(layout);
attr->role = rk::nn::TensorRole::VAR;
attr->name = name;
switch (precision) {
case PrecisionType::kInt8:
attr->qntBits = 8;
attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
attr->qntParamSymmetric.scale = qnt.scale;
break;
case PrecisionType::kInt32:
attr->qntBits = 32;
attr->qntType = rk::nn::QuantizationType::SYMMETRIC;
attr->qntParamSymmetric.scale = qnt.scale;
break;
default:
break;
}
attr->dims.resize(shape.size());
for (int i = 0; i < shape.size(); i++) {
attr->dims[i] = shape[i];
}
LOG(INFO) << "[rknpu]:Graph::Add data node:" << name
<< " precision: " << PrecisionToStr(precision)
<< " layout: " << DataLayoutToStr(layout);
node->set_data(rgraph_->CreateTensor(attr, nullptr)); // todo
return node;
}
Graph::Graph() {
rgraph_ = new rk::nn::Graph();
CHECK(rgraph_ != nullptr);
}
Graph::~Graph() {}
} // namespace rknpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
#include "rknpu/rknpu_pub.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace rknpu {
// Graph and node is defined to collect all of converted RKNPU IR nodes
struct QuantizationInfo {
int enable_int8;
int quant_bits;
std::vector<float> scale;
};
class Node {
public:
enum class Role {
kVar = 0,
kConst,
kData,
};
Node(std::shared_ptr<rk::nn::Tensor> data,
PrecisionType precision,
DataLayoutType layout,
Role role)
: data_(data), precision_(precision), layout_(layout), role_(role) {}
Node(PrecisionType precision, DataLayoutType layout, Role role)
: precision_(precision), layout_(layout), role_(role) {}
void set_data(std::shared_ptr<rk::nn::Tensor> data) { data_ = data; }
void set_precision(PrecisionType precision) { precision_ = precision; }
void set_layout(DataLayoutType layout) { layout_ = layout; }
void set_role(Role role) { role_ = role; }
void set_quant_param(const QuantizationInfo& qnt) { qnt_ = qnt; }
std::shared_ptr<rk::nn::Tensor> data() { return data_; }
PrecisionType precision() const { return precision_; }
DataLayoutType layout() const { return layout_; }
Role role() const { return role_; }
bool is_var() const { return role_ == Role::kVar; }
bool is_const() const { return role_ == Role::kConst; }
bool is_data() const { return role_ == Role::kData; }
private:
std::shared_ptr<rk::nn::Tensor> data_{nullptr};
PrecisionType precision_{PRECISION(kFloat)};
DataLayoutType layout_{DATALAYOUT(kNCHW)};
Role role_{Role::kVar};
QuantizationInfo qnt_;
};
class Graph {
public:
Graph();
~Graph();
public:
int Add(const std::string& name, std::shared_ptr<Node> node);
// Const or data node
std::shared_ptr<Node> Add(const std::string& name,
const Tensor& tensor,
std::vector<int64_t> shape,
PrecisionType precision = PRECISION(kUnk),
DataLayoutType layout = DATALAYOUT(kNCHW),
const QuantizationInfo& qnt = QuantizationInfo());
std::shared_ptr<Node> Get(const std::string& name) {
CHECK(Has(name)) << "[RKNPU] Node " << name << " not found.";
return nodes_.at(name).back();
}
std::shared_ptr<Node> Add(const std::string& name,
const Tensor& tensor,
PrecisionType precision = PRECISION(kUnk),
DataLayoutType layout = DATALAYOUT(kNCHW),
const QuantizationInfo& qnt = QuantizationInfo()) {
return Add(name, tensor, tensor.dims().Vectorize(), precision, layout, qnt);
}
// Data node
std::shared_ptr<Node> Add(const std::string& name,
std::vector<int64_t> shape,
PrecisionType precision = PRECISION(kFloat),
DataLayoutType layout = DATALAYOUT(kNCHW),
const QuantizationInfo& qnt = QuantizationInfo());
std::shared_ptr<Node> Add(const std::string& name,
DDim dims,
PrecisionType precision = PRECISION(kFloat),
DataLayoutType layout = DATALAYOUT(kNCHW),
const QuantizationInfo& qnt = QuantizationInfo()) {
return Add(name, dims.Vectorize(), precision, layout, qnt);
}
bool Has(const std::string& name) {
return nodes_.find(name) != nodes_.end();
}
rk::nn::Graph* GetHandle() { return rgraph_; }
private:
std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
rk::nn::Graph* rgraph_;
};
} // namespace rknpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
USE_SUBGRAPH_BRIDGE(relu, kRKNPU);
USE_SUBGRAPH_BRIDGE(conv2d, kRKNPU);
USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kRKNPU);
USE_SUBGRAPH_BRIDGE(pool2d, kRKNPU);
USE_SUBGRAPH_BRIDGE(fc, kRKNPU);
USE_SUBGRAPH_BRIDGE(softmax, kRKNPU);
USE_SUBGRAPH_BRIDGE(batch_norm, kRKNPU);
USE_SUBGRAPH_BRIDGE(concat, kRKNPU);
USE_SUBGRAPH_BRIDGE(elementwise_add, kRKNPU);
USE_SUBGRAPH_BRIDGE(elementwise_sub, kRKNPU);
USE_SUBGRAPH_BRIDGE(elementwise_mul, kRKNPU);
USE_SUBGRAPH_BRIDGE(elementwise_div, kRKNPU);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/pool_op.h"
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/rknpu/bridges/graph.h"
#include "lite/kernels/rknpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace rknpu {
int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[RKNPU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
auto out_name = op_info->Output("Out").front();
auto output = scope->FindMutableTensor(out_name);
auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
auto global_pooling = op_info->GetAttr<bool>("global_pooling");
auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
// for quantization
bool enable_int8 = false;
float input_scale = 1.0;
float output_scale = 1.0;
int bit_length = 8;
DataLayoutType layout = DATALAYOUT(kNCHW);
PrecisionType precision = PRECISION(kFloat);
if (x->precision() == PRECISION(kInt8)) {
// enable_int8 = op_info->GetAttr<bool>("enable_int8");
enable_int8 = true;
input_scale = op_info->GetAttr<float>("input_scale");
bit_length = op_info->GetAttr<int>("bit_length");
output_scale = op_info->GetAttr<float>("output_scale");
if (enable_int8) {
precision = PRECISION(kInt8);
LOG(WARNING) << "[RKNPU] Pooling int8";
}
}
// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
QuantizationInfo qnt;
qnt.enable_int8 = enable_int8;
if (enable_int8) {
qnt.scale.push_back(input_scale);
qnt.quant_bits = bit_length;
}
x_node = graph->Add(x_name, *x, x->precision(), layout, qnt);
}
// pool mode
rk::nn::PoolType mode = rk::nn::PoolType::POOLING_UNKNOWN;
if (pooling_type == "max") {
mode = rk::nn::PoolType::POOLING_MAX;
} else if (pooling_type == "avg") {
mode = rk::nn::PoolType::POOLING_AVG;
} else {
LOG(WARNING) << "[RKNPU] Unsupported pooling type: " << pooling_type;
return FAILED;
}
// pad mode
rk::nn::PadType pad_mode = rk::nn::PadType::AUTO;
std::string padding_algorithm("");
if (op_info->HasAttr("padding_algorithm")) {
padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
}
if (padding_algorithm == "SAME") {
pad_mode = rk::nn::PadType::SAME;
} else if (padding_algorithm == "VALID") {
pad_mode = rk::nn::PadType::VALID;
}
// paddings and strides
if (paddings.size() == 2L) {
for (size_t i = 0; i < 2L; ++i) {
int copy_pad = *(paddings.begin() + 2 * i);
paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
}
}
CHECK_EQ(paddings.size(), 4L)
<< "[NPU] Paddings size should be the same or twice as the inputs size.";
bool adaptive = false;
if (op_info->HasAttr("adaptive")) {
adaptive = op_info->GetAttr<bool>("adaptive");
}
auto strides = op_info->GetAttr<std::vector<int>>("strides");
lite::operators::UpdatePadding(&paddings,
global_pooling,
adaptive,
padding_algorithm,
x->dims(),
strides,
ksize);
// ceil mode
int ceil_mode = 0;
if (op_info->HasAttr("ceil_mode")) {
ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
}
std::shared_ptr<Node> output_node = nullptr;
QuantizationInfo output_qnt;
output_qnt.enable_int8 = enable_int8;
if (enable_int8) {
output_qnt.quant_bits = bit_length;
output_qnt.scale.push_back(output_scale);
output->mutable_data<int8_t>();
}
output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
inputs.push_back(x_node->data());
outputs.push_back(output_node->data());
rk::nn::PoolAttr attrs;
attrs.ksize[0] = ksize[0];
attrs.ksize[1] = ksize[1];
attrs.stride[0] = strides[0];
attrs.stride[1] = strides[1];
attrs.pad[0] = paddings[0];
attrs.pad[1] = paddings[1];
attrs.pad[2] = paddings[2];
attrs.pad[3] = paddings[3];
attrs.pad_type = pad_mode;
attrs.pool_type = mode;
attrs.global_pooling = global_pooling;
if (ceil_mode) {
attrs.round_type = rk::nn::RoundType::ROUND_CEIL;
} else {
attrs.round_type = rk::nn::RoundType::ROUND_FLOOR;
}
auto rGraph = graph->GetHandle();
auto pool =
rGraph->AddOperator(rk::nn::OperatorType::POOL, inputs, outputs, &attrs);
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace rknpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(pool2d,
kRKNPU,
paddle::lite::subgraph::rknpu::PoolConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/rknpu/bridges/graph.h"
#include "lite/kernels/rknpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace rknpu {
int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[RKNPU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x_type = kernel->GetInputDeclType("X");
CHECK(x_type->layout() == DATALAYOUT(kNCHW));
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
auto x_rank = x_dims.size();
auto out_name = op_info->Output("Out").front();
auto out_type = kernel->GetOutputDeclType("Out");
CHECK(out_type->layout() == DATALAYOUT(kNCHW));
auto output = scope->FindMutableTensor(out_name);
auto axis = op_info->GetAttr<int>("axis");
if (axis < 0) {
axis += x_rank;
}
// for quantization
bool enable_int8 = false;
float input_scale = 1.0;
float output_scale = 1.0;
int bit_length = 8;
DataLayoutType layout = DATALAYOUT(kNCHW);
PrecisionType precision = PRECISION(kFloat);
if (op_info->HasAttr("enable_int8")) {
enable_int8 = op_info->GetAttr<bool>("enable_int8");
input_scale = op_info->GetAttr<float>("input_scale");
bit_length = op_info->GetAttr<int>("bit_length");
output_scale = op_info->GetAttr<float>("output_scale");
if (enable_int8) {
precision = PRECISION(kInt8);
}
}
// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
QuantizationInfo qnt;
qnt.enable_int8 = enable_int8;
if (enable_int8) {
qnt.scale.push_back(input_scale);
qnt.quant_bits = bit_length;
}
x_node = graph->Add(x_name, *x, precision, layout, qnt);
}
std::shared_ptr<Node> output_node = nullptr;
QuantizationInfo output_qnt;
output_qnt.enable_int8 = enable_int8;
if (enable_int8) {
output_qnt.quant_bits = bit_length;
output_qnt.scale.push_back(output_scale);
output->mutable_data<int8_t>();
}
output_node = graph->Add(out_name, *output, precision, layout, output_qnt);
std::vector<std::shared_ptr<rk::nn::Tensor>> inputs;
std::vector<std::shared_ptr<rk::nn::Tensor>> outputs;
inputs.push_back(x_node->data());
outputs.push_back(output_node->data());
rk::nn::SoftmaxAttr attrs;
attrs.axis = axis;
attrs.beta = 1.0;
auto rGraph = graph->GetHandle();
auto softmax = rGraph->AddOperator(
rk::nn::OperatorType::SOFTMAX, inputs, outputs, &attrs);
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace rknpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(softmax,
kRKNPU,
paddle::lite::subgraph::rknpu::SoftmaxConverter);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/rknpu/bridges/utility.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
#include "rknpu/rknpu_pub.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace rknpu {
rk::nn::PrecisionType ToRknpuPrecisionType(PrecisionType precision) {
rk::nn::PrecisionType t = rk::nn::PrecisionType::UNKNOWN;
switch (precision) {
case PrecisionType::kFloat:
t = rk::nn::PrecisionType::FLOAT32;
break;
case PrecisionType::kFP16:
t = rk::nn::PrecisionType::FLOAT16;
break;
case PrecisionType::kInt16:
t = rk::nn::PrecisionType::INT16;
break;
case PrecisionType::kInt32:
t = rk::nn::PrecisionType::INT32;
break;
case PrecisionType::kInt64:
t = rk::nn::PrecisionType::INT64;
break;
case PrecisionType::kInt8:
t = rk::nn::PrecisionType::INT8;
break;
case PrecisionType::kBool:
t = rk::nn::PrecisionType::BOOL8;
break;
default:
break;
}
return t;
}
rk::nn::DataLayoutType ToRknpuDataLayoutType(DataLayoutType layout) {
rk::nn::DataLayoutType t = rk::nn::DataLayoutType::UNKNOWN;
switch (layout) {
case DataLayoutType::kNCHW:
t = rk::nn::DataLayoutType::NCHW;
break;
case DataLayoutType::kNHWC:
t = rk::nn::DataLayoutType::NHWC;
break;
default:
break;
}
return t;
}
bool HasInputArg(const OpInfo* op_info,
const Scope* scope,
const std::string& argname) {
auto iarg_names = op_info->input_argnames();
if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
iarg_names.end()) {
auto inputs = op_info->Input(argname);
if (inputs.empty()) {
return false;
}
auto var_name = inputs.front();
auto var = scope->FindVar(var_name);
return var != nullptr;
} else {
return false;
}
}
} // namespace rknpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
#include "rknpu/rknpu_pub.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace rknpu {
rk::nn::PrecisionType ToRknpuPrecisionType(PrecisionType precision);
rk::nn::DataLayoutType ToRknpuDataLayoutType(DataLayoutType layout);
bool HasInputArg(const OpInfo* op_info,
const Scope* scope,
const std::string& argname);
} // namespace rknpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/rknpu/subgraph_compute.h"
#include <sys/time.h>
#include <time.h>
#include <utility>
#include "lite/backends/rknpu/device.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/rknpu/bridges/graph.h"
#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h"
#include "lite/kernels/rknpu/bridges/utility.h"
#include "rknpu/rknpu_pub.h" // NOLINT
namespace paddle {
namespace lite {
namespace kernels {
namespace rknpu {
int SubgraphEngine::BuildDeviceProgram() {
LOG(INFO) << "[RKNPU]:BuildDeviceProgram";
int status = 0;
// Convert all of ops and their input vars and weights and added into the NPU
// RKNPU IR graph
subgraph::rknpu::Graph graph;
const auto& bridges = subgraph::Registry::Instance();
for (auto& inst : origin_program_) {
auto op = const_cast<OpLite*>(inst.op());
CHECK(op);
op->CheckShape();
op->InferShape();
std::string op_type = op->op_info()->Type();
if (!bridges.Exists(op_type, TARGET(kRKNPU))) {
return subgraph::FAILED;
}
auto kernel = inst.kernel();
status |= bridges.Select(op_type, TARGET(kRKNPU))(
reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED;
}
}
// Collect the valid input and output nodes in the RKNPU IR graph and update
// the input and output names
device_inames_.clear();
device_onames_.clear();
for (auto& input_name : input_names_) {
LOG(INFO) << "[RKNPU] Input node " << input_name;
if (graph.Has(input_name)) {
LOG(INFO) << input_name << " Precision "
<< PrecisionToStr(graph.Get(input_name)->precision());
device_itensors_.push_back(graph.Get(input_name)->data());
device_inames_.push_back(input_name);
} else {
LOG(WARNING) << "[RKNPU] Input node " << input_name
<< " is ignored because it does not exist.";
}
}
for (auto& output_name : output_names_) {
LOG(INFO) << "[RKNPU] Output node " << output_name;
if (graph.Has(output_name)) {
auto tensor = scope_->FindMutableTensor(output_name);
LOG(INFO) << output_name << " Precision "
<< PrecisionToStr(tensor->precision());
device_otensors_.push_back(graph.Get(output_name)->data());
device_onames_.push_back(output_name);
} else {
LOG(WARNING) << "[RKNPU] Output node " << output_name
<< " is ignored because it does not exist.";
}
}
CHECK(!device_inames_.empty())
<< "[RKNPU] No input nodes found for building NPU model";
CHECK(!device_onames_.empty())
<< "[RKNPU] No output nodes found for building NPU model";
device_program_ = lite::rknpu::Device::Global().Build(
model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
if (device_program_ == nullptr) {
LOG(WARNING) << "[RKNPU] Build model failed!";
return subgraph::FAILED;
}
// input
origin_idims_.resize(input_names_.size());
origin_itensors_.resize(input_names_.size());
for (size_t i = 0; i < input_names_.size(); i++) {
origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
CHECK(origin_itensors_[i]);
origin_idims_[i] = origin_itensors_[i]->dims();
}
// output
origin_odims_.resize(output_names_.size());
origin_otensors_.resize(output_names_.size());
for (size_t i = 0; i < output_names_.size(); i++) {
origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
CHECK(origin_otensors_[i]);
origin_odims_[i] = origin_otensors_[i]->dims();
auto output_dims = origin_otensors_[i]->dims();
}
origin_idims_.resize(device_inames_.size());
origin_itensors_.resize(device_inames_.size());
device_itensors_.resize(device_inames_.size());
origin_odims_.resize(device_onames_.size());
origin_otensors_.resize(device_onames_.size());
device_otensors_.resize(device_onames_.size());
for (int i = 0; i < device_inames_.size(); i++) {
auto node = graph.Get(device_inames_[i]);
auto precision = node->precision();
auto layout = node->layout();
origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
CHECK(origin_itensors_[i]);
origin_idims_[i] = origin_itensors_[i]->dims();
LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << device_inames_[i]
<< " precision: " << PrecisionToStr(precision)
<< " layout: " << DataLayoutToStr(layout);
}
for (int i = 0; i < device_onames_.size(); i++) {
auto node = graph.Get(device_onames_[i]);
auto precision = node->precision();
auto layout = node->layout();
origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
CHECK(origin_otensors_[i]);
origin_odims_[i] = origin_otensors_[i]->dims();
LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << device_onames_[i]
<< " precision: " << PrecisionToStr(precision)
<< " layout: " << DataLayoutToStr(layout);
// Prepare the device output tensors
switch (precision) {
case PRECISION(kFloat):
origin_otensors_[i]->mutable_data<float>();
break;
case PRECISION(kInt8):
origin_otensors_[i]->mutable_data<int8_t>();
break;
case PRECISION(kInt16):
origin_otensors_[i]->mutable_data<int16_t>();
break;
case PRECISION(kInt32):
origin_otensors_[i]->mutable_data<int32_t>();
break;
case PRECISION(kInt64):
origin_otensors_[i]->mutable_data<int64_t>();
break;
default:
LOG(FATAL) << "[RKNPU] " << device_onames_[i]
<< " can't mutable data with precision type "
<< PrecisionToStr(precision);
break;
}
}
return status;
}
int SubgraphEngine::LaunchDeviceProgram() {
LOG(INFO) << "[RKNPU]:LaunchDeviceProgram";
std::vector<rk::nn::InputInfo> inputs;
std::vector<rk::nn::OutputInfo> outputs;
inputs.resize(device_itensors_.size());
for (size_t i = 0; i < device_itensors_.size(); i++) {
inputs[i].index = i;
inputs[i].buf = const_cast<void*>(origin_itensors_[i]->raw_data());
inputs[i].size = origin_itensors_[i]->memory_size();
inputs[i].pass_through = false;
inputs[i].type =
subgraph::rknpu::ToRknpuPrecisionType(origin_itensors_[i]->precision());
inputs[i].layout = rk::nn::DataLayoutType::NCHW;
}
outputs.resize(device_otensors_.size());
for (size_t i = 0; i < device_otensors_.size(); i++) {
outputs[i].index = i;
outputs[i].buf = const_cast<void*>(origin_otensors_[i]->raw_data());
outputs[i].size = origin_otensors_[i]->memory_size();
outputs[i].want_float = false;
}
device_program_->SetInputs(inputs);
device_program_->Run();
device_program_->GetOutputs(outputs);
return 0;
}
void SubgraphCompute::PrepareForRun() {
LOG(INFO) << "[RKNPU]:PrepareForRun";
auto& param = this->Param<param_t>();
engine_.reset(new SubgraphEngine(ctx_.get(),
param.sub_block_idx,
param.sub_block_desc,
param.input_data_names,
param.output_data_names,
param.scope));
CHECK(engine_);
engine_->Build();
}
void SubgraphCompute::Run() {
LOG(INFO) << "[RKNPU]:Run";
CHECK(engine_);
engine_->Launch();
}
} // namespace rknpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(subgraph,
kRKNPU,
kInt8,
kNCHW,
paddle::lite::kernels::rknpu::SubgraphCompute,
def)
.BindInput("Inputs",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt8),
DATALAYOUT(kNCHW))})
.BindOutput("Outputs",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt8),
DATALAYOUT(kNCHW))})
.Finalize();
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/core/program.h"
#include "lite/core/types.h"
#include "lite/kernels/npu/bridges/engine.h"
#include "lite/kernels/npu/bridges/registry.h"
#include "rknpu/rknpu_pub.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace rknpu {
class SubgraphEngine : public subgraph::Engine {
public:
SubgraphEngine(KernelContext *ctx,
int block_idx,
cpp::BlockDesc *block_desc,
const std::vector<std::string> &input_names,
const std::vector<std::string> &output_names,
Scope *scope)
: subgraph::Engine(
ctx, block_idx, block_desc, input_names, output_names, scope) {}
protected:
int BuildDeviceProgram() override;
int LaunchDeviceProgram() override;
std::string model_name_;
std::vector<std::string> device_inames_;
std::vector<std::string> device_onames_;
std::vector<std::shared_ptr<rk::nn::Tensor>> device_itensors_;
std::vector<std::shared_ptr<rk::nn::Tensor>> device_otensors_;
std::unique_ptr<rk::nn::Exection> device_program_{nullptr};
};
class SubgraphCompute
: public KernelLite<TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::SubgraphParam;
void PrepareForRun() override;
void Run() override;
virtual ~SubgraphCompute() = default;
private:
std::unique_ptr<SubgraphEngine> engine_;
};
} // namespace rknpu
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -12,3 +12,10 @@ if(LITE_WITH_XPU)
${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
endif()
if(LITE_WITH_RKNPU)
lite_cc_test(test_mobilenetv1_int8_rknpu SRCS test_mobilenetv1_int8_rknpu.cc
DEPS ${lite_model_test_DEPS} paddle_api_full
RKNPU_DEPS ${rknpu_kernels} ${rknpu_bridges}
ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <sys/time.h>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
inline double GetCurrentUS() {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
}
inline int64_t ShapeProduction(std::vector<int64_t> shape) {
int64_t s = 1;
for (int64_t dim : shape) {
s *= dim;
}
return s;
}
int main(int argc, char** argv) {
if (argc < 2) {
std::cerr << "[ERROR] usage: ./" << argv[0]
<< " model_dir [thread_num] [warmup_times] [repeat_times] "
"[input_data_path] [output_data_path]"
<< std::endl;
return -1;
}
std::string model_dir = argv[1];
int thread_num = 1;
if (argc > 2) {
thread_num = atoi(argv[2]);
}
int warmup_times = 5;
if (argc > 3) {
warmup_times = atoi(argv[3]);
}
int repeat_times = 10;
if (argc > 4) {
repeat_times = atoi(argv[4]);
}
std::string input_data_path;
if (argc > 5) {
input_data_path = argv[5];
}
std::string output_data_path;
if (argc > 6) {
output_data_path = argv[6];
}
paddle::lite_api::CxxConfig config;
config.set_model_dir(model_dir);
config.set_threads(thread_num);
config.set_power_mode(paddle::lite_api::LITE_POWER_HIGH);
config.set_valid_places(
{paddle::lite_api::Place{
TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)},
paddle::lite_api::Place{
TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
paddle::lite_api::Place{
TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)},
paddle::lite_api::Place{
TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}});
auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
std::unique_ptr<paddle::lite_api::Tensor> input_tensor(
std::move(predictor->GetInput(0)));
input_tensor->Resize({1, 3, 224, 224});
auto input_data = input_tensor->mutable_data<float>();
auto input_size = ShapeProduction(input_tensor->shape());
if (input_data_path.empty()) {
for (int i = 0; i < input_size; i++) {
input_data[i] = 1;
}
} else {
std::fstream fs(input_data_path, std::ios::in);
if (!fs.is_open()) {
std::cerr << "open input data file failed." << std::endl;
return -1;
}
for (int i = 0; i < input_size; i++) {
fs >> input_data[i];
}
}
for (int i = 0; i < warmup_times; ++i) {
predictor->Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < repeat_times; ++i) {
predictor->Run();
}
std::cout << "Model: " << model_dir << ", threads num " << thread_num
<< ", warmup times: " << warmup_times
<< ", repeat times: " << repeat_times << ", spend "
<< (GetCurrentUS() - start) / repeat_times / 1000.0
<< " ms in average." << std::endl;
std::unique_ptr<const paddle::lite_api::Tensor> output_tensor(
std::move(predictor->GetOutput(0)));
auto output_data = output_tensor->data<float>();
auto output_size = ShapeProduction(output_tensor->shape());
std::cout << "output data:";
for (int i = 0; i < output_size; i += 100) {
std::cout << "[" << i << "] " << output_data[i] << std::endl;
}
return 0;
}
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_RKNPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
......
......@@ -27,6 +27,8 @@ NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.hua
BUILD_XPU=OFF
BUILD_XTCL=OFF
XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/"
BUILD_RKNPU=OFF
RKNPU_DDK_ROOT="$(pwd)/rknpu/"
LITE_WITH_ARM_LANG=OFF
readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
......@@ -141,6 +143,8 @@ function make_tiny_publish_so {
-DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DLITE_WITH_RKNPU=$BUILD_RKNPU \
-DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
make publish_inference -j$NUM_PROC
......@@ -230,6 +234,8 @@ function make_full_publish_so {
-DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DLITE_WITH_RKNPU=$BUILD_RKNPU \
-DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
-DLITE_WITH_TRAIN=$BUILD_TRAIN \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
......@@ -265,6 +271,8 @@ function make_all_tests {
-DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DLITE_WITH_RKNPU=$BUILD_RKNPU \
-DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
make lite_compile_deps -j$NUM_PROC
......@@ -498,6 +506,14 @@ function main {
XPU_SDK_ROOT="${i#*=}"
shift
;;
--build_rknpu=*)
BUILD_RKNPU="${i#*=}"
shift
;;
--rknpu_ddk_root=*)
RKNPU_DDK_ROOT="${i#*=}"
shift
;;
tiny_publish)
make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL
shift
......
#!/bin/bash
set -ex
# global variables with default value
ARM_OS="armlinux" # android only yet
ARM_ABI="armv8" # armv8, armv7
ARM_LANG="gcc" # gcc only yet
DDK_ROOT="$(pwd)/rknpu"
TARGET_NAME="test_subgraph_pass" # default target
BUILD_EXTRA=OFF # ON(with sequence ops)/OFF
WITH_TESTING=ON # ON/OFF
SHUTDOWN_LOG=OFF # ON(disable logging)/OFF
ON_TINY_PUBLISH=OFF # ON(tiny publish)/OFF(full publish)
function print_usage {
echo -e "\nUSAGE:"
echo
echo "----------------------------------------"
echo -e "--arm_os=<os> android only yet."
echo -e "--arm_abi=<abi> armv8, armv7 yet."
echo -e "--arm_lang=<gcc>"
echo -e "--ddk_root=<hiai_ddk_root>"
echo -e "--target_name=<target_name>"
echo "----------------------------------------"
echo
}
# for code gen, a source file is generated after a test,
# but is dependended by some targets in cmake.
# here we fake an empty file to make cmake works.
function prepare_workspace {
# in build directory
# 1. Prepare gen_code file
GEN_CODE_PATH_PREFIX=lite/gen_code
mkdir -p ./${GEN_CODE_PATH_PREFIX}
touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
# 2.Prepare debug tool
DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
}
function prepare_thirdparty {
readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
readonly workspace=$PWD
if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
rm -rf $workspace/third-party
if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
wget $THIRDPARTY_TAR
fi
tar xzf third-party-05b862.tar.gz
else
git submodule update --init --recursive
fi
}
function build_npu {
cur_dir=$(pwd)
prepare_thirdparty
local publish_dir
if [[ "${ON_TINY_PUBLISH}" == "ON" ]]; then
WITH_TESTING=OFF
SHUTDOWN_LOG=ON
publish_dir="tiny_publish"
else
publish_dir="full_publish"
fi
build_dir=$cur_dir/build.lite.rknpu.${ARM_OS}.${ARM_ABI}.${ARM_LANG}.${publish_dir}
mkdir -p $build_dir
cd $build_dir
# NPU libs need API LEVEL 24 above
prepare_workspace
cmake .. \
-DWITH_GPU=OFF \
-DWITH_MKL=OFF \
-DWITH_LITE=ON \
-DLITE_WITH_CUDA=OFF \
-DLITE_WITH_X86=OFF \
-DLITE_WITH_NPU=OFF \
-DLITE_WITH_JAVA=OFF \
-DLITE_WITH_ARM=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-DWITH_ARM_DOTPROD=ON \
-DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
-DWITH_TESTING=${WITH_TESTING} \
-DLITE_SHUTDOWN_LOG=${SHUTDOWN_LOG} \
-DLITE_ON_TINY_PUBLISH=${ON_TINY_PUBLISH} \
-DARM_TARGET_OS=${ARM_OS} \
-DARM_TARGET_ARCH_ABI=${ARM_ABI} \
-DARM_TARGET_LANG=${ARM_LANG} \
-DLITE_WITH_RKNPU=ON \
-DRKNPU_DDK_ROOT=${DDK_ROOT}
make $TARGET_NAME -j2
cd -
echo "Done"
}
function main {
# Parse command line.
for i in "$@"; do
case $i in
--target_name=*)
TARGET_NAME="${i#*=}"
shift
;;
--arm_os=*)
ARM_OS="${i#*=}"
shift
;;
--arm_abi=*)
ARM_ABI="${i#*=}"
shift
;;
--arm_lang=*)
ARM_LANG="${i#*=}"
shift
;;
--android_stl=*)
ANDROID_STL="${i#*=}"
shift
;;
--build_extra=*)
BUILD_EXTRA="${i#*=}"
shift
;;
--ddk_root=*)
DDK_ROOT="${i#*=}"
shift
;;
build)
build_npu
shift
;;
full_publish)
TARGET_NAME=publish_inference
build_npu
shift
;;
tiny_publish)
ON_TINY_PUBLISH=ON
TARGET_NAME=publish_inference
build_npu
shift
;;
*)
# unknown option
print_usage
exit 1
;;
esac
done
}
main $@
......@@ -56,8 +56,8 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
ops_lines = []
# valid targets and valid_ops
valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"]
valid_ops = [[], [], [], [], [], [], [], [], [], []]
valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU"]
valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[]]
class TargetType:
kUnk = 0
kHost = 1
......@@ -68,6 +68,9 @@ class TargetType:
kFPGA = 7
kNPU = 8
kXPU = 9
kBM = 10
kMLU = 11
kRKNPU = 12
kAny = 6 # any target
# record op_info of valid kernels into `valid_ops` according to different target type
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册