diff --git a/cmake/cross_compiling/npu.cmake b/cmake/cross_compiling/npu.cmake index ec5a318896744ab2a66788d0b12d67c35865b8ec..863200986c93ea09d3fa3049fe684b32c2fb52dd 100644 --- a/cmake/cross_compiling/npu.cmake +++ b/cmake/cross_compiling/npu.cmake @@ -32,14 +32,26 @@ endif() include_directories("${NPU_DDK_ROOT}") +set(NPU_SUB_LIB_PATH "lib64") +if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") + set(NPU_SUB_LIB_PATH "lib64") +endif() + +if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") + set(NPU_SUB_LIB_PATH "lib") +endif() + find_library(NPU_DDK_HIAI_FILE NAMES hiai - PATHS ${NPU_DDK_ROOT}/lib64) + PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) find_library(NPU_DDK_IR_FILE NAMES hiai_ir - PATHS ${NPU_DDK_ROOT}/lib64) + PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build - PATHS ${NPU_DDK_ROOT}/lib64) + PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) + +find_library(NPU_DDK_PROTO_FILE NAMES protobuf-lite + PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) if(NOT NPU_DDK_HIAI_FILE) message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}") @@ -65,6 +77,14 @@ else() set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE}) endif() -set(npu_ddk_libs npu_ddk_hiai npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk libs") +if(NOT NPU_DDK_PROTO_FILE) + message(FATAL_ERROR "Can not find NPU_DDK_PROTO_FILE in ${NPU_DDK_ROOT}") +else() + message(STATUS "Found NPU_DDK Protobuf Library: ${NPU_DDK_PROTO_FILE}") + add_library(npu_ddk_proto SHARED IMPORTED GLOBAL) + set_property(TARGET npu_ddk_proto PROPERTY IMPORTED_LOCATION ${NPU_DDK_PROTO_FILE}) +endif() + +set(npu_ddk_libs npu_ddk_hiai npu_ddk_ir npu_ddk_ir_build npu_ddk_proto CACHE INTERNAL "npu ddk libs") diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc index d7ce9ed7d6791b2d4165b8c3f34173700af4c036..02c42a507ffd98bc4a2a1ed643a6339ae5e72d46 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc +++ b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc @@ -17,6 +17,7 @@ #include #include "lite/core/mir/graph_visualize_pass.h" #include "lite/core/mir/subgraph/subgraph_program_pass.h" +#include "lite/core/op_registry.h" #include "lite/core/program.h" #include "lite/core/tensor.h" @@ -24,41 +25,92 @@ #include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_passes.h" -#include "lite/core/op_registry.h" +#include "lite/api/test_helper.h" #include "lite/model_parser/pb/program_desc.h" -DEFINE_string(model_dir, "", "model_dir"); DEFINE_string(optimized_model, "", "optimized_model"); +DEFINE_int32(batch_size, 1, "batch size"); +DEFINE_int32(im_channel, 3, "im_channel"); namespace paddle { namespace lite { -TEST(NPUSubgraph, mobilenetv1) { - lite::Predictor predictor; - std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kNPU), PRECISION(kFloat)}}); - predictor.Build( - FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)}, valid_places); - - auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); - // input_tensor->Resize(DDim(std::vector({1, 13, 1, 1}))); +void TestModel(lite::Predictor* predictor, + const std::vector& valid_places, + const std::string& model_dir) { + predictor->Build( + model_dir, Place{TARGET(kARM), PRECISION(kFloat)}, valid_places); + + auto* input_tensor = predictor->GetInput(0); + input_tensor->Resize(DDim(std::vector( + {FLAGS_batch_size, FLAGS_im_channel, FLAGS_im_height, FLAGS_im_width}))); auto* data = input_tensor->mutable_data(); auto item_size = input_tensor->dims().production(); for (int i = 0; i < item_size; i++) { data[i] = 1; } - predictor.GenNPURuntimeProgram(); + if (std::find(valid_places.begin(), + valid_places.end(), + Place{TARGET(kNPU), PRECISION(kFloat)}) != valid_places.end()) { + // TODO(TJ): change if valid npu so try use it, add rollback and move to api + predictor->GenNPURuntimeProgram(); + } + + predictor->Run(); + if (model_dir != FLAGS_optimized_model && + std::find(valid_places.begin(), + valid_places.end(), + Place{TARGET(kNPU), PRECISION(kFloat)}) != valid_places.end()) { + predictor->SaveModel(FLAGS_optimized_model); + } +} - for (int i = 0; i < 10; ++i) { - predictor.Run(); +void CompareOutData(const lite::Predictor& tgt, const lite::Predictor& ref) { + auto* tgt_otensor = tgt.GetOutput(0); + auto* ref_otensor = ref.GetOutput(0); + const auto* tgt_pdata = tgt_otensor->data(); + const auto* ref_pdata = ref_otensor->data(); + EXPECT_EQ(tgt_otensor->dims().production(), ref_otensor->dims().production()); + for (size_t i = 0; i < tgt_otensor->dims().production(); ++i) { + auto diff = std::fabs((tgt_pdata[i] - ref_pdata[i]) / ref_pdata[i]); + VLOG(3) << diff; + EXPECT_LT(diff, 0.1); } +} + +TEST(NPUSubgraph, compare) { + DeviceInfo::Init(); + DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, 1); + + lite::Predictor predictor_arm, predictor_npu, predictor_npu_savedmodel; + std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kFloat)}}); + + TestModel(&predictor_arm, valid_places, FLAGS_model_dir); + + valid_places.push_back(Place{TARGET(kNPU), PRECISION(kFloat)}); + TestModel(&predictor_npu, valid_places, FLAGS_model_dir); + + CompareOutData(predictor_npu, predictor_arm); + LOG(INFO) << " ================ NPU speed ================== "; + for (int i = 0; i < FLAGS_repeats; ++i) { + auto start = GetCurrentUS(); + predictor_npu.Run(); + LOG(INFO) << i << ", " << GetCurrentUS() - start << "us"; + } + + LOG(INFO) << " =================== ARM CPU speed =================== "; + for (int i = 0; i < FLAGS_repeats; ++i) { + auto start = GetCurrentUS(); + predictor_arm.Run(); + LOG(INFO) << i << ", " << GetCurrentUS() - start << "us"; + } + + TestModel(&predictor_npu_savedmodel, valid_places, FLAGS_optimized_model); - LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model; - predictor.SaveModel(FLAGS_optimized_model); + CompareOutData(predictor_npu_savedmodel, predictor_arm); } } // namespace lite diff --git a/lite/tools/build_npu.sh b/lite/tools/build_npu.sh new file mode 100755 index 0000000000000000000000000000000000000000..600569fdd74c62d92960227dedba4d46321a831e --- /dev/null +++ b/lite/tools/build_npu.sh @@ -0,0 +1,157 @@ +#!/bin/bash +set -ex + +function print_usage { + echo -e "\nUSAGE:" + echo + echo "----------------------------------------" + echo -e "--arm_os= android only yet." + echo -e "--arm_abi= armv8, armv7 yet." + echo -e "--arm_stl= shared or static" + echo -e "--arm_lang= " + echo -e "--ddk_root= " + echo -e "--test_name=" + echo "----------------------------------------" + echo +} + +# for code gen, a source file is generated after a test, +# but is dependended by some targets in cmake. +# here we fake an empty file to make cmake works. +function prepare_workspace { + # in build directory + # 1. Prepare gen_code file + GEN_CODE_PATH_PREFIX=lite/gen_code + mkdir -p ./${GEN_CODE_PATH_PREFIX} + touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc + + # 2.Prepare debug tool + DEBUG_TOOL_PATH_PREFIX=lite/tools/debug + mkdir -p ./${DEBUG_TOOL_PATH_PREFIX} + cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/ +} + +function prepare_thirdparty { + readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz + + readonly workspace=$PWD + if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then + rm -rf $workspace/third-party + + if [ ! -f $workspace/third-party-05b862.tar.gz ]; then + wget $THIRDPARTY_TAR + fi + tar xzf third-party-05b862.tar.gz + else + git submodule update --init --recursive + fi +} + +function cmake_npu { + prepare_workspace + # $1: ARM_TARGET_OS in "android" , "armlinux" + # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf" + # $3: ARM_TARGET_LANG in "gcc" "clang" + # $4: ANDROID_STL_TYPE in "c++_shared" "c++_static" + # $5: DDK_ROOT path + + # NPU libs need API LEVEL 24 above + cmake .. \ + -DWITH_GPU=OFF \ + -DWITH_MKL=OFF \ + -DWITH_LITE=ON \ + -DLITE_WITH_CUDA=OFF \ + -DLITE_WITH_X86=OFF \ + -DLITE_WITH_ARM=ON \ + -DWITH_ARM_DOTPROD=ON \ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ + -DWITH_TESTING=ON \ + -DLITE_WITH_NPU=ON \ + -DANDROID_API_LEVEL=24 \ + -DARM_TARGET_OS=$1 \ + -DARM_TARGET_ARCH_ABI=$2 \ + -DARM_TARGET_LANG=$3 \ + -DANDROID_STL_TYPE=$4 \ + -DNPU_DDK_ROOT=$5 +} + +function build_npu { + # os, abi, lang, stl, ddk_root, test_name + cur_dir=$(pwd) + + local os=android + local abi=armv8 + local lang=gcc + local stl="c++_shared" + local ddk_root="${cur_dir}/ai_ddk_lib/" + local test_name=test_npu_pass + prepare_thirdparty + + if [[ $# -ge 1 ]]; then + os=$1 + fi + if [[ $# -ge 2 ]]; then + abi=$2 + fi + if [[ $# -ge 3 ]]; then + lang=$3 + fi + if [[ $# -ge 4 ]]; then + stl=$4 + fi + if [[ $# -ge 5 ]]; then + ddk_root=$5 + fi + if [[ $# -ge 6 ]]; then + test_name=$6 + fi + + build_dir=$cur_dir/build.lite.npu.${os}.${abi}.${lang}.${stl} + mkdir -p $build_dir + cd $build_dir + + cmake_npu ${os} ${abi} ${lang} ${stl} ${ddk_root} + make $test_name -j8 + + cd - + echo "Done" +} + +function main { + # Parse command line. + for i in "$@"; do + case $i in + --tests=*) + TESTS_FILE="${i#*=}" + shift + ;; + --test_name=*) + TEST_NAME="${i#*=}" + shift + ;; + --arm_os=*) + ARM_OS="${i#*=}" + shift + ;; + --arm_abi=*) + ARM_ABI="${i#*=}" + shift + ;; + --arm_lang=*) + ARM_LANG="${i#*=}" + shift + ;; + build) + build_npu ${os} ${abi} ${lang} ${stl} ${ddk_root} ${test_name} + shift + ;; + *) + # unknown option + print_usage + exit 1 + ;; + esac + done +} + +main $@