未验证 提交 32065859 编写于 作者: T tensor-tang 提交者: GitHub

[NPU] add script and refine tests (#1873)

* add npu script and tester

* fix npu armv7 so and refine tests

test=develop

* update fix and refine log

test=develop
上级 c52e6647
...@@ -32,14 +32,26 @@ endif() ...@@ -32,14 +32,26 @@ endif()
include_directories("${NPU_DDK_ROOT}") include_directories("${NPU_DDK_ROOT}")
set(NPU_SUB_LIB_PATH "lib64")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
set(NPU_SUB_LIB_PATH "lib64")
endif()
if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(NPU_SUB_LIB_PATH "lib")
endif()
find_library(NPU_DDK_HIAI_FILE NAMES hiai find_library(NPU_DDK_HIAI_FILE NAMES hiai
PATHS ${NPU_DDK_ROOT}/lib64) PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
find_library(NPU_DDK_IR_FILE NAMES hiai_ir find_library(NPU_DDK_IR_FILE NAMES hiai_ir
PATHS ${NPU_DDK_ROOT}/lib64) PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
PATHS ${NPU_DDK_ROOT}/lib64) PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
find_library(NPU_DDK_PROTO_FILE NAMES protobuf-lite
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
if(NOT NPU_DDK_HIAI_FILE) if(NOT NPU_DDK_HIAI_FILE)
message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}") message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
...@@ -65,6 +77,14 @@ else() ...@@ -65,6 +77,14 @@ else()
set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE}) set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
endif() endif()
set(npu_ddk_libs npu_ddk_hiai npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk libs") if(NOT NPU_DDK_PROTO_FILE)
message(FATAL_ERROR "Can not find NPU_DDK_PROTO_FILE in ${NPU_DDK_ROOT}")
else()
message(STATUS "Found NPU_DDK Protobuf Library: ${NPU_DDK_PROTO_FILE}")
add_library(npu_ddk_proto SHARED IMPORTED GLOBAL)
set_property(TARGET npu_ddk_proto PROPERTY IMPORTED_LOCATION ${NPU_DDK_PROTO_FILE})
endif()
set(npu_ddk_libs npu_ddk_hiai npu_ddk_ir npu_ddk_ir_build npu_ddk_proto CACHE INTERNAL "npu ddk libs")
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <vector> #include <vector>
#include "lite/core/mir/graph_visualize_pass.h" #include "lite/core/mir/graph_visualize_pass.h"
#include "lite/core/mir/subgraph/subgraph_program_pass.h" #include "lite/core/mir/subgraph/subgraph_program_pass.h"
#include "lite/core/op_registry.h"
#include "lite/core/program.h" #include "lite/core/program.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
...@@ -24,41 +25,92 @@ ...@@ -24,41 +25,92 @@
#include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
#include "lite/core/op_registry.h" #include "lite/api/test_helper.h"
#include "lite/model_parser/pb/program_desc.h" #include "lite/model_parser/pb/program_desc.h"
DEFINE_string(model_dir, "", "model_dir");
DEFINE_string(optimized_model, "", "optimized_model"); DEFINE_string(optimized_model, "", "optimized_model");
DEFINE_int32(batch_size, 1, "batch size");
DEFINE_int32(im_channel, 3, "im_channel");
namespace paddle { namespace paddle {
namespace lite { namespace lite {
TEST(NPUSubgraph, mobilenetv1) { void TestModel(lite::Predictor* predictor,
lite::Predictor predictor; const std::vector<Place>& valid_places,
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, const std::string& model_dir) {
Place{TARGET(kARM), PRECISION(kFloat)}, predictor->Build(
Place{TARGET(kNPU), PRECISION(kFloat)}}); model_dir, Place{TARGET(kARM), PRECISION(kFloat)}, valid_places);
predictor.Build(
FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)}, valid_places); auto* input_tensor = predictor->GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>(
auto* input_tensor = predictor.GetInput(0); {FLAGS_batch_size, FLAGS_im_channel, FLAGS_im_height, FLAGS_im_width})));
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
// input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 13, 1, 1})));
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production(); auto item_size = input_tensor->dims().production();
for (int i = 0; i < item_size; i++) { for (int i = 0; i < item_size; i++) {
data[i] = 1; data[i] = 1;
} }
predictor.GenNPURuntimeProgram(); if (std::find(valid_places.begin(),
valid_places.end(),
Place{TARGET(kNPU), PRECISION(kFloat)}) != valid_places.end()) {
// TODO(TJ): change if valid npu so try use it, add rollback and move to api
predictor->GenNPURuntimeProgram();
}
predictor->Run();
if (model_dir != FLAGS_optimized_model &&
std::find(valid_places.begin(),
valid_places.end(),
Place{TARGET(kNPU), PRECISION(kFloat)}) != valid_places.end()) {
predictor->SaveModel(FLAGS_optimized_model);
}
}
for (int i = 0; i < 10; ++i) { void CompareOutData(const lite::Predictor& tgt, const lite::Predictor& ref) {
predictor.Run(); auto* tgt_otensor = tgt.GetOutput(0);
auto* ref_otensor = ref.GetOutput(0);
const auto* tgt_pdata = tgt_otensor->data<float>();
const auto* ref_pdata = ref_otensor->data<float>();
EXPECT_EQ(tgt_otensor->dims().production(), ref_otensor->dims().production());
for (size_t i = 0; i < tgt_otensor->dims().production(); ++i) {
auto diff = std::fabs((tgt_pdata[i] - ref_pdata[i]) / ref_pdata[i]);
VLOG(3) << diff;
EXPECT_LT(diff, 0.1);
} }
}
TEST(NPUSubgraph, compare) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, 1);
lite::Predictor predictor_arm, predictor_npu, predictor_npu_savedmodel;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
TestModel(&predictor_arm, valid_places, FLAGS_model_dir);
valid_places.push_back(Place{TARGET(kNPU), PRECISION(kFloat)});
TestModel(&predictor_npu, valid_places, FLAGS_model_dir);
CompareOutData(predictor_npu, predictor_arm);
LOG(INFO) << " ================ NPU speed ================== ";
for (int i = 0; i < FLAGS_repeats; ++i) {
auto start = GetCurrentUS();
predictor_npu.Run();
LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
}
LOG(INFO) << " =================== ARM CPU speed =================== ";
for (int i = 0; i < FLAGS_repeats; ++i) {
auto start = GetCurrentUS();
predictor_arm.Run();
LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
}
TestModel(&predictor_npu_savedmodel, valid_places, FLAGS_optimized_model);
LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model; CompareOutData(predictor_npu_savedmodel, predictor_arm);
predictor.SaveModel(FLAGS_optimized_model);
} }
} // namespace lite } // namespace lite
......
#!/bin/bash
set -ex
function print_usage {
echo -e "\nUSAGE:"
echo
echo "----------------------------------------"
echo -e "--arm_os=<os> android only yet."
echo -e "--arm_abi=<abi> armv8, armv7 yet."
echo -e "--arm_stl=<shared> shared or static"
echo -e "--arm_lang=<gcc> "
echo -e "--ddk_root=<hiai_ddk_root> "
echo -e "--test_name=<test_name>"
echo "----------------------------------------"
echo
}
# for code gen, a source file is generated after a test,
# but is dependended by some targets in cmake.
# here we fake an empty file to make cmake works.
function prepare_workspace {
# in build directory
# 1. Prepare gen_code file
GEN_CODE_PATH_PREFIX=lite/gen_code
mkdir -p ./${GEN_CODE_PATH_PREFIX}
touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
# 2.Prepare debug tool
DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
}
function prepare_thirdparty {
readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
readonly workspace=$PWD
if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
rm -rf $workspace/third-party
if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
wget $THIRDPARTY_TAR
fi
tar xzf third-party-05b862.tar.gz
else
git submodule update --init --recursive
fi
}
function cmake_npu {
prepare_workspace
# $1: ARM_TARGET_OS in "android" , "armlinux"
# $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
# $3: ARM_TARGET_LANG in "gcc" "clang"
# $4: ANDROID_STL_TYPE in "c++_shared" "c++_static"
# $5: DDK_ROOT path
# NPU libs need API LEVEL 24 above
cmake .. \
-DWITH_GPU=OFF \
-DWITH_MKL=OFF \
-DWITH_LITE=ON \
-DLITE_WITH_CUDA=OFF \
-DLITE_WITH_X86=OFF \
-DLITE_WITH_ARM=ON \
-DWITH_ARM_DOTPROD=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-DWITH_TESTING=ON \
-DLITE_WITH_NPU=ON \
-DANDROID_API_LEVEL=24 \
-DARM_TARGET_OS=$1 \
-DARM_TARGET_ARCH_ABI=$2 \
-DARM_TARGET_LANG=$3 \
-DANDROID_STL_TYPE=$4 \
-DNPU_DDK_ROOT=$5
}
function build_npu {
# os, abi, lang, stl, ddk_root, test_name
cur_dir=$(pwd)
local os=android
local abi=armv8
local lang=gcc
local stl="c++_shared"
local ddk_root="${cur_dir}/ai_ddk_lib/"
local test_name=test_npu_pass
prepare_thirdparty
if [[ $# -ge 1 ]]; then
os=$1
fi
if [[ $# -ge 2 ]]; then
abi=$2
fi
if [[ $# -ge 3 ]]; then
lang=$3
fi
if [[ $# -ge 4 ]]; then
stl=$4
fi
if [[ $# -ge 5 ]]; then
ddk_root=$5
fi
if [[ $# -ge 6 ]]; then
test_name=$6
fi
build_dir=$cur_dir/build.lite.npu.${os}.${abi}.${lang}.${stl}
mkdir -p $build_dir
cd $build_dir
cmake_npu ${os} ${abi} ${lang} ${stl} ${ddk_root}
make $test_name -j8
cd -
echo "Done"
}
function main {
# Parse command line.
for i in "$@"; do
case $i in
--tests=*)
TESTS_FILE="${i#*=}"
shift
;;
--test_name=*)
TEST_NAME="${i#*=}"
shift
;;
--arm_os=*)
ARM_OS="${i#*=}"
shift
;;
--arm_abi=*)
ARM_ABI="${i#*=}"
shift
;;
--arm_lang=*)
ARM_LANG="${i#*=}"
shift
;;
build)
build_npu ${os} ${abi} ${lang} ${stl} ${ddk_root} ${test_name}
shift
;;
*)
# unknown option
print_usage
exit 1
;;
esac
done
}
main $@
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册