提交 6946ca23 编写于 作者: H HappyAngel 提交者: Yan Chunwei

[lite][arm]fix model_optimize bug, update concat and split op, speed up (#2620)

上级 d4739621
...@@ -73,7 +73,7 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF) ...@@ -73,7 +73,7 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF) lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)
lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF) lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
# cv build options # cv build options
lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF IF NOT LITE_WITH_ARM) lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF)
# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
......
...@@ -22,7 +22,7 @@ endfunction() ...@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET) function (lite_deps TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS ARGS) set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS}) set(deps ${lite_deps_DEPS})
...@@ -44,7 +44,7 @@ function (lite_deps TARGET) ...@@ -44,7 +44,7 @@ function (lite_deps TARGET)
set(deps ${deps} ${var}) set(deps ${deps} ${var})
endforeach(var) endforeach(var)
if(LITE_WITH_CV) if(LITE_WITH_CV)
foreach(var ${lite_cv_deps}) foreach(var ${lite_deps_CV_DEPS})
set(deps ${deps} ${var}) set(deps ${deps} ${var})
endforeach(var) endforeach(var)
endif() endif()
...@@ -115,10 +115,11 @@ file(WRITE ${offline_lib_registry_file} "") # clean ...@@ -115,10 +115,11 @@ file(WRITE ${offline_lib_registry_file} "") # clean
# LIGHT_DEPS: LITE_WITH_LIGHT_WEIGHT_FRAMEWORK # LIGHT_DEPS: LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
# HVY_DEPS: NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK # HVY_DEPS: NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
# EXCLUDE_COMPILE_DEPS: TARGET will not be included in lite_compile_deps if this is not None # EXCLUDE_COMPILE_DEPS: TARGET will not be included in lite_compile_deps if this is not None
# CV_DEPS: LITE_WITH_CV
function(lite_cc_library TARGET) function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module) set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS LIGHT_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -129,6 +130,7 @@ function(lite_cc_library TARGET) ...@@ -129,6 +130,7 @@ function(lite_cc_library TARGET)
CUDA_DEPS ${args_CUDA_DEPS} CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS} CL_DEPS ${args_CL_DEPS}
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
CV_DEPS ${args_CV_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
...@@ -162,7 +164,7 @@ function(lite_cc_binary TARGET) ...@@ -162,7 +164,7 @@ function(lite_cc_binary TARGET)
endif() endif()
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps "") set(deps "")
...@@ -178,6 +180,7 @@ function(lite_cc_binary TARGET) ...@@ -178,6 +180,7 @@ function(lite_cc_binary TARGET)
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${CV_DEPS}
) )
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
...@@ -208,7 +211,7 @@ function(lite_cc_test TARGET) ...@@ -208,7 +211,7 @@ function(lite_cc_test TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS ARGS
COMPILE_LEVEL # (basic|extra) COMPILE_LEVEL # (basic|extra)
) )
...@@ -232,6 +235,7 @@ function(lite_cc_test TARGET) ...@@ -232,6 +235,7 @@ function(lite_cc_test TARGET)
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${args_CV_DEPS}
) )
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size # strip binary target to reduce size
......
...@@ -222,7 +222,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -222,7 +222,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
) )
add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_android_cxx_demos logging gflags)
add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
...@@ -236,7 +237,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -236,7 +237,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
) )
add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos) add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
endif() endif()
......
...@@ -24,12 +24,16 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE ...@@ -24,12 +24,16 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
endif() endif()
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive") target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
endif(LITE_WITH_CUDA) endif(LITE_WITH_CUDA)
#light api dynamic library #light api dynamic library
lite_cc_library(paddle_light_api_shared MODULE lite_cc_library(paddle_light_api_shared MODULE
SRCS light_api_shared.cc SRCS light_api_shared.cc
DEPS ${light_lib_DEPS} DEPS ${light_lib_DEPS}
ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels}) ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels})
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
if (LITE_WITH_NPU) if (LITE_WITH_NPU)
# Strips the symbols of our protobuf functions to fix the conflicts during # Strips the symbols of our protobuf functions to fix the conflicts during
...@@ -75,16 +79,17 @@ message(STATUS "get FPGA kernels ${fpga_kernels}") ...@@ -75,16 +79,17 @@ message(STATUS "get FPGA kernels ${fpga_kernels}")
# for full api # for full api
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
set(cxx_api_deps set(cxx_api_deps
scope optimizer target_wrapper_host model_parser program) scope optimizer target_wrapper_host model_parser program)
lite_cc_library(cxx_api lite_cc_library(cxx_api
SRCS cxx_api.cc SRCS cxx_api.cc
DEPS ${cxx_api_deps} ${ops} ${host_kernels} program DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} CV_DEPS paddle_cv_arm
XPU_DEPS ${xpu_kernels} NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
CL_DEPS ${opencl_kernels} XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
FPGA_DEPS ${fpga_kernels}) CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
endif() endif()
# for light api # for light api
...@@ -100,6 +105,7 @@ lite_cc_library(light_api SRCS light_api.cc ...@@ -100,6 +105,7 @@ lite_cc_library(light_api SRCS light_api.cc
CUDA_DEPS ${cuda_kernels} CUDA_DEPS ${cuda_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
...@@ -224,11 +230,12 @@ else() ...@@ -224,11 +230,12 @@ else()
endif() endif()
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api paddle_api_light lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api paddle_api_light
${ops} ${ops}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} CV_DEPS paddle_cv_arm
CL_DEPS ${opencl_kernels} NPU_DEPS ${npu_kernels}
FPGA_DEPS ${fpga_kernels}) CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
# The final inference library for just MobileConfig. # The final inference library for just MobileConfig.
bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
...@@ -258,7 +265,7 @@ if (LITE_WITH_JAVA AND LITE_WITH_ARM) ...@@ -258,7 +265,7 @@ if (LITE_WITH_JAVA AND LITE_WITH_ARM)
add_subdirectory(android) add_subdirectory(android)
endif() endif()
if (LITE_WITH_PYTHON) if (LITE_WITH_PYTHON)
add_subdirectory(python) add_subdirectory(python)
endif() endif()
...@@ -288,25 +295,16 @@ endif() ...@@ -288,25 +295,16 @@ endif()
# Some bins # Some bins
if(NOT IOS) if(NOT IOS)
lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} CV_DEPS paddle_cv_arm
XPU_DEPS ${xpu_kernels} NPU_DEPS ${npu_kernels}
CL_DEPS ${opencl_kernels} XPU_DEPS ${xpu_kernels}
FPGA_DEPS ${fpga_kernels} CL_DEPS ${opencl_kernels}
X86_DEPS ${x86_kernels} FPGA_DEPS ${fpga_kernels}
CUDA_DEPS ${cuda_kernels}) X86_DEPS ${x86_kernels}
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils CUDA_DEPS ${cuda_kernels})
${ops} ${host_kernels}
ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
endif() endif()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
......
...@@ -26,31 +26,32 @@ namespace math { ...@@ -26,31 +26,32 @@ namespace math {
void concat_func(const std::vector<lite::Tensor *> &input, void concat_func(const std::vector<lite::Tensor *> &input,
const int axis, const int axis,
lite::Tensor *output) { lite::Tensor *output) {
size_t num = input.size(); int64_t concat_input_size = 1;
int rows = 1; int64_t num_cancats = 1;
auto dim_0 = input[0]->dims(); auto dim_0 = input[0]->dims();
for (int i = 0; i < axis; ++i) { size_t num = input.size();
rows *= dim_0[i]; for (int i = axis + 1; i < dim_0.size(); i++) {
concat_input_size *= dim_0[i];
} }
int out_rows = rows, out_cols = 0; for (int i = 0; i < axis; i++) {
num_cancats *= dim_0[i];
std::vector<int64_t> input_cols(input.size());
for (int i = 0; i < num; ++i) {
int t_cols = input[i]->numel() / rows;
out_cols += t_cols;
input_cols[i] = t_cols;
} }
float *dst_ptr = output->mutable_data<float>();
// computation const int out_concat_axis = output->dims()[axis];
for (int k = 0; k < out_rows; ++k) { int64_t offset_concat_axis = 0;
float *dst_ptr = output->mutable_data<float>() + k * out_cols; int64_t out_sum = out_concat_axis * concat_input_size;
int col_idx = 0; for (int n = 0; n < num; n++) {
for (int j = 0; j < num; ++j) { auto dims = input[n]->dims();
int col_len = input_cols[j]; const float *src_ptr = input[n]->data<float>();
const float *src_prt = input[j]->data<float>() + k * col_len; int64_t in_concat_axis = dims[axis];
std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len); float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
col_idx += col_len; int64_t in_sum = in_concat_axis * concat_input_size;
for (int i = 0; i < num_cancats; i++) {
std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum);
dout_ptr += out_sum;
src_ptr += in_sum;
} }
offset_concat_axis += in_concat_axis;
} }
} }
......
...@@ -70,10 +70,12 @@ void split<float>(const float* din, ...@@ -70,10 +70,12 @@ void split<float>(const float* din,
int in_after = in_strides[axis]; int in_after = in_strides[axis];
int out_after = out_strides[axis]; int out_after = out_strides[axis];
const float* din_ptr = din + input_offset;
for (int i = 0; i < before; ++i) { for (int i = 0; i < before; ++i) {
split_cpy(din + input_offset + i * in_after, std::memcpy(out_data, din_ptr, sizeof(float) * out_after);
out_data + i * out_after, din_ptr += in_after;
out_after); out_data += out_after;
} }
input_offset += out_strides[axis]; input_offset += out_strides[axis];
} }
......
...@@ -262,6 +262,7 @@ void Instruction::Run() { ...@@ -262,6 +262,7 @@ void Instruction::Run() {
if (op_->run_once() && has_run_) { if (op_->run_once() && has_run_) {
return; return;
} }
// VLOG(4) << "kernel launch"; // VLOG(4) << "kernel launch";
op_->InferShape(); op_->InferShape();
// VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target " // VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "
......
...@@ -60,3 +60,32 @@ adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_ ...@@ -60,3 +60,32 @@ adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_
adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./ adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./
``` ```
运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg 运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg
8. 编译并运行物体分类的demo
```shell
cd ../mobile_classify
wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
tar zxvf mobilenet_v1.tar.gz
make
adb -s emulator-5554 push mobile_classify /data/local/tmp/
adb -s emulator-5554 push test.jpg /data/local/tmp/
adb -s emulator-5554 push labels.txt /data/local/tmp/
adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
```
运行成功将在控制台输出预测结果的前5个类别的预测概率
- 如若想看前10个类别的预测概率,在运行命令输入topk的值即可
eg:
```shell
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
```
- 如若想看其他模型的分类结果, 在运行命令输入model_dir 及其model的输入大小即可
eg:
```shell
adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH &&
/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
```
ARM_ABI = arm7
export ARM_ABI
include ../Makefile.def
LITE_ROOT=../../../
THIRD_PARTY_DIR=${LITE_ROOT}/third_party
OPENCV_VERSION=opencv4.1.0
OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
###############################################################
# How to use one of static libaray: #
# `libpaddle_api_full_bundled.a` #
# `libpaddle_api_light_bundled.a` #
###############################################################
# Note: default use lite's shared library. #
###############################################################
# 1. Comment above line using `libpaddle_light_api_shared.so`
# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
mobile_classify: fetch_opencv mobile_classify.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_classify.o -o mobile_classify $(CXX_LIBS) $(LDFLAGS)
mobile_classify.o: mobile_classify.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_classify.o -c mobile_classify.cc
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
(echo "fetch opencv libs" && \
wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
.PHONY: clean
clean:
rm -f mobile_classify.o
rm -f mobile_classify
ARM_ABI = arm8
export ARM_ABI
include ../Makefile.def
LITE_ROOT=../../../
THIRD_PARTY_DIR=${LITE_ROOT}/third_party
OPENCV_VERSION=opencv4.1.0
OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
###############################################################
# How to use one of static libaray: #
# `libpaddle_api_full_bundled.a` #
# `libpaddle_api_light_bundled.a` #
###############################################################
# Note: default use lite's shared library. #
###############################################################
# 1. Comment above line using `libpaddle_light_api_shared.so`
# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
mobile_classify: fetch_opencv mobile_classify.o
$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_classify.o -o mobile_classify $(CXX_LIBS) $(LDFLAGS)
mobile_classify.o: mobile_classify.cc
$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_classify.o -c mobile_classify.cc
fetch_opencv:
@ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR}
@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
(echo "fetch opencv libs" && \
wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
.PHONY: clean
clean:
rm -f mobile_classify.o
rm -f mobile_classify
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <vector>
#include "opencv2/core.hpp"
#include "opencv2/imgcodecs.hpp"
#include "opencv2/imgproc.hpp"
#include "paddle_api.h" // NOLINT
using namespace paddle::lite_api; // NOLINT
void load_labels(std::string path, std::vector<std::string>* labels) {
FILE* fp = fopen(path.c_str(), "r");
if (fp == nullptr) {
printf("load label file failed \n");
return;
}
while (!feof(fp)) {
char str[1024];
fgets(str, 1024, fp);
std::string str_s(str);
if (str_s.length() > 0) {
for (int i = 0; i < str_s.length(); i++) {
if (str_s[i] == ' ') {
std::string strr = str_s.substr(i, str_s.length() - i - 1);
labels->push_back(strr);
i = str_s.length();
}
}
}
}
fclose(fp);
}
void print_topk(const float* scores,
const int size,
const int topk,
const std::vector<std::string>& labels) {
std::vector<std::pair<float, int>> vec;
vec.resize(size);
for (int i = 0; i < size; i++) {
vec[i] = std::make_pair(scores[i], i);
}
std::partial_sort(vec.begin(),
vec.begin() + topk,
vec.end(),
std::greater<std::pair<float, int>>());
// print topk and score
for (int i = 0; i < topk; i++) {
float score = vec[i].first;
int index = vec[i].second;
printf("i: %d, index: %d, name: %s, score: %f \n",
i,
index,
labels[index].c_str(),
score);
}
}
// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
void neon_mean_scale(
const float* din, float* dout, int size, float* mean, float* scale) {
float32x4_t vmean0 = vdupq_n_f32(mean[0]);
float32x4_t vmean1 = vdupq_n_f32(mean[1]);
float32x4_t vmean2 = vdupq_n_f32(mean[2]);
float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
float* dout_c0 = dout;
float* dout_c1 = dout + size;
float* dout_c2 = dout + size * 2;
int i = 0;
for (; i < size - 3; i += 4) {
float32x4x3_t vin3 = vld3q_f32(din);
float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
vst1q_f32(dout_c0, vs0);
vst1q_f32(dout_c1, vs1);
vst1q_f32(dout_c2, vs2);
din += 12;
dout_c0 += 4;
dout_c1 += 4;
dout_c2 += 4;
}
for (; i < size; i++) {
*(dout_c0++) = (*(din++) - mean[0]) * scale[0];
*(dout_c0++) = (*(din++) - mean[1]) * scale[1];
*(dout_c0++) = (*(din++) - mean[2]) * scale[2];
}
}
void pre_process(const cv::Mat& img,
int width,
int height,
Tensor dstTensor,
float* means,
float* scales) {
cv::Mat rgb_img;
// cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
cv::Mat imgf;
rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
const float* dimg = reinterpret_cast<const float*>(imgf.data);
float* data = dstTensor.mutable_data<float>();
neon_mean_scale(dimg, data, width * height, means, scales);
}
void RunModel(std::string model_dir,
std::string img_path,
const std::vector<std::string>& labels,
const int topk,
int width,
int height) {
// 1. Set MobileConfig
MobileConfig config;
config.set_model_dir(model_dir);
// 2. Create PaddlePredictor by MobileConfig
std::shared_ptr<PaddlePredictor> predictor =
CreatePaddlePredictor<MobileConfig>(config);
// 3. Prepare input data from image
std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
input_tensor->Resize({1, 3, height, width});
auto* data = input_tensor->mutable_data<float>();
// read img and pre-process
cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
// pre_process(img, width, height, data);
float means[3] = {0.485f, 0.456f, 0.406f};
float scales[3] = {0.229f, 0.224f, 0.225f};
pre_process(img, width, height, *input_tensor, means, scales);
// 4. Run predictor
predictor->Run();
// 5. Get output and post process
std::unique_ptr<const Tensor> output_tensor(
std::move(predictor->GetOutput(0)));
auto* outptr = output_tensor->data<float>();
auto shape_out = output_tensor->shape();
int64_t cnt = 1;
for (auto& i : shape_out) {
cnt *= i;
}
print_topk(outptr, cnt, topk, labels);
}
int main(int argc, char** argv) {
if (argc < 4) {
std::cerr << "[ERROR] usage: " << argv[0]
<< " model_dir image_path label_file\n";
exit(1);
}
printf("parameter: model_dir, image_path and label_file are necessary \n");
printf("parameter: topk, input_width, input_height, are optional \n");
std::string model_dir = argv[1];
std::string img_path = argv[2];
std::string label_file = argv[3];
std::vector<std::string> labels;
load_labels(label_file, &labels);
int topk = 5;
int height = 224;
int width = 224;
if (argc > 4) {
topk = atoi(argv[4]);
}
if (argc > 6) {
width = atoi(argv[5]);
height = atoi(argv[6]);
}
RunModel(model_dir, img_path, labels, topk, width, height);
return 0;
}
...@@ -89,16 +89,20 @@ bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { ...@@ -89,16 +89,20 @@ bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
if (std::find(input_arg_names.begin(), input_arg_names.end(), "AxisTensor") != if (std::find(input_arg_names.begin(), input_arg_names.end(), "AxisTensor") !=
input_arg_names.end()) { input_arg_names.end()) {
auto args = opdesc.Input("AxisTensor"); auto args = opdesc.Input("AxisTensor");
auto *var = scope->FindVar(args.front()); if (!args.empty()) {
param_.axis_tensor = var->GetMutable<lite::Tensor>(); auto *var = scope->FindVar(args.front());
param_.axis_tensor = var->GetMutable<lite::Tensor>();
}
} }
if (std::find(input_arg_names.begin(), if (std::find(input_arg_names.begin(),
input_arg_names.end(), input_arg_names.end(),
"SectionsTensorList") != input_arg_names.end()) { "SectionsTensorList") != input_arg_names.end()) {
auto args = opdesc.Input("SectionsTensorList"); auto args = opdesc.Input("SectionsTensorList");
auto *var = scope->FindVar(args.front()); if (!args.empty()) {
param_.sections_tensor_list = auto *var = scope->FindVar(args.front());
*(var->GetMutable<std::vector<lite::Tensor *>>()); param_.sections_tensor_list =
*(var->GetMutable<std::vector<lite::Tensor *>>());
}
} }
return true; return true;
} }
......
...@@ -7,5 +7,5 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ ...@@ -7,5 +7,5 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_
image_flip.cc image_flip.cc
image_rotate.cc image_rotate.cc
image_resize.cc image_resize.cc
DEPS ${lite_cv_deps} paddle_api_light) DEPS ${lite_cv_deps} paddle_api place)
endif() endif()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册