From 186397fe29dfb57e721bcc67bab3c352c6f4a9aa Mon Sep 17 00:00:00 2001 From: HappyAngel Date: Mon, 23 Dec 2019 20:12:05 +0800 Subject: [PATCH] [lite][arm]fix model_optimize bug, update concat and split op, speed up (#2620) --- CMakeLists.txt | 2 +- cmake/lite.cmake | 14 +- lite/CMakeLists.txt | 6 +- lite/api/CMakeLists.txt | 74 ++++--- lite/backends/arm/math/concat.cc | 43 ++-- lite/backends/arm/math/split.cc | 8 +- lite/core/program.cc | 1 + lite/demo/cxx/README.md | 29 +++ .../mobile_classify/Makefile.android.armv7 | 61 ++++++ .../mobile_classify/Makefile.android.armv8 | 61 ++++++ .../cxx/mobile_classify/mobile_classify.cc | 195 ++++++++++++++++++ lite/operators/split_op.cc | 14 +- lite/utils/cv/CMakeLists.txt | 2 +- 13 files changed, 434 insertions(+), 76 deletions(-) create mode 100644 lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7 create mode 100644 lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8 create mode 100644 lite/demo/cxx/mobile_classify/mobile_classify.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 77a94bea1e..0c71a45ffc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,7 +73,7 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF) lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF) lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF) # cv build options -lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF IF NOT LITE_WITH_ARM) +lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF) # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. diff --git a/cmake/lite.cmake b/cmake/lite.cmake index d6b374529e..d01e2d67ed 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -22,7 +22,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -44,7 +44,7 @@ function (lite_deps TARGET) set(deps ${deps} ${var}) endforeach(var) if(LITE_WITH_CV) - foreach(var ${lite_cv_deps}) + foreach(var ${lite_deps_CV_DEPS}) set(deps ${deps} ${var}) endforeach(var) endif() @@ -115,10 +115,11 @@ file(WRITE ${offline_lib_registry_file} "") # clean # LIGHT_DEPS: LITE_WITH_LIGHT_WEIGHT_FRAMEWORK # HVY_DEPS: NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK # EXCLUDE_COMPILE_DEPS: TARGET will not be included in lite_compile_deps if this is not None +# CV_DEPS: LITE_WITH_CV function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -129,6 +130,7 @@ function(lite_cc_library TARGET) CUDA_DEPS ${args_CUDA_DEPS} CL_DEPS ${args_CL_DEPS} ARM_DEPS ${args_ARM_DEPS} + CV_DEPS ${args_CV_DEPS} FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} @@ -162,7 +164,7 @@ function(lite_cc_binary TARGET) endif() set(oneValueArgs "") set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS - LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) + LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps "") @@ -178,6 +180,7 @@ function(lite_cc_binary TARGET) PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} + CV_DEPS ${CV_DEPS} ) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) @@ -208,7 +211,7 @@ function(lite_cc_test TARGET) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS - LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS + LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) ) @@ -232,6 +235,7 @@ function(lite_cc_test TARGET) PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} + CV_DEPS ${args_CV_DEPS} ) _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) # strip binary target to reduce size diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index df6b7d3648..07d78320db 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -222,7 +222,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile" ) add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) @@ -236,7 +237,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile" - + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile" ) add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos) endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 70239e94e7..2264d0052e 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -24,12 +24,16 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE endif() if(LITE_WITH_CUDA) target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive") - endif(LITE_WITH_CUDA) + endif(LITE_WITH_CUDA) + #light api dynamic library lite_cc_library(paddle_light_api_shared MODULE - SRCS light_api_shared.cc - DEPS ${light_lib_DEPS} - ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels}) + SRCS light_api_shared.cc + DEPS ${light_lib_DEPS} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels}) + target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) if (LITE_WITH_NPU) # Strips the symbols of our protobuf functions to fix the conflicts during @@ -75,16 +79,17 @@ message(STATUS "get FPGA kernels ${fpga_kernels}") # for full api if (NOT LITE_ON_TINY_PUBLISH) set(cxx_api_deps - scope optimizer target_wrapper_host model_parser program) + scope optimizer target_wrapper_host model_parser program) lite_cc_library(cxx_api - SRCS cxx_api.cc - DEPS ${cxx_api_deps} ${ops} ${host_kernels} program - X86_DEPS ${x86_kernels} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} - XPU_DEPS ${xpu_kernels} - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels}) + SRCS cxx_api.cc + DEPS ${cxx_api_deps} ${ops} ${host_kernels} program + X86_DEPS ${x86_kernels} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass + XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) endif() # for light api @@ -100,6 +105,7 @@ lite_cc_library(light_api SRCS light_api.cc CUDA_DEPS ${cuda_kernels} X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} @@ -224,11 +230,12 @@ else() endif() if (NOT LITE_ON_TINY_PUBLISH) lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api paddle_api_light - ${ops} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels}) + ${ops} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) # The final inference library for just MobileConfig. bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) @@ -258,7 +265,7 @@ if (LITE_WITH_JAVA AND LITE_WITH_ARM) add_subdirectory(android) endif() -if (LITE_WITH_PYTHON) +if (LITE_WITH_PYTHON) add_subdirectory(python) endif() @@ -288,25 +295,16 @@ endif() # Some bins if(NOT IOS) - lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} - XPU_DEPS ${xpu_kernels} - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels} - X86_DEPS ${x86_kernels} - CUDA_DEPS ${cuda_kernels}) - lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils - ${ops} ${host_kernels} - ARM_DEPS ${arm_kernels} - NPU_DEPS ${npu_kernels} - XPU_DEPS ${xpu_kernels} - CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels} - X86_DEPS ${x86_kernels} - CUDA_DEPS ${cuda_kernels}) - + lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels} + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) endif() #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc diff --git a/lite/backends/arm/math/concat.cc b/lite/backends/arm/math/concat.cc index 9b94cefa16..65f9345338 100644 --- a/lite/backends/arm/math/concat.cc +++ b/lite/backends/arm/math/concat.cc @@ -26,31 +26,32 @@ namespace math { void concat_func(const std::vector &input, const int axis, lite::Tensor *output) { - size_t num = input.size(); - int rows = 1; + int64_t concat_input_size = 1; + int64_t num_cancats = 1; auto dim_0 = input[0]->dims(); - for (int i = 0; i < axis; ++i) { - rows *= dim_0[i]; + size_t num = input.size(); + for (int i = axis + 1; i < dim_0.size(); i++) { + concat_input_size *= dim_0[i]; } - int out_rows = rows, out_cols = 0; - - std::vector input_cols(input.size()); - for (int i = 0; i < num; ++i) { - int t_cols = input[i]->numel() / rows; - out_cols += t_cols; - input_cols[i] = t_cols; + for (int i = 0; i < axis; i++) { + num_cancats *= dim_0[i]; } - - // computation - for (int k = 0; k < out_rows; ++k) { - float *dst_ptr = output->mutable_data() + k * out_cols; - int col_idx = 0; - for (int j = 0; j < num; ++j) { - int col_len = input_cols[j]; - const float *src_prt = input[j]->data() + k * col_len; - std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len); - col_idx += col_len; + float *dst_ptr = output->mutable_data(); + const int out_concat_axis = output->dims()[axis]; + int64_t offset_concat_axis = 0; + int64_t out_sum = out_concat_axis * concat_input_size; + for (int n = 0; n < num; n++) { + auto dims = input[n]->dims(); + const float *src_ptr = input[n]->data(); + int64_t in_concat_axis = dims[axis]; + float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size; + int64_t in_sum = in_concat_axis * concat_input_size; + for (int i = 0; i < num_cancats; i++) { + std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum); + dout_ptr += out_sum; + src_ptr += in_sum; } + offset_concat_axis += in_concat_axis; } } diff --git a/lite/backends/arm/math/split.cc b/lite/backends/arm/math/split.cc index 54ea7e62c2..bff29af93b 100644 --- a/lite/backends/arm/math/split.cc +++ b/lite/backends/arm/math/split.cc @@ -70,10 +70,12 @@ void split(const float* din, int in_after = in_strides[axis]; int out_after = out_strides[axis]; + const float* din_ptr = din + input_offset; + for (int i = 0; i < before; ++i) { - split_cpy(din + input_offset + i * in_after, - out_data + i * out_after, - out_after); + std::memcpy(out_data, din_ptr, sizeof(float) * out_after); + din_ptr += in_after; + out_data += out_after; } input_offset += out_strides[axis]; } diff --git a/lite/core/program.cc b/lite/core/program.cc index b0c61bf00e..8dc8fb0ddd 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -262,6 +262,7 @@ void Instruction::Run() { if (op_->run_once() && has_run_) { return; } + // VLOG(4) << "kernel launch"; op_->InferShape(); // VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target " diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md index b7768d763e..5e0ec49add 100644 --- a/lite/demo/cxx/README.md +++ b/lite/demo/cxx/README.md @@ -60,3 +60,32 @@ adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_ adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./ ``` 运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg + +8. 编译并运行物体分类的demo +```shell +cd ../mobile_classify +wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz +tar zxvf mobilenet_v1.tar.gz +make +adb -s emulator-5554 push mobile_classify /data/local/tmp/ +adb -s emulator-5554 push test.jpg /data/local/tmp/ +adb -s emulator-5554 push labels.txt /data/local/tmp/ +adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify +adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt" +``` +运行成功将在控制台输出预测结果的前5个类别的预测概率 +- 如若想看前10个类别的预测概率,在运行命令输入topk的值即可 + eg: + ```shell + adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && + /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10" + ``` +- 如若想看其他模型的分类结果, 在运行命令输入model_dir 及其model的输入大小即可 + eg: + ```shell + adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && + /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224" + ``` + diff --git a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7 new file mode 100644 index 0000000000..8d446af9b1 --- /dev/null +++ b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7 @@ -0,0 +1,61 @@ +ARM_ABI = arm7 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +mobile_classify: fetch_opencv mobile_classify.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_classify.o -o mobile_classify $(CXX_LIBS) $(LDFLAGS) + +mobile_classify.o: mobile_classify.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_classify.o -c mobile_classify.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f mobile_classify.o + rm -f mobile_classify diff --git a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8 new file mode 100644 index 0000000000..255c42f2dc --- /dev/null +++ b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8 @@ -0,0 +1,61 @@ +ARM_ABI = arm8 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +mobile_classify: fetch_opencv mobile_classify.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_classify.o -o mobile_classify $(CXX_LIBS) $(LDFLAGS) + +mobile_classify.o: mobile_classify.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_classify.o -c mobile_classify.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f mobile_classify.o + rm -f mobile_classify diff --git a/lite/demo/cxx/mobile_classify/mobile_classify.cc b/lite/demo/cxx/mobile_classify/mobile_classify.cc new file mode 100644 index 0000000000..c651bf9f4c --- /dev/null +++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc @@ -0,0 +1,195 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +void load_labels(std::string path, std::vector* labels) { + FILE* fp = fopen(path.c_str(), "r"); + if (fp == nullptr) { + printf("load label file failed \n"); + return; + } + while (!feof(fp)) { + char str[1024]; + fgets(str, 1024, fp); + std::string str_s(str); + + if (str_s.length() > 0) { + for (int i = 0; i < str_s.length(); i++) { + if (str_s[i] == ' ') { + std::string strr = str_s.substr(i, str_s.length() - i - 1); + labels->push_back(strr); + i = str_s.length(); + } + } + } + } + fclose(fp); +} + +void print_topk(const float* scores, + const int size, + const int topk, + const std::vector& labels) { + std::vector> vec; + vec.resize(size); + for (int i = 0; i < size; i++) { + vec[i] = std::make_pair(scores[i], i); + } + + std::partial_sort(vec.begin(), + vec.begin() + topk, + vec.end(), + std::greater>()); + + // print topk and score + for (int i = 0; i < topk; i++) { + float score = vec[i].first; + int index = vec[i].second; + printf("i: %d, index: %d, name: %s, score: %f \n", + i, + index, + labels[index].c_str(), + score); + } +} +// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up +void neon_mean_scale( + const float* din, float* dout, int size, float* mean, float* scale) { + float32x4_t vmean0 = vdupq_n_f32(mean[0]); + float32x4_t vmean1 = vdupq_n_f32(mean[1]); + float32x4_t vmean2 = vdupq_n_f32(mean[2]); + float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]); + float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]); + float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]); + + float* dout_c0 = dout; + float* dout_c1 = dout + size; + float* dout_c2 = dout + size * 2; + + int i = 0; + for (; i < size - 3; i += 4) { + float32x4x3_t vin3 = vld3q_f32(din); + float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0); + float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1); + float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2); + float32x4_t vs0 = vmulq_f32(vsub0, vscale0); + float32x4_t vs1 = vmulq_f32(vsub1, vscale1); + float32x4_t vs2 = vmulq_f32(vsub2, vscale2); + vst1q_f32(dout_c0, vs0); + vst1q_f32(dout_c1, vs1); + vst1q_f32(dout_c2, vs2); + + din += 12; + dout_c0 += 4; + dout_c1 += 4; + dout_c2 += 4; + } + for (; i < size; i++) { + *(dout_c0++) = (*(din++) - mean[0]) * scale[0]; + *(dout_c0++) = (*(din++) - mean[1]) * scale[1]; + *(dout_c0++) = (*(din++) - mean[2]) * scale[2]; + } +} + +void pre_process(const cv::Mat& img, + int width, + int height, + Tensor dstTensor, + float* means, + float* scales) { + cv::Mat rgb_img; + // cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f); + cv::Mat imgf; + rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); + const float* dimg = reinterpret_cast(imgf.data); + float* data = dstTensor.mutable_data(); + neon_mean_scale(dimg, data, width * height, means, scales); +} + +void RunModel(std::string model_dir, + std::string img_path, + const std::vector& labels, + const int topk, + int width, + int height) { + // 1. Set MobileConfig + MobileConfig config; + config.set_model_dir(model_dir); + + // 2. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data from image + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize({1, 3, height, width}); + auto* data = input_tensor->mutable_data(); + // read img and pre-process + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + // pre_process(img, width, height, data); + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + pre_process(img, width, height, *input_tensor, means, scales); + + // 4. Run predictor + predictor->Run(); + + // 5. Get output and post process + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + auto* outptr = output_tensor->data(); + auto shape_out = output_tensor->shape(); + int64_t cnt = 1; + for (auto& i : shape_out) { + cnt *= i; + } + print_topk(outptr, cnt, topk, labels); +} + +int main(int argc, char** argv) { + if (argc < 4) { + std::cerr << "[ERROR] usage: " << argv[0] + << " model_dir image_path label_file\n"; + exit(1); + } + printf("parameter: model_dir, image_path and label_file are necessary \n"); + printf("parameter: topk, input_width, input_height, are optional \n"); + std::string model_dir = argv[1]; + std::string img_path = argv[2]; + std::string label_file = argv[3]; + std::vector labels; + load_labels(label_file, &labels); + int topk = 5; + int height = 224; + int width = 224; + if (argc > 4) { + topk = atoi(argv[4]); + } + if (argc > 6) { + width = atoi(argv[5]); + height = atoi(argv[6]); + } + + RunModel(model_dir, img_path, labels, topk, width, height); + return 0; +} diff --git a/lite/operators/split_op.cc b/lite/operators/split_op.cc index ec98a0d6c3..834d68a315 100644 --- a/lite/operators/split_op.cc +++ b/lite/operators/split_op.cc @@ -89,16 +89,20 @@ bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { if (std::find(input_arg_names.begin(), input_arg_names.end(), "AxisTensor") != input_arg_names.end()) { auto args = opdesc.Input("AxisTensor"); - auto *var = scope->FindVar(args.front()); - param_.axis_tensor = var->GetMutable(); + if (!args.empty()) { + auto *var = scope->FindVar(args.front()); + param_.axis_tensor = var->GetMutable(); + } } if (std::find(input_arg_names.begin(), input_arg_names.end(), "SectionsTensorList") != input_arg_names.end()) { auto args = opdesc.Input("SectionsTensorList"); - auto *var = scope->FindVar(args.front()); - param_.sections_tensor_list = - *(var->GetMutable>()); + if (!args.empty()) { + auto *var = scope->FindVar(args.front()); + param_.sections_tensor_list = + *(var->GetMutable>()); + } } return true; } diff --git a/lite/utils/cv/CMakeLists.txt b/lite/utils/cv/CMakeLists.txt index 01f5341c97..0edcb2ef24 100644 --- a/lite/utils/cv/CMakeLists.txt +++ b/lite/utils/cv/CMakeLists.txt @@ -7,5 +7,5 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ image_flip.cc image_rotate.cc image_resize.cc - DEPS ${lite_cv_deps} paddle_api_light) + DEPS ${lite_cv_deps} paddle_api place) endif() -- GitLab