diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index 815c5423ffc5ddbe71a55199c7520f645f97c01a..2264e57b9d9e2cde2ef29f6273b538dbb60c6447 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -226,6 +226,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile" ) add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) @@ -243,6 +245,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile" ) add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos) endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 874b5bb34dc12c1d979bc2e662f506c22350a776..d91fe9cd50b59b33b31f5e93a3ab79faee5a944b 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -35,6 +35,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE NPU_DEPS ${npu_kernels}) target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) + if (LITE_WITH_NPU) # Strips the symbols of our protobuf functions to fix the conflicts during # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so) @@ -45,8 +46,8 @@ else() if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux")) add_library(paddle_light_api_shared SHARED "") target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc) - set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") - add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) + set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") + add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) if (LITE_WITH_NPU) # Need to add HIAI runtime libs (libhiai.so) dependency target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs}) @@ -123,6 +124,7 @@ if(WITH_TESTING) X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels} ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} @@ -285,6 +287,7 @@ endif(LITE_ON_MODEL_OPTIMIZE_TOOL) lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light ${ops} ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} @@ -307,9 +310,11 @@ if(NOT IOS) FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) + lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc index dc9fac96ee848d73ca14c8dc4555c0f44951400a..5b063a8ef19c85d3818d2ca57659170d7d86357d 100644 --- a/lite/api/model_test.cc +++ b/lite/api/model_test.cc @@ -86,6 +86,7 @@ void Run(const std::vector>& input_shapes, for (int i = 0; i < input_shapes[j].size(); ++i) { input_num *= input_shapes[j][i]; } + for (int i = 0; i < input_num; ++i) { input_data[i] = 1.f; } diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md index c1cf90169b13e35ad4445addca9575eb443b8204..3217a7ed49006325715e22f8aa82d155bc8bf927 100644 --- a/lite/demo/cxx/README.md +++ b/lite/demo/cxx/README.md @@ -68,26 +68,44 @@ adb pull /data/local/tmp/test_yolov3_detection_result.jpg ./ cd ../mobile_classify wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz tar zxvf mobilenet_v1.tar.gz +./model_optimize_tool optimize model make -adb push mobile_classify /data/local/tmp/ -adb push test.jpg /data/local/tmp/ -adb push labels.txt /data/local/tmp/ -adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ -adb shell chmod +x /data/local/tmp/mobile_classify -adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && -/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt" + +adb -s emulator-5554 push mobile_classify /data/local/tmp/ +adb -s emulator-5554 push test.jpg /data/local/tmp/ +adb -s emulator-5554 push labels.txt /data/local/tmp/ +adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify +adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt" ``` 运行成功将在控制台输出预测结果的前5个类别的预测概率 - 如若想看前10个类别的预测概率,在运行命令输入topk的值即可 eg: ```shell - adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && - /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10" + adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && + /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10" ``` - 如若想看其他模型的分类结果, 在运行命令输入model_dir 及其model的输入大小即可 eg: ```shell - adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && - /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224" + adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && + /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv2opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224" ``` +9. 编译含CV预处理库模型单测demo +```shell +cd ../test_cv +wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz +tar zxvf mobilenet_v1.tar.gz +./model_optimize_tool optimize model +make +adb -s emulator-5554 push test_model_cv /data/local/tmp/ +adb -s emulator-5554 push test.jpg /data/local/tmp/ +adb -s emulator-5554 push labels.txt /data/local/tmp/ +adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s emulator-5554 shell chmod +x /data/local/tmp/test_model_cv +adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/test_model_cv /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt" +``` +运行成功将在控制台输出预测结果的前10个类别的预测概率 diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 new file mode 100644 index 0000000000000000000000000000000000000000..d659a316cd856fd550e83b125573409f239b8cf2 --- /dev/null +++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 @@ -0,0 +1,71 @@ +ARM_ABI = arm7 +LITE_WITH_CV = ON +export ARM_ABI +export LITE_WITH_CV + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +test_model_cv: fetch_opencv test_model_cv.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) + +test_model_cv.o: test_model_cv.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc + +test_img_prepross: fetch_opencv test_img_prepross.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross $(CXX_LIBS) $(LDFLAGS) + +test_img_prepross.o: test_img_prepross.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f test_model_cv.o + rm -f test_model_cv + rm -f test_img_prepross.o + rm -f test_img_prepross diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 new file mode 100644 index 0000000000000000000000000000000000000000..c80b07d5c029a3624a514e07375fd08e8770da25 --- /dev/null +++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 @@ -0,0 +1,70 @@ +ARM_ABI = arm8 +LITE_WITH_CV = ON +export ARM_ABI +export LITE_WITH_CV + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +test_model_cv: fetch_opencv test_model_cv.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) + +test_model_cv.o: test_model_cv.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc + +test_img_prepross: fetch_opencv test_img_prepross.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross $(CXX_LIBS) $(LDFLAGS) + +test_img_prepross.o: test_img_prepross.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f test_model_cv.o + rm -f test_model_cv + rm -f test_img_prepross.o + rm -f test_img_prepross diff --git a/lite/demo/cxx/mobile_classify/mobile_classify.cc b/lite/demo/cxx/mobile_classify/mobile_classify.cc index c651bf9f4cca0db0e126311e5a03b3ade6ccf886..d0cf59e185e1330b7d8487d562afa0af29236007 100644 --- a/lite/demo/cxx/mobile_classify/mobile_classify.cc +++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc @@ -117,7 +117,7 @@ void pre_process(const cv::Mat& img, float* means, float* scales) { cv::Mat rgb_img; - // cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f); cv::Mat imgf; rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); diff --git a/lite/demo/cxx/test_cv/README.md b/lite/demo/cxx/test_cv/README.md new file mode 100644 index 0000000000000000000000000000000000000000..36d2985a4fd4f243027f8caab9b6c5a8beb94cad --- /dev/null +++ b/lite/demo/cxx/test_cv/README.md @@ -0,0 +1,131 @@ +# 图像预测库的使用 +1. 下载源码(https://github.com/PaddlePaddle/Paddle-Lite),打开LITE_WITH_CV=ON,编译full_publish模式 +example: +```shell +set BUILD_WITH_CV=ON or LITE_WITH_CV=ON +./lite/tools/build.sh +--arm_os=android +--arm_abi=armv8 +--arm_lang=gcc +--android_stl=c++_static +full_publish +``` + +2. 准备模型和优化模型 +example: +```shell +wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz +tar zxvf mobilenet_v1.tar.gz +./lite/tools/build.sh build_optimize_tool +./build.model_optimize_tool/lite/api/model_optimize_tool +--optimize_out_type=naive_buffer +--optimize_out=model_dir +--model_dir=model_dir +--prefer_int8_kernel=false +``` + +3. 编译并运行完整test_model_cv demo +example: +```shell +cd inference_lite_lib.android.armv8/demo/cxx/test_cv +``` + +- 修改MakeFile, 注释编译test_img_propress 语句 + ```shell + test_model_cv: fetch_opencv test_model_cv.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) + + test_model_cv.o: test_model_cv.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc + + #test_img_propress: fetch_opencv test_img_propress.o + # $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_propress.o -o test_img_propress $(CXX_LIBS) $(LDFLAGS) + + #test_img_propress.o: test_img_propress.cc + # $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_propress.o -c test_img_propress.cc + + .PHONY: clean + clean: + rm -f test_model_cv.o + rm -f test_model_cv + #rm -f test_img_propress.o + #rm -f test_img_propress + ``` +- 修改../../..//cxx/include/paddle_image_preprocess.h, 修改paddle_api.h头文件的路径 + ```shell + origin: + #include "lite/api/paddle_api.h" + #include "lite/api/paddle_place.h" + now: + #include "paddle_api.h" + #include "paddle_place.h" + ``` +- 测试模型必须是优化后的模型 + +```shell +make + +adb -s device_id push mobilenet_v1 /data/local/tmp/ +adb -s device_id push test_model_cv /data/local/tmp/ +adb -s device_id push test.jpg /data/local/tmp/ +adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s device_id shell chmod +x /data/local/tmp/test_model_cv +adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/test_model_cv /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg 1 3 224 224 " +``` +运行成功将在控制台输出部分预测结果 + +4. 编译并运行完整test_img_preprocess demo +example: +```shell +cd inference_lite_lib.android.armv8/demo/cxx/test_cv +``` + +- 修改MakeFile, 注释编译test_model_cv 语句 + ```shell + #test_model_cv: fetch_opencv test_model_cv.o + # $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv $(CXX_LIBS) $(LDFLAGS) + + #test_model_cv.o: test_model_cv.cc + # $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc + + test_img_propress: fetch_opencv test_img_propress.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_propress.o -o test_img_propress $(CXX_LIBS) $(LDFLAGS) + + test_img_propress.o: test_img_propress.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_propress.o -c test_img_propress.cc + + .PHONY: clean + clean: + #rm -f test_model_cv.o + #rm -f test_model_cv + rm -f test_img_propress.o + rm -f test_img_propress + ``` +- 修改../../..//cxx/include/paddle_image_preprocess.h, 修改paddle_api.h头文件的路径 + ```shell + origin: + #include "lite/api/paddle_api.h" + #include "lite/api/paddle_place.h" + now: + #include "paddle_api.h" + #include "paddle_place.h" + ``` +- 测试模型必须是优化后的模型 + +```shell +make + +adb -s device_id push mobilenet_v1 /data/local/tmp/ +adb -s device_id push test_img_propress /data/local/tmp/ +adb -s device_id push test.jpg /data/local/tmp/ +adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ +adb -s device_id shell chmod +x /data/local/tmp/test_model_cv +adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/test_img_propress /data/local/tmp/test.jpg /data/local/tmp/ 3 3 1 3 224 224 /data/local/tmp/mobilenet_v1 " +adb -s device_id pull /data/local/tmp/resize.jpg ./ +adb -s device_id pull /data/local/tmp/convert.jpg ./ +adb -s device_id pull /data/local/tmp/flip.jpg ./ +adb -s device_id pull /data/local/tmp/rotate.jpg ./ +``` +运行成功将在控制台输出OpenCV 和 Padlle-lite的耗时;同时,将在test_cv目录下看到生成的图像预处理结果图: 如:resize.jpg、convert.jpg等 diff --git a/lite/demo/cxx/test_cv/test_img_prepross.cc b/lite/demo/cxx/test_cv/test_img_prepross.cc new file mode 100644 index 0000000000000000000000000000000000000000..c2cbd66cc0a15a1032141641d83fbf8db85d20bf --- /dev/null +++ b/lite/demo/cxx/test_cv/test_img_prepross.cc @@ -0,0 +1,389 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" // NOLINT +#include "paddle_image_preprocess.h" // NOLINT +#include "time.h" // NOLINT +typedef paddle::lite_api::Tensor Tensor; +typedef paddle::lite::utils::cv::ImageFormat ImageFormat; +typedef paddle::lite::utils::cv::FlipParam FlipParam; +typedef paddle::lite::utils::cv::TransParam TransParam; +typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess; +typedef paddle::lite_api::DataLayoutType LayoutType; +using namespace paddle::lite_api; // NOLINT + +void fill_with_mat(cv::Mat& mat, uint8_t* src) { // NOLINT + for (int i = 0; i < mat.rows; i++) { + for (int j = 0; j < mat.cols; j++) { + int tmp = (i * mat.cols + j) * 3; + cv::Vec3b& rgb = mat.at(i, j); + rgb[0] = src[tmp]; + rgb[1] = src[tmp + 1]; + rgb[2] = src[tmp + 2]; + } + } +} +void test_img(std::vector cluster_id, + std::vector thread_num, + std::string img_path, + std::string dst_path, + ImageFormat srcFormat, + ImageFormat dstFormat, + int width, + int height, + float rotate, + FlipParam flip, + LayoutType layout, + std::string model_dir, + int test_iter = 1) { + // init + // paddle::lite::DeviceInfo::Init(); + // read img and pre-process + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + int srch = img.rows; + int srcw = img.cols; + for (auto& cls : cluster_id) { + for (auto& th : thread_num) { + std::cout << "cluster: " << cls << ", threads: " << th << std::endl; + // 1. Set MobileConfig + MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode((PowerMode)cls); + config.set_threads(th); + std::cout << "model: " << model_dir; + + // 2. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data from image + std::unique_ptr input_tensor(predictor->GetInput(0)); + + /* + imread(img_path, param) + IMREAD_UNCHANGED(<0) 表示加载原图,不做任何改变 + IMREAD_GRAYSCALE ( 0)表示把原图作为灰度图像加载进来 + IMREAD_COLOR (>0) 表示把原图作为RGB图像加载进来 + */ + cv::Mat img; + if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + img = imread(img_path, cv::IMREAD_COLOR); + } else if (srcFormat == ImageFormat::GRAY) { + img = imread(img_path, cv::IMREAD_GRAYSCALE); + } else { + printf("this format %d does not support \n", srcFormat); + return; + } + if (img.empty()) { + std::cout << "opencv read image " << img_path.c_str() << " failed" + << std::endl; + return; + } + int srch = img.rows; + int srcw = img.cols; + int dsth = height; + int dstw = width; + + std::cout << " input tensor size, num= " << 1 << ", channel= " << 1 + << ", height= " << srch << ", width= " << srcw + << ", srcFormat= " << (ImageFormat)srcFormat << std::endl; + // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12, + if (srcFormat == ImageFormat::GRAY) { + std::cout << "srcFormat: GRAY" << std::endl; + } + if (srcFormat == ImageFormat::BGR) { + std::cout << "srcFormat: BGR" << std::endl; + } + if (srcFormat == ImageFormat::RGB) { + std::cout << "srcFormat: RGB" << std::endl; + } + std::cout << " output tensor size, num=" << 1 << ", channel=" << 1 + << ", height=" << dsth << ", width=" << dstw + << ", dstFormat= " << (ImageFormat)dstFormat << std::endl; + + if (dstFormat == ImageFormat::GRAY) { + std::cout << "dstFormat: GRAY" << std::endl; + } + if (dstFormat == ImageFormat::BGR) { + std::cout << "dstFormat: BGR" << std::endl; + } + if (dstFormat == ImageFormat::RGB) { + std::cout << "dstFormat: RGB" << std::endl; + } + + std::cout << "Rotate = " << rotate << ", Flip = " << flip + << ", Layout = " << static_cast(layout) << std::endl; + if (static_cast(layout) != 1 && static_cast(layout) != 3) { + std::cout << "this layout" << static_cast(layout) + << " is no support" << std::endl; + } + int size = 3 * srch * srcw; + if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) { + size = 3 * srch * srcw; + } else if (srcFormat == ImageFormat::GRAY) { + size = srch * srcw; + } + uint8_t* src = img.data; + + int out_size = srch * srcw; + int resize = dstw * dsth; + if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) { + out_size = 3 * srch * srcw; + resize = 3 * dsth * dstw; + } else if (dstFormat == ImageFormat::GRAY) { + out_size = srch * srcw; + resize = dsth * dstw; + } + // out + uint8_t* lite_dst = new uint8_t[out_size]; + uint8_t* resize_tmp = new uint8_t[resize]; + uint8_t* tv_out_ratote = new uint8_t[out_size]; + uint8_t* tv_out_flip = new uint8_t[out_size]; + std::vector shape_out = {1, 3, srch, srcw}; + + input_tensor->Resize(shape_out); + Tensor dst_tensor = *input_tensor; + std::cout << "opencv compute" << std::endl; + cv::Mat im_convert; + cv::Mat im_resize; + cv::Mat im_rotate; + cv::Mat im_flip; + double to_1 = 0; + double to_2 = 0; + double to_3 = 0; + double to_4 = 0; + double to1 = 0; + for (int i = 0; i < test_iter; i++) { + clock_t start = clock(); + clock_t begin = clock(); + // convert bgr-gray + if (dstFormat == srcFormat) { + im_convert = img; + } else if (dstFormat == ImageFormat::BGR && + srcFormat == ImageFormat::GRAY) { + cv::cvtColor(img, im_convert, cv::COLOR_GRAY2BGR); + } else if (srcFormat == ImageFormat::BGR && + dstFormat == ImageFormat::GRAY) { + cv::cvtColor(img, im_convert, cv::COLOR_BGR2GRAY); + } else if (dstFormat == srcFormat) { + printf("convert format error \n"); + return; + } + clock_t end = clock(); + to_1 += (end - begin); + + begin = clock(); + // resize default linear + cv::resize(im_convert, im_resize, cv::Size(dstw, dsth), 0.f, 0.f); + end = clock(); + to_2 += (end - begin); + + begin = clock(); + // rotate 90 + if (rotate == 90) { + cv::flip(im_convert.t(), im_rotate, 1); + } else if (rotate == 180) { + cv::flip(im_convert, im_rotate, -1); + } else if (rotate == 270) { + cv::flip(im_convert.t(), im_rotate, 0); + } + end = clock(); + to_3 += (end - begin); + + begin = clock(); + // flip + cv::flip(im_convert, im_flip, flip); + end = clock(); + to_4 += (end - begin); + clock_t ovet = clock(); + to1 += (ovet - start); + } + + std::cout << "Paddle-lite compute" << std::endl; + double lite_to = 0; + double lite_to_1 = 0; + double lite_to_2 = 0; + double lite_to_3 = 0; + double lite_to_4 = 0; + double lite_to_5 = 0; + TransParam tparam; + tparam.ih = srch; + tparam.iw = srcw; + tparam.oh = dsth; + tparam.ow = dstw; + tparam.flip_param = flip; + tparam.rotate_param = rotate; + + ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); + + for (int i = 0; i < test_iter; ++i) { + clock_t start = clock(); + clock_t begin = clock(); + image_preprocess.imageConvert(src, lite_dst); + clock_t end = clock(); + lite_to_1 += (end - begin); + + begin = clock(); + image_preprocess.imageResize(lite_dst, resize_tmp); + end = clock(); + lite_to_2 += (end - begin); + + begin = clock(); + image_preprocess.imageRotate( + lite_dst, tv_out_ratote, (ImageFormat)dstFormat, srcw, srch, 90); + end = clock(); + lite_to_3 += (end - begin); + + begin = clock(); + image_preprocess.imageFlip( + lite_dst, tv_out_flip, (ImageFormat)dstFormat, srcw, srch, flip); + end = clock(); + lite_to_4 += (end - begin); + + clock_t over = clock(); + lite_to += (over - start); + + begin = clock(); + image_preprocess.image2Tensor(lite_dst, + &dst_tensor, + (ImageFormat)dstFormat, + srcw, + srch, + layout, + means, + scales); + end = clock(); + lite_to_5 += (end - begin); + } + to_1 = 1000 * to_1 / CLOCKS_PER_SEC; + to_2 = 1000 * to_2 / CLOCKS_PER_SEC; + to_3 = 1000 * to_3 / CLOCKS_PER_SEC; + to_4 = 1000 * to_4 / CLOCKS_PER_SEC; + to1 = 1000 * to1 / CLOCKS_PER_SEC; + std::cout << "opencv convert run time: " << to_1 + << "ms, avg: " << to_1 / test_iter << std::endl; + std::cout << "opencv resize run time: " << to_2 + << "ms, avg: " << to_2 / test_iter << std::endl; + std::cout << "opencv rotate run time: " << to_3 + << "ms, avg: " << to_3 / test_iter << std::endl; + std::cout << "opencv flip time: " << to_4 + << "ms, avg: " << to_4 / test_iter << std::endl; + std::cout << "opencv total run time: " << to1 + << "ms, avg: " << to1 / test_iter << std::endl; + std::cout << "------" << std::endl; + + lite_to_1 = 1000 * lite_to_1 / CLOCKS_PER_SEC; + lite_to_2 = 1000 * lite_to_2 / CLOCKS_PER_SEC; + lite_to_3 = 1000 * lite_to_3 / CLOCKS_PER_SEC; + lite_to_4 = 1000 * lite_to_4 / CLOCKS_PER_SEC; + lite_to_5 = 1000 * lite_to_5 / CLOCKS_PER_SEC; + lite_to = 1000 * lite_to / CLOCKS_PER_SEC; + std::cout << "lite convert run time: " << lite_to_1 + << "ms, avg: " << lite_to_1 / test_iter << std::endl; + std::cout << "lite resize run time: " << lite_to_2 + << "ms, avg: " << lite_to_2 / test_iter << std::endl; + std::cout << "lite rotate run time: " << lite_to_3 + << "ms, avg: " << lite_to_3 / test_iter << std::endl; + std::cout << "lite flip time: " << lite_to_4 + << "ms, avg: " << lite_to_4 / test_iter << std::endl; + std::cout << "lite total run time: " << lite_to + << "ms, avg: " << lite_to / test_iter << std::endl; + std::cout << "lite img2tensor time: " << lite_to_5 + << "ms, avg: " << lite_to_5 / test_iter << std::endl; + std::cout << "------" << std::endl; + + double max_ratio = 0; + double max_diff = 0; + const double eps = 1e-6f; + // save_img + std::cout << "write image: " << std::endl; + std::string resize_name = dst_path + "/resize.jpg"; + std::string convert_name = dst_path + "/convert.jpg"; + std::string rotate_name = dst_path + "/rotate.jpg"; + std::string flip_name = dst_path + "/flip.jpg"; + cv::Mat resize_mat(dsth, dstw, CV_8UC3); + cv::Mat convert_mat(srch, srcw, CV_8UC3); + cv::Mat rotate_mat; + if (rotate == 90 || rotate == 270) { + rotate_mat = cv::Mat(srcw, srch, CV_8UC3); + } else { + rotate_mat = cv::Mat(srch, srcw, CV_8UC3); + } + cv::Mat flip_mat(srch, srcw, CV_8UC3); + fill_with_mat(resize_mat, resize_tmp); + fill_with_mat(convert_mat, lite_dst); + fill_with_mat(rotate_mat, tv_out_ratote); + fill_with_mat(flip_mat, tv_out_flip); + cv::imwrite(convert_name, convert_mat); + cv::imwrite(resize_name, resize_mat); + cv::imwrite(rotate_name, rotate_mat); + cv::imwrite(flip_name, flip_mat); + delete[] lite_dst; + delete[] resize_tmp; + delete[] tv_out_ratote; + delete[] tv_out_flip; + } + } +} + +int main(int argc, char** argv) { + if (argc < 7) { + std::cerr << "[ERROR] usage: " << argv[0] + << " image_path dst_apth srcFormat dstFormat width height\n"; + exit(1); + } + std::string image_path = argv[1]; + std::string dst_path = argv[2]; + int srcFormat = atoi(argv[3]); + int dstFormat = atoi(argv[4]); + int width = atoi(argv[5]); + int height = atoi(argv[6]); + int flip = -1; + float rotate = 90; + int layout = 1; + std::string model_dir = "mobilenet_v1"; + if (argc > 7) { + model_dir = argv[7]; + } + if (argc > 8) { + flip = atoi(argv[8]); + } + if (argc > 9) { + rotate = atoi(argv[9]); + } + if (argc > 10) { + layout = atoi(argv[10]); + } + test_img({3}, + {1, 2, 4}, + image_path, + dst_path, + (ImageFormat)srcFormat, + (ImageFormat)dstFormat, + width, + height, + rotate, + (FlipParam)flip, + (LayoutType)layout, + model_dir, + 20); + return 0; +} diff --git a/lite/demo/cxx/test_cv/test_model_cv.cc b/lite/demo/cxx/test_cv/test_model_cv.cc new file mode 100644 index 0000000000000000000000000000000000000000..24f408bf4a55ea2d499e39902201597c0e8c6e4e --- /dev/null +++ b/lite/demo/cxx/test_cv/test_model_cv.cc @@ -0,0 +1,224 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" // NOLINT +#include "paddle_image_preprocess.h" // NOLINT +#include "time.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} +// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up +void neon_mean_scale( + const float* din, float* dout, int size, float* mean, float* scale) { + float32x4_t vmean0 = vdupq_n_f32(mean[0]); + float32x4_t vmean1 = vdupq_n_f32(mean[1]); + float32x4_t vmean2 = vdupq_n_f32(mean[2]); + float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]); + float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]); + float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]); + + float* dout_c0 = dout; + float* dout_c1 = dout + size; + float* dout_c2 = dout + size * 2; + + int i = 0; + for (; i < size - 3; i += 4) { + float32x4x3_t vin3 = vld3q_f32(din); + float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0); + float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1); + float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2); + float32x4_t vs0 = vmulq_f32(vsub0, vscale0); + float32x4_t vs1 = vmulq_f32(vsub1, vscale1); + float32x4_t vs2 = vmulq_f32(vsub2, vscale2); + vst1q_f32(dout_c0, vs0); + vst1q_f32(dout_c1, vs1); + vst1q_f32(dout_c2, vs2); + + din += 12; + dout_c0 += 4; + dout_c1 += 4; + dout_c2 += 4; + } + for (; i < size; i++) { + *(dout_c0++) = (*(din++) - mean[0]) * scale[0]; + *(dout_c0++) = (*(din++) - mean[1]) * scale[1]; + *(dout_c0++) = (*(din++) - mean[2]) * scale[2]; + } +} +void pre_process(const cv::Mat& img, int width, int height, Tensor dstTensor) { +#ifdef LITE_WITH_CV + typedef paddle::lite::utils::cv::ImageFormat ImageFormat; + typedef paddle::lite::utils::cv::FlipParam FlipParam; + typedef paddle::lite::utils::cv::TransParam TransParam; + typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess; + typedef paddle::lite_api::DataLayoutType LayoutType; + // init TransParam + TransParam tp; + tp.iw = img.cols; + tp.ih = img.rows; + tp.ow = width; + tp.oh = height; + ImageFormat srcFormat = ImageFormat::BGR; + ImageFormat dstFormat = ImageFormat::RGB; + // init ImagePreprocess + ImagePreprocess img_process(srcFormat, dstFormat, tp); + // init temp var + const uint8_t* img_ptr = reinterpret_cast(img.data); + uint8_t* rgb_ptr = new uint8_t[img.cols * img.rows * 3]; + uint8_t* resize_ptr = new uint8_t[width * height * 3]; + // do convert bgr--rgb + img_process.imageConvert(img_ptr, rgb_ptr); + // do resize + img_process.imageResize(rgb_ptr, resize_ptr); + // data--tensor and normalize + float means[3] = {103.94f, 116.78f, 123.68f}; + float scales[3] = {0.017f, 0.017f, 0.017f}; + img_process.image2Tensor( + resize_ptr, &dstTensor, LayoutType::kNCHW, means, scales); + float* data = dstTensor.mutable_data(); +#else + cv::Mat rgb_img; + cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f); + cv::Mat imgf; + rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); + float means[3] = {0.485f, 0.456f, 0.406f}; + float scales[3] = {0.229f, 0.224f, 0.225f}; + const float* dimg = reinterpret_cast(imgf.data); + float* data = dstTensor.mutable_data(); + neon_mean_scale(dimg, data, width * height, means, scales); +#endif +} + +void RunModel(std::string model_dir, + std::string img_path, + std::vector input_shape, + PowerMode power_mode, + int thread_num, + int test_iter, + int warmup = 0) { + // 1. Set MobileConfig + MobileConfig config; + config.set_model_dir(model_dir); + config.set_power_mode(power_mode); + config.set_threads(thread_num); + + // 2. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + // 3. Prepare input data from image + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize( + {input_shape[0], input_shape[1], input_shape[2], input_shape[3]}); + auto* data = input_tensor->mutable_data(); + // read img and pre-process + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + + pre_process(img, input_shape[3], input_shape[2], *input_tensor); + + // 4. Run predictor + for (int i = 0; i < warmup; ++i) { + predictor->Run(); + } + double lps = 0.f; + double min_time = 1000000.f; + double max_time = 0.f; + for (int i = 0; i < test_iter; ++i) { + clock_t begin = clock(); + predictor->Run(); + clock_t end = clock(); + double t = (end - begin) * 1000; + t = t / CLOCKS_PER_SEC; + lps += t; + if (t < min_time) { + min_time = t; + } + if (t > max_time) { + max_time = t; + } + std::cout << "iter: " << i << ", time: " << t << " ms" << std::endl; + } + std::cout << "================== Speed Report ===================" + << std::endl; + std::cout << "Model: " << model_dir + << ", power_mode: " << static_cast(power_mode) + << ", threads num " << thread_num << ", warmup: " << warmup + << ", repeats: " << test_iter << ", avg time: " << lps / test_iter + << " ms" + << ", min time: " << min_time << " ms" + << ", max time: " << max_time << " ms." << std::endl; + + // 5. Get output and post process + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + auto* outptr = output_tensor->data(); + auto shape_out = output_tensor->shape(); + int output_num = 1; + for (int i = 0; i < shape_out.size(); ++i) { + output_num *= shape_out[i]; + } + std::cout << "output_num: " << output_num << std::endl; + for (int i = 0; i < output_num; i += 100) { + std::cout << "i: " << i << ", out: " << outptr[i] << std::endl; + } +} + +int main(int argc, char** argv) { + if (argc < 7) { + std::cerr << "[ERROR] usage: " << argv[0] + << " model_dir image_path input_shape\n"; + exit(1); + } + std::string model_dir = argv[1]; + std::string img_path = argv[2]; + std::vector input_shape; + input_shape.push_back(atoi(argv[3])); + input_shape.push_back(atoi(argv[4])); + input_shape.push_back(atoi(argv[5])); + input_shape.push_back(atoi(argv[6])); + int power_mode = 3; + int threads = 1; + int test_iter = 100; + int warmup = 10; + if (argc > 7) { + power_mode = atoi(argv[7]); + } + if (argc > 8) { + threads = atoi(argv[8]); + } + if (argc > 9) { + test_iter = atoi(argv[9]); + } + if (argc > 10) { + warmup = atoi(argv[10]); + } + RunModel(model_dir, + img_path, + input_shape, + (PowerMode)power_mode, + threads, + test_iter, + warmup); + return 0; +} diff --git a/lite/tests/cv/CMakeLists.txt b/lite/tests/cv/CMakeLists.txt index 05fcc06b10ae5dc6b009ae087ce4e18f8d82e475..697c9874ef2072eedf6b654863e25e981fb6834a 100644 --- a/lite/tests/cv/CMakeLists.txt +++ b/lite/tests/cv/CMakeLists.txt @@ -1,3 +1,3 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) - lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm paddle_api_light ${lite_cv_deps} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm) endif() diff --git a/lite/tests/cv/cv_basic.h b/lite/tests/cv/cv_basic.h index 728d3167144bc6e03683b77803fb4887967eb524..92f68543bb15bdc15a8ed029f67ed33ca215361b 100644 --- a/lite/tests/cv/cv_basic.h +++ b/lite/tests/cv/cv_basic.h @@ -192,7 +192,6 @@ void nv21_bgra_basic(const uint8_t* in_data, nv2bgra(in_data, out_data, srcw, srch, 0, 1); } -/* /* 采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R 采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B @@ -217,6 +216,21 @@ void bgr_gray_basic(const uint8_t* in_data, } } } +void bgra_gray_basic(const uint8_t* in_data, + uint8_t* out_data, + int srcw, + int srch) { + for (int i = 0; i < srch; i++) { + const uint8_t* din_ptr = in_data + i * 4 * srcw; + uint8_t* dout_ptr = out_data + i * srcw; + for (int j = 0; j < srcw; j++) { + int sum = din_ptr[0] * 15 + din_ptr[1] * 75 + din_ptr[2] * 38; + sum = sum >> 7; + *dout_ptr++ = sum; + din_ptr += 4; + } + } +} void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { for (int i = 0; i < srch; i++) { @@ -228,6 +242,17 @@ void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { } } } +void gray_bgra_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = *src; + *dst++ = *src; + *dst++ = *src; + *dst++ = 255; + src++; + } + } +} // bgr2bgra, rgb2rgba void hwc3_to_hwc4_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) { for (int i = 0; i < srch; i++) { @@ -340,6 +365,16 @@ void image_convert_basic(const uint8_t* in_data, (srcFormat == ImageFormat::GRAY && dstFormat == ImageFormat::BGR)) { gray_bgr_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::RGBA && + dstFormat == ImageFormat::GRAY) || + (srcFormat == ImageFormat::BGRA && + dstFormat == ImageFormat::GRAY)) { + bgra_gray_basic(in_data, out_data, srcw, srch); + } else if ((srcFormat == ImageFormat::GRAY && + dstFormat == ImageFormat::RGBA) || + (srcFormat == ImageFormat::GRAY && + dstFormat == ImageFormat::BGRA)) { + gray_bgra_basic(in_data, out_data, srcw, srch); } else if ((srcFormat == ImageFormat::RGBA && dstFormat == ImageFormat::RGB) || (srcFormat == ImageFormat::BGRA && @@ -525,6 +560,7 @@ void image_resize_basic(const uint8_t* in_data, int y_flag = 0; // only one line if (y_in_start < 0) { y_flag = 1; + y_in_end = 0; } float b0 = ibeta[dy * 2]; float b1 = ibeta[dy * 2 + 1]; @@ -750,6 +786,26 @@ void image_flip_basic(const uint8_t* in_data, flipxy_basic(in_data, srch, srcw, out_data, num); } } +void gray_to_tensor_basic(const uint8_t* bgr, + float* output, + int width, + int height, + float* means, + float* scales, + int num) { + int size = width * height; + float mean_val = means[0]; + float scale_val = scales[0]; + + for (int h = 0; h < height; h++) { + const uint8_t* ptr_bgr = bgr + h * width * num; + float* ptr_h = output + h * width; + for (int i = 0; i < width; i++) { + *ptr_h++ = (ptr_bgr[0] - mean_val) * scale_val; + ptr_bgr += num; + } + } +} void bgr_to_tensor_chw_basic(const uint8_t* bgr, float* output, @@ -828,5 +884,8 @@ void image_to_tensor_basic(const uint8_t* in_data, } else if (layout == LayoutType::kNHWC && (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA)) { bgr_to_tensor_hwc_basic(in_data, output, srcw, srch, means, scales, 4); + } else if (srcFormat == ImageFormat::GRAY && + (layout == LayoutType::kNHWC || layout == LayoutType::kNCHW)) { + gray_to_tensor_basic(in_data, output, srcw, srch, means, scales, 1); } } diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc index eefd30f74f570f64d1b5617c9dddc836086394b1..e22e327e8b10d1237f5e07b5b0a8d95d3b19e70b 100644 --- a/lite/tests/cv/image_convert_test.cc +++ b/lite/tests/cv/image_convert_test.cc @@ -20,6 +20,7 @@ #include "lite/core/profile/timer.h" #include "lite/tests/cv/cv_basic.h" #include "lite/utils/cv/paddle_image_preprocess.h" +#include "time.h" // NOLINT DEFINE_int32(cluster, 3, "cluster id"); DEFINE_int32(threads, 1, "threads num"); @@ -28,15 +29,15 @@ DEFINE_int32(repeats, 1, "repeats times"); DEFINE_bool(basic_test, false, "do all tests"); DEFINE_bool(check_result, true, "check the result"); -DEFINE_int32(srcFormat, 0, "input image format"); -DEFINE_int32(dstFormat, 1, "output image format"); +DEFINE_int32(srcFormat, 0, "input image format RGBA"); +DEFINE_int32(dstFormat, 2, "output image format RGB"); DEFINE_int32(srch, 1920, "input height"); DEFINE_int32(srcw, 1080, "input width"); DEFINE_int32(dsth, 960, "output height"); DEFINE_int32(dstw, 540, "output width"); DEFINE_int32(angle, 90, "rotate angel"); DEFINE_int32(flip_num, 0, "flip x"); -DEFINE_int32(layout, 0, "layout nchw"); +DEFINE_int32(layout, 1, "layout nchw"); typedef paddle::lite::utils::cv::ImageFormat ImageFormat; typedef paddle::lite::utils::cv::FlipParam FlipParam; @@ -99,7 +100,7 @@ void test_img(const std::vector& cluster_id, float rotate, FlipParam flip, LayoutType layout, - int test_iter = 1) { + int test_iter = 10) { #ifdef LITE_WITH_ARM paddle::lite::DeviceInfo::Init(); #endif @@ -221,7 +222,7 @@ void test_img(const std::vector& cluster_id, float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f}; if (FLAGS_check_result) { - LOG(INFO) << "image convert basic compute"; + // LOG(INFO) << "image convert basic compute"; image_convert_basic(src, basic_dst, (ImageFormat)srcFormat, @@ -230,7 +231,7 @@ void test_img(const std::vector& cluster_id, srch, out_size); - LOG(INFO) << "image resize basic compute"; + // LOG(INFO) << "image resize basic compute"; image_resize_basic(basic_dst, resize_basic, (ImageFormat)dstFormat, @@ -239,7 +240,7 @@ void test_img(const std::vector& cluster_id, dstw, dsth); - LOG(INFO) << "image rotate basic compute"; + // LOG(INFO) << "image rotate basic compute"; image_rotate_basic(resize_basic, tv_out_ratote_basic, (ImageFormat)dstFormat, @@ -247,7 +248,7 @@ void test_img(const std::vector& cluster_id, dsth, rotate); - LOG(INFO) << "image flip basic compute"; + // LOG(INFO) << "image flip basic compute"; image_flip_basic(resize_basic, tv_out_flip_basic, (ImageFormat)dstFormat, @@ -255,7 +256,7 @@ void test_img(const std::vector& cluster_id, dsth, flip); - LOG(INFO) << "image to tensor basic compute"; + // LOG(INFO) << "image to tensor basic compute"; image_to_tensor_basic(resize_basic, &tensor_basic, (ImageFormat)dstFormat, @@ -267,10 +268,13 @@ void test_img(const std::vector& cluster_id, } Timer t1; + Timer t_convert; + Timer t_resize; + Timer t_flip; + Timer t_rotate; + Timer t_tensor; LOG(INFO) << "saber cv compute"; - double to = 0; - double min_time = 100000; TransParam tparam; tparam.ih = srch; tparam.iw = srcw; @@ -285,15 +289,17 @@ void test_img(const std::vector& cluster_id, ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); for (int i = 0; i < test_iter; ++i) { - t1.Reset(); t1.Start(); - LOG(INFO) << "image convert saber compute"; + // LOG(INFO) << "image convert saber compute"; + t_convert.Start(); // 方法一: image_preprocess.imageCovert(src, lite_dst); - image_preprocess.imageCovert( + image_preprocess.imageConvert( src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat); + t_convert.Stop(); - LOG(INFO) << "image resize saber compute"; + // LOG(INFO) << "image resize saber compute"; + t_resize.Start(); // 方法一:image_preprocess.imageResize(lite_dst, resize_tmp); image_preprocess.imageResize(lite_dst, resize_tmp, @@ -302,8 +308,10 @@ void test_img(const std::vector& cluster_id, srch, dstw, dsth); + t_resize.Stop(); - LOG(INFO) << "image rotate saber compute"; + // LOG(INFO) << "image rotate saber compute"; + t_rotate.Start(); // 方法一: image_preprocess.imageRotate(resize_tmp, tv_out_ratote); image_preprocess.imageRotate(resize_tmp, tv_out_ratote, @@ -311,13 +319,17 @@ void test_img(const std::vector& cluster_id, dstw, dsth, rotate); + t_rotate.Stop(); - LOG(INFO) << "image flip saber compute"; + // LOG(INFO) << "image flip saber compute"; + t_flip.Start(); // 方法一: image_preprocess.imageFlip(resize_tmp, tv_out_flip); image_preprocess.imageFlip( resize_tmp, tv_out_flip, (ImageFormat)dstFormat, dstw, dsth, flip); + t_flip.Stop(); - LOG(INFO) << "image to tensor compute"; + // LOG(INFO) << "image to tensor compute"; + t_tensor.Start(); // 方法一: image_preprocess.image2Tensor( // resize_tmp, &dst_tensor, layout, means, scales); image_preprocess.image2Tensor(resize_tmp, @@ -328,16 +340,27 @@ void test_img(const std::vector& cluster_id, layout, means, scales); - + t_tensor.Stop(); t1.Stop(); - double tdiff = t1.LapTimes().Avg(); - to += tdiff; - if (tdiff < min_time) { - min_time = tdiff; - } } - LOG(INFO) << "image trans total time : " << to - << ", avg time : " << to / test_iter; + LOG(INFO) << "image convert avg time : " << t_convert.LapTimes().Avg() + << ", min time: " << t_convert.LapTimes().Min() + << ", max time: " << t_convert.LapTimes().Max(); + LOG(INFO) << "image resize avg time : " << t_resize.LapTimes().Avg() + << ", min time: " << t_resize.LapTimes().Min() + << ", max time: " << t_resize.LapTimes().Max(); + LOG(INFO) << "image rotate avg time : " << t_rotate.LapTimes().Avg() + << ", min time: " << t_rotate.LapTimes().Min() + << ", max time: " << t_rotate.LapTimes().Max(); + LOG(INFO) << "image flip avg time : " << t_flip.LapTimes().Avg() + << ", min time: " << t_flip.LapTimes().Min() + << ", max time: " << t_flip.LapTimes().Max(); + LOG(INFO) << "image tensor avg time : " << t_tensor.LapTimes().Avg() + << ", min time: " << t_tensor.LapTimes().Min() + << ", max time: " << t_tensor.LapTimes().Max(); + LOG(INFO) << "image trans total avg time : " << t1.LapTimes().Avg() + << ", min time: " << t1.LapTimes().Min() + << ", max time: " << t1.LapTimes().Max(); double max_ratio = 0; double max_diff = 0; @@ -536,7 +559,7 @@ void test_img(const std::vector& cluster_id, } } -#if 1 +#if 0 TEST(TestImageConvertRand, test_func_image_convert_preprocess) { if (FLAGS_basic_test) { for (auto w : {1, 4, 8, 16, 112, 224, 1092}) { @@ -546,19 +569,16 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) { for (auto rotate : {180}) { for (auto flip : {0}) { for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) { - for (auto dstFormat : {0, 1, 2, 3}) { + for (auto dstFormat : {0, 1, 2, 3, 4}) { for (auto layout : {1}) { - if ((dstFormat == ImageFormat::GRAY && - (srcFormat == ImageFormat::RGBA || - srcFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::GRAY && - (dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::NV12 || + if ((srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) && - (dstFormat == ImageFormat::GRAY || - dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) { + (dstFormat == ImageFormat::GRAY)) { + continue; + } + if ((dstFormat == ImageFormat::NV12 || + dstFormat == ImageFormat::NV21) && + (srcFormat == ImageFormat::GRAY)) { continue; } if (srcFormat == ImageFormat::NV12 || @@ -591,7 +611,7 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) { } } #endif -#if 1 +#if 0 TEST(TestImageConvertRand, test_func_image_resize_preprocess) { if (FLAGS_basic_test) { for (auto w : {1, 4, 8, 16, 112, 224, 1092}) { @@ -601,21 +621,13 @@ TEST(TestImageConvertRand, test_func_image_resize_preprocess) { for (auto rotate : {180}) { for (auto flip : {0}) { for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) { - for (auto dstFormat : {0, 1, 2, 3}) { + for (auto dstFormat : {0, 1, 2, 3, 4, 11}) { for (auto layout : {1}) { if (dstFormat == ImageFormat::NV12 || - dstFormat == ImageFormat::NV21 || - (dstFormat == ImageFormat::GRAY && - (srcFormat == ImageFormat::RGBA || - srcFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::GRAY && - (dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) || + dstFormat == ImageFormat::NV21 || (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) && - (dstFormat == ImageFormat::GRAY || - dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) { + dstFormat == ImageFormat::GRAY) { continue; } if (srcFormat == ImageFormat::NV12 || @@ -656,25 +668,10 @@ TEST(TestImageConvertRand, test_func_image_trans_preprocess) { for (auto ww : {32, 112}) { for (auto hh : {112}) { for (auto rotate : {90, 180, 270}) { - for (auto flip : {0, 1, 2}) { - for (auto srcFormat : {11}) { - for (auto dstFormat : {3}) { + for (auto flip : {-1, 0, 1}) { + for (auto srcFormat : {0}) { + for (auto dstFormat : {0, 1, 2, 3, 4}) { for (auto layout : {1, 3}) { - if (dstFormat == ImageFormat::NV12 || - dstFormat == ImageFormat::NV21 || - (dstFormat == ImageFormat::GRAY && - (srcFormat == ImageFormat::RGBA || - srcFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::GRAY && - (dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) || - (srcFormat == ImageFormat::NV12 || - srcFormat == ImageFormat::NV21) && - (dstFormat == ImageFormat::GRAY || - dstFormat == ImageFormat::RGBA || - dstFormat == ImageFormat::BGRA)) { - continue; - } if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) { if (w % 2) { // is not ou shu, two line y == one line @@ -717,7 +714,8 @@ TEST(TestImageConvertCustom, test_func_image_preprocess_custom) { (ImageFormat)FLAGS_dstFormat, FLAGS_angle, (FlipParam)FLAGS_flip_num, - (LayoutType)FLAGS_layout); + (LayoutType)FLAGS_layout, + 20); } #endif #endif diff --git a/lite/utils/cv/CMakeLists.txt b/lite/utils/cv/CMakeLists.txt index 0edcb2ef24ce4f53ffffa14ad70cbbc1d5c5971e..6c88e70de125b650bcf576fd686373c59e37454c 100644 --- a/lite/utils/cv/CMakeLists.txt +++ b/lite/utils/cv/CMakeLists.txt @@ -1,5 +1,4 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) - set(lite_cv_deps) lite_cc_library(paddle_cv_arm SRCS image_convert.cc paddle_image_preprocess.cc @@ -7,5 +6,5 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ image_flip.cc image_rotate.cc image_resize.cc - DEPS ${lite_cv_deps} paddle_api place) + DEPS paddle_api place) endif() diff --git a/lite/utils/cv/image2tensor.cc b/lite/utils/cv/image2tensor.cc index b51a82da1d0e9dc1750670ef55690e9a34a659fc..3a09039a0f53c9ac49a472b61b477dd6d2e5ac33 100644 --- a/lite/utils/cv/image2tensor.cc +++ b/lite/utils/cv/image2tensor.cc @@ -18,6 +18,13 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +void gray_to_tensor(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales); + void bgr_to_tensor_chw(const uint8_t* src, float* output, int width, @@ -52,7 +59,7 @@ void bgra_to_tensor_hwc(const uint8_t* src, * NCHW * param src: input image data * param dstTensor: output tensor data - * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA) + * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA) * param srcw: input image width * param srch: input image height * param layout: output tensor layout,support NHWC and NCHW @@ -79,6 +86,9 @@ void Image2Tensor::choose(const uint8_t* src, } else if (layout == LayoutType::kNHWC && (srcFormat == BGRA || srcFormat == RGBA)) { impl_ = bgra_to_tensor_hwc; + } else if ((layout == LayoutType::kNHWC || layout == LayoutType::kNCHW) && + (srcFormat == GRAY)) { + impl_ = gray_to_tensor; } else { printf("this layout: %d or image format: %d not support \n", static_cast(layout), @@ -87,6 +97,147 @@ void Image2Tensor::choose(const uint8_t* src, } impl_(src, output, srcw, srch, means, scales); } + +void gray_to_tensor(const uint8_t* src, + float* output, + int width, + int height, + float* means, + float* scales) { + int size = width * height; + float mean_val = means[0]; + float scale_val = scales[0]; + + int dim16 = width >> 16; + int remain = width % 16; + + float32x4_t vmean = vdupq_n_f32(mean_val); + float32x4_t vscale = vdupq_n_f32(scale_val); +#pragma omp parallel for + for (int i = 0; i < height; i += 1) { + const uint8_t* din_ptr = src + i * width; + float* ptr_h = output + i * width; + int cnt = dim16; + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr0], #64] \n" + "prfm pldl1keep, [%[inptr0], #128] \n" + "prfm pldl1keep, [%[inptr0], #192] \n" + "1: \n" + "ld1 {v0.8b}, [%[inptr0]], #8 \n" // d8 = y0y1y2.." + "ld1 {v1.8b}, [%[inptr0]], #8 \n" // d8 = y0y1y2.." + // 8->16 + "ushll v3.8h, v0.8b, #0 \n" + "ushll v4.8h, v0.8b, #0 \n" + // 16->32 + "ushll v6.4s, v3.4h, #0 \n" + "ushll2 v7.4s, v3.8h, #0 \n" + "ushll v8.4s, v4.4h, #0 \n" + "ushll2 v9.4s, v4.8h, #0 \n" + // int32->fp32 + "ucvtf v12.4s, v6.4s \n" + "ucvtf v13.4s, v7.4s \n" + "ucvtf v14.4s, v8.4s \n" + "ucvtf v15.4s, v9.4s \n" + // sub -mean + "fsub v12.4s, v12.4s, %w[vmean].4s \n" + "fsub v13.4s, v13.4s, %w[vmean].4s \n" + "fsub v14.4s, v14.4s, %w[vmean].4s \n" + "fsub v15.4s, v15.4s, %w[vmean].4s \n" + // mul * scale + "fmul v6.4s, v12.4s, %w[vscale].4s \n" + "fmul v7.4s, v13.4s, %w[vscale].4s \n" + "fmul v8.4s, v14.4s, %w[vscale].4s \n" + "fmul v9.4s, v15.4s, %w[vscale].4s \n" + // store + "st1 {v6.4s}, [%[outr0]], #16 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "st1 {v7.4s}, [%[outr0]], #16 \n" + "st1 {v8.4s}, [%[outr0]], #16 \n" + "st1 {v9.4s}, [%[outr0]], #16 \n" + "bne 1b \n" + : [inptr0] "+r"(din_ptr), [outr0] "+r"(ptr_h), [cnt] "+r"(cnt) + : [vmean] "w"(vmean), [vscale] "w"(vscale) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile( + "pld [%[inptr0]] @ preload a, 64byte\n" + "pld [%[inptr0], #64] @ preload a, 64byte\n" + "pld [%[inptr0], #128] @ preload a, 64byte\n" + "pld [%[inptr0], #192] @ preload a, 64byte\n" + "1: \n" + "vld1.8 {d12, d13}, [%[inptr0]]! \n" + // 8->16 + "vmovl.u8 q8, d12 \n" + "vmovl.u8 q9, d13 \n" + // 16->32 + "vmovl.u16 q11, d16 \n" + "vmovl.u16 q12, d17 \n" + "vmovl.u16 q13, d18 \n" + "vmovl.u16 q14, d19 \n" + // int32->fp32 + "vcvt.f32.u32 q7, q11 \n" + "vcvt.f32.u32 q8, q12 \n" + "vcvt.f32.u32 q9, q13 \n" + "vcvt.f32.u32 q10, q14 \n" + // sub -mean + "vsub.f32 q7, q7, %q[vmean] \n" + "vsub.f32 q8, q8, %q[vmean] \n" + "vsub.f32 q9, q9, %q[vmean] \n" + "vsub.f32 q10, q10, %q[vmean] \n" + // mul *scale + "vmul.f32 q11, q7, %q[vscale] \n" + "vmul.f32 q12, q8, %q[vscale] \n" + "vmul.f32 q13, q9, %q[vscale] \n" + "vmul.f32 q14, q10, %q[vscale] \n" + // store + "vst1.32 {d22 - d23}, [%[outr0]]! \n" + "subs %[cnt], #1 \n" + "vst1.32 {d24 - d25}, [%[outr0]]! \n" + "vst1.32 {d26 - d27}, [%[outr0]]! \n" + "vst1.32 {d28 - d29}, [%[outr0]]! \n" + "bne 1b" + : [inptr0] "+r"(din_ptr), [outr0] "+r"(ptr_h), [cnt] "+r"(cnt) + : [vmean] "w"(vmean), [vscale] "w"(vscale) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14"); +#endif + } + for (int j = 0; j < remain; j++) { + *ptr_h++ = (*din_ptr - mean_val) * scale_val; + din_ptr++; + } + } +} + void bgr_to_tensor_chw(const uint8_t* src, float* output, int width, @@ -390,6 +541,7 @@ void bgra_to_tensor_chw(const uint8_t* src, } } } + void bgr_to_tensor_hwc(const uint8_t* src, float* output, int width, diff --git a/lite/utils/cv/image_convert.cc b/lite/utils/cv/image_convert.cc index 24b6db70dd4f4fb1ad8e8c915444684d4db07cfd..385f56d233cb151445a086ed59d5c40374cd8c36 100644 --- a/lite/utils/cv/image_convert.cc +++ b/lite/utils/cv/image_convert.cc @@ -30,10 +30,14 @@ void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch); void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch); void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch); void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// bgra rgba to gray +void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch); // bgr rgb to gray void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch); // gray to bgr rgb void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch); +// gray to bgra rgba +void hwc1_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch); // bgr to bgra or rgb to rgba void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch); // bgra to bgr or rgba to rgb @@ -112,6 +116,12 @@ void ImageConvert::choose(const uint8_t* src, } else if ((srcFormat == RGB && dstFormat == BGRA) || (srcFormat == BGR && dstFormat == RGBA)) { impl_ = hwc3_trans_hwc4; + } else if ((srcFormat == GRAY && dstFormat == RGBA) || + (srcFormat == GRAY && dstFormat == BGRA)) { + impl_ = hwc1_to_hwc4; + } else if ((srcFormat == RGBA && dstFormat == GRAY) || + (srcFormat == BGRA && dstFormat == GRAY)) { + impl_ = hwc4_to_hwc1; } else { printf("srcFormat: %d, dstFormat: %d does not support! \n", srcFormat, @@ -989,7 +999,7 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { "vshrn.u32 d24, q6, #7 \n" "vshrn.u32 d25, q7, #7 \n" "vshrn.u32 d26, q8, #7 \n" - "vshrn.u32 d27, q8, #7 \n" + "vshrn.u32 d27, q9, #7 \n" // 16->8 "vmovn.u16 d4, q10 \n" "vmovn.u16 d5, q11 \n" @@ -1077,6 +1087,280 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { } } /* +采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R +采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B +b = 0.114 *128 = 14.529 = 15 +g = 0.587 * 128 = 75.136 = 75 +r = 0.2989 * 127 = 38.2592 = 38 +Gray = (15*B + 75*G + 38*R)/128 +bgra2gray, rgba2gray +*/ +void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + uint8_t b = 15; + uint8_t g = 75; + uint8_t r = 38; + + uint8x8_t vb = vdup_n_u8(b); + uint8x8_t vg = vdup_n_u8(g); + uint8x8_t vr = vdup_n_u8(r); +#ifdef __aarch64__ +#else + uint8_t vb_array[8] = {b, b, b, b, b, b, b, b}; + uint8_t vg_array[8] = {g, g, g, g, g, g, g, g}; + uint8_t vr_array[8] = {r, r, r, r, r, r, r, r}; +#endif + int cnt_pro = srcw >> 3; + int remain_pro = srcw % 8; + int win = srcw * 4; + int i = 0; +#pragma omp parallel for + for (i = 0; i < srch - 3; i += 4) { + int j = 0; + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + uint8_t* outr0 = dst + i * srcw; + uint8_t* outr1 = outr0 + srcw; + uint8_t* outr2 = outr1 + srcw; + uint8_t* outr3 = outr2 + srcw; + + int cnt = cnt_pro; + if (cnt > 0) { +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr0], #128] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "prfm pldl1keep, [%[inptr1], #128] \n" + "prfm pldl1keep, [%[inptr2]] \n" + "prfm pldl1keep, [%[inptr2], #128] \n" + "prfm pldl1keep, [%[inptr3]] \n" + "prfm pldl1keep, [%[inptr3], #128] \n" + "1: \n" + "ld4 {v0.8b - v3.8b}, [%[inptr0]], #32 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + "ld4 {v4.8b - v7.8b}, [%[inptr1]], #32 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + "ld4 {v8.8b - v11.8b}, [%[inptr2]], #32 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + "ld4 {v12.8b - v15.8b}, [%[inptr3]], #32 \n" // d8 = y0y3y6y9.. d9 = + // y1y4y7... + // mul b + "umull v13.8h, v0.8b, %w[vb].8b \n" // v0 * vb + "umull v14.8h, v4.8b, %w[vb].8b \n" // v0 * vb + "umull v15.8h, v8.8b, %w[vb].8b \n" // v0 * vb + "umull v16.8h, v12.8b, %w[vb].8b \n" // v0 * vb + // mul g + "umull v17.8h, v1.8b, %w[vg].8b \n" // v0 * vb + "umull v18.8h, v5.8b, %w[vg].8b \n" // v0 * vb + "umull v19.8h, v9.8b, %w[vg].8b \n" // v0 * vb + "umull v20.8h, v13.8b, %w[vg].8b \n" // v0 * vb + // mul r + "umlal v13.8h, v2.8b, %w[vr].8b \n" // v0 * vb + "umlal v14.8h, v6.8b, %w[vr].8b \n" // v0 * vb + "umlal v15.8h, v10.8b, %w[vr].8b \n" // v0 * vb + "umlal v16.8h, v14.8b, %w[vr].8b \n" // v0 * vb + // 16->32 + "uaddl v0.4s, v17.4h, v13.4h \n" + "uaddl2 v1.4s, v17.8h, v13.8h \n" + "uaddl v2.4s, v18.4h, v14.4h \n" + "uaddl2 v3.4s, v18.8h, v14.8h \n" + "uaddl v4.4s, v19.4h, v15.4h \n" + "uaddl2 v5.4s, v19.8h, v15.8h \n" + "uaddl v6.4s, v20.4h, v16.4h \n" + "uaddl2 v7.4s, v20.8h, v16.8h \n" + // 32->16 v0 >> 7 + "shrn v12.4h, v0.4s, #7 \n" + "shrn2 v12.8h, v1.4s, #7 \n" + "shrn v13.4h, v2.4s, #7 \n" + "shrn2 v13.8h, v3.4s, #7 \n" + "shrn v14.4h, v4.4s, #7 \n" + "shrn2 v14.8h, v5.4s, #7 \n" + "shrn v15.4h, v6.4s, #7 \n" + "shrn2 v15.8h, v7.4s, #7 \n" + // 16->8 + "xtn v0.8b, v12.8h \n" + "xtn v1.8b, v13.8h \n" + "xtn v2.8b, v14.8h \n" + "xtn v3.8b, v15.8h \n" + "subs %w[cnt], %w[cnt], #1 \n" + "st1 {v0.8b}, [%[outr0]], #8 \n" + "st1 {v1.8b}, [%[outr1]], #8 \n" + "st1 {v2.8b}, [%[outr2]], #8 \n" + "st1 {v3.8b}, [%[outr3]], #8 \n" + "bne 1b \n" + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outr0] "+r"(outr0), + [outr1] "+r"(outr1), + [outr2] "+r"(outr2), + [outr3] "+r"(outr3), + [cnt] "+r"(cnt) + : [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20"); +#else + asm volatile( + "pld [%[inptr0]] @ preload a, 64byte\n" + "pld [%[inptr0], #128] @ preload a, 64byte\n" + "pld [%[inptr1]] @ preload a, 64byte\n" + "pld [%[inptr1], #128] @ preload a, 64byte\n" + "pld [%[inptr2]] @ preload a, 64byte\n" + "pld [%[inptr2], #128] @ preload a, 64byte\n" + "pld [%[inptr3]] @ preload a, 64byte\n" + "pld [%[inptr3], #128] @ preload a, 64byte\n" + "vld1.8 d0, [%[vb]] \n" + "vld1.8 d1, [%[vg]] \n" + "vld1.8 d2, [%[vr]] \n" + "1: \n" + "vld4.8 {d3, d4, d5, d6}, [%[inptr0]]! \n" + "vld4.8 {d7, d8, d9, d10}, [%[inptr1]]! \n" + "vld4.8 {d11, d12, d13, d14}, [%[inptr2]]! \n" + "vld4.8 {d15, d16, d17, d18}, [%[inptr3]]! \n" + // vb + "vmull.u8 q10, d3, d0 \n" + "vmull.u8 q11, d7, d0 \n" + "vmull.u8 q12, d11, d0 \n" + "vmull.u8 q13, d15, d0 \n" + // vg + "vmull.u8 q14, d4, d1 \n" + "vmull.u8 q15, d8, d1 \n" + "vmull.u8 q5, d12, d1 \n" + "vmull.u8 q7, d16, d1 \n" + // vr + "vmlal.u8 q10, d5, d2 \n" + "vmlal.u8 q11, d9, d2 \n" + "vmlal.u8 q12, d13, d2 \n" + "vmlal.u8 q13, d17, d2 \n" + // 16->32 + "vaddl.u16 q2, d28, d20 \n" + "vaddl.u16 q3, d29, d21 \n" + "vaddl.u16 q4, d30, d22 \n" + "vaddl.u16 q10, d31, d23 \n" + "vaddl.u16 q6, d10, d24 \n" + "vaddl.u16 q11, d11, d25 \n" + "vaddl.u16 q8, d14, d26 \n" + "vaddl.u16 q9, d15, d27 \n" + // 32->16 q2 >> 7 + "vshrn.u32 d10, q2, #7 \n" + "vshrn.u32 d11, q3, #7 \n" + "vshrn.u32 d14, q4, #7 \n" + "vshrn.u32 d15, q10, #7 \n" + "vshrn.u32 d24, q6, #7 \n" + "vshrn.u32 d25, q11, #7 \n" + "vshrn.u32 d26, q8, #7 \n" + "vshrn.u32 d27, q9, #7 \n" + // 16->8 + "vmovn.u16 d4, q5 \n" + "vmovn.u16 d5, q7 \n" + "vmovn.u16 d6, q12 \n" + "vmovn.u16 d7, q13 \n" + "subs %[cnt], #1 \n" + // store + "vst1.8 d4, [%[outr0]]! \n" + "vst1.8 d5, [%[outr1]]! \n" + "vst1.8 d6, [%[outr2]]! \n" + "vst1.8 d7, [%[outr3]]! \n" + "bne 1b \n" + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outr0] "+r"(outr0), + [outr1] "+r"(outr1), + [outr2] "+r"(outr2), + [outr3] "+r"(outr3), + [cnt] "+r"(cnt) + : [vb] "r"(vb_array), [vg] "r"(vg_array), [vr] "r"(vr_array) + : "cc", + "memory", + "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); +#endif + } + for (; j < remain_pro; j++) { + *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7; + *outr1++ = (inptr1[0] * b + inptr1[1] * g + inptr1[2] * r) >> 7; + *outr2++ = (inptr2[0] * b + inptr2[1] * g + inptr2[2] * r) >> 7; + *outr3++ = (inptr3[0] * b + inptr3[1] * g + inptr3[2] * r) >> 7; + inptr0 += 4; + inptr1 += 4; + inptr2 += 4; + inptr3 += 4; + } + } + for (; i < srch; i++) { + int j = 0; + const uint8_t* inptr0 = src + i * win; + uint8_t* outr0 = dst + i * srcw; + for (j = 0; j < cnt_pro; j++) { + uint8x8x4_t y0 = vld4_u8(inptr0); // d8 = y0y3y6y9.. d9 = y1y4y7...y + uint16x8_t val0 = vmull_u8(y0.val[0], vb); + + uint16x8_t val0_1 = vmull_u8(y0.val[1], vg); + + val0 = vmlal_u8(val0, y0.val[2], vr); + + uint32x4_t v0_sum0 = vaddl_u16(vget_low_u16(val0_1), vget_low_u16(val0)); + uint32x4_t v0_sum1 = + vaddl_u16(vget_high_u16(val0_1), vget_high_u16(val0)); + + uint16x4_t v0_sum0_16 = vshrn_n_u32(v0_sum0, 7); + uint16x4_t v0_sum1_16 = vshrn_n_u32(v0_sum1, 7); + + uint16x8_t v0_sum = vcombine_u16(v0_sum0_16, v0_sum1_16); + + uint8x8_t vout0 = vmovn_u16(v0_sum); + + inptr0 += 32; + vst1_u8(outr0, vout0); + outr0 += 8; + } + for (; j < srcw; j++) { + *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7; + inptr0 += 4; + } + } +} +/* 采用CV_GRAY2BGR,转换公式B = G = R = Gray 采用CV_GRAY2RGB,转换公式R = G = B = Gray gray2bgr, gray2rgb @@ -1091,6 +1375,22 @@ void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch) { } } } +/* +采用CV_GRAY2BGRA,转换公式B = G = R = Gray A=255 +采用CV_GRAY2RGBA,转换公式R = G = B = Gray A=255 +gray2bgra, gray2rgba +*/ +void hwc1_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) { + for (int i = 0; i < srch; i++) { + for (int j = 0; j < srcw; j++) { + *dst++ = *src; + *dst++ = *src; + *dst++ = *src; + *dst++ = 255; + src++; + } + } +} // bgr2bgra, rgb2rgba void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) { for (int i = 0; i < srch; i++) { diff --git a/lite/utils/cv/image_flip.cc b/lite/utils/cv/image_flip.cc index fd84691a2d1d244350f40238bc137d5d159ba62b..f535c858e4dddcd04a0ce8cfa7a727356df34d64 100644 --- a/lite/utils/cv/image_flip.cc +++ b/lite/utils/cv/image_flip.cc @@ -19,6 +19,23 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +void ImageFlip::choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + FlipParam flip_param) { + if (srcFormat == GRAY) { + flip_hwc1(src, dst, srcw, srch, flip_param); + } else if (srcFormat == BGR || srcFormat == RGB) { + flip_hwc3(src, dst, srcw, srch, flip_param); + } else if (srcFormat == BGRA || srcFormat == RGBA) { + flip_hwc4(src, dst, srcw, srch, flip_param); + } else { + printf("this srcFormat: %d does not support! \n", srcFormat); + return; + } +} // gray void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in); void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in); @@ -43,6 +60,9 @@ void flip_hwc1(const uint8_t* src, flip_hwc1_y(src, dst, srcw, srch); } else if (flip_param == XY) { flip_hwc1_xy(src, dst, srcw, srch); + } else { + printf("its doesn't support Flip: %d \n", static_cast(flip_param)); + return; } } @@ -57,6 +77,9 @@ void flip_hwc3(const uint8_t* src, flip_hwc3_y(src, dst, srcw, srch); } else if (flip_param == XY) { flip_hwc3_xy(src, dst, srcw, srch); + } else { + printf("its doesn't support Flip: %d \n", static_cast(flip_param)); + return; } } @@ -71,6 +94,9 @@ void flip_hwc4(const uint8_t* src, flip_hwc4_y(src, dst, srcw, srch); } else if (flip_param == XY) { flip_hwc4_xy(src, dst, srcw, srch); + } else { + printf("its doesn't support Flip: %d \n", static_cast(flip_param)); + return; } } /* diff --git a/lite/utils/cv/image_flip.h b/lite/utils/cv/image_flip.h index 5e513324a179423ec1d008d6e6cd33d29a79c095..7215b9494a36d50cba787be7e53253d704bde8bd 100644 --- a/lite/utils/cv/image_flip.h +++ b/lite/utils/cv/image_flip.h @@ -21,6 +21,15 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +class ImageFlip { + public: + void choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + FlipParam flip_param); +}; void flip_hwc1( const uint8_t* src, uint8_t* dst, int srcw, int srch, FlipParam flip_param); void flip_hwc3( diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc index 8b0b8aa17d3ced769c7ff606e9ba5fe78208b3d7..cd02a2cf4bd0bdfa0f2c45ed2cf0b1ead803480c 100644 --- a/lite/utils/cv/image_resize.cc +++ b/lite/utils/cv/image_resize.cc @@ -38,6 +38,15 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +void ImageResize::choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + int dstw, + int dsth) { + resize(src, dst, srcFormat, srcw, srch, dstw, dsth); +} void compute_xy(int srcw, int srch, int dstw, diff --git a/lite/utils/cv/image_resize.h b/lite/utils/cv/image_resize.h index e2e399f542c3b00eaf6a3b09f6315b38518f409f..f11f7b5d93159509ca9069f409335e6530060383 100644 --- a/lite/utils/cv/image_resize.h +++ b/lite/utils/cv/image_resize.h @@ -39,6 +39,16 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +class ImageResize { + public: + void choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + int dstw, + int dsth); +}; void resize(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, diff --git a/lite/utils/cv/image_rotate.cc b/lite/utils/cv/image_rotate.cc index 04ba84076685f89c376203d69ea631afe03671ec..98e61fb444aad691d28ae2116dbbd5743e20e481 100644 --- a/lite/utils/cv/image_rotate.cc +++ b/lite/utils/cv/image_rotate.cc @@ -19,6 +19,26 @@ namespace paddle { namespace lite { namespace utils { namespace cv { +void ImageRotate::choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + float degree) { + if (degree != 90 && degree != 180 && degree != 270) { + printf("this degree: %f not support \n", degree); + } + if (srcFormat == GRAY) { + rotate_hwc1(src, dst, srcw, srch, degree); + } else if (srcFormat == BGR || srcFormat == RGB) { + rotate_hwc3(src, dst, srcw, srch, degree); + } else if (srcFormat == BGRA || srcFormat == RGBA) { + rotate_hwc4(src, dst, srcw, srch, degree); + } else { + printf("this srcFormat: %d does not support! \n", srcFormat); + return; + } +} // gray void rotate_hwc1_90( const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out); @@ -50,6 +70,9 @@ void rotate_hwc1( rotate_hwc1_180(src, dst, srcw, srch, srcw, srch); } else if (degree == 270) { rotate_hwc1_270(src, dst, srcw, srch, srch, srcw); + } else { + printf("this degree: %f does not support! \n", degree); + return; } } @@ -61,6 +84,9 @@ void rotate_hwc3( rotate_hwc3_180(src, dst, srcw, srch, srcw, srch); } else if (degree == 270) { rotate_hwc3_270(src, dst, srcw, srch, srch, srcw); + } else { + printf("this degree: %f does not support! \n", degree); + return; } } @@ -72,6 +98,9 @@ void rotate_hwc4( rotate_hwc4_180(src, dst, srcw, srch, srcw, srch); } else if (degree == 270) { rotate_hwc4_270(src, dst, srcw, srch, srch, srcw); + } else { + printf("this degree: %f does not support! \n", degree); + return; } } #ifdef __aarch64__ @@ -578,6 +607,7 @@ void rotate_hwc1_90(const uint8_t* src, int stride_h = 4 * w_in; int stride_h_w = 4 * w_in - 8; int stride_out = 4 * w_out; + int ww = w_out - 8; #pragma omp parallel for for (i = 0; i < h_in - 7; i += 8) { const uint8_t* inptr0 = src + i * w_in; @@ -586,7 +616,7 @@ void rotate_hwc1_90(const uint8_t* src, const uint8_t* inptr3 = inptr2 + w_in; int j = 0; for (; j < w_in - 7; j += 8) { - uint8_t* outptr0 = dst + j * w_out + i; + uint8_t* outptr0 = dst + j * w_out + (ww - i); uint8_t* outptr1 = outptr0 + w_out; uint8_t* outptr2 = outptr1 + w_out; uint8_t* outptr3 = outptr2 + w_out; @@ -648,7 +678,7 @@ void rotate_hwc1_90(const uint8_t* src, const uint8_t* inptr6 = inptr5 + w_in; const uint8_t* inptr7 = inptr6 + w_in; for (; j < w_in; j++) { - uint8_t* outptr = dst + j * w_out + i; + uint8_t* outptr = dst + j * w_out + ww - i; *outptr++ = *inptr0++; *outptr++ = *inptr1++; *outptr++ = *inptr2++; @@ -659,10 +689,11 @@ void rotate_hwc1_90(const uint8_t* src, *outptr++ = *inptr7++; } } + ww = w_out - 1; for (; i < h_in; i++) { const uint8_t* inptr0 = src + i * w_in; for (int j = 0; j < w_in; j++) { - uint8_t* outptr0 = dst + j * w_out + i; + uint8_t* outptr0 = dst + j * w_out + ww - i; *outptr0 = *inptr0++; } } @@ -693,9 +724,9 @@ void rotate_hwc1_180(const uint8_t* src, const uint8_t* inptr3 = inptr2 + w_in; uint8_t* outptr0 = dst + (h_in - i) * w_out - stride_w; // last - uint8_t* outptr1 = outptr0 + w_out; - uint8_t* outptr2 = outptr1 + w_out; - uint8_t* outptr3 = outptr2 + w_out; + uint8_t* outptr1 = outptr0 - w_out; + uint8_t* outptr2 = outptr1 - w_out; + uint8_t* outptr3 = outptr2 - w_out; if (i + 3 >= h_in) { uint8_t* ptr = zerobuff + w_in - stride_w; diff --git a/lite/utils/cv/image_rotate.h b/lite/utils/cv/image_rotate.h index 8335fca28051c3ba0ae5070464c32d5e804361f4..8e04a3f5244ab5740f9ee1b0e3586cdcea7aa32a 100644 --- a/lite/utils/cv/image_rotate.h +++ b/lite/utils/cv/image_rotate.h @@ -16,10 +16,20 @@ #include #include +#include "lite/utils/cv/paddle_image_preprocess.h" namespace paddle { namespace lite { namespace utils { namespace cv { +class ImageRotate { + public: + void choose(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + int srcw, + int srch, + float degree); +}; void rotate_hwc1( const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree); void rotate_hwc3( diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc index f18047556874a82d28c5964a1b5fd2fa8284c814..c46811a046a19a50592097fb987280ad19608193 100644 --- a/lite/utils/cv/paddle_image_preprocess.cc +++ b/lite/utils/cv/paddle_image_preprocess.cc @@ -25,7 +25,6 @@ namespace paddle { namespace lite { namespace utils { namespace cv { - #define PI 3.14159265f #define Degrees2Radians(degrees) ((degrees) * (SK_ScalarPI / 180)) #define Radians2Degrees(radians) ((radians) * (180 / SK_ScalarPI)) @@ -38,7 +37,7 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, this->dstFormat_ = dstFormat; this->transParam_ = param; } -void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) { +void ImagePreprocess::imageConvert(const uint8_t* src, uint8_t* dst) { ImageConvert img_convert; img_convert.choose(src, dst, @@ -48,10 +47,10 @@ void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) { this->transParam_.ih); } -void ImagePreprocess::imageCovert(const uint8_t* src, - uint8_t* dst, - ImageFormat srcFormat, - ImageFormat dstFormat) { +void ImagePreprocess::imageConvert(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + ImageFormat dstFormat) { ImageConvert img_convert; img_convert.choose(src, dst, @@ -68,7 +67,8 @@ void ImagePreprocess::imageResize(const uint8_t* src, int srch, int dstw, int dsth) { - resize(src, dst, srcFormat, srcw, srch, dstw, dsth); + ImageResize img_resize; + img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth); } void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) { @@ -77,7 +77,8 @@ void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) { int dstw = this->transParam_.ow; int dsth = this->transParam_.oh; auto srcFormat = this->dstFormat_; - resize(src, dst, srcFormat, srcw, srch, dstw, dsth); + ImageResize img_resize; + img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth); } void ImagePreprocess::imageRotate(const uint8_t* src, @@ -86,19 +87,8 @@ void ImagePreprocess::imageRotate(const uint8_t* src, int srcw, int srch, float degree) { - if (degree != 90 && degree != 180 && degree != 270) { - printf("this degree: %f not support \n", degree); - } - if (srcFormat == GRAY) { - rotate_hwc1(src, dst, srcw, srch, degree); - } else if (srcFormat == BGR || srcFormat == RGB) { - rotate_hwc3(src, dst, srcw, srch, degree); - } else if (srcFormat == BGRA || srcFormat == RGBA) { - rotate_hwc4(src, dst, srcw, srch, degree); - } else { - printf("this srcFormat: %d does not support! \n", srcFormat); - return; - } + ImageRotate img_rotate; + img_rotate.choose(src, dst, srcFormat, srcw, srch, degree); } void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) { @@ -106,10 +96,8 @@ void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) { auto srch = this->transParam_.oh; auto srcFormat = this->dstFormat_; auto degree = this->transParam_.rotate_param; - if (degree != 90 && degree != 180 && degree != 270) { - printf("this degree: %f not support \n", degree); - } - ImagePreprocess::imageRotate(src, dst, srcFormat, srcw, srch, degree); + ImageRotate img_rotate; + img_rotate.choose(src, dst, srcFormat, srcw, srch, degree); } void ImagePreprocess::imageFlip(const uint8_t* src, @@ -118,16 +106,8 @@ void ImagePreprocess::imageFlip(const uint8_t* src, int srcw, int srch, FlipParam flip_param) { - if (srcFormat == GRAY) { - flip_hwc1(src, dst, srcw, srch, flip_param); - } else if (srcFormat == BGR || srcFormat == RGB) { - flip_hwc3(src, dst, srcw, srch, flip_param); - } else if (srcFormat == BGRA || srcFormat == RGBA) { - flip_hwc4(src, dst, srcw, srch, flip_param); - } else { - printf("this srcFormat: %d does not support! \n", srcFormat); - return; - } + ImageFlip img_flip; + img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param); } void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) { @@ -135,7 +115,8 @@ void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) { auto srch = this->transParam_.oh; auto srcFormat = this->dstFormat_; auto flip_param = this->transParam_.flip_param; - ImagePreprocess::imageFlip(src, dst, srcFormat, srcw, srch, flip_param); + ImageFlip img_flip; + img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param); } void ImagePreprocess::image2Tensor(const uint8_t* src, diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h index 5a46a9e48e8202fe29ec9fc7d950ccf15920cc32..a12c0d11f067fc3e807682f9a213d3024def97e0 100644 --- a/lite/utils/cv/paddle_image_preprocess.h +++ b/lite/utils/cv/paddle_image_preprocess.h @@ -19,6 +19,7 @@ #include #include "lite/api/paddle_api.h" #include "lite/api/paddle_place.h" + namespace paddle { namespace lite { namespace utils { @@ -37,9 +38,9 @@ enum ImageFormat { }; // flip enum enum FlipParam { - X = 0, // flip along the X axis - Y, // flip along the Y axis - XY // flip along the XY axis + XY = -1, // flip along the XY axis + X = 0, // flip along the X axis + Y // flip along the Y axis }; // transform param typedef struct { @@ -69,11 +70,12 @@ class ImagePreprocess { * BGR(RGB)and BGRA(RGBA) transform, * BGR(RGB)and RGB(BGR) transform, * BGR(RGB)and RGBA(BGRA) transform, - * BGR(RGB)and GRAY transform, + * BGR(RGB) and GRAY transform, + * BGRA(RGBA) and GRAY transform, * param src: input image data * param dst: output image data */ - void imageCovert(const uint8_t* src, uint8_t* dst); + void imageConvert(const uint8_t* src, uint8_t* dst); /* * image color convert * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA), @@ -81,6 +83,7 @@ class ImagePreprocess { * BGR(RGB)and RGB(BGR) transform, * BGR(RGB)and RGBA(BGRA) transform, * BGR(RGB)and GRAY transform, + * BGRA(RGBA) and GRAY transform, * param src: input image data * param dst: output image data * param srcFormat: input image image format support: GRAY, NV12(NV21), @@ -88,10 +91,10 @@ class ImagePreprocess { * param dstFormat: output image image format, support GRAY, BGR(RGB) and * BGRA(RGBA) */ - void imageCovert(const uint8_t* src, - uint8_t* dst, - ImageFormat srcFormat, - ImageFormat dstFormat); + void imageConvert(const uint8_t* src, + uint8_t* dst, + ImageFormat srcFormat, + ImageFormat dstFormat); /* * image resize, use bilinear method * support image format: 1-channel image (egs: GRAY, 2-channel image (egs: @@ -171,7 +174,8 @@ class ImagePreprocess { FlipParam flip_param); /* * change image data to tensor data - * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and + * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC + * and * NCHW * param src: input image data * param dstTensor: output tensor data @@ -186,7 +190,8 @@ class ImagePreprocess { float* scales); /* * change image data to tensor data - * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and + * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC + * and * NCHW * param src: input image data * param dstTensor: output tensor data