[arm] add test_cv demo (#2691)

* add cv image process * fix arm liunx build error * add LITE_WITH_CV defien to make cv, test=develop * fix cv format, annd add describe in utils/cv * set LITE_WITH_CV=OFF in build.sh, test=develop * delete cv_enum.h in utils/cv, push the contents in cv_ennum.h to paddle_image_preprocess.h, test=develop * according to reviews to redefine paddle_image_preprocess.h, test=develop * add detailed note of flipParam, test=develop * fix format in paddle_image_preprocess.h, test=develop * fix cmake error in llite/CMakeLists.txt, missing mkdir cxx, test=develop * according to review change, test=develop * add elemetnwise mul constant elimination and deconv+relu, deconv+batchnorm fusion, test=develop * fix format, test=develop * fix model_optimize bug, update concat and split op, speed up, test=develop * update split speed, test=develop * fix format, test=develop * add classify demo inn demo/cxx/ , test=develop * fix formart inn mobile_classify, test=develop * delete some note and extra code, test=develop * remove test.jpg and labels.txt, test=develop * add test_cv in cxx/demo * add test_cv READMEE, test=develoop * add note info, flip only support x, y, xy;rotate only support 90, 180, 270; test=develop * fix build error, paddle_cv_arm , test=develop * add GRAY to RGBA(BGRA) convert and RGBA(BGRA)_to_Tensor, test=develop * fix format from review, test=develop * fix makefile format. test=devellop * fix bbuuild v7 error, test=develop

[arm] add test_cv demo (#2691)
* add cv image process * fix arm liunx build error * add LITE_WITH_CV defien to make cv, test=develop * fix cv format, annd add describe in utils/cv * set LITE_WITH_CV=OFF in build.sh, test=develop * delete cv_enum.h in utils/cv, push the contents in cv_ennum.h to paddle_image_preprocess.h, test=develop * according to reviews to redefine paddle_image_preprocess.h, test=develop * add detailed note of flipParam, test=develop * fix format in paddle_image_preprocess.h, test=develop * fix cmake error in llite/CMakeLists.txt, missing mkdir cxx, test=develop * according to review change, test=develop * add elemetnwise mul constant elimination and deconv+relu, deconv+batchnorm fusion, test=develop * fix format, test=develop * fix model_optimize bug, update concat and split op, speed up, test=develop * update split speed, test=develop * fix format, test=develop * add classify demo inn demo/cxx/ , test=develop * fix formart inn mobile_classify, test=develop * delete some note and extra code, test=develop * remove test.jpg and labels.txt, test=develop * add test_cv in cxx/demo * add test_cv READMEE, test=develoop * add note info, flip only support x, y, xy;rotate only support 90, 180, 270; test=develop * fix build error, paddle_cv_arm , test=develop * add GRAY to RGBA(BGRA) convert and RGBA(BGRA)_to_Tensor, test=develop * fix format from review, test=develop * fix makefile format. test=devellop * fix bbuuild v7 error, test=develop
e84406a7 · HappyAngel · GitHub · 6135fd4a · e84406a7 · e84406a7
24 changed file
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -226,6 +226,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
@@ -243,6 +245,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
            )
            add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
        endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -35,6 +35,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
        NPU_DEPS ${npu_kernels})

    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
+
    if (LITE_WITH_NPU)
        # Strips the symbols of our protobuf functions to fix the conflicts during
        # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
@@ -45,8 +46,8 @@ else()
    if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
        add_library(paddle_light_api_shared SHARED "")
        target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc)
-       set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
-       add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
+        set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
+        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
        if (LITE_WITH_NPU)
            # Need to add HIAI runtime libs (libhiai.so) dependency
            target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
@@ -123,6 +124,7 @@ if(WITH_TESTING)
       X86_DEPS ${x86_kernels}
       CUDA_DEPS ${cuda_kernels}
       ARM_DEPS ${arm_kernels}
+       CV_DEPS paddle_cv_arm
       NPU_DEPS ${npu_kernels}
       XPU_DEPS ${xpu_kernels}
       CL_DEPS ${opencl_kernels}
@@ -285,6 +287,7 @@ endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
 lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light
  ${ops}
  ARM_DEPS ${arm_kernels}
+  CV_DEPS paddle_cv_arm
  NPU_DEPS ${npu_kernels}
  XPU_DEPS ${xpu_kernels}
  CL_DEPS ${opencl_kernels}
@@ -307,9 +310,11 @@ if(NOT IOS)
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
+
    lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -86,6 +86,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
    for (int i = 0; i < input_shapes[j].size(); ++i) {
      input_num *= input_shapes[j][i];
    }
+
    for (int i = 0; i < input_num; ++i) {
      input_data[i] = 1.f;
    }

--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
@@ -68,26 +68,44 @@ adb pull /data/local/tmp/test_yolov3_detection_result.jpg ./
 cd ../mobile_classify
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
 tar zxvf mobilenet_v1.tar.gz
+./model_optimize_tool optimize model
 make
-adb push mobile_classify /data/local/tmp/
-adb push test.jpg /data/local/tmp/
-adb push labels.txt /data/local/tmp/
-adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
-adb shell chmod +x /data/local/tmp/mobile_classify
-adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
-/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+
+adb -s emulator-5554 push mobile_classify /data/local/tmp/
+adb -s emulator-5554 push test.jpg /data/local/tmp/
+adb -s emulator-5554 push labels.txt /data/local/tmp/
+adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify
+adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
 ```
 运行成功将在控制台输出预测结果的前5个类别的预测概率
 - 如若想看前10个类别的预测概率，在运行命令输入topk的值即可
    eg:
    ```shell
-    adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
-    /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
+    adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+    /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
    ```
 - 如若想看其他模型的分类结果， 在运行命令输入model_dir 及其model的输入大小即可
    eg:
    ```shell
-    adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
-    /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
+    adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+    /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv2opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
    ```
    
+9. 编译含CV预处理库模型单测demo 
+```shell
+cd ../test_cv
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+./model_optimize_tool optimize model
+make
+adb -s emulator-5554 push test_model_cv /data/local/tmp/
+adb -s emulator-5554 push test.jpg /data/local/tmp/
+adb -s emulator-5554 push labels.txt /data/local/tmp/
+adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb -s emulator-5554 shell chmod +x /data/local/tmp/test_model_cv
+adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/test_model_cv /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+```
+运行成功将在控制台输出预测结果的前10个类别的预测概率
--- a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
+ARM_ABI = arm7
+LITE_WITH_CV = ON
+export ARM_ABI
+export LITE_WITH_CV
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+test_model_cv: fetch_opencv test_model_cv.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+test_model_cv.o: test_model_cv.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+test_img_prepross: fetch_opencv test_img_prepross.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross  $(CXX_LIBS) $(LDFLAGS)
+
+test_img_prepross.o: test_img_prepross.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f test_model_cv.o
+	rm -f test_model_cv
+      rm -f test_img_prepross.o
+	rm -f test_img_prepross
--- a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
+ARM_ABI = arm8
+LITE_WITH_CV = ON
+export ARM_ABI
+export LITE_WITH_CV
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+test_model_cv: fetch_opencv test_model_cv.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+test_model_cv.o: test_model_cv.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+test_img_prepross: fetch_opencv test_img_prepross.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross  $(CXX_LIBS) $(LDFLAGS)
+
+test_img_prepross.o: test_img_prepross.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f test_model_cv.o
+	rm -f test_model_cv
+      rm -f test_img_prepross.o
+	rm -f test_img_prepross
--- a/lite/demo/cxx/mobile_classify/mobile_classify.cc
+++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc
@@ -117,7 +117,7 @@ void pre_process(const cv::Mat& img,
                 float* means,
                 float* scales) {
  cv::Mat rgb_img;
-  // cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
  cv::Mat imgf;
  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);

--- a/lite/demo/cxx/test_cv/README.md
+++ b/lite/demo/cxx/test_cv/README.md
+# 图像预测库的使用
+1. 下载源码（https://github.com/PaddlePaddle/Paddle-Lite），打开LITE_WITH_CV=ON，编译full_publish模式
+example:
+```shell
+set BUILD_WITH_CV=ON or LITE_WITH_CV=ON
+./lite/tools/build.sh
+--arm_os=android
+--arm_abi=armv8
+--arm_lang=gcc
+--android_stl=c++_static
+full_publish
+```
+
+2. 准备模型和优化模型
+example:
+```shell
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+./lite/tools/build.sh build_optimize_tool
+./build.model_optimize_tool/lite/api/model_optimize_tool 
+--optimize_out_type=naive_buffer 
+--optimize_out=model_dir 
+--model_dir=model_dir
+--prefer_int8_kernel=false
+```
+
+3. 编译并运行完整test_model_cv demo
+example:
+```shell
+cd inference_lite_lib.android.armv8/demo/cxx/test_cv
+```
+
+- 修改MakeFile, 注释编译test_img_propress 语句
+    ```shell
+    test_model_cv: fetch_opencv test_model_cv.o
+            $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+    test_model_cv.o: test_model_cv.cc
+            $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+    #test_img_propress: fetch_opencv test_img_propress.o
+    #        $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_propress.o -o test_img_propress  $(CXX_LIBS) $(LDFLAGS)
+
+    #test_img_propress.o: test_img_propress.cc
+    #        $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_propress.o -c test_img_propress.cc
+
+    .PHONY: clean
+    clean:
+            rm -f test_model_cv.o
+            rm -f test_model_cv
+            #rm -f test_img_propress.o
+            #rm -f test_img_propress
+    ```
+- 修改../../..//cxx/include/paddle_image_preprocess.h， 修改paddle_api.h头文件的路径
+    ```shell
+    origin:
+        #include "lite/api/paddle_api.h"
+        #include "lite/api/paddle_place.h"
+    now:
+        #include "paddle_api.h"
+        #include "paddle_place.h"
+    ```
+- 测试模型必须是优化后的模型
+
+```shell
+make
+
+adb -s device_id push mobilenet_v1 /data/local/tmp/
+adb -s device_id push test_model_cv /data/local/tmp/
+adb -s device_id push test.jpg /data/local/tmp/
+adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb -s device_id shell chmod +x /data/local/tmp/test_model_cv
+adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/test_model_cv /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg 1 3 224 224 "
+```
+运行成功将在控制台输出部分预测结果
+
+4. 编译并运行完整test_img_preprocess demo
+example:
+```shell
+cd inference_lite_lib.android.armv8/demo/cxx/test_cv
+```
+
+- 修改MakeFile, 注释编译test_model_cv 语句
+    ```shell
+    #test_model_cv: fetch_opencv test_model_cv.o
+    #        $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+    #test_model_cv.o: test_model_cv.cc
+    #        $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+    test_img_propress: fetch_opencv test_img_propress.o
+            $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_propress.o -o test_img_propress  $(CXX_LIBS) $(LDFLAGS)
+
+    test_img_propress.o: test_img_propress.cc
+            $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_propress.o -c test_img_propress.cc
+
+    .PHONY: clean
+    clean:
+            #rm -f test_model_cv.o
+            #rm -f test_model_cv
+            rm -f test_img_propress.o
+            rm -f test_img_propress
+    ```
+- 修改../../..//cxx/include/paddle_image_preprocess.h， 修改paddle_api.h头文件的路径
+    ```shell
+    origin:
+        #include "lite/api/paddle_api.h"
+        #include "lite/api/paddle_place.h"
+    now:
+        #include "paddle_api.h"
+        #include "paddle_place.h"
+    ```
+- 测试模型必须是优化后的模型
+
+```shell
+make
+
+adb -s device_id push mobilenet_v1 /data/local/tmp/
+adb -s device_id push test_img_propress /data/local/tmp/
+adb -s device_id push test.jpg /data/local/tmp/
+adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb -s device_id shell chmod +x /data/local/tmp/test_model_cv
+adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/test_img_propress /data/local/tmp/test.jpg /data/local/tmp/ 3 3 1 3 224 224 /data/local/tmp/mobilenet_v1  "
+adb -s device_id pull /data/local/tmp/resize.jpg ./
+adb -s device_id pull /data/local/tmp/convert.jpg ./
+adb -s device_id pull /data/local/tmp/flip.jpg ./
+adb -s device_id pull /data/local/tmp/rotate.jpg ./
+```
+运行成功将在控制台输出OpenCV 和 Padlle-lite的耗时；同时，将在test_cv目录下看到生成的图像预处理结果图: 如：resize.jpg、convert.jpg等
--- a/lite/demo/cxx/test_cv/test_img_prepross.cc
+++ b/lite/demo/cxx/test_cv/test_img_prepross.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"               // NOLINT
+#include "paddle_image_preprocess.h"  // NOLINT
+#include "time.h"                     // NOLINT
+typedef paddle::lite_api::Tensor Tensor;
+typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
+typedef paddle::lite::utils::cv::FlipParam FlipParam;
+typedef paddle::lite::utils::cv::TransParam TransParam;
+typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
+typedef paddle::lite_api::DataLayoutType LayoutType;
+using namespace paddle::lite_api;  // NOLINT
+
+void fill_with_mat(cv::Mat& mat, uint8_t* src) {  // NOLINT
+  for (int i = 0; i < mat.rows; i++) {
+    for (int j = 0; j < mat.cols; j++) {
+      int tmp = (i * mat.cols + j) * 3;
+      cv::Vec3b& rgb = mat.at<cv::Vec3b>(i, j);
+      rgb[0] = src[tmp];
+      rgb[1] = src[tmp + 1];
+      rgb[2] = src[tmp + 2];
+    }
+  }
+}
+void test_img(std::vector<int> cluster_id,
+              std::vector<int> thread_num,
+              std::string img_path,
+              std::string dst_path,
+              ImageFormat srcFormat,
+              ImageFormat dstFormat,
+              int width,
+              int height,
+              float rotate,
+              FlipParam flip,
+              LayoutType layout,
+              std::string model_dir,
+              int test_iter = 1) {
+  // init
+  // paddle::lite::DeviceInfo::Init();
+  // read img and pre-process
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  float means[3] = {0.485f, 0.456f, 0.406f};
+  float scales[3] = {0.229f, 0.224f, 0.225f};
+  int srch = img.rows;
+  int srcw = img.cols;
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::cout << "cluster: " << cls << ", threads: " << th << std::endl;
+      // 1. Set MobileConfig
+      MobileConfig config;
+      config.set_model_dir(model_dir);
+      config.set_power_mode((PowerMode)cls);
+      config.set_threads(th);
+      std::cout << "model: " << model_dir;
+
+      // 2. Create PaddlePredictor by MobileConfig
+      std::shared_ptr<PaddlePredictor> predictor =
+          CreatePaddlePredictor<MobileConfig>(config);
+
+      // 3. Prepare input data from image
+      std::unique_ptr<Tensor> input_tensor(predictor->GetInput(0));
+
+      /*
+        imread(img_path, param)
+        IMREAD_UNCHANGED(<0) 表示加载原图，不做任何改变
+        IMREAD_GRAYSCALE ( 0)表示把原图作为灰度图像加载进来
+        IMREAD_COLOR (>0) 表示把原图作为RGB图像加载进来
+      */
+      cv::Mat img;
+      if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+        img = imread(img_path, cv::IMREAD_COLOR);
+      } else if (srcFormat == ImageFormat::GRAY) {
+        img = imread(img_path, cv::IMREAD_GRAYSCALE);
+      } else {
+        printf("this format %d does not support \n", srcFormat);
+        return;
+      }
+      if (img.empty()) {
+        std::cout << "opencv read image " << img_path.c_str() << " failed"
+                  << std::endl;
+        return;
+      }
+      int srch = img.rows;
+      int srcw = img.cols;
+      int dsth = height;
+      int dstw = width;
+
+      std::cout << " input tensor size, num= " << 1 << ", channel= " << 1
+                << ", height= " << srch << ", width= " << srcw
+                << ", srcFormat= " << (ImageFormat)srcFormat << std::endl;
+      // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12,
+      if (srcFormat == ImageFormat::GRAY) {
+        std::cout << "srcFormat: GRAY" << std::endl;
+      }
+      if (srcFormat == ImageFormat::BGR) {
+        std::cout << "srcFormat: BGR" << std::endl;
+      }
+      if (srcFormat == ImageFormat::RGB) {
+        std::cout << "srcFormat: RGB" << std::endl;
+      }
+      std::cout << " output tensor size, num=" << 1 << ", channel=" << 1
+                << ", height=" << dsth << ", width=" << dstw
+                << ", dstFormat= " << (ImageFormat)dstFormat << std::endl;
+
+      if (dstFormat == ImageFormat::GRAY) {
+        std::cout << "dstFormat: GRAY" << std::endl;
+      }
+      if (dstFormat == ImageFormat::BGR) {
+        std::cout << "dstFormat: BGR" << std::endl;
+      }
+      if (dstFormat == ImageFormat::RGB) {
+        std::cout << "dstFormat: RGB" << std::endl;
+      }
+
+      std::cout << "Rotate = " << rotate << ", Flip = " << flip
+                << ", Layout = " << static_cast<int>(layout) << std::endl;
+      if (static_cast<int>(layout) != 1 && static_cast<int>(layout) != 3) {
+        std::cout << "this layout" << static_cast<int>(layout)
+                  << " is no support" << std::endl;
+      }
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+        size = 3 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = img.data;
+
+      int out_size = srch * srcw;
+      int resize = dstw * dsth;
+      if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+        resize = 3 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+        resize = dsth * dstw;
+      }
+      // out
+      uint8_t* lite_dst = new uint8_t[out_size];
+      uint8_t* resize_tmp = new uint8_t[resize];
+      uint8_t* tv_out_ratote = new uint8_t[out_size];
+      uint8_t* tv_out_flip = new uint8_t[out_size];
+      std::vector<int64_t> shape_out = {1, 3, srch, srcw};
+
+      input_tensor->Resize(shape_out);
+      Tensor dst_tensor = *input_tensor;
+      std::cout << "opencv compute" << std::endl;
+      cv::Mat im_convert;
+      cv::Mat im_resize;
+      cv::Mat im_rotate;
+      cv::Mat im_flip;
+      double to_1 = 0;
+      double to_2 = 0;
+      double to_3 = 0;
+      double to_4 = 0;
+      double to1 = 0;
+      for (int i = 0; i < test_iter; i++) {
+        clock_t start = clock();
+        clock_t begin = clock();
+        // convert bgr-gray
+        if (dstFormat == srcFormat) {
+          im_convert = img;
+        } else if (dstFormat == ImageFormat::BGR &&
+                   srcFormat == ImageFormat::GRAY) {
+          cv::cvtColor(img, im_convert, cv::COLOR_GRAY2BGR);
+        } else if (srcFormat == ImageFormat::BGR &&
+                   dstFormat == ImageFormat::GRAY) {
+          cv::cvtColor(img, im_convert, cv::COLOR_BGR2GRAY);
+        } else if (dstFormat == srcFormat) {
+          printf("convert format error \n");
+          return;
+        }
+        clock_t end = clock();
+        to_1 += (end - begin);
+
+        begin = clock();
+        // resize default linear
+        cv::resize(im_convert, im_resize, cv::Size(dstw, dsth), 0.f, 0.f);
+        end = clock();
+        to_2 += (end - begin);
+
+        begin = clock();
+        // rotate 90
+        if (rotate == 90) {
+          cv::flip(im_convert.t(), im_rotate, 1);
+        } else if (rotate == 180) {
+          cv::flip(im_convert, im_rotate, -1);
+        } else if (rotate == 270) {
+          cv::flip(im_convert.t(), im_rotate, 0);
+        }
+        end = clock();
+        to_3 += (end - begin);
+
+        begin = clock();
+        // flip
+        cv::flip(im_convert, im_flip, flip);
+        end = clock();
+        to_4 += (end - begin);
+        clock_t ovet = clock();
+        to1 += (ovet - start);
+      }
+
+      std::cout << "Paddle-lite compute" << std::endl;
+      double lite_to = 0;
+      double lite_to_1 = 0;
+      double lite_to_2 = 0;
+      double lite_to_3 = 0;
+      double lite_to_4 = 0;
+      double lite_to_5 = 0;
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = dsth;
+      tparam.ow = dstw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+
+      for (int i = 0; i < test_iter; ++i) {
+        clock_t start = clock();
+        clock_t begin = clock();
+        image_preprocess.imageConvert(src, lite_dst);
+        clock_t end = clock();
+        lite_to_1 += (end - begin);
+
+        begin = clock();
+        image_preprocess.imageResize(lite_dst, resize_tmp);
+        end = clock();
+        lite_to_2 += (end - begin);
+
+        begin = clock();
+        image_preprocess.imageRotate(
+            lite_dst, tv_out_ratote, (ImageFormat)dstFormat, srcw, srch, 90);
+        end = clock();
+        lite_to_3 += (end - begin);
+
+        begin = clock();
+        image_preprocess.imageFlip(
+            lite_dst, tv_out_flip, (ImageFormat)dstFormat, srcw, srch, flip);
+        end = clock();
+        lite_to_4 += (end - begin);
+
+        clock_t over = clock();
+        lite_to += (over - start);
+
+        begin = clock();
+        image_preprocess.image2Tensor(lite_dst,
+                                      &dst_tensor,
+                                      (ImageFormat)dstFormat,
+                                      srcw,
+                                      srch,
+                                      layout,
+                                      means,
+                                      scales);
+        end = clock();
+        lite_to_5 += (end - begin);
+      }
+      to_1 = 1000 * to_1 / CLOCKS_PER_SEC;
+      to_2 = 1000 * to_2 / CLOCKS_PER_SEC;
+      to_3 = 1000 * to_3 / CLOCKS_PER_SEC;
+      to_4 = 1000 * to_4 / CLOCKS_PER_SEC;
+      to1 = 1000 * to1 / CLOCKS_PER_SEC;
+      std::cout << "opencv convert run time: " << to_1
+                << "ms, avg: " << to_1 / test_iter << std::endl;
+      std::cout << "opencv resize run time: " << to_2
+                << "ms, avg: " << to_2 / test_iter << std::endl;
+      std::cout << "opencv rotate run time: " << to_3
+                << "ms, avg: " << to_3 / test_iter << std::endl;
+      std::cout << "opencv flip  time: " << to_4
+                << "ms, avg: " << to_4 / test_iter << std::endl;
+      std::cout << "opencv total run time: " << to1
+                << "ms, avg: " << to1 / test_iter << std::endl;
+      std::cout << "------" << std::endl;
+
+      lite_to_1 = 1000 * lite_to_1 / CLOCKS_PER_SEC;
+      lite_to_2 = 1000 * lite_to_2 / CLOCKS_PER_SEC;
+      lite_to_3 = 1000 * lite_to_3 / CLOCKS_PER_SEC;
+      lite_to_4 = 1000 * lite_to_4 / CLOCKS_PER_SEC;
+      lite_to_5 = 1000 * lite_to_5 / CLOCKS_PER_SEC;
+      lite_to = 1000 * lite_to / CLOCKS_PER_SEC;
+      std::cout << "lite convert run time: " << lite_to_1
+                << "ms, avg: " << lite_to_1 / test_iter << std::endl;
+      std::cout << "lite resize run time: " << lite_to_2
+                << "ms, avg: " << lite_to_2 / test_iter << std::endl;
+      std::cout << "lite rotate run time: " << lite_to_3
+                << "ms, avg: " << lite_to_3 / test_iter << std::endl;
+      std::cout << "lite flip  time: " << lite_to_4
+                << "ms, avg: " << lite_to_4 / test_iter << std::endl;
+      std::cout << "lite total run time: " << lite_to
+                << "ms, avg: " << lite_to / test_iter << std::endl;
+      std::cout << "lite img2tensor  time: " << lite_to_5
+                << "ms, avg: " << lite_to_5 / test_iter << std::endl;
+      std::cout << "------" << std::endl;
+
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      // save_img
+      std::cout << "write image: " << std::endl;
+      std::string resize_name = dst_path + "/resize.jpg";
+      std::string convert_name = dst_path + "/convert.jpg";
+      std::string rotate_name = dst_path + "/rotate.jpg";
+      std::string flip_name = dst_path + "/flip.jpg";
+      cv::Mat resize_mat(dsth, dstw, CV_8UC3);
+      cv::Mat convert_mat(srch, srcw, CV_8UC3);
+      cv::Mat rotate_mat;
+      if (rotate == 90 || rotate == 270) {
+        rotate_mat = cv::Mat(srcw, srch, CV_8UC3);
+      } else {
+        rotate_mat = cv::Mat(srch, srcw, CV_8UC3);
+      }
+      cv::Mat flip_mat(srch, srcw, CV_8UC3);
+      fill_with_mat(resize_mat, resize_tmp);
+      fill_with_mat(convert_mat, lite_dst);
+      fill_with_mat(rotate_mat, tv_out_ratote);
+      fill_with_mat(flip_mat, tv_out_flip);
+      cv::imwrite(convert_name, convert_mat);
+      cv::imwrite(resize_name, resize_mat);
+      cv::imwrite(rotate_name, rotate_mat);
+      cv::imwrite(flip_name, flip_mat);
+      delete[] lite_dst;
+      delete[] resize_tmp;
+      delete[] tv_out_ratote;
+      delete[] tv_out_flip;
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 7) {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " image_path dst_apth srcFormat dstFormat width height\n";
+    exit(1);
+  }
+  std::string image_path = argv[1];
+  std::string dst_path = argv[2];
+  int srcFormat = atoi(argv[3]);
+  int dstFormat = atoi(argv[4]);
+  int width = atoi(argv[5]);
+  int height = atoi(argv[6]);
+  int flip = -1;
+  float rotate = 90;
+  int layout = 1;
+  std::string model_dir = "mobilenet_v1";
+  if (argc > 7) {
+    model_dir = argv[7];
+  }
+  if (argc > 8) {
+    flip = atoi(argv[8]);
+  }
+  if (argc > 9) {
+    rotate = atoi(argv[9]);
+  }
+  if (argc > 10) {
+    layout = atoi(argv[10]);
+  }
+  test_img({3},
+           {1, 2, 4},
+           image_path,
+           dst_path,
+           (ImageFormat)srcFormat,
+           (ImageFormat)dstFormat,
+           width,
+           height,
+           rotate,
+           (FlipParam)flip,
+           (LayoutType)layout,
+           model_dir,
+           20);
+  return 0;
+}
--- a/lite/demo/cxx/test_cv/test_model_cv.cc
+++ b/lite/demo/cxx/test_cv/test_model_cv.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"               // NOLINT
+#include "paddle_image_preprocess.h"  // NOLINT
+#include "time.h"                     // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale) {
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+void pre_process(const cv::Mat& img, int width, int height, Tensor dstTensor) {
+#ifdef LITE_WITH_CV
+  typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
+  typedef paddle::lite::utils::cv::FlipParam FlipParam;
+  typedef paddle::lite::utils::cv::TransParam TransParam;
+  typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
+  typedef paddle::lite_api::DataLayoutType LayoutType;
+  // init TransParam
+  TransParam tp;
+  tp.iw = img.cols;
+  tp.ih = img.rows;
+  tp.ow = width;
+  tp.oh = height;
+  ImageFormat srcFormat = ImageFormat::BGR;
+  ImageFormat dstFormat = ImageFormat::RGB;
+  // init ImagePreprocess
+  ImagePreprocess img_process(srcFormat, dstFormat, tp);
+  // init temp var
+  const uint8_t* img_ptr = reinterpret_cast<const uint8_t*>(img.data);
+  uint8_t* rgb_ptr = new uint8_t[img.cols * img.rows * 3];
+  uint8_t* resize_ptr = new uint8_t[width * height * 3];
+  // do convert bgr--rgb
+  img_process.imageConvert(img_ptr, rgb_ptr);
+  // do resize
+  img_process.imageResize(rgb_ptr, resize_ptr);
+  // data--tensor and normalize
+  float means[3] = {103.94f, 116.78f, 123.68f};
+  float scales[3] = {0.017f, 0.017f, 0.017f};
+  img_process.image2Tensor(
+      resize_ptr, &dstTensor, LayoutType::kNCHW, means, scales);
+  float* data = dstTensor.mutable_data<float>();
+#else
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  float means[3] = {0.485f, 0.456f, 0.406f};
+  float scales[3] = {0.229f, 0.224f, 0.225f};
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  float* data = dstTensor.mutable_data<float>();
+  neon_mean_scale(dimg, data, width * height, means, scales);
+#endif
+}
+
+void RunModel(std::string model_dir,
+              std::string img_path,
+              std::vector<int> input_shape,
+              PowerMode power_mode,
+              int thread_num,
+              int test_iter,
+              int warmup = 0) {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+  // 3. Prepare input data from image
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(
+      {input_shape[0], input_shape[1], input_shape[2], input_shape[3]});
+  auto* data = input_tensor->mutable_data<float>();
+  // read img and pre-process
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+
+  pre_process(img, input_shape[3], input_shape[2], *input_tensor);
+
+  // 4. Run predictor
+  for (int i = 0; i < warmup; ++i) {
+    predictor->Run();
+  }
+  double lps = 0.f;
+  double min_time = 1000000.f;
+  double max_time = 0.f;
+  for (int i = 0; i < test_iter; ++i) {
+    clock_t begin = clock();
+    predictor->Run();
+    clock_t end = clock();
+    double t = (end - begin) * 1000;
+    t = t / CLOCKS_PER_SEC;
+    lps += t;
+    if (t < min_time) {
+      min_time = t;
+    }
+    if (t > max_time) {
+      max_time = t;
+    }
+    std::cout << "iter: " << i << ", time: " << t << " ms" << std::endl;
+  }
+  std::cout << "================== Speed Report ==================="
+            << std::endl;
+  std::cout << "Model: " << model_dir
+            << ", power_mode: " << static_cast<int>(power_mode)
+            << ", threads num " << thread_num << ", warmup: " << warmup
+            << ", repeats: " << test_iter << ", avg time: " << lps / test_iter
+            << " ms"
+            << ", min time: " << min_time << " ms"
+            << ", max time: " << max_time << " ms." << std::endl;
+
+  // 5. Get output and post process
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor->data<float>();
+  auto shape_out = output_tensor->shape();
+  int output_num = 1;
+  for (int i = 0; i < shape_out.size(); ++i) {
+    output_num *= shape_out[i];
+  }
+  std::cout << "output_num: " << output_num << std::endl;
+  for (int i = 0; i < output_num; i += 100) {
+    std::cout << "i: " << i << ", out: " << outptr[i] << std::endl;
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 7) {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " model_dir image_path input_shape\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  std::string img_path = argv[2];
+  std::vector<int> input_shape;
+  input_shape.push_back(atoi(argv[3]));
+  input_shape.push_back(atoi(argv[4]));
+  input_shape.push_back(atoi(argv[5]));
+  input_shape.push_back(atoi(argv[6]));
+  int power_mode = 3;
+  int threads = 1;
+  int test_iter = 100;
+  int warmup = 10;
+  if (argc > 7) {
+    power_mode = atoi(argv[7]);
+  }
+  if (argc > 8) {
+    threads = atoi(argv[8]);
+  }
+  if (argc > 9) {
+    test_iter = atoi(argv[9]);
+  }
+  if (argc > 10) {
+    warmup = atoi(argv[10]);
+  }
+  RunModel(model_dir,
+           img_path,
+           input_shape,
+           (PowerMode)power_mode,
+           threads,
+           test_iter,
+           warmup);
+  return 0;
+}
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
 if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
-    lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm paddle_api_light ${lite_cv_deps} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
 endif()
--- a/lite/tests/cv/cv_basic.h
+++ b/lite/tests/cv/cv_basic.h
@@ -192,7 +192,6 @@ void nv21_bgra_basic(const uint8_t* in_data,
  nv2bgra(in_data, out_data, srcw, srch, 0, 1);
 }

-/*
 /*
 采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R
 采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B
@@ -217,6 +216,21 @@ void bgr_gray_basic(const uint8_t* in_data,
    }
  }
 }
+void bgra_gray_basic(const uint8_t* in_data,
+                     uint8_t* out_data,
+                     int srcw,
+                     int srch) {
+  for (int i = 0; i < srch; i++) {
+    const uint8_t* din_ptr = in_data + i * 4 * srcw;
+    uint8_t* dout_ptr = out_data + i * srcw;
+    for (int j = 0; j < srcw; j++) {
+      int sum = din_ptr[0] * 15 + din_ptr[1] * 75 + din_ptr[2] * 38;
+      sum = sum >> 7;
+      *dout_ptr++ = sum;
+      din_ptr += 4;
+    }
+  }
+}

 void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
  for (int i = 0; i < srch; i++) {
@@ -228,6 +242,17 @@ void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
    }
  }
 }
+void gray_bgra_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  for (int i = 0; i < srch; i++) {
+    for (int j = 0; j < srcw; j++) {
+      *dst++ = *src;
+      *dst++ = *src;
+      *dst++ = *src;
+      *dst++ = 255;
+      src++;
+    }
+  }
+}
 // bgr2bgra, rgb2rgba
 void hwc3_to_hwc4_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
  for (int i = 0; i < srch; i++) {
@@ -340,6 +365,16 @@ void image_convert_basic(const uint8_t* in_data,
               (srcFormat == ImageFormat::GRAY &&
                dstFormat == ImageFormat::BGR)) {
      gray_bgr_basic(in_data, out_data, srcw, srch);
+    } else if ((srcFormat == ImageFormat::RGBA &&
+                dstFormat == ImageFormat::GRAY) ||
+               (srcFormat == ImageFormat::BGRA &&
+                dstFormat == ImageFormat::GRAY)) {
+      bgra_gray_basic(in_data, out_data, srcw, srch);
+    } else if ((srcFormat == ImageFormat::GRAY &&
+                dstFormat == ImageFormat::RGBA) ||
+               (srcFormat == ImageFormat::GRAY &&
+                dstFormat == ImageFormat::BGRA)) {
+      gray_bgra_basic(in_data, out_data, srcw, srch);
    } else if ((srcFormat == ImageFormat::RGBA &&
                dstFormat == ImageFormat::RGB) ||
               (srcFormat == ImageFormat::BGRA &&
@@ -525,6 +560,7 @@ void image_resize_basic(const uint8_t* in_data,
    int y_flag = 0;  // only one line
    if (y_in_start < 0) {
      y_flag = 1;
+      y_in_end = 0;
    }
    float b0 = ibeta[dy * 2];
    float b1 = ibeta[dy * 2 + 1];
@@ -750,6 +786,26 @@ void image_flip_basic(const uint8_t* in_data,
    flipxy_basic(in_data, srch, srcw, out_data, num);
  }
 }
+void gray_to_tensor_basic(const uint8_t* bgr,
+                          float* output,
+                          int width,
+                          int height,
+                          float* means,
+                          float* scales,
+                          int num) {
+  int size = width * height;
+  float mean_val = means[0];
+  float scale_val = scales[0];
+
+  for (int h = 0; h < height; h++) {
+    const uint8_t* ptr_bgr = bgr + h * width * num;
+    float* ptr_h = output + h * width;
+    for (int i = 0; i < width; i++) {
+      *ptr_h++ = (ptr_bgr[0] - mean_val) * scale_val;
+      ptr_bgr += num;
+    }
+  }
+}

 void bgr_to_tensor_chw_basic(const uint8_t* bgr,
                             float* output,
@@ -828,5 +884,8 @@ void image_to_tensor_basic(const uint8_t* in_data,
  } else if (layout == LayoutType::kNHWC && (srcFormat == ImageFormat::BGRA ||
                                             srcFormat == ImageFormat::RGBA)) {
    bgr_to_tensor_hwc_basic(in_data, output, srcw, srch, means, scales, 4);
+  } else if (srcFormat == ImageFormat::GRAY &&
+             (layout == LayoutType::kNHWC || layout == LayoutType::kNCHW)) {
+    gray_to_tensor_basic(in_data, output, srcw, srch, means, scales, 1);
  }
 }
--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
@@ -20,6 +20,7 @@
 #include "lite/core/profile/timer.h"
 #include "lite/tests/cv/cv_basic.h"
 #include "lite/utils/cv/paddle_image_preprocess.h"
+#include "time.h"  // NOLINT

 DEFINE_int32(cluster, 3, "cluster id");
 DEFINE_int32(threads, 1, "threads num");
@@ -28,15 +29,15 @@ DEFINE_int32(repeats, 1, "repeats times");
 DEFINE_bool(basic_test, false, "do all tests");
 DEFINE_bool(check_result, true, "check the result");

-DEFINE_int32(srcFormat, 0, "input image format");
-DEFINE_int32(dstFormat, 1, "output image format");
+DEFINE_int32(srcFormat, 0, "input image format RGBA");
+DEFINE_int32(dstFormat, 2, "output image format RGB");
 DEFINE_int32(srch, 1920, "input height");
 DEFINE_int32(srcw, 1080, "input width");
 DEFINE_int32(dsth, 960, "output height");
 DEFINE_int32(dstw, 540, "output width");
 DEFINE_int32(angle, 90, "rotate angel");
 DEFINE_int32(flip_num, 0, "flip x");
-DEFINE_int32(layout, 0, "layout nchw");
+DEFINE_int32(layout, 1, "layout nchw");

 typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
 typedef paddle::lite::utils::cv::FlipParam FlipParam;
@@ -99,7 +100,7 @@ void test_img(const std::vector<int>& cluster_id,
              float rotate,
              FlipParam flip,
              LayoutType layout,
-              int test_iter = 1) {
+              int test_iter = 10) {
 #ifdef LITE_WITH_ARM
  paddle::lite::DeviceInfo::Init();
 #endif
@@ -221,7 +222,7 @@ void test_img(const std::vector<int>& cluster_id,
      float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f};

      if (FLAGS_check_result) {
-        LOG(INFO) << "image convert basic compute";
+        // LOG(INFO) << "image convert basic compute";
        image_convert_basic(src,
                            basic_dst,
                            (ImageFormat)srcFormat,
@@ -230,7 +231,7 @@ void test_img(const std::vector<int>& cluster_id,
                            srch,
                            out_size);

-        LOG(INFO) << "image resize basic compute";
+        // LOG(INFO) << "image resize basic compute";
        image_resize_basic(basic_dst,
                           resize_basic,
                           (ImageFormat)dstFormat,
@@ -239,7 +240,7 @@ void test_img(const std::vector<int>& cluster_id,
                           dstw,
                           dsth);

-        LOG(INFO) << "image rotate basic compute";
+        // LOG(INFO) << "image rotate basic compute";
        image_rotate_basic(resize_basic,
                           tv_out_ratote_basic,
                           (ImageFormat)dstFormat,
@@ -247,7 +248,7 @@ void test_img(const std::vector<int>& cluster_id,
                           dsth,
                           rotate);

-        LOG(INFO) << "image flip basic compute";
+        // LOG(INFO) << "image flip basic compute";
        image_flip_basic(resize_basic,
                         tv_out_flip_basic,
                         (ImageFormat)dstFormat,
@@ -255,7 +256,7 @@ void test_img(const std::vector<int>& cluster_id,
                         dsth,
                         flip);

-        LOG(INFO) << "image to tensor basic compute";
+        // LOG(INFO) << "image to tensor basic compute";
        image_to_tensor_basic(resize_basic,
                              &tensor_basic,
                              (ImageFormat)dstFormat,
@@ -267,10 +268,13 @@ void test_img(const std::vector<int>& cluster_id,
      }

      Timer t1;
+      Timer t_convert;
+      Timer t_resize;
+      Timer t_flip;
+      Timer t_rotate;
+      Timer t_tensor;

      LOG(INFO) << "saber cv compute";
-      double to = 0;
-      double min_time = 100000;
      TransParam tparam;
      tparam.ih = srch;
      tparam.iw = srcw;
@@ -285,15 +289,17 @@ void test_img(const std::vector<int>& cluster_id,
      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);

      for (int i = 0; i < test_iter; ++i) {
-        t1.Reset();
        t1.Start();

-        LOG(INFO) << "image convert saber compute";
+        // LOG(INFO) << "image convert saber compute";
+        t_convert.Start();
        // 方法一: image_preprocess.imageCovert(src, lite_dst);
-        image_preprocess.imageCovert(
+        image_preprocess.imageConvert(
            src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat);
+        t_convert.Stop();

-        LOG(INFO) << "image resize saber compute";
+        // LOG(INFO) << "image resize saber compute";
+        t_resize.Start();
        // 方法一:image_preprocess.imageResize(lite_dst, resize_tmp);
        image_preprocess.imageResize(lite_dst,
                                     resize_tmp,
@@ -302,8 +308,10 @@ void test_img(const std::vector<int>& cluster_id,
                                     srch,
                                     dstw,
                                     dsth);
+        t_resize.Stop();

-        LOG(INFO) << "image rotate saber compute";
+        // LOG(INFO) << "image rotate saber compute";
+        t_rotate.Start();
        // 方法一: image_preprocess.imageRotate(resize_tmp, tv_out_ratote);
        image_preprocess.imageRotate(resize_tmp,
                                     tv_out_ratote,
@@ -311,13 +319,17 @@ void test_img(const std::vector<int>& cluster_id,
                                     dstw,
                                     dsth,
                                     rotate);
+        t_rotate.Stop();

-        LOG(INFO) << "image flip saber compute";
+        // LOG(INFO) << "image flip saber compute";
+        t_flip.Start();
        // 方法一: image_preprocess.imageFlip(resize_tmp, tv_out_flip);
        image_preprocess.imageFlip(
            resize_tmp, tv_out_flip, (ImageFormat)dstFormat, dstw, dsth, flip);
+        t_flip.Stop();

-        LOG(INFO) << "image to tensor compute";
+        // LOG(INFO) << "image to tensor compute";
+        t_tensor.Start();
        // 方法一: image_preprocess.image2Tensor(
        //  resize_tmp, &dst_tensor, layout, means, scales);
        image_preprocess.image2Tensor(resize_tmp,
@@ -328,16 +340,27 @@ void test_img(const std::vector<int>& cluster_id,
                                      layout,
                                      means,
                                      scales);
-
+        t_tensor.Stop();
        t1.Stop();
-        double tdiff = t1.LapTimes().Avg();
-        to += tdiff;
-        if (tdiff < min_time) {
-          min_time = tdiff;
-        }
      }
-      LOG(INFO) << "image trans total time : " << to
-                << ",  avg time : " << to / test_iter;
+      LOG(INFO) << "image convert avg time : " << t_convert.LapTimes().Avg()
+                << ", min time: " << t_convert.LapTimes().Min()
+                << ", max time: " << t_convert.LapTimes().Max();
+      LOG(INFO) << "image resize avg time : " << t_resize.LapTimes().Avg()
+                << ", min time: " << t_resize.LapTimes().Min()
+                << ", max time: " << t_resize.LapTimes().Max();
+      LOG(INFO) << "image rotate avg time : " << t_rotate.LapTimes().Avg()
+                << ", min time: " << t_rotate.LapTimes().Min()
+                << ", max time: " << t_rotate.LapTimes().Max();
+      LOG(INFO) << "image flip avg time : " << t_flip.LapTimes().Avg()
+                << ", min time: " << t_flip.LapTimes().Min()
+                << ", max time: " << t_flip.LapTimes().Max();
+      LOG(INFO) << "image tensor avg time : " << t_tensor.LapTimes().Avg()
+                << ", min time: " << t_tensor.LapTimes().Min()
+                << ", max time: " << t_tensor.LapTimes().Max();
+      LOG(INFO) << "image trans total avg time : " << t1.LapTimes().Avg()
+                << ", min time: " << t1.LapTimes().Min()
+                << ", max time: " << t1.LapTimes().Max();

      double max_ratio = 0;
      double max_diff = 0;
@@ -536,7 +559,7 @@ void test_img(const std::vector<int>& cluster_id,
  }
 }

-#if 1
+#if 0
 TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
  if (FLAGS_basic_test) {
    for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
@@ -546,19 +569,16 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
            for (auto rotate : {180}) {
              for (auto flip : {0}) {
                for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) {
-                  for (auto dstFormat : {0, 1, 2, 3}) {
+                  for (auto dstFormat : {0, 1, 2, 3, 4}) {
                    for (auto layout : {1}) {
-                      if ((dstFormat == ImageFormat::GRAY &&
-                           (srcFormat == ImageFormat::RGBA ||
-                            srcFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::GRAY &&
-                           (dstFormat == ImageFormat::RGBA ||
-                            dstFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::NV12 ||
+                      if ((srcFormat == ImageFormat::NV12 ||
                           srcFormat == ImageFormat::NV21) &&
-                              (dstFormat == ImageFormat::GRAY ||
-                               dstFormat == ImageFormat::RGBA ||
-                               dstFormat == ImageFormat::BGRA)) {
+                              (dstFormat == ImageFormat::GRAY)) {
+                        continue;
+                      }
+                      if ((dstFormat == ImageFormat::NV12 ||
+                           dstFormat == ImageFormat::NV21) &&
+                              (srcFormat == ImageFormat::GRAY)) {
                        continue;
                      }
                      if (srcFormat == ImageFormat::NV12 ||
@@ -591,7 +611,7 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
  }
 }
 #endif
-#if 1
+#if 0
 TEST(TestImageConvertRand, test_func_image_resize_preprocess) {
  if (FLAGS_basic_test) {
    for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
@@ -601,21 +621,13 @@ TEST(TestImageConvertRand, test_func_image_resize_preprocess) {
            for (auto rotate : {180}) {
              for (auto flip : {0}) {
                for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) {
-                  for (auto dstFormat : {0, 1, 2, 3}) {
+                  for (auto dstFormat : {0, 1, 2, 3, 4, 11}) {
                    for (auto layout : {1}) {
                      if (dstFormat == ImageFormat::NV12 ||
-                          dstFormat == ImageFormat::NV21 ||
-                          (dstFormat == ImageFormat::GRAY &&
-                           (srcFormat == ImageFormat::RGBA ||
-                            srcFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::GRAY &&
-                           (dstFormat == ImageFormat::RGBA ||
-                            dstFormat == ImageFormat::BGRA)) ||
+                           dstFormat == ImageFormat::NV21 ||
                          (srcFormat == ImageFormat::NV12 ||
                           srcFormat == ImageFormat::NV21) &&
-                              (dstFormat == ImageFormat::GRAY ||
-                               dstFormat == ImageFormat::RGBA ||
-                               dstFormat == ImageFormat::BGRA)) {
+                              dstFormat == ImageFormat::GRAY) {
                        continue;
                      }
                      if (srcFormat == ImageFormat::NV12 ||
@@ -656,25 +668,10 @@ TEST(TestImageConvertRand, test_func_image_trans_preprocess) {
        for (auto ww : {32, 112}) {
          for (auto hh : {112}) {
            for (auto rotate : {90, 180, 270}) {
-              for (auto flip : {0, 1, 2}) {
-                for (auto srcFormat : {11}) {
-                  for (auto dstFormat : {3}) {
+              for (auto flip : {-1, 0, 1}) {
+                for (auto srcFormat : {0}) {
+                  for (auto dstFormat : {0, 1, 2, 3, 4}) {
                    for (auto layout : {1, 3}) {
-                      if (dstFormat == ImageFormat::NV12 ||
-                          dstFormat == ImageFormat::NV21 ||
-                          (dstFormat == ImageFormat::GRAY &&
-                           (srcFormat == ImageFormat::RGBA ||
-                            srcFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::GRAY &&
-                           (dstFormat == ImageFormat::RGBA ||
-                            dstFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::NV12 ||
-                           srcFormat == ImageFormat::NV21) &&
-                              (dstFormat == ImageFormat::GRAY ||
-                               dstFormat == ImageFormat::RGBA ||
-                               dstFormat == ImageFormat::BGRA)) {
-                        continue;
-                      }
                      if (srcFormat == ImageFormat::NV12 ||
                          srcFormat == ImageFormat::NV21) {
                        if (w % 2) {  // is not ou shu, two line y == one line
@@ -717,7 +714,8 @@ TEST(TestImageConvertCustom, test_func_image_preprocess_custom) {
           (ImageFormat)FLAGS_dstFormat,
           FLAGS_angle,
           (FlipParam)FLAGS_flip_num,
-           (LayoutType)FLAGS_layout);
+           (LayoutType)FLAGS_layout,
+           20);
 }
 #endif
 #endif
--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
 if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
-    set(lite_cv_deps)
    lite_cc_library(paddle_cv_arm SRCS
            image_convert.cc
            paddle_image_preprocess.cc
@@ -7,5 +6,5 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_
            image_flip.cc
            image_rotate.cc
            image_resize.cc
-            DEPS ${lite_cv_deps} paddle_api place)
+            DEPS paddle_api place)
 endif()
--- a/lite/utils/cv/image2tensor.cc
+++ b/lite/utils/cv/image2tensor.cc
@@ -18,6 +18,13 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+void gray_to_tensor(const uint8_t* src,
+                    float* output,
+                    int width,
+                    int height,
+                    float* means,
+                    float* scales);
+
 void bgr_to_tensor_chw(const uint8_t* src,
                       float* output,
                       int width,
@@ -52,7 +59,7 @@ void bgra_to_tensor_hwc(const uint8_t* src,
 * NCHW
  * param src: input image data
  * param dstTensor: output tensor data
-  * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
  * param srcw: input image width
  * param srch: input image height
  * param layout: output tensor layout，support NHWC and NCHW
@@ -79,6 +86,9 @@ void Image2Tensor::choose(const uint8_t* src,
  } else if (layout == LayoutType::kNHWC &&
             (srcFormat == BGRA || srcFormat == RGBA)) {
    impl_ = bgra_to_tensor_hwc;
+  } else if ((layout == LayoutType::kNHWC || layout == LayoutType::kNCHW) &&
+             (srcFormat == GRAY)) {
+    impl_ = gray_to_tensor;
  } else {
    printf("this layout: %d or image format: %d not support \n",
           static_cast<int>(layout),
@@ -87,6 +97,147 @@ void Image2Tensor::choose(const uint8_t* src,
  }
  impl_(src, output, srcw, srch, means, scales);
 }
+
+void gray_to_tensor(const uint8_t* src,
+                    float* output,
+                    int width,
+                    int height,
+                    float* means,
+                    float* scales) {
+  int size = width * height;
+  float mean_val = means[0];
+  float scale_val = scales[0];
+
+  int dim16 = width >> 16;
+  int remain = width % 16;
+
+  float32x4_t vmean = vdupq_n_f32(mean_val);
+  float32x4_t vscale = vdupq_n_f32(scale_val);
+#pragma omp parallel for
+  for (int i = 0; i < height; i += 1) {
+    const uint8_t* din_ptr = src + i * width;
+    float* ptr_h = output + i * width;
+    int cnt = dim16;
+    if (cnt > 0) {
+#ifdef __aarch64__
+      asm volatile(
+          "prfm   pldl1keep, [%[inptr0]]                \n"
+          "prfm   pldl1keep, [%[inptr0], #64]   \n"
+          "prfm   pldl1keep, [%[inptr0], #128]   \n"
+          "prfm   pldl1keep, [%[inptr0], #192]   \n"
+          "1:     \n"
+          "ld1 {v0.8b}, [%[inptr0]], #8 \n"  // d8 = y0y1y2.."
+          "ld1 {v1.8b}, [%[inptr0]], #8 \n"  // d8 = y0y1y2.."
+          // 8->16
+          "ushll v3.8h, v0.8b, #0  \n"
+          "ushll v4.8h, v0.8b, #0  \n"
+          // 16->32
+          "ushll v6.4s, v3.4h, #0   \n"
+          "ushll2 v7.4s, v3.8h, #0   \n"
+          "ushll v8.4s, v4.4h, #0   \n"
+          "ushll2 v9.4s, v4.8h, #0   \n"
+          // int32->fp32
+          "ucvtf v12.4s, v6.4s \n"
+          "ucvtf v13.4s, v7.4s \n"
+          "ucvtf v14.4s, v8.4s \n"
+          "ucvtf v15.4s, v9.4s \n"
+          // sub -mean
+          "fsub v12.4s, v12.4s, %w[vmean].4s \n"
+          "fsub v13.4s, v13.4s, %w[vmean].4s \n"
+          "fsub v14.4s, v14.4s, %w[vmean].4s \n"
+          "fsub v15.4s, v15.4s, %w[vmean].4s \n"
+          // mul * scale
+          "fmul v6.4s, v12.4s, %w[vscale].4s \n"
+          "fmul v7.4s, v13.4s, %w[vscale].4s \n"
+          "fmul v8.4s, v14.4s, %w[vscale].4s \n"
+          "fmul v9.4s, v15.4s, %w[vscale].4s \n"
+          // store
+          "st1 {v6.4s}, [%[outr0]], #16 \n"
+          "subs %w[cnt], %w[cnt], #1 \n"
+          "st1 {v7.4s}, [%[outr0]], #16 \n"
+          "st1 {v8.4s}, [%[outr0]], #16 \n"
+          "st1 {v9.4s}, [%[outr0]], #16 \n"
+          "bne 1b \n"
+          : [inptr0] "+r"(din_ptr), [outr0] "+r"(ptr_h), [cnt] "+r"(cnt)
+          : [vmean] "w"(vmean), [vscale] "w"(vscale)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15");
+#else
+      asm volatile(
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr0], #64]                         @ preload a, 64byte\n"
+          "pld [%[inptr0], #128]                         @ preload a, 64byte\n"
+          "pld [%[inptr0], #192]                         @ preload a, 64byte\n"
+          "1: \n"
+          "vld1.8 {d12, d13}, [%[inptr0]]! \n"
+          // 8->16
+          "vmovl.u8 q8, d12 \n"
+          "vmovl.u8 q9, d13 \n"
+          // 16->32
+          "vmovl.u16 q11, d16 \n"
+          "vmovl.u16 q12, d17 \n"
+          "vmovl.u16 q13, d18 \n"
+          "vmovl.u16 q14, d19 \n"
+          // int32->fp32
+          "vcvt.f32.u32 q7, q11 \n"
+          "vcvt.f32.u32 q8, q12 \n"
+          "vcvt.f32.u32 q9, q13 \n"
+          "vcvt.f32.u32 q10, q14 \n"
+          // sub -mean
+          "vsub.f32 q7, q7, %q[vmean] \n"
+          "vsub.f32 q8, q8, %q[vmean] \n"
+          "vsub.f32 q9, q9, %q[vmean] \n"
+          "vsub.f32 q10, q10, %q[vmean] \n"
+          // mul *scale
+          "vmul.f32 q11, q7, %q[vscale] \n"
+          "vmul.f32 q12, q8, %q[vscale] \n"
+          "vmul.f32 q13, q9, %q[vscale] \n"
+          "vmul.f32 q14, q10, %q[vscale] \n"
+          // store
+          "vst1.32  {d22 - d23}, [%[outr0]]! \n"
+          "subs %[cnt], #1 \n"
+          "vst1.32  {d24 - d25}, [%[outr0]]! \n"
+          "vst1.32  {d26 - d27}, [%[outr0]]! \n"
+          "vst1.32  {d28 - d29}, [%[outr0]]! \n"
+          "bne 1b"
+          : [inptr0] "+r"(din_ptr), [outr0] "+r"(ptr_h), [cnt] "+r"(cnt)
+          : [vmean] "w"(vmean), [vscale] "w"(vscale)
+          : "cc",
+            "memory",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12",
+            "q13",
+            "q14");
+#endif
+    }
+    for (int j = 0; j < remain; j++) {
+      *ptr_h++ = (*din_ptr - mean_val) * scale_val;
+      din_ptr++;
+    }
+  }
+}
+
 void bgr_to_tensor_chw(const uint8_t* src,
                       float* output,
                       int width,
@@ -390,6 +541,7 @@ void bgra_to_tensor_chw(const uint8_t* src,
    }
  }
 }
+
 void bgr_to_tensor_hwc(const uint8_t* src,
                       float* output,
                       int width,

--- a/lite/utils/cv/image_convert.cc
+++ b/lite/utils/cv/image_convert.cc
@@ -30,10 +30,14 @@ void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+// bgra rgba to gray
+void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 // bgr rgb to gray
 void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 // gray to bgr rgb
 void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+// gray to bgra rgba
+void hwc1_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 // bgr to bgra or rgb to rgba
 void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 // bgra to bgr or rgba to rgb
@@ -112,6 +116,12 @@ void ImageConvert::choose(const uint8_t* src,
    } else if ((srcFormat == RGB && dstFormat == BGRA) ||
               (srcFormat == BGR && dstFormat == RGBA)) {
      impl_ = hwc3_trans_hwc4;
+    } else if ((srcFormat == GRAY && dstFormat == RGBA) ||
+               (srcFormat == GRAY && dstFormat == BGRA)) {
+      impl_ = hwc1_to_hwc4;
+    } else if ((srcFormat == RGBA && dstFormat == GRAY) ||
+               (srcFormat == BGRA && dstFormat == GRAY)) {
+      impl_ = hwc4_to_hwc1;
    } else {
      printf("srcFormat: %d, dstFormat: %d does not support! \n",
             srcFormat,
@@ -989,7 +999,7 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
          "vshrn.u32 d24, q6, #7 \n"
          "vshrn.u32 d25, q7, #7 \n"
          "vshrn.u32 d26, q8, #7 \n"
-          "vshrn.u32 d27, q8, #7 \n"
+          "vshrn.u32 d27, q9, #7 \n"
          // 16->8
          "vmovn.u16 d4, q10 \n"
          "vmovn.u16 d5, q11 \n"
@@ -1077,6 +1087,280 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
  }
 }
 /*
+采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R
+采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B
+b = 0.114 *128 = 14.529 = 15
+g = 0.587 * 128 = 75.136 = 75
+r = 0.2989 * 127 = 38.2592 = 38
+Gray = (15*B + 75*G + 38*R)/128
+bgra2gray, rgba2gray
+*/
+void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  uint8_t b = 15;
+  uint8_t g = 75;
+  uint8_t r = 38;
+
+  uint8x8_t vb = vdup_n_u8(b);
+  uint8x8_t vg = vdup_n_u8(g);
+  uint8x8_t vr = vdup_n_u8(r);
+#ifdef __aarch64__
+#else
+  uint8_t vb_array[8] = {b, b, b, b, b, b, b, b};
+  uint8_t vg_array[8] = {g, g, g, g, g, g, g, g};
+  uint8_t vr_array[8] = {r, r, r, r, r, r, r, r};
+#endif
+  int cnt_pro = srcw >> 3;
+  int remain_pro = srcw % 8;
+  int win = srcw * 4;
+  int i = 0;
+#pragma omp parallel for
+  for (i = 0; i < srch - 3; i += 4) {
+    int j = 0;
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    uint8_t* outr0 = dst + i * srcw;
+    uint8_t* outr1 = outr0 + srcw;
+    uint8_t* outr2 = outr1 + srcw;
+    uint8_t* outr3 = outr2 + srcw;
+
+    int cnt = cnt_pro;
+    if (cnt > 0) {
+#ifdef __aarch64__
+      asm volatile(
+          "prfm   pldl1keep, [%[inptr0]]                \n"
+          "prfm   pldl1keep, [%[inptr0], #128]   \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr1], #128]   \n"
+          "prfm   pldl1keep, [%[inptr2]]                \n"
+          "prfm   pldl1keep, [%[inptr2], #128]   \n"
+          "prfm   pldl1keep, [%[inptr3]]                \n"
+          "prfm   pldl1keep, [%[inptr3], #128]   \n"
+          "1: \n"
+          "ld4 {v0.8b - v3.8b}, [%[inptr0]], #32 \n"    // d8 = y0y3y6y9.. d9 =
+                                                        // y1y4y7...
+          "ld4 {v4.8b - v7.8b}, [%[inptr1]], #32 \n"    // d8 = y0y3y6y9.. d9 =
+                                                        // y1y4y7...
+          "ld4 {v8.8b - v11.8b}, [%[inptr2]], #32 \n"   // d8 = y0y3y6y9.. d9 =
+                                                        // y1y4y7...
+          "ld4 {v12.8b - v15.8b}, [%[inptr3]], #32 \n"  // d8 = y0y3y6y9.. d9 =
+                                                        // y1y4y7...
+          // mul b
+          "umull v13.8h, v0.8b, %w[vb].8b \n"   // v0 * vb
+          "umull v14.8h, v4.8b, %w[vb].8b \n"   // v0 * vb
+          "umull v15.8h, v8.8b, %w[vb].8b \n"   // v0 * vb
+          "umull v16.8h, v12.8b, %w[vb].8b \n"  // v0 * vb
+          // mul g
+          "umull v17.8h, v1.8b, %w[vg].8b \n"   // v0 * vb
+          "umull v18.8h, v5.8b, %w[vg].8b \n"   // v0 * vb
+          "umull v19.8h, v9.8b, %w[vg].8b \n"   // v0 * vb
+          "umull v20.8h, v13.8b, %w[vg].8b \n"  // v0 * vb
+          // mul r
+          "umlal v13.8h, v2.8b, %w[vr].8b \n"   // v0 * vb
+          "umlal v14.8h, v6.8b, %w[vr].8b \n"   // v0 * vb
+          "umlal v15.8h, v10.8b, %w[vr].8b \n"  // v0 * vb
+          "umlal v16.8h, v14.8b, %w[vr].8b \n"  // v0 * vb
+          // 16->32
+          "uaddl v0.4s, v17.4h, v13.4h \n"
+          "uaddl2 v1.4s, v17.8h, v13.8h \n"
+          "uaddl v2.4s, v18.4h, v14.4h \n"
+          "uaddl2 v3.4s, v18.8h, v14.8h \n"
+          "uaddl v4.4s, v19.4h, v15.4h \n"
+          "uaddl2 v5.4s, v19.8h, v15.8h \n"
+          "uaddl v6.4s, v20.4h, v16.4h \n"
+          "uaddl2 v7.4s, v20.8h, v16.8h \n"
+          // 32->16 v0 >> 7
+          "shrn v12.4h, v0.4s, #7 \n"
+          "shrn2 v12.8h, v1.4s, #7 \n"
+          "shrn v13.4h, v2.4s, #7 \n"
+          "shrn2 v13.8h, v3.4s, #7 \n"
+          "shrn v14.4h, v4.4s, #7 \n"
+          "shrn2 v14.8h, v5.4s, #7 \n"
+          "shrn v15.4h, v6.4s, #7 \n"
+          "shrn2 v15.8h, v7.4s, #7 \n"
+          // 16->8
+          "xtn v0.8b, v12.8h \n"
+          "xtn v1.8b, v13.8h \n"
+          "xtn v2.8b, v14.8h \n"
+          "xtn v3.8b, v15.8h \n"
+          "subs %w[cnt], %w[cnt], #1 \n"
+          "st1 {v0.8b}, [%[outr0]], #8 \n"
+          "st1 {v1.8b}, [%[outr1]], #8 \n"
+          "st1 {v2.8b}, [%[outr2]], #8 \n"
+          "st1 {v3.8b}, [%[outr3]], #8 \n"
+          "bne 1b \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outr0] "+r"(outr0),
+            [outr1] "+r"(outr1),
+            [outr2] "+r"(outr2),
+            [outr3] "+r"(outr3),
+            [cnt] "+r"(cnt)
+          : [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20");
+#else
+      asm volatile(
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr0], #128]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]            @ preload a, 64byte\n"
+          "pld [%[inptr1], #128]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]            @ preload a, 64byte\n"
+          "pld [%[inptr2], #128]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]            @ preload a, 64byte\n"
+          "pld [%[inptr3], #128]                         @ preload a, 64byte\n"
+          "vld1.8 d0, [%[vb]] \n"
+          "vld1.8 d1, [%[vg]] \n"
+          "vld1.8 d2, [%[vr]] \n"
+          "1: \n"
+          "vld4.8 {d3, d4, d5, d6}, [%[inptr0]]! \n"
+          "vld4.8 {d7, d8, d9, d10}, [%[inptr1]]! \n"
+          "vld4.8 {d11, d12, d13, d14}, [%[inptr2]]! \n"
+          "vld4.8 {d15, d16, d17, d18}, [%[inptr3]]! \n"
+          // vb
+          "vmull.u8 q10, d3, d0 \n"
+          "vmull.u8 q11, d7, d0 \n"
+          "vmull.u8 q12, d11, d0 \n"
+          "vmull.u8 q13, d15, d0 \n"
+          // vg
+          "vmull.u8 q14, d4, d1 \n"
+          "vmull.u8 q15, d8, d1 \n"
+          "vmull.u8 q5, d12, d1 \n"
+          "vmull.u8 q7, d16, d1 \n"
+          // vr
+          "vmlal.u8 q10, d5, d2 \n"
+          "vmlal.u8 q11, d9, d2 \n"
+          "vmlal.u8 q12, d13, d2 \n"
+          "vmlal.u8 q13, d17, d2 \n"
+          // 16->32
+          "vaddl.u16 q2, d28, d20 \n"
+          "vaddl.u16 q3, d29, d21 \n"
+          "vaddl.u16 q4, d30, d22 \n"
+          "vaddl.u16 q10, d31, d23 \n"
+          "vaddl.u16 q6, d10, d24 \n"
+          "vaddl.u16 q11, d11, d25 \n"
+          "vaddl.u16 q8, d14, d26 \n"
+          "vaddl.u16 q9, d15, d27 \n"
+          // 32->16 q2 >> 7
+          "vshrn.u32  d10, q2, #7 \n"
+          "vshrn.u32 d11, q3, #7 \n"
+          "vshrn.u32 d14, q4, #7 \n"
+          "vshrn.u32 d15, q10, #7 \n"
+          "vshrn.u32 d24, q6, #7 \n"
+          "vshrn.u32 d25, q11, #7 \n"
+          "vshrn.u32 d26, q8, #7 \n"
+          "vshrn.u32 d27, q9, #7 \n"
+          // 16->8
+          "vmovn.u16 d4, q5 \n"
+          "vmovn.u16 d5, q7 \n"
+          "vmovn.u16 d6, q12 \n"
+          "vmovn.u16 d7, q13 \n"
+          "subs %[cnt], #1 \n"
+          // store
+          "vst1.8 d4, [%[outr0]]! \n"
+          "vst1.8 d5, [%[outr1]]! \n"
+          "vst1.8 d6, [%[outr2]]! \n"
+          "vst1.8 d7, [%[outr3]]! \n"
+          "bne 1b \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outr0] "+r"(outr0),
+            [outr1] "+r"(outr1),
+            [outr2] "+r"(outr2),
+            [outr3] "+r"(outr3),
+            [cnt] "+r"(cnt)
+          : [vb] "r"(vb_array), [vg] "r"(vg_array), [vr] "r"(vr_array)
+          : "cc",
+            "memory",
+            "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12",
+            "q13",
+            "q14",
+            "q15");
+#endif
+    }
+    for (; j < remain_pro; j++) {
+      *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7;
+      *outr1++ = (inptr1[0] * b + inptr1[1] * g + inptr1[2] * r) >> 7;
+      *outr2++ = (inptr2[0] * b + inptr2[1] * g + inptr2[2] * r) >> 7;
+      *outr3++ = (inptr3[0] * b + inptr3[1] * g + inptr3[2] * r) >> 7;
+      inptr0 += 4;
+      inptr1 += 4;
+      inptr2 += 4;
+      inptr3 += 4;
+    }
+  }
+  for (; i < srch; i++) {
+    int j = 0;
+    const uint8_t* inptr0 = src + i * win;
+    uint8_t* outr0 = dst + i * srcw;
+    for (j = 0; j < cnt_pro; j++) {
+      uint8x8x4_t y0 = vld4_u8(inptr0);  // d8 = y0y3y6y9.. d9 = y1y4y7...y
+      uint16x8_t val0 = vmull_u8(y0.val[0], vb);
+
+      uint16x8_t val0_1 = vmull_u8(y0.val[1], vg);
+
+      val0 = vmlal_u8(val0, y0.val[2], vr);
+
+      uint32x4_t v0_sum0 = vaddl_u16(vget_low_u16(val0_1), vget_low_u16(val0));
+      uint32x4_t v0_sum1 =
+          vaddl_u16(vget_high_u16(val0_1), vget_high_u16(val0));
+
+      uint16x4_t v0_sum0_16 = vshrn_n_u32(v0_sum0, 7);
+      uint16x4_t v0_sum1_16 = vshrn_n_u32(v0_sum1, 7);
+
+      uint16x8_t v0_sum = vcombine_u16(v0_sum0_16, v0_sum1_16);
+
+      uint8x8_t vout0 = vmovn_u16(v0_sum);
+
+      inptr0 += 32;
+      vst1_u8(outr0, vout0);
+      outr0 += 8;
+    }
+    for (; j < srcw; j++) {
+      *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7;
+      inptr0 += 4;
+    }
+  }
+}
+/*
 采用CV_GRAY2BGR,转换公式B = G = R = Gray
 采用CV_GRAY2RGB,转换公式R = G = B = Gray
 gray2bgr, gray2rgb
@@ -1091,6 +1375,22 @@ void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
    }
  }
 }
+/*
+采用CV_GRAY2BGRA,转换公式B = G = R = Gray A=255
+采用CV_GRAY2RGBA,转换公式R = G = B = Gray A=255
+gray2bgra, gray2rgba
+*/
+void hwc1_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  for (int i = 0; i < srch; i++) {
+    for (int j = 0; j < srcw; j++) {
+      *dst++ = *src;
+      *dst++ = *src;
+      *dst++ = *src;
+      *dst++ = 255;
+      src++;
+    }
+  }
+}
 // bgr2bgra, rgb2rgba
 void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
  for (int i = 0; i < srch; i++) {

--- a/lite/utils/cv/image_flip.cc
+++ b/lite/utils/cv/image_flip.cc
@@ -19,6 +19,23 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+void ImageFlip::choose(const uint8_t* src,
+                       uint8_t* dst,
+                       ImageFormat srcFormat,
+                       int srcw,
+                       int srch,
+                       FlipParam flip_param) {
+  if (srcFormat == GRAY) {
+    flip_hwc1(src, dst, srcw, srch, flip_param);
+  } else if (srcFormat == BGR || srcFormat == RGB) {
+    flip_hwc3(src, dst, srcw, srch, flip_param);
+  } else if (srcFormat == BGRA || srcFormat == RGBA) {
+    flip_hwc4(src, dst, srcw, srch, flip_param);
+  } else {
+    printf("this srcFormat: %d does not support! \n", srcFormat);
+    return;
+  }
+}
 // gray
 void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
 void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
@@ -43,6 +60,9 @@ void flip_hwc1(const uint8_t* src,
    flip_hwc1_y(src, dst, srcw, srch);
  } else if (flip_param == XY) {
    flip_hwc1_xy(src, dst, srcw, srch);
+  } else {
+    printf("its doesn't support Flip: %d \n", static_cast<int>(flip_param));
+    return;
  }
 }

@@ -57,6 +77,9 @@ void flip_hwc3(const uint8_t* src,
    flip_hwc3_y(src, dst, srcw, srch);
  } else if (flip_param == XY) {
    flip_hwc3_xy(src, dst, srcw, srch);
+  } else {
+    printf("its doesn't support Flip: %d \n", static_cast<int>(flip_param));
+    return;
  }
 }

@@ -71,6 +94,9 @@ void flip_hwc4(const uint8_t* src,
    flip_hwc4_y(src, dst, srcw, srch);
  } else if (flip_param == XY) {
    flip_hwc4_xy(src, dst, srcw, srch);
+  } else {
+    printf("its doesn't support Flip: %d \n", static_cast<int>(flip_param));
+    return;
  }
 }
 /*

--- a/lite/utils/cv/image_flip.h
+++ b/lite/utils/cv/image_flip.h
@@ -21,6 +21,15 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+class ImageFlip {
+ public:
+  void choose(const uint8_t* src,
+              uint8_t* dst,
+              ImageFormat srcFormat,
+              int srcw,
+              int srch,
+              FlipParam flip_param);
+};
 void flip_hwc1(
    const uint8_t* src, uint8_t* dst, int srcw, int srch, FlipParam flip_param);
 void flip_hwc3(

--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -38,6 +38,15 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+void ImageResize::choose(const uint8_t* src,
+                         uint8_t* dst,
+                         ImageFormat srcFormat,
+                         int srcw,
+                         int srch,
+                         int dstw,
+                         int dsth) {
+  resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
+}
 void compute_xy(int srcw,
                int srch,
                int dstw,

--- a/lite/utils/cv/image_resize.h
+++ b/lite/utils/cv/image_resize.h
@@ -39,6 +39,16 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+class ImageResize {
+ public:
+  void choose(const uint8_t* src,
+              uint8_t* dst,
+              ImageFormat srcFormat,
+              int srcw,
+              int srch,
+              int dstw,
+              int dsth);
+};
 void resize(const uint8_t* src,
            uint8_t* dst,
            ImageFormat srcFormat,

--- a/lite/utils/cv/image_rotate.cc
+++ b/lite/utils/cv/image_rotate.cc
@@ -19,6 +19,26 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+void ImageRotate::choose(const uint8_t* src,
+                         uint8_t* dst,
+                         ImageFormat srcFormat,
+                         int srcw,
+                         int srch,
+                         float degree) {
+  if (degree != 90 && degree != 180 && degree != 270) {
+    printf("this degree: %f not support \n", degree);
+  }
+  if (srcFormat == GRAY) {
+    rotate_hwc1(src, dst, srcw, srch, degree);
+  } else if (srcFormat == BGR || srcFormat == RGB) {
+    rotate_hwc3(src, dst, srcw, srch, degree);
+  } else if (srcFormat == BGRA || srcFormat == RGBA) {
+    rotate_hwc4(src, dst, srcw, srch, degree);
+  } else {
+    printf("this srcFormat: %d does not support! \n", srcFormat);
+    return;
+  }
+}
 // gray
 void rotate_hwc1_90(
    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out);
@@ -50,6 +70,9 @@ void rotate_hwc1(
    rotate_hwc1_180(src, dst, srcw, srch, srcw, srch);
  } else if (degree == 270) {
    rotate_hwc1_270(src, dst, srcw, srch, srch, srcw);
+  } else {
+    printf("this degree: %f does not support! \n", degree);
+    return;
  }
 }

@@ -61,6 +84,9 @@ void rotate_hwc3(
    rotate_hwc3_180(src, dst, srcw, srch, srcw, srch);
  } else if (degree == 270) {
    rotate_hwc3_270(src, dst, srcw, srch, srch, srcw);
+  } else {
+    printf("this degree: %f does not support! \n", degree);
+    return;
  }
 }

@@ -72,6 +98,9 @@ void rotate_hwc4(
    rotate_hwc4_180(src, dst, srcw, srch, srcw, srch);
  } else if (degree == 270) {
    rotate_hwc4_270(src, dst, srcw, srch, srch, srcw);
+  } else {
+    printf("this degree: %f does not support! \n", degree);
+    return;
  }
 }
 #ifdef __aarch64__
@@ -578,6 +607,7 @@ void rotate_hwc1_90(const uint8_t* src,
  int stride_h = 4 * w_in;
  int stride_h_w = 4 * w_in - 8;
  int stride_out = 4 * w_out;
+  int ww = w_out - 8;
 #pragma omp parallel for
  for (i = 0; i < h_in - 7; i += 8) {
    const uint8_t* inptr0 = src + i * w_in;
@@ -586,7 +616,7 @@ void rotate_hwc1_90(const uint8_t* src,
    const uint8_t* inptr3 = inptr2 + w_in;
    int j = 0;
    for (; j < w_in - 7; j += 8) {
-      uint8_t* outptr0 = dst + j * w_out + i;
+      uint8_t* outptr0 = dst + j * w_out + (ww - i);
      uint8_t* outptr1 = outptr0 + w_out;
      uint8_t* outptr2 = outptr1 + w_out;
      uint8_t* outptr3 = outptr2 + w_out;
@@ -648,7 +678,7 @@ void rotate_hwc1_90(const uint8_t* src,
    const uint8_t* inptr6 = inptr5 + w_in;
    const uint8_t* inptr7 = inptr6 + w_in;
    for (; j < w_in; j++) {
-      uint8_t* outptr = dst + j * w_out + i;
+      uint8_t* outptr = dst + j * w_out + ww - i;
      *outptr++ = *inptr0++;
      *outptr++ = *inptr1++;
      *outptr++ = *inptr2++;
@@ -659,10 +689,11 @@ void rotate_hwc1_90(const uint8_t* src,
      *outptr++ = *inptr7++;
    }
  }
+  ww = w_out - 1;
  for (; i < h_in; i++) {
    const uint8_t* inptr0 = src + i * w_in;
    for (int j = 0; j < w_in; j++) {
-      uint8_t* outptr0 = dst + j * w_out + i;
+      uint8_t* outptr0 = dst + j * w_out + ww - i;
      *outptr0 = *inptr0++;
    }
  }
@@ -693,9 +724,9 @@ void rotate_hwc1_180(const uint8_t* src,
    const uint8_t* inptr3 = inptr2 + w_in;

    uint8_t* outptr0 = dst + (h_in - i) * w_out - stride_w;  // last
-    uint8_t* outptr1 = outptr0 + w_out;
-    uint8_t* outptr2 = outptr1 + w_out;
-    uint8_t* outptr3 = outptr2 + w_out;
+    uint8_t* outptr1 = outptr0 - w_out;
+    uint8_t* outptr2 = outptr1 - w_out;
+    uint8_t* outptr3 = outptr2 - w_out;

    if (i + 3 >= h_in) {
      uint8_t* ptr = zerobuff + w_in - stride_w;

--- a/lite/utils/cv/image_rotate.h
+++ b/lite/utils/cv/image_rotate.h
@@ -16,10 +16,20 @@

 #include <stdint.h>
 #include <vector>
+#include "lite/utils/cv/paddle_image_preprocess.h"
 namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+class ImageRotate {
+ public:
+  void choose(const uint8_t* src,
+              uint8_t* dst,
+              ImageFormat srcFormat,
+              int srcw,
+              int srch,
+              float degree);
+};
 void rotate_hwc1(
    const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree);
 void rotate_hwc3(

--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
@@ -25,7 +25,6 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
-
 #define PI 3.14159265f
 #define Degrees2Radians(degrees) ((degrees) * (SK_ScalarPI / 180))
 #define Radians2Degrees(radians) ((radians) * (180 / SK_ScalarPI))
@@ -38,7 +37,7 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat,
  this->dstFormat_ = dstFormat;
  this->transParam_ = param;
 }
-void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) {
+void ImagePreprocess::imageConvert(const uint8_t* src, uint8_t* dst) {
  ImageConvert img_convert;
  img_convert.choose(src,
                     dst,
@@ -48,10 +47,10 @@ void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) {
                     this->transParam_.ih);
 }

-void ImagePreprocess::imageCovert(const uint8_t* src,
-                                  uint8_t* dst,
-                                  ImageFormat srcFormat,
-                                  ImageFormat dstFormat) {
+void ImagePreprocess::imageConvert(const uint8_t* src,
+                                   uint8_t* dst,
+                                   ImageFormat srcFormat,
+                                   ImageFormat dstFormat) {
  ImageConvert img_convert;
  img_convert.choose(src,
                     dst,
@@ -68,7 +67,8 @@ void ImagePreprocess::imageResize(const uint8_t* src,
                                  int srch,
                                  int dstw,
                                  int dsth) {
-  resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
+  ImageResize img_resize;
+  img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }

 void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
@@ -77,7 +77,8 @@ void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
  int dstw = this->transParam_.ow;
  int dsth = this->transParam_.oh;
  auto srcFormat = this->dstFormat_;
-  resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
+  ImageResize img_resize;
+  img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }

 void ImagePreprocess::imageRotate(const uint8_t* src,
@@ -86,19 +87,8 @@ void ImagePreprocess::imageRotate(const uint8_t* src,
                                  int srcw,
                                  int srch,
                                  float degree) {
-  if (degree != 90 && degree != 180 && degree != 270) {
-    printf("this degree: %f not support \n", degree);
-  }
-  if (srcFormat == GRAY) {
-    rotate_hwc1(src, dst, srcw, srch, degree);
-  } else if (srcFormat == BGR || srcFormat == RGB) {
-    rotate_hwc3(src, dst, srcw, srch, degree);
-  } else if (srcFormat == BGRA || srcFormat == RGBA) {
-    rotate_hwc4(src, dst, srcw, srch, degree);
-  } else {
-    printf("this srcFormat: %d does not support! \n", srcFormat);
-    return;
-  }
+  ImageRotate img_rotate;
+  img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
 }

 void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) {
@@ -106,10 +96,8 @@ void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) {
  auto srch = this->transParam_.oh;
  auto srcFormat = this->dstFormat_;
  auto degree = this->transParam_.rotate_param;
-  if (degree != 90 && degree != 180 && degree != 270) {
-    printf("this degree: %f not support \n", degree);
-  }
-  ImagePreprocess::imageRotate(src, dst, srcFormat, srcw, srch, degree);
+  ImageRotate img_rotate;
+  img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
 }

 void ImagePreprocess::imageFlip(const uint8_t* src,
@@ -118,16 +106,8 @@ void ImagePreprocess::imageFlip(const uint8_t* src,
                                int srcw,
                                int srch,
                                FlipParam flip_param) {
-  if (srcFormat == GRAY) {
-    flip_hwc1(src, dst, srcw, srch, flip_param);
-  } else if (srcFormat == BGR || srcFormat == RGB) {
-    flip_hwc3(src, dst, srcw, srch, flip_param);
-  } else if (srcFormat == BGRA || srcFormat == RGBA) {
-    flip_hwc4(src, dst, srcw, srch, flip_param);
-  } else {
-    printf("this srcFormat: %d does not support! \n", srcFormat);
-    return;
-  }
+  ImageFlip img_flip;
+  img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
 }

 void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) {
@@ -135,7 +115,8 @@ void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) {
  auto srch = this->transParam_.oh;
  auto srcFormat = this->dstFormat_;
  auto flip_param = this->transParam_.flip_param;
-  ImagePreprocess::imageFlip(src, dst, srcFormat, srcw, srch, flip_param);
+  ImageFlip img_flip;
+  img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
 }

 void ImagePreprocess::image2Tensor(const uint8_t* src,

--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h
@@ -19,6 +19,7 @@
 #include <vector>
 #include "lite/api/paddle_api.h"
 #include "lite/api/paddle_place.h"
+
 namespace paddle {
 namespace lite {
 namespace utils {
@@ -37,9 +38,9 @@ enum ImageFormat {
 };
 // flip enum
 enum FlipParam {
-  X = 0,  // flip along the X axis
-  Y,      // flip along the Y axis
-  XY      // flip along the XY axis
+  XY = -1,  // flip along the XY axis
+  X = 0,    // flip along the X axis
+  Y         // flip along the Y axis
 };
 // transform param
 typedef struct {
@@ -69,11 +70,12 @@ class ImagePreprocess {
  * BGR(RGB)and BGRA(RGBA) transform,
  * BGR(RGB)and RGB(BGR) transform,
  * BGR(RGB)and RGBA(BGRA) transform,
-  * BGR(RGB)and GRAY transform,
+  * BGR(RGB) and GRAY transform,
+  * BGRA(RGBA) and GRAY transform,
  * param src: input image data
  * param dst: output image data
  */
-  void imageCovert(const uint8_t* src, uint8_t* dst);
+  void imageConvert(const uint8_t* src, uint8_t* dst);
  /*
  * image color convert
  * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
@@ -81,6 +83,7 @@ class ImagePreprocess {
  * BGR(RGB)and RGB(BGR) transform,
  * BGR(RGB)and RGBA(BGRA) transform,
  * BGR(RGB)and GRAY transform,
+  * BGRA(RGBA) and GRAY transform,
  * param src: input image data
  * param dst: output image data
  * param srcFormat: input image image format support: GRAY, NV12(NV21),
@@ -88,10 +91,10 @@ class ImagePreprocess {
  * param dstFormat: output image image format, support GRAY, BGR(RGB) and
  * BGRA(RGBA)
  */
-  void imageCovert(const uint8_t* src,
-                   uint8_t* dst,
-                   ImageFormat srcFormat,
-                   ImageFormat dstFormat);
+  void imageConvert(const uint8_t* src,
+                    uint8_t* dst,
+                    ImageFormat srcFormat,
+                    ImageFormat dstFormat);
  /*
  * image resize, use bilinear method
  * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
@@ -171,7 +174,8 @@ class ImagePreprocess {
                 FlipParam flip_param);
  /*
  * change image data to tensor data
-  * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and
+  * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
+  * and
  * NCHW
  * param src: input image data
  * param dstTensor: output tensor data
@@ -186,7 +190,8 @@ class ImagePreprocess {
                    float* scales);
  /*
   * change image data to tensor data
-  * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and
+  * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
+  * and
  * NCHW
  * param src: input image data
  * param dstTensor: output tensor data