[lite][arm]fix model_optimize bug, update concat and split op, speed up (#2620)

186397fe · HappyAngel · Yan Chunwei · e659e4ab · 186397fe · 186397fe
13 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,7 +73,7 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
 lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)
 lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
 # cv build options
-lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF IF NOT LITE_WITH_ARM)
+lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF)


 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  set(deps ${lite_deps_DEPS})
@@ -44,7 +44,7 @@ function (lite_deps TARGET)
      set(deps ${deps} ${var})
    endforeach(var)
    if(LITE_WITH_CV)
-      foreach(var ${lite_cv_deps})
+      foreach(var ${lite_deps_CV_DEPS})
        set(deps ${deps} ${var})
      endforeach(var)
    endif()
@@ -115,10 +115,11 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 #  LIGHT_DEPS:    LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 #  HVY_DEPS:      NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 #  EXCLUDE_COMPILE_DEPS: TARGET will not be included in lite_compile_deps if this is not None
+#  CV_DEPS:       LITE_WITH_CV
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -129,6 +130,7 @@ function(lite_cc_library TARGET)
            CUDA_DEPS ${args_CUDA_DEPS}
            CL_DEPS ${args_CL_DEPS}
            ARM_DEPS ${args_ARM_DEPS}
+            CV_DEPS ${args_CV_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
@@ -162,7 +164,7 @@ function(lite_cc_binary TARGET)
    endif()
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
-      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
+      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

    set(deps "")
@@ -178,6 +180,7 @@ function(lite_cc_binary TARGET)
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
+            CV_DEPS ${CV_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -208,7 +211,7 @@ function(lite_cc_test TARGET)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
-        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
+        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
        )
@@ -232,6 +235,7 @@ function(lite_cc_test TARGET)
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
+              CV_DEPS ${args_CV_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -222,7 +222,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
@@ -236,7 +237,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
-
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
            )
            add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
        endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -24,12 +24,16 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
    endif()
    if(LITE_WITH_CUDA)
        target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
-    endif(LITE_WITH_CUDA) 
+    endif(LITE_WITH_CUDA)
+
    #light api dynamic library
    lite_cc_library(paddle_light_api_shared MODULE
-    SRCS light_api_shared.cc
-    DEPS ${light_lib_DEPS}
-    ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels})
+        SRCS light_api_shared.cc
+        DEPS ${light_lib_DEPS}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels})
+
    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
    if (LITE_WITH_NPU)
        # Strips the symbols of our protobuf functions to fix the conflicts during
@@ -75,16 +79,17 @@ message(STATUS "get FPGA kernels ${fpga_kernels}")
 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
    set(cxx_api_deps
-      scope optimizer target_wrapper_host model_parser program)
+    scope optimizer target_wrapper_host model_parser program)
    lite_cc_library(cxx_api
-                    SRCS cxx_api.cc
-                    DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
-                    X86_DEPS ${x86_kernels}
-                    ARM_DEPS ${arm_kernels}
-                    NPU_DEPS ${npu_kernels}
-                    XPU_DEPS ${xpu_kernels}
-                    CL_DEPS ${opencl_kernels}
-                    FPGA_DEPS ${fpga_kernels})
+                        SRCS cxx_api.cc
+                        DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
+                        X86_DEPS ${x86_kernels}
+                        ARM_DEPS ${arm_kernels}
+                        CV_DEPS paddle_cv_arm
+                        NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
+                        XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
+                        CL_DEPS ${opencl_kernels}
+                        FPGA_DEPS ${fpga_kernels})
 endif()

 # for light api
@@ -100,6 +105,7 @@ lite_cc_library(light_api SRCS light_api.cc
        CUDA_DEPS ${cuda_kernels}
        X86_DEPS ${x86_kernels}
        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
@@ -224,11 +230,12 @@ else()
 endif()
 if (NOT LITE_ON_TINY_PUBLISH)
    lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api paddle_api_light
-      ${ops}
-      ARM_DEPS ${arm_kernels}
-      NPU_DEPS ${npu_kernels}
-      CL_DEPS ${opencl_kernels}
-      FPGA_DEPS ${fpga_kernels})
+        ${ops}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels})
    # The final inference library for just MobileConfig.
    bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
@@ -258,7 +265,7 @@ if (LITE_WITH_JAVA AND LITE_WITH_ARM)
    add_subdirectory(android)
 endif()

-if (LITE_WITH_PYTHON) 
+if (LITE_WITH_PYTHON)
    add_subdirectory(python)
 endif()

@@ -288,25 +295,16 @@ endif()

 # Some bins
 if(NOT IOS)
-  lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils
-    ${ops} ${host_kernels}
-    ARM_DEPS ${arm_kernels}
-    NPU_DEPS ${npu_kernels}
-    XPU_DEPS ${xpu_kernels}
-    CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels}
-    CUDA_DEPS ${cuda_kernels})
-  lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
-    ${ops} ${host_kernels}
-    ARM_DEPS ${arm_kernels}
-    NPU_DEPS ${npu_kernels}
-    XPU_DEPS ${xpu_kernels}
-    CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels}
-    CUDA_DEPS ${cuda_kernels})
-
+    lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
 endif()

 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc

--- a/lite/backends/arm/math/concat.cc
+++ b/lite/backends/arm/math/concat.cc
@@ -26,31 +26,32 @@ namespace math {
 void concat_func(const std::vector<lite::Tensor *> &input,
                 const int axis,
                 lite::Tensor *output) {
-  size_t num = input.size();
-  int rows = 1;
+  int64_t concat_input_size = 1;
+  int64_t num_cancats = 1;
  auto dim_0 = input[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
+  size_t num = input.size();
+  for (int i = axis + 1; i < dim_0.size(); i++) {
+    concat_input_size *= dim_0[i];
  }
-  int out_rows = rows, out_cols = 0;
-
-  std::vector<int64_t> input_cols(input.size());
-  for (int i = 0; i < num; ++i) {
-    int t_cols = input[i]->numel() / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
+  for (int i = 0; i < axis; i++) {
+    num_cancats *= dim_0[i];
  }
-
-  // computation
-  for (int k = 0; k < out_rows; ++k) {
-    float *dst_ptr = output->mutable_data<float>() + k * out_cols;
-    int col_idx = 0;
-    for (int j = 0; j < num; ++j) {
-      int col_len = input_cols[j];
-      const float *src_prt = input[j]->data<float>() + k * col_len;
-      std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
-      col_idx += col_len;
+  float *dst_ptr = output->mutable_data<float>();
+  const int out_concat_axis = output->dims()[axis];
+  int64_t offset_concat_axis = 0;
+  int64_t out_sum = out_concat_axis * concat_input_size;
+  for (int n = 0; n < num; n++) {
+    auto dims = input[n]->dims();
+    const float *src_ptr = input[n]->data<float>();
+    int64_t in_concat_axis = dims[axis];
+    float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
+    int64_t in_sum = in_concat_axis * concat_input_size;
+    for (int i = 0; i < num_cancats; i++) {
+      std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum);
+      dout_ptr += out_sum;
+      src_ptr += in_sum;
    }
+    offset_concat_axis += in_concat_axis;
  }
 }


--- a/lite/backends/arm/math/split.cc
+++ b/lite/backends/arm/math/split.cc
@@ -70,10 +70,12 @@ void split<float>(const float* din,
    int in_after = in_strides[axis];
    int out_after = out_strides[axis];

+    const float* din_ptr = din + input_offset;
+
    for (int i = 0; i < before; ++i) {
-      split_cpy(din + input_offset + i * in_after,
-                out_data + i * out_after,
-                out_after);
+      std::memcpy(out_data, din_ptr, sizeof(float) * out_after);
+      din_ptr += in_after;
+      out_data += out_after;
    }
    input_offset += out_strides[axis];
  }

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -262,6 +262,7 @@ void Instruction::Run() {
  if (op_->run_once() && has_run_) {
    return;
  }
+
  // VLOG(4) << "kernel launch";
  op_->InferShape();
  // VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "

--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
@@ -60,3 +60,32 @@ adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_
 adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./
 ```
 运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg
+
+8. 编译并运行物体分类的demo
+```shell
+cd ../mobile_classify
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+make
+adb -s emulator-5554 push mobile_classify /data/local/tmp/
+adb -s emulator-5554 push test.jpg /data/local/tmp/
+adb -s emulator-5554 push labels.txt /data/local/tmp/
+adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify
+adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+```
+运行成功将在控制台输出预测结果的前5个类别的预测概率
+- 如若想看前10个类别的预测概率，在运行命令输入topk的值即可
+    eg:
+    ```shell
+    adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+    /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
+    ```
+- 如若想看其他模型的分类结果， 在运行命令输入model_dir 及其model的输入大小即可
+    eg:
+    ```shell
+    adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+    /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
+    ```
+    
--- a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mobile_classify: fetch_opencv mobile_classify.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_classify.o -o mobile_classify  $(CXX_LIBS) $(LDFLAGS)
+
+mobile_classify.o: mobile_classify.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_classify.o -c mobile_classify.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mobile_classify.o
+	rm -f mobile_classify
--- a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mobile_classify: fetch_opencv mobile_classify.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_classify.o -o mobile_classify  $(CXX_LIBS) $(LDFLAGS)
+
+mobile_classify.o: mobile_classify.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_classify.o -c mobile_classify.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mobile_classify.o
+	rm -f mobile_classify
--- a/lite/demo/cxx/mobile_classify/mobile_classify.cc
+++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+void load_labels(std::string path, std::vector<std::string>* labels) {
+  FILE* fp = fopen(path.c_str(), "r");
+  if (fp == nullptr) {
+    printf("load label file failed \n");
+    return;
+  }
+  while (!feof(fp)) {
+    char str[1024];
+    fgets(str, 1024, fp);
+    std::string str_s(str);
+
+    if (str_s.length() > 0) {
+      for (int i = 0; i < str_s.length(); i++) {
+        if (str_s[i] == ' ') {
+          std::string strr = str_s.substr(i, str_s.length() - i - 1);
+          labels->push_back(strr);
+          i = str_s.length();
+        }
+      }
+    }
+  }
+  fclose(fp);
+}
+
+void print_topk(const float* scores,
+                const int size,
+                const int topk,
+                const std::vector<std::string>& labels) {
+  std::vector<std::pair<float, int>> vec;
+  vec.resize(size);
+  for (int i = 0; i < size; i++) {
+    vec[i] = std::make_pair(scores[i], i);
+  }
+
+  std::partial_sort(vec.begin(),
+                    vec.begin() + topk,
+                    vec.end(),
+                    std::greater<std::pair<float, int>>());
+
+  // print topk and score
+  for (int i = 0; i < topk; i++) {
+    float score = vec[i].first;
+    int index = vec[i].second;
+    printf("i: %d, index: %d, name: %s, score: %f \n",
+           i,
+           index,
+           labels[index].c_str(),
+           score);
+  }
+}
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale) {
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+
+void pre_process(const cv::Mat& img,
+                 int width,
+                 int height,
+                 Tensor dstTensor,
+                 float* means,
+                 float* scales) {
+  cv::Mat rgb_img;
+  // cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  float* data = dstTensor.mutable_data<float>();
+  neon_mean_scale(dimg, data, width * height, means, scales);
+}
+
+void RunModel(std::string model_dir,
+              std::string img_path,
+              const std::vector<std::string>& labels,
+              const int topk,
+              int width,
+              int height) {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_dir(model_dir);
+
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  // 3. Prepare input data from image
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize({1, 3, height, width});
+  auto* data = input_tensor->mutable_data<float>();
+  // read img and pre-process
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  //   pre_process(img, width, height, data);
+  float means[3] = {0.485f, 0.456f, 0.406f};
+  float scales[3] = {0.229f, 0.224f, 0.225f};
+  pre_process(img, width, height, *input_tensor, means, scales);
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output and post process
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor->data<float>();
+  auto shape_out = output_tensor->shape();
+  int64_t cnt = 1;
+  for (auto& i : shape_out) {
+    cnt *= i;
+  }
+  print_topk(outptr, cnt, topk, labels);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 4) {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " model_dir image_path label_file\n";
+    exit(1);
+  }
+  printf("parameter:  model_dir, image_path and label_file are necessary \n");
+  printf("parameter:  topk, input_width,  input_height, are optional \n");
+  std::string model_dir = argv[1];
+  std::string img_path = argv[2];
+  std::string label_file = argv[3];
+  std::vector<std::string> labels;
+  load_labels(label_file, &labels);
+  int topk = 5;
+  int height = 224;
+  int width = 224;
+  if (argc > 4) {
+    topk = atoi(argv[4]);
+  }
+  if (argc > 6) {
+    width = atoi(argv[5]);
+    height = atoi(argv[6]);
+  }
+
+  RunModel(model_dir, img_path, labels, topk, width, height);
+  return 0;
+}
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
@@ -89,16 +89,20 @@ bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
  if (std::find(input_arg_names.begin(), input_arg_names.end(), "AxisTensor") !=
      input_arg_names.end()) {
    auto args = opdesc.Input("AxisTensor");
-    auto *var = scope->FindVar(args.front());
-    param_.axis_tensor = var->GetMutable<lite::Tensor>();
+    if (!args.empty()) {
+      auto *var = scope->FindVar(args.front());
+      param_.axis_tensor = var->GetMutable<lite::Tensor>();
+    }
  }
  if (std::find(input_arg_names.begin(),
                input_arg_names.end(),
                "SectionsTensorList") != input_arg_names.end()) {
    auto args = opdesc.Input("SectionsTensorList");
-    auto *var = scope->FindVar(args.front());
-    param_.sections_tensor_list =
-        *(var->GetMutable<std::vector<lite::Tensor *>>());
+    if (!args.empty()) {
+      auto *var = scope->FindVar(args.front());
+      param_.sections_tensor_list =
+          *(var->GetMutable<std::vector<lite::Tensor *>>());
+    }
  }
  return true;
 }

--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
@@ -7,5 +7,5 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_
            image_flip.cc
            image_rotate.cc
            image_resize.cc
-            DEPS ${lite_cv_deps} paddle_api_light)
+            DEPS ${lite_cv_deps} paddle_api place)
 endif()