diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77a94bea1efcdafaa67b4c078bfb0a756f7b1cec..0c71a45ffc4c6676238f5975513eefc70f4d702f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,7 +73,7 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
 lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)
 lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
 # cv build options
-lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF IF NOT LITE_WITH_ARM)
+lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF)
 
 
 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index d6b374529e27119f1c48c03c667aa694481e45e8..d01e2d67edd04cfbccd8def62133f301cb694fef 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
   cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   set(deps ${lite_deps_DEPS})
@@ -44,7 +44,7 @@ function (lite_deps TARGET)
       set(deps ${deps} ${var})
     endforeach(var)
     if(LITE_WITH_CV)
-      foreach(var ${lite_cv_deps})
+      foreach(var ${lite_deps_CV_DEPS})
         set(deps ${deps} ${var})
       endforeach(var)
     endif()
@@ -115,10 +115,11 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 #  LIGHT_DEPS:    LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 #  HVY_DEPS:      NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 #  EXCLUDE_COMPILE_DEPS: TARGET will not be included in lite_compile_deps if this is not None
+#  CV_DEPS:       LITE_WITH_CV
 function(lite_cc_library TARGET)
     set(options SHARED shared STATIC static MODULE module)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
       HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -129,6 +130,7 @@ function(lite_cc_library TARGET)
             CUDA_DEPS ${args_CUDA_DEPS}
             CL_DEPS ${args_CL_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
+            CV_DEPS ${args_CV_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
             NPU_DEPS ${args_NPU_DEPS}
             XPU_DEPS ${args_XPU_DEPS}
@@ -162,7 +164,7 @@ function(lite_cc_binary TARGET)
     endif()
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
-      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
+      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
     set(deps "")
@@ -178,6 +180,7 @@ function(lite_cc_binary TARGET)
             PROFILE_DEPS ${args_PROFILE_DEPS}
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
+            CV_DEPS ${CV_DEPS}
             )
     cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
     target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -208,7 +211,7 @@ function(lite_cc_test TARGET)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
-        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
+        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
         ARGS
         COMPILE_LEVEL # (basic|extra)
         )
@@ -232,6 +235,7 @@ function(lite_cc_test TARGET)
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
+              CV_DEPS ${args_CV_DEPS}
               )
     _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
     # strip binary target to reduce size
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index df6b7d3648409e13d88c049ec86173905f8b3cb6..07d78320db81f93220f353fbc4eca1d9ae135d04 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -222,7 +222,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
             )
             add_dependencies(publish_inference_android_cxx_demos logging gflags)
             add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
@@ -236,7 +237,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
-
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
             )
             add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
         endif()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 70239e94e7a3064fb383246623d05a2079dda1fa..2264d0052ef296052dcd4a55c1ae90b8c4551b26 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -24,12 +24,16 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
     endif()
     if(LITE_WITH_CUDA)
         target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
-    endif(LITE_WITH_CUDA) 
+    endif(LITE_WITH_CUDA)
+
     #light api dynamic library
     lite_cc_library(paddle_light_api_shared MODULE
-    SRCS light_api_shared.cc
-    DEPS ${light_lib_DEPS}
-    ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels})
+        SRCS light_api_shared.cc
+        DEPS ${light_lib_DEPS}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels})
+
     target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
     if (LITE_WITH_NPU)
         # Strips the symbols of our protobuf functions to fix the conflicts during
@@ -75,16 +79,17 @@ message(STATUS "get FPGA kernels ${fpga_kernels}")
 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
     set(cxx_api_deps
-      scope optimizer target_wrapper_host model_parser program)
+    scope optimizer target_wrapper_host model_parser program)
     lite_cc_library(cxx_api
-                    SRCS cxx_api.cc
-                    DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
-                    X86_DEPS ${x86_kernels}
-                    ARM_DEPS ${arm_kernels}
-                    NPU_DEPS ${npu_kernels}
-                    XPU_DEPS ${xpu_kernels}
-                    CL_DEPS ${opencl_kernels}
-                    FPGA_DEPS ${fpga_kernels})
+                        SRCS cxx_api.cc
+                        DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
+                        X86_DEPS ${x86_kernels}
+                        ARM_DEPS ${arm_kernels}
+                        CV_DEPS paddle_cv_arm
+                        NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
+                        XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
+                        CL_DEPS ${opencl_kernels}
+                        FPGA_DEPS ${fpga_kernels})
 endif()
 
 # for light api
@@ -100,6 +105,7 @@ lite_cc_library(light_api SRCS light_api.cc
         CUDA_DEPS ${cuda_kernels}
         X86_DEPS ${x86_kernels}
         ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
         CL_DEPS ${opencl_kernels}
@@ -224,11 +230,12 @@ else()
 endif()
 if (NOT LITE_ON_TINY_PUBLISH)
     lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api paddle_api_light
-      ${ops}
-      ARM_DEPS ${arm_kernels}
-      NPU_DEPS ${npu_kernels}
-      CL_DEPS ${opencl_kernels}
-      FPGA_DEPS ${fpga_kernels})
+        ${ops}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels})
     # The final inference library for just MobileConfig.
     bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
     get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
@@ -258,7 +265,7 @@ if (LITE_WITH_JAVA AND LITE_WITH_ARM)
     add_subdirectory(android)
 endif()
 
-if (LITE_WITH_PYTHON) 
+if (LITE_WITH_PYTHON)
     add_subdirectory(python)
 endif()
 
@@ -288,25 +295,16 @@ endif()
 
 # Some bins
 if(NOT IOS)
-  lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils
-    ${ops} ${host_kernels}
-    ARM_DEPS ${arm_kernels}
-    NPU_DEPS ${npu_kernels}
-    XPU_DEPS ${xpu_kernels}
-    CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels}
-    CUDA_DEPS ${cuda_kernels})
-  lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
-    ${ops} ${host_kernels}
-    ARM_DEPS ${arm_kernels}
-    NPU_DEPS ${npu_kernels}
-    XPU_DEPS ${xpu_kernels}
-    CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels}
-    CUDA_DEPS ${cuda_kernels})
-
+    lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
 endif()
 
 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
diff --git a/lite/backends/arm/math/concat.cc b/lite/backends/arm/math/concat.cc
index 9b94cefa16bca0dd487ad0e4f6b88e604b694416..65f93453388d7f41d73669f583d189bec9035bb5 100644
--- a/lite/backends/arm/math/concat.cc
+++ b/lite/backends/arm/math/concat.cc
@@ -26,31 +26,32 @@ namespace math {
 void concat_func(const std::vector<lite::Tensor *> &input,
                  const int axis,
                  lite::Tensor *output) {
-  size_t num = input.size();
-  int rows = 1;
+  int64_t concat_input_size = 1;
+  int64_t num_cancats = 1;
   auto dim_0 = input[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
+  size_t num = input.size();
+  for (int i = axis + 1; i < dim_0.size(); i++) {
+    concat_input_size *= dim_0[i];
   }
-  int out_rows = rows, out_cols = 0;
-
-  std::vector<int64_t> input_cols(input.size());
-  for (int i = 0; i < num; ++i) {
-    int t_cols = input[i]->numel() / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
+  for (int i = 0; i < axis; i++) {
+    num_cancats *= dim_0[i];
   }
-
-  // computation
-  for (int k = 0; k < out_rows; ++k) {
-    float *dst_ptr = output->mutable_data<float>() + k * out_cols;
-    int col_idx = 0;
-    for (int j = 0; j < num; ++j) {
-      int col_len = input_cols[j];
-      const float *src_prt = input[j]->data<float>() + k * col_len;
-      std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
-      col_idx += col_len;
+  float *dst_ptr = output->mutable_data<float>();
+  const int out_concat_axis = output->dims()[axis];
+  int64_t offset_concat_axis = 0;
+  int64_t out_sum = out_concat_axis * concat_input_size;
+  for (int n = 0; n < num; n++) {
+    auto dims = input[n]->dims();
+    const float *src_ptr = input[n]->data<float>();
+    int64_t in_concat_axis = dims[axis];
+    float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
+    int64_t in_sum = in_concat_axis * concat_input_size;
+    for (int i = 0; i < num_cancats; i++) {
+      std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum);
+      dout_ptr += out_sum;
+      src_ptr += in_sum;
     }
+    offset_concat_axis += in_concat_axis;
   }
 }
 
diff --git a/lite/backends/arm/math/split.cc b/lite/backends/arm/math/split.cc
index 54ea7e62c2567cf2fe490351572968366fda483e..bff29af93b525dc18e19bded03b0770f7f7a33c8 100644
--- a/lite/backends/arm/math/split.cc
+++ b/lite/backends/arm/math/split.cc
@@ -70,10 +70,12 @@ void split<float>(const float* din,
     int in_after = in_strides[axis];
     int out_after = out_strides[axis];
 
+    const float* din_ptr = din + input_offset;
+
     for (int i = 0; i < before; ++i) {
-      split_cpy(din + input_offset + i * in_after,
-                out_data + i * out_after,
-                out_after);
+      std::memcpy(out_data, din_ptr, sizeof(float) * out_after);
+      din_ptr += in_after;
+      out_data += out_after;
     }
     input_offset += out_strides[axis];
   }
diff --git a/lite/core/program.cc b/lite/core/program.cc
index b0c61bf00ed29e2fa71072b64f11f6ba30f77691..8dc8fb0dddc54d7d83b2368b31b5f30725469296 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -262,6 +262,7 @@ void Instruction::Run() {
   if (op_->run_once() && has_run_) {
     return;
   }
+
   // VLOG(4) << "kernel launch";
   op_->InferShape();
   // VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "
diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md
index b7768d763eb4f6d2255119f805753f96d4bef9e6..5e0ec49adda2c6f7372bdbba1fdd04b610b0a0bc 100644
--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
@@ -60,3 +60,32 @@ adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_
 adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./
 ```
 运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg
+
+8. 编译并运行物体分类的demo
+```shell
+cd ../mobile_classify
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+make
+adb -s emulator-5554 push mobile_classify /data/local/tmp/
+adb -s emulator-5554 push test.jpg /data/local/tmp/
+adb -s emulator-5554 push labels.txt /data/local/tmp/
+adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify
+adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+```
+运行成功将在控制台输出预测结果的前5个类别的预测概率
+- 如若想看前10个类别的预测概率，在运行命令输入topk的值即可
+    eg:
+    ```shell
+    adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+    /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
+    ```
+- 如若想看其他模型的分类结果， 在运行命令输入model_dir 及其model的输入大小即可
+    eg:
+    ```shell
+    adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+    /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
+    ```
+    
diff --git a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7
new file mode 100644
index 0000000000000000000000000000000000000000..8d446af9b174d8876fdd9aafd64bc2057dd7e17e
--- /dev/null
+++ b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7
@@ -0,0 +1,61 @@
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mobile_classify: fetch_opencv mobile_classify.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_classify.o -o mobile_classify  $(CXX_LIBS) $(LDFLAGS)
+
+mobile_classify.o: mobile_classify.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_classify.o -c mobile_classify.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mobile_classify.o
+	rm -f mobile_classify
diff --git a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8
new file mode 100644
index 0000000000000000000000000000000000000000..255c42f2dca5364d9a639c993737608657568b17
--- /dev/null
+++ b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8
@@ -0,0 +1,61 @@
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mobile_classify: fetch_opencv mobile_classify.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_classify.o -o mobile_classify  $(CXX_LIBS) $(LDFLAGS)
+
+mobile_classify.o: mobile_classify.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_classify.o -c mobile_classify.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mobile_classify.o
+	rm -f mobile_classify
diff --git a/lite/demo/cxx/mobile_classify/mobile_classify.cc b/lite/demo/cxx/mobile_classify/mobile_classify.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c651bf9f4cca0db0e126311e5a03b3ade6ccf886
--- /dev/null
+++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc
@@ -0,0 +1,195 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+void load_labels(std::string path, std::vector<std::string>* labels) {
+  FILE* fp = fopen(path.c_str(), "r");
+  if (fp == nullptr) {
+    printf("load label file failed \n");
+    return;
+  }
+  while (!feof(fp)) {
+    char str[1024];
+    fgets(str, 1024, fp);
+    std::string str_s(str);
+
+    if (str_s.length() > 0) {
+      for (int i = 0; i < str_s.length(); i++) {
+        if (str_s[i] == ' ') {
+          std::string strr = str_s.substr(i, str_s.length() - i - 1);
+          labels->push_back(strr);
+          i = str_s.length();
+        }
+      }
+    }
+  }
+  fclose(fp);
+}
+
+void print_topk(const float* scores,
+                const int size,
+                const int topk,
+                const std::vector<std::string>& labels) {
+  std::vector<std::pair<float, int>> vec;
+  vec.resize(size);
+  for (int i = 0; i < size; i++) {
+    vec[i] = std::make_pair(scores[i], i);
+  }
+
+  std::partial_sort(vec.begin(),
+                    vec.begin() + topk,
+                    vec.end(),
+                    std::greater<std::pair<float, int>>());
+
+  // print topk and score
+  for (int i = 0; i < topk; i++) {
+    float score = vec[i].first;
+    int index = vec[i].second;
+    printf("i: %d, index: %d, name: %s, score: %f \n",
+           i,
+           index,
+           labels[index].c_str(),
+           score);
+  }
+}
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale) {
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+
+void pre_process(const cv::Mat& img,
+                 int width,
+                 int height,
+                 Tensor dstTensor,
+                 float* means,
+                 float* scales) {
+  cv::Mat rgb_img;
+  // cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  float* data = dstTensor.mutable_data<float>();
+  neon_mean_scale(dimg, data, width * height, means, scales);
+}
+
+void RunModel(std::string model_dir,
+              std::string img_path,
+              const std::vector<std::string>& labels,
+              const int topk,
+              int width,
+              int height) {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_dir(model_dir);
+
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  // 3. Prepare input data from image
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize({1, 3, height, width});
+  auto* data = input_tensor->mutable_data<float>();
+  // read img and pre-process
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  //   pre_process(img, width, height, data);
+  float means[3] = {0.485f, 0.456f, 0.406f};
+  float scales[3] = {0.229f, 0.224f, 0.225f};
+  pre_process(img, width, height, *input_tensor, means, scales);
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output and post process
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor->data<float>();
+  auto shape_out = output_tensor->shape();
+  int64_t cnt = 1;
+  for (auto& i : shape_out) {
+    cnt *= i;
+  }
+  print_topk(outptr, cnt, topk, labels);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 4) {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " model_dir image_path label_file\n";
+    exit(1);
+  }
+  printf("parameter:  model_dir, image_path and label_file are necessary \n");
+  printf("parameter:  topk, input_width,  input_height, are optional \n");
+  std::string model_dir = argv[1];
+  std::string img_path = argv[2];
+  std::string label_file = argv[3];
+  std::vector<std::string> labels;
+  load_labels(label_file, &labels);
+  int topk = 5;
+  int height = 224;
+  int width = 224;
+  if (argc > 4) {
+    topk = atoi(argv[4]);
+  }
+  if (argc > 6) {
+    width = atoi(argv[5]);
+    height = atoi(argv[6]);
+  }
+
+  RunModel(model_dir, img_path, labels, topk, width, height);
+  return 0;
+}
diff --git a/lite/operators/split_op.cc b/lite/operators/split_op.cc
index ec98a0d6c3ba3b1e5cd1c7992b58e96917d21057..834d68a3156700605e621a1ba71faec33fb7b745 100644
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
@@ -89,16 +89,20 @@ bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   if (std::find(input_arg_names.begin(), input_arg_names.end(), "AxisTensor") !=
       input_arg_names.end()) {
     auto args = opdesc.Input("AxisTensor");
-    auto *var = scope->FindVar(args.front());
-    param_.axis_tensor = var->GetMutable<lite::Tensor>();
+    if (!args.empty()) {
+      auto *var = scope->FindVar(args.front());
+      param_.axis_tensor = var->GetMutable<lite::Tensor>();
+    }
   }
   if (std::find(input_arg_names.begin(),
                 input_arg_names.end(),
                 "SectionsTensorList") != input_arg_names.end()) {
     auto args = opdesc.Input("SectionsTensorList");
-    auto *var = scope->FindVar(args.front());
-    param_.sections_tensor_list =
-        *(var->GetMutable<std::vector<lite::Tensor *>>());
+    if (!args.empty()) {
+      auto *var = scope->FindVar(args.front());
+      param_.sections_tensor_list =
+          *(var->GetMutable<std::vector<lite::Tensor *>>());
+    }
   }
   return true;
 }
diff --git a/lite/utils/cv/CMakeLists.txt b/lite/utils/cv/CMakeLists.txt
index 01f5341c972342afa13fabaf5183a7d5d8543c7f..0edcb2ef24ce4f53ffffa14ad70cbbc1d5c5971e 100644
--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
@@ -7,5 +7,5 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_
             image_flip.cc
             image_rotate.cc
             image_resize.cc
-            DEPS ${lite_cv_deps} paddle_api_light)
+            DEPS ${lite_cv_deps} paddle_api place)
 endif()