* adjust follow npu/xpu

* fix code_style test=develop

* adjust follow npu/xpu
* fix code_style test=develop
04a36e78 · baolei.an · d6709eb9 · 04a36e78 · 04a36e78 · 04a36e78
508 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,7 +74,7 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
 lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)
 lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
 # cv build options
-lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF IF NOT LITE_WITH_ARM)
+lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF)
 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
@@ -170,6 +170,10 @@ endif()
 ########################################################################################
+if(LITE_WITH_XPU)
+    include(xpu)
+endif()
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm
@@ -189,14 +193,9 @@ if(LITE_WITH_CUDA)
  include(cuda)
 endif()
-if(LITE_WITH_XPU)
-  include(xpu)
-endif()
 if(LITE_WITH_BM)
  include(bm)
 endif()
 include(generic)            # simplify cmake module
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs

--- a/README_cn.md
+++ b/README_cn.md
@@ -34,7 +34,7 @@ Paddle Lite为Paddle-Mobile的升级版，定位支持包括手机移动端在
 PaddleLite 的架构设计着重考虑了对多硬件和平台的支持，并且强化了多个硬件在一个模型中混合执行的能力，多个层面的性能优化处理，以及对端侧应用的轻量化设计。
-![](https://github.com/Superjomn/_tmp_images/raw/master/images/paddle-lite-architecture.png)
+![](https://user-images.githubusercontent.com/45189361/70908123-6ce4fd00-2045-11ea-97e1-ad08446c5c86.png)
 其中，Analysis Phase 包括了 MIR(Machine IR) 相关模块，能够对原有的模型的计算图针对具体的硬件列表进行算子融合、计算裁剪 在内的多种优化。Execution Phase 只涉及到Kernel 的执行，且可以单独部署，以支持极致的轻量级部署。

--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -63,7 +63,7 @@ if (LITE_ON_TINY_PUBLISH)
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
    endif()
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto -fvisibility=hidden -fvisibility-inlines-hidden -fdata-sections -ffunction-sections")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
    check_linker_flag(-Wl,--gc-sections)
 endif()

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -32,10 +32,9 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib
    /usr/lib
-	${CUDA_TOOLKIT_ROOT_DIR}
+    ${CUDA_TOOLKIT_ROOT_DIR}
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
-	${CUDA_TOOLKIT_ROOT_DIR}/lib64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
-	)
 if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0))
    find_library(CUBLAS_LIBRARY  NAMES libcublas.so PATHS ${CUDNN_CHECK_LIBRARY_DIRS} NO_DEFAULT_PATH)

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(deps ${lite_deps_DEPS})
@@ -44,7 +44,7 @@ function (lite_deps TARGET)
      set(deps ${deps} ${var})
    endforeach(var)
    if(LITE_WITH_CV)
-      foreach(var ${lite_cv_deps})
+      foreach(var ${lite_deps_CV_DEPS})
        set(deps ${deps} ${var})
      endforeach(var)
    endif()
@@ -121,10 +121,11 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 #  LIGHT_DEPS:    LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 #  HVY_DEPS:      NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 #  EXCLUDE_COMPILE_DEPS: TARGET will not be included in lite_compile_deps if this is not None
+#  CV_DEPS:       LITE_WITH_CV
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS XPU_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -134,11 +135,12 @@ function(lite_cc_library TARGET)
            X86_DEPS ${args_X86_DEPS}
            CUDA_DEPS ${args_CUDA_DEPS}
            CL_DEPS ${args_CL_DEPS}
-            NPU_DEPS ${args_NPU_DEPS}
+            BM_DEPS ${args_BM_DEPS}
-            XPU_DEPS ${args_XPU_DEPS}
            ARM_DEPS ${args_ARM_DEPS}
+            CV_DEPS ${args_CV_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
-            BM_DEPS ${args_BM_DEPS}
+            NPU_DEPS ${args_NPU_DEPS}
+            XPU_DEPS ${args_XPU_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
@@ -168,8 +170,8 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
-      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
+      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    set(deps "")
@@ -180,10 +182,13 @@ function(lite_cc_binary TARGET)
            CL_DEPS ${args_CL_DEPS}
            ARM_DEPS ${args_ARM_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
-            BM_DEPS ${args_BM_DEPS}
+            NPU_DEPS ${args_NPU_DEPS}
+            XPU_DEPS ${args_XPU_DEPS}
+	    BM_DEPS ${args_BM_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
+            CV_DEPS ${CV_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -213,8 +218,8 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
-        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
+        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
        )
@@ -233,10 +238,13 @@ function(lite_cc_test TARGET)
              CL_DEPS ${args_CL_DEPS}
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
-              BM_DEPS ${args_BM_DEPS}
+              NPU_DEPS ${args_NPU_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
+	      BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
+              CV_DEPS ${args_CV_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size
@@ -277,7 +285,7 @@ endif()
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -376,12 +384,13 @@ function(add_kernel TARGET device level)
    lite_cc_library(${TARGET} SRCS ${args_SRCS}
              DEPS ${args_DEPS}
              X86_DEPS ${args_X86_DEPS}
-              XPU_DEPS ${args_XPU_DEPS}
              CUDA_DEPS ${args_CUDA_DEPS}
              CL_DEPS ${args_CL_DEPS}
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
-              BM_DEPS ${args_BM_DEPS}
+              NPU_DEPS ${args_NPU_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
+	      BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -400,7 +409,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS BM_DEPS FPGA_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -426,12 +435,13 @@ function(add_operator TARGET level)
    lite_cc_library(${TARGET} SRCS ${args_SRCS}
              DEPS ${args_DEPS}
              X86_DEPS ${args_X86_DEPS}
-              XPU_DEPS ${args_XPU_DEPS}
              CUDA_DEPS ${args_CUDA_DEPS}
              CL_DEPS ${args_CL_DEPS}
              ARM_DEPS ${args_ARM_DEPS}
              FPGA_DEPS ${args_FPGA_DEPS}
-              BM_DEPS ${args_BM_DEPS}
+              NPU_DEPS ${args_NPU_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
+	      BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/cmake/xpu.cmake
+++ b/cmake/xpu.cmake
@@ -99,7 +99,7 @@ else()
  set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
 endif()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1 -D_GLIBCXX_USE_CXX11_ABI=0")
 set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
 set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -176,13 +176,17 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+                    COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
                    COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/libpaddle_light_api_shared.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
                    )
                add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared)
+                add_dependencies(tiny_publish_cxx_lib bundle_light_api)
                add_dependencies(publish_inference tiny_publish_cxx_lib)
-                add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
+                if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
-                            COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
+                    add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
+                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
+                endif()
            endif()
        endif()
    endif()
@@ -222,7 +226,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
@@ -236,7 +241,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
            )
            add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
        endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -16,17 +16,24 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
    target_link_libraries(paddle_full_api_shared framework_proto)
    if(LITE_WITH_X86)
-       add_dependencies(paddle_full_api_shared xxhash)
+        add_dependencies(paddle_full_api_shared xxhash)
-       target_link_libraries(paddle_full_api_shared xxhash)
+        target_link_libraries(paddle_full_api_shared xxhash)
+        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) 
+            add_dependencies(paddle_full_api_shared dynload_mklml)
+        endif()
    endif()
    if(LITE_WITH_CUDA)
        target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
-    endif(LITE_WITH_CUDA) 
+    endif(LITE_WITH_CUDA)
    #light api dynamic library
    lite_cc_library(paddle_light_api_shared MODULE
-    SRCS light_api_shared.cc
+        SRCS light_api_shared.cc
-    DEPS ${light_lib_DEPS}
+        DEPS ${light_lib_DEPS}
-    ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels})
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels})
    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
    if (LITE_WITH_NPU)
        # Strips the symbols of our protobuf functions to fix the conflicts during
@@ -38,10 +45,11 @@ else()
    if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
        add_library(paddle_light_api_shared SHARED "")
        target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc)
-        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
+       set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
+       add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
        if (LITE_WITH_NPU)
            # Need to add HIAI runtime libs (libhiai.so) dependency
-            target_link_libraries(paddle_light_api_shared ${npu_runtime_libs})
+            target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
        endif()
    endif()
 endif()
@@ -73,22 +81,22 @@ message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
-message(STATUS "get BM kernels ${bm_kernels}")
 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
    set(cxx_api_deps
-      scope optimizer target_wrapper_host model_parser program)
+    scope optimizer target_wrapper_host model_parser program)
    lite_cc_library(cxx_api
-                    SRCS cxx_api.cc
+                        SRCS cxx_api.cc
-                    DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
+                        DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
-                    X86_DEPS ${x86_kernels}
+                        X86_DEPS ${x86_kernels}
-                    ARM_DEPS ${arm_kernels}
+                        ARM_DEPS ${arm_kernels}
-                    NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
+                        CV_DEPS paddle_cv_arm
-                    XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
+                        NPU_DEPS ${npu_kernels}
-                    BM_DEPS ${bm_kernels} ${bm_bridges} bm_pass
+                        XPU_DEPS ${xpu_kernels}
-                    CL_DEPS ${opencl_kernels}
+                        BM_DEPS ${bm_kernels}
-                    FPGA_DEPS ${fpga_kernels})
+                        CL_DEPS ${opencl_kernels}
+                        FPGA_DEPS ${fpga_kernels})
 endif()
 # for light api
@@ -104,6 +112,7 @@ lite_cc_library(light_api SRCS light_api.cc
        CUDA_DEPS ${cuda_kernels}
        X86_DEPS ${x86_kernels}
        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
@@ -234,11 +243,12 @@ else()
 endif()
 if (NOT LITE_ON_TINY_PUBLISH)
    lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api paddle_api_light
-      ${ops}
+        ${ops}
-      ARM_DEPS ${arm_kernels}
+        ARM_DEPS ${arm_kernels}
-      NPU_DEPS ${npu_kernels}
+        CV_DEPS paddle_cv_arm
-      CL_DEPS ${opencl_kernels}
+        NPU_DEPS ${npu_kernels}
-      FPGA_DEPS ${fpga_kernels})
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels})
    # The final inference library for just MobileConfig.
    bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
@@ -270,7 +280,7 @@ if (LITE_WITH_JAVA AND LITE_WITH_ARM)
    add_subdirectory(android)
 endif()
-if (LITE_WITH_PYTHON) 
+if (LITE_WITH_PYTHON)
    add_subdirectory(python)
 endif()
@@ -301,27 +311,38 @@ endif()
 # Some bins
 if(NOT IOS)
-  lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils
+    lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils
-    ${ops} ${host_kernels}
+        ${ops} ${host_kernels}
-    ARM_DEPS ${arm_kernels}
+        ARM_DEPS ${arm_kernels}
-    NPU_DEPS ${npu_kernels}
+        CV_DEPS paddle_cv_arm
-    XPU_DEPS ${xpu_kernels}
+        NPU_DEPS ${npu_kernels}
-    CL_DEPS ${opencl_kernels}
+        XPU_DEPS ${xpu_kernels}
-    BM_DEPS ${bm_kernels}
+        CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
+        BM_DEPS ${bm_kernels}
-    X86_DEPS ${x86_kernels}
+        FPGA_DEPS ${fpga_kernels}
-    CUDA_DEPS ${cuda_kernels})
+        X86_DEPS ${x86_kernels}
-  lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
+        CUDA_DEPS ${cuda_kernels})
-    ${ops} ${host_kernels}
-    ARM_DEPS ${arm_kernels}
-    NPU_DEPS ${npu_kernels}
-    XPU_DEPS ${xpu_kernels}
-    CL_DEPS ${opencl_kernels}
-    BM_DEPS ${bm_kernels}
-    FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels}
-    CUDA_DEPS ${cuda_kernels})
+    lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
+        lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+	BM_DEPS ${bm_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
 endif()
 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc

--- a/lite/api/_paddle_use_ops.h
+++ b/lite/api/_paddle_use_ops.h
@@ -108,7 +108,7 @@ USE_LITE_OP(while)
 USE_LITE_OP(lod_reset)
 USE_LITE_OP(lookup_table)
 USE_LITE_OP(multiclass_nms)
-USE_LITE_OP(graph_op)
+USE_LITE_OP(subgraph)
 USE_LITE_OP(sequence_expand)
 USE_LITE_OP(sequence_pool)
 USE_LITE_OP(reduce_max)

--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -25,11 +25,12 @@ if (NOT LITE_ON_TINY_PUBLISH)
    endif()
 else()
    add_library(paddle_lite_jni SHARED "")
+    set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
    target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
    add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
    if (LITE_WITH_NPU)
        # Need to add HIAI runtime libs (libhiai.so) dependency
-        target_link_libraries(paddle_lite_jni ${npu_runtime_libs})
+        target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs})
    endif()
 endif()

--- a/lite/api/android/jni/native/tensor_jni.cc
+++ b/lite/api/android/jni/native/tensor_jni.cc
@@ -120,6 +120,22 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3B(
  return JNI_TRUE;
 }
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3I(
+    JNIEnv *env, jobject jtensor, jintArray buf) {
+  std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+  if (tensor == nullptr || (*tensor == nullptr)) {
+    return JNI_FALSE;
+  }
+  int64_t buf_size = (int64_t)env->GetArrayLength(buf);
+  if (buf_size != product((*tensor)->shape())) {
+    return JNI_FALSE;
+  }
+  int32_t *input = (*tensor)->mutable_data<int32_t>();
+  env->GetIntArrayRegion(buf, 0, buf_size, input);
+  return JNI_TRUE;
+}
 JNIEXPORT jfloatArray JNICALL
 Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *env, jobject jtensor) {
  if (is_const_tensor(env, jtensor)) {
@@ -148,6 +164,20 @@ Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *env, jobject jtensor) {
  }
 }
+JNIEXPORT jintArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_getIntData(JNIEnv *env, jobject jtensor) {
+  if (is_const_tensor(env, jtensor)) {
+    std::unique_ptr<const Tensor> *tensor =
+        get_read_only_tensor_pointer(env, jtensor);
+    return cpp_array_to_jintarray(
+        env, (*tensor)->data<int32_t>(), product((*tensor)->shape()));
+  } else {
+    std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+    return cpp_array_to_jintarray(
+        env, (*tensor)->data<int32_t>(), product((*tensor)->shape()));
+  }
+}
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_deleteCppTensor(
    JNIEnv *env, jobject jtensor, jlong java_pointer) {
  if (java_pointer == 0) {

--- a/lite/api/android/jni/native/tensor_jni.h
+++ b/lite/api/android/jni/native/tensor_jni.h
@@ -16,8 +16,8 @@
 #include <jni.h>
 /* Header for class com_baidu_paddle_lite_Tensor */
-#ifndef PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
+#ifndef LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
-#define PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
+#define LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -49,6 +49,14 @@ Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *, jobject);
 JNIEXPORT jbyteArray JNICALL
 Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *, jobject);
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    getIntData
+ * Signature: ()[I
+ */
+JNIEXPORT jintArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_getIntData(JNIEnv *, jobject);
 /*
 * Class:     com_baidu_paddle_lite_Tensor
 * Method:    nativeResize
@@ -73,6 +81,14 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3F(
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3B(
    JNIEnv *, jobject, jbyteArray);
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    nativeSetData
+ * Signature: ([I)Z
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3I(
+    JNIEnv *, jobject, jintArray);
 /*
 * Class:     com_baidu_paddle_lite_Tensor
 * Method:    deleteCppTensor
@@ -87,4 +103,4 @@ Java_com_baidu_paddle_lite_Tensor_deleteCppTensor(JNIEnv *, jobject, jlong);
 #ifdef __cplusplus
 }
 #endif
-#endif  // PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
+#endif  // LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
+++ b/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
@@ -108,6 +108,19 @@ public class Tensor {
        return nativeSetData(buf);
    }
+    /**
+     * Set the tensor int data.
+     *
+     * @param buf the int array buffer which will be copied into tensor.
+     * @return true if set data successfully.
+     */
+    public boolean setData(int[] buf) {
+        if (readOnly) {
+            return false;
+        }
+        return nativeSetData(buf);
+    }
    /**
     * @return shape of the tensor as long array.
     */
@@ -123,12 +136,19 @@ public class Tensor {
     */
    public native byte[] getByteData();
+    /**
+     * @return the tensor data as int array.
+     */
+    public native int[] getIntData();
    private native boolean nativeResize(long[] dims);
    private native boolean nativeSetData(float[] buf);
    private native boolean nativeSetData(byte[] buf);
+    private native boolean nativeSetData(int[] buf);
    /**
     * Delete C++ Tenor object pointed by the input pointer, which is presented by a
     * long value.

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -46,7 +46,6 @@ void OutputOptModel(const std::string& load_model_dir,
  config.set_model_dir(load_model_dir);
  std::vector<Place> vaild_places = {
      Place{TARGET(kARM), PRECISION(kFloat)},
-      Place{TARGET(kX86), PRECISION(kFloat)},
  };
  if (FLAGS_is_quantized_model) {
    vaild_places.insert(vaild_places.begin(),

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -139,22 +139,15 @@ std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
-  std::vector<const cpp::OpDesc *> feeds;
-  std::vector<const cpp::OpDesc *> fetchs;
-#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU) || defined(LITE_WITH_BM)
-  // The shape of input tensors must be determined before generating NPU and XPU
-  // program.
-  auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
-  for (size_t i = 0; i < current_block->OpsSize(); i++) {
-    auto op = current_block->GetOp<cpp::OpDesc>(i);
-#else
  if (!program_) {
    GenRuntimeProgram();
  }
+  std::vector<const cpp::OpDesc *> feeds;
+  std::vector<const cpp::OpDesc *> fetchs;
  const auto &insts = program_->instructions();
  for (size_t i = 0; i < program_->num_instructions(); i++) {
    const auto &op = insts[i].op()->op_info();
-#endif
    if (op->Type() == "feed") {
      feeds.push_back(op);
    } else if (op->Type() == "fetch") {

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -20,6 +20,12 @@
 #include "lite/core/device_info.h"
 #include "lite/core/version.h"
+#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+#include <omp.h>
+#include "lite/backends/x86/mklml.h"
+#endif
 namespace paddle {
 namespace lite {
@@ -33,6 +39,17 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  mode_ = config.power_mode();
  threads_ = config.threads();
+#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+  int num_threads = config.cpu_math_library_num_threads();
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
+  omp_set_num_threads(real_num_threads);
+  VLOG(3) << "set_cpu_math_library_math_threads() is set successfully and the "
+             "number of threads is:"
+          << num_threads;
+#endif
 }
 std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInput(int i) {

--- a/lite/api/lite_multithread_test.cc
+++ b/lite/api/lite_multithread_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/device_info.h"
+#include "lite/core/profile/timer.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/basic_profiler.h"
+#endif             // LITE_WITH_PROFILE
+#include <thread>  // NOLINT
+using paddle::lite::profile::Timer;
+DEFINE_string(input_shape,
+              "1,3,224,224",
+              "input shapes, separated by colon and comma");
+DEFINE_string(model_dir_0, "", "model_dir_0");
+DEFINE_string(input_shape_0,
+              "1,3,224,224",
+              "input shapes another, separated by colon and comma");
+DEFINE_bool(use_optimize_nb,
+            false,
+            "optimized & naive buffer model for mobile devices");
+DEFINE_int32(test_type, 0, "multithread test type");
+namespace paddle {
+namespace lite_api {
+void OutputOptModel(const std::string& load_model_dir,
+                    const std::string& save_optimized_model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  config.set_valid_places({
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  // delete old optimized model
+  int ret = system(
+      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
+          .c_str());
+  if (ret == 0) {
+    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
+  }
+  predictor->SaveOptimizedModel(save_optimized_model_dir,
+                                LiteModelType::kNaiveBuffer);
+  LOG(INFO) << "Load model from " << load_model_dir;
+  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
+}
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+void Run(const std::vector<std::vector<int64_t>>& input_shapes,
+         const std::string& model_dir,
+         const PowerMode power_mode,
+         const int thread_num,
+         const int repeat,
+         int tid,
+         const int warmup_times = 5) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
+    }
+    for (int i = 0; i < input_num; ++i) {
+      input_data[i] = 1.f;
+    }
+  }
+  for (int i = 0; i < warmup_times; ++i) {
+    predictor->Run();
+  }
+  Timer ti;
+  for (int j = 0; j < repeat; ++j) {
+    ti.Start();
+    predictor->Run();
+    float t = ti.Stop();
+    auto output = predictor->GetOutput(0);
+    auto out = output->data<float>();
+    LOG(INFO) << "[thread " << tid << "] Model: " << model_dir
+              << " output[0]:" << out[0] << "; output[1]:" << out[1];
+  }
+  LOG(INFO) << "[thread " << tid << "] Model: " << model_dir
+            << ", power_mode: " << static_cast<int>(power_mode)
+            << ", threads num " << thread_num
+            << ", avg time: " << ti.LapTimes().Avg() << "ms"
+            << ", min time: " << ti.LapTimes().Min() << " ms"
+            << ", max time: " << ti.LapTimes().Max() << " ms.";
+}
+void RunTestType_00(const std::vector<std::vector<int64_t>>& input_shapes,
+                    const std::string& model_dir,
+                    const PowerMode power_mode,
+                    const int thread_num,
+                    const int repeat,
+                    const int warmup_times = 5) {
+  std::thread run_th0(Run,
+                      input_shapes,
+                      model_dir,
+                      power_mode,
+                      thread_num,
+                      repeat,
+                      0,
+                      warmup_times);
+  Run(input_shapes, model_dir, power_mode, thread_num, repeat, 1, warmup_times);
+  run_th0.join();
+}
+void RunTestType_01(const std::vector<std::vector<int64_t>>& input_shapes,
+                    const std::string& model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes_0,
+                    const std::string& model_dir_0,
+                    const PowerMode power_mode,
+                    const int thread_num,
+                    const int repeat,
+                    const int warmup_times = 5) {
+  std::thread run_th0(Run,
+                      input_shapes,
+                      model_dir,
+                      power_mode,
+                      thread_num,
+                      repeat,
+                      0,
+                      warmup_times);
+  Run(input_shapes_0,
+      model_dir_0,
+      power_mode,
+      thread_num,
+      repeat,
+      1,
+      warmup_times);
+  run_th0.join();
+}
+void run_with_predictor(std::shared_ptr<PaddlePredictor> predictor,
+                        const std::vector<std::vector<int64_t>>& input_shapes,
+                        int index,
+                        const std::string& name) {
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
+    }
+    for (int i = 0; i < input_num; ++i) {
+      input_data[i] = 1.f;
+    }
+  }
+  Timer ti;
+  ti.Start();
+  predictor->Run();
+  float t = ti.Stop();
+  auto output = predictor->GetOutput(0);
+  auto out = output->data<float>();
+  LOG(INFO) << "[thread " << index << "] name: " << name
+            << ",run time: " << ti.LapTimes().Avg() << "ms"
+            << " output[0]:" << out[0] << "; output[1]:" << out[1];
+}
+void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes,
+                    const std::string& model_dir,
+                    const PowerMode power_mode,
+                    const int thread_num,
+                    const int repeat,
+                    int warmup = 5) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  for (int i = 0; i < repeat; ++i) {
+    std::thread pre_th0(
+        run_with_predictor, predictor, input_shapes, i, model_dir);
+    pre_th0.join();
+  }
+}
+void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes,
+                    const std::string& model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes_0,
+                    const std::string& model_dir_0,
+                    const PowerMode power_mode,
+                    const int thread_num,
+                    const int repeat,
+                    int warmup = 5) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  config.set_model_dir(model_dir_0);
+  auto predictor_0 = lite_api::CreatePaddlePredictor(config);
+  for (int i = 0; i < 2 * repeat; i += 2) {
+    std::thread pre_th0(
+        run_with_predictor, predictor, input_shapes, i, model_dir);
+    std::thread pre_th1(
+        run_with_predictor, predictor_0, input_shapes_0, i + 1, model_dir_0);
+    pre_th0.join();
+    pre_th1.join();
+  }
+}
+#endif
+}  // namespace lite_api
+}  // namespace paddle
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir == "") {
+    LOG(INFO) << "usage: "
+              << "--model_dir /path/to/your/model";
+    exit(0);
+  }
+  std::string save_optimized_model_dir = "";
+  std::string save_optimized_model_dir_0 = "";
+  if (FLAGS_use_optimize_nb) {
+    save_optimized_model_dir = FLAGS_model_dir;
+    save_optimized_model_dir_0 = FLAGS_model_dir_0;
+  } else {
+    save_optimized_model_dir = FLAGS_model_dir + "opt2";
+    save_optimized_model_dir_0 = FLAGS_model_dir_0 + "opt2";
+  }
+  auto split_string =
+      [](const std::string& str_in) -> std::vector<std::string> {
+    std::vector<std::string> str_out;
+    std::string tmp_str = str_in;
+    while (!tmp_str.empty()) {
+      size_t next_offset = tmp_str.find(":");
+      str_out.push_back(tmp_str.substr(0, next_offset));
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return str_out;
+  };
+  auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
+    std::vector<int64_t> shape;
+    std::string tmp_str = str_shape;
+    while (!tmp_str.empty()) {
+      int dim = atoi(tmp_str.data());
+      shape.push_back(dim);
+      size_t next_offset = tmp_str.find(",");
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return shape;
+  };
+  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
+  std::vector<std::vector<int64_t>> input_shapes;
+  for (int i = 0; i < str_input_shapes.size(); ++i) {
+    input_shapes.push_back(get_shape(str_input_shapes[i]));
+  }
+  std::vector<std::string> str_input_shapes_0 =
+      split_string(FLAGS_input_shape_0);
+  std::vector<std::vector<int64_t>> input_shapes_0;
+  for (int i = 0; i < str_input_shapes_0.size(); ++i) {
+    input_shapes_0.push_back(get_shape(str_input_shapes_0[i]));
+  }
+  if (!FLAGS_use_optimize_nb) {
+    // Output optimized model
+    paddle::lite_api::OutputOptModel(
+        FLAGS_model_dir, save_optimized_model_dir, input_shapes);
+    paddle::lite_api::OutputOptModel(
+        FLAGS_model_dir_0, save_optimized_model_dir_0, input_shapes_0);
+  }
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  // Run inference using optimized model
+  if (FLAGS_test_type == 0) {
+    paddle::lite_api::RunTestType_00(
+        input_shapes,
+        save_optimized_model_dir,
+        static_cast<paddle::lite_api::PowerMode>(0),
+        FLAGS_threads,
+        FLAGS_repeats,
+        5);
+    LOG(INFO) << "=========above is case 0, below is case "
+                 "1============================";
+    paddle::lite_api::RunTestType_10(
+        input_shapes,
+        save_optimized_model_dir,
+        static_cast<paddle::lite_api::PowerMode>(0),
+        FLAGS_threads,
+        FLAGS_repeats);
+  }
+  if (FLAGS_test_type == 1) {
+    paddle::lite_api::RunTestType_01(
+        input_shapes,
+        save_optimized_model_dir,
+        input_shapes_0,
+        save_optimized_model_dir_0,
+        static_cast<paddle::lite_api::PowerMode>(0),
+        FLAGS_threads,
+        FLAGS_repeats,
+        5);
+    LOG(INFO) << "=========above is case 0, below is case "
+                 "1============================";
+    paddle::lite_api::RunTestType_11(
+        input_shapes,
+        save_optimized_model_dir,
+        input_shapes_0,
+        save_optimized_model_dir_0,
+        static_cast<paddle::lite_api::PowerMode>(0),
+        FLAGS_threads,
+        FLAGS_repeats);
+  }
+#endif
+  return 0;
+}
--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -90,6 +90,10 @@ std::vector<Place> ParserValidPlaces() {
          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
    } else if (target_repr == "x86") {
      valid_places.emplace_back(TARGET(kX86));
+    } else if (target_repr == "npu") {
+      valid_places.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "xpu") {
+      valid_places.emplace_back(TARGET(kXPU));
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -47,7 +47,6 @@ void OutputOptModel(const std::string& load_model_dir,
  lite_api::CxxConfig config;
  config.set_model_dir(load_model_dir);
  config.set_valid_places({
-      Place{TARGET(kX86), PRECISION(kFloat)},
      Place{TARGET(kARM), PRECISION(kFloat)},
  });
  auto predictor = lite_api::CreatePaddlePredictor(config);
@@ -72,10 +71,6 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
         const int thread_num,
         const int repeat,
         const int warmup_times = 0) {
-#ifdef LITE_WITH_PROFILE
-  lite::profile::BasicProfiler<lite::profile::BasicTimer>::Global().SetWarmup(
-      warmup_times);
-#endif
  lite_api::MobileConfig config;
  config.set_model_dir(model_dir);
  config.set_power_mode(power_mode);

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -133,6 +133,7 @@ class LITE_API CxxConfig : public ConfigBase {
  std::string model_file_;
  std::string param_file_;
  bool model_from_memory_{false};
+  int cpu_math_library_math_threads_ = 1;
 public:
  void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -151,6 +152,13 @@ class LITE_API CxxConfig : public ConfigBase {
  std::string model_file() const { return model_file_; }
  std::string param_file() const { return param_file_; }
  bool model_from_memory() const { return model_from_memory_; }
+  void set_cpu_math_library_num_threads(int threads) {
+    cpu_math_library_math_threads_ = threads;
+  }
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_math_threads_;
+  }
 };
 /// MobileConfig is the config for the light weight predictor, it will skip

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -78,7 +78,8 @@ const std::string& PrecisionToStr(PrecisionType precision) {
 }
 const std::string& DataLayoutToStr(DataLayoutType layout) {
-  static const std::string datalayout2string[] = {"unk", "NCHW", "any", "NHWC"};
+  static const std::string datalayout2string[] = {
+      "unk", "NCHW", "any", "NHWC", "ImageDefault", "ImageFolder", "ImageNW"};
  auto x = static_cast<int>(layout);
  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
  return datalayout2string[x];
@@ -117,8 +118,13 @@ const std::string& PrecisionRepr(PrecisionType precision) {
 }
 const std::string& DataLayoutRepr(DataLayoutType layout) {
-  static const std::string datalayout2string[] = {
+  static const std::string datalayout2string[] = {"kUnk",
-      "kUnk", "kNCHW", "kAny", "kNHWC"};
+                                                  "kNCHW",
+                                                  "kAny",
+                                                  "kNHWC",
+                                                  "kImageDefault",
+                                                  "kImageFolder",
+                                                  "kImageNW"};
  auto x = static_cast<int>(layout);
  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
  return datalayout2string[x];
@@ -149,8 +155,12 @@ std::set<PrecisionType> ExpandValidPrecisions(PrecisionType precision) {
 }
 std::set<DataLayoutType> ExpandValidLayouts(DataLayoutType layout) {
-  static const std::set<DataLayoutType> valid_set(
+  static const std::set<DataLayoutType> valid_set({DATALAYOUT(kNCHW),
-      {DATALAYOUT(kNCHW), DATALAYOUT(kAny), DATALAYOUT(kNHWC)});
+                                                   DATALAYOUT(kAny),
+                                                   DATALAYOUT(kNHWC),
+                                                   DATALAYOUT(kImageDefault),
+                                                   DATALAYOUT(kImageFolder),
+                                                   DATALAYOUT(kImageNW)});
  if (layout == DATALAYOUT(kAny)) {
    return valid_set;
  }

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -72,8 +72,11 @@ enum class DataLayoutType : int {
  kUnk = 0,
  kNCHW = 1,
  kNHWC = 3,
-  kAny = 2,  // any data layout
+  kImageDefault = 4,  // for opencl image2d
-  NUM = 4,   // number of fields.
+  kImageFolder = 5,   // for opencl image2d
+  kImageNW = 6,       // for opencl image2d
+  kAny = 2,           // any data layout
+  NUM = 7,            // number of fields.
 };
 typedef enum {

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -20,15 +20,6 @@ USE_MIR_PASS(static_kernel_pick_pass);
 USE_MIR_PASS(variable_place_inference_pass);
 USE_MIR_PASS(type_target_cast_pass);
 USE_MIR_PASS(generate_program_pass);
-#ifdef LITE_WITH_NPU
-USE_MIR_PASS(generate_npu_program_pass);
-#endif
-#ifdef LITE_WITH_XPU
-USE_MIR_PASS(generate_xpu_program_pass);
-#endif
-#ifdef LITE_WITH_BM
-USE_MIR_PASS(generate_bm_program_pass);
-#endif
 USE_MIR_PASS(io_copy_kernel_pick_pass);
 USE_MIR_PASS(argument_type_display_pass);
@@ -40,11 +31,16 @@ USE_MIR_PASS(lite_fc_fuse_pass);
 USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
 USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
 USE_MIR_PASS(lite_interpolate_fuse_pass);
+USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
 USE_MIR_PASS(identity_scale_eliminate_pass);
 USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
 USE_MIR_PASS(lite_conv_activation_fuse_pass);
+USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
 USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass);
 USE_MIR_PASS(lite_quant_dequant_fuse_pass);
 USE_MIR_PASS(type_precision_cast_pass);
 USE_MIR_PASS(type_layout_cast_pass);
 USE_MIR_PASS(memory_optimize_pass);
+USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
+USE_MIR_PASS(npu_subgraph_pass);
+USE_MIR_PASS(xpu_subgraph_pass);
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -4,3 +4,6 @@ if (NOT LITE_ON_TINY_PUBLISH)
 endif()
 lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+if (LITE_ON_TINY_PUBLISH)
+   set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
+endif()
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -165,6 +165,9 @@ void BindLitePlace(py::module *m) {
  py::enum_<DataLayoutType>(*m, "DataLayoutType")
      .value("NCHW", DataLayoutType::kNCHW)
      .value("NHWC", DataLayoutType::kNHWC)
+      .value("ImageDefault", DataLayoutType::kImageDefault)
+      .value("ImageFolder", DataLayoutType::kImageFolder)
+      .value("ImageNW", DataLayoutType::kImageNW)
      .value("Any", DataLayoutType::kAny);
  // Place

--- a/lite/api/test_step_rnn_lite_x86.cc
+++ b/lite/api/test_step_rnn_lite_x86.cc
@@ -30,6 +30,7 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
  std::string model_dir = FLAGS_model_dir;
  lite_api::CxxConfig config;
  config.set_model_dir(model_dir);
+  config.set_cpu_math_library_num_threads(1);
  config.set_valid_places({lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
@@ -48,7 +49,7 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
                                           "micro_video_id",
                                           "vertical_type_id"};
-  for (int i = 0; i < target_names.size(); ++i) {
+  for (size_t i = 0; i < target_names.size(); ++i) {
    auto input_tensor = predictor->GetInput(i);
    int size = 0;
    if (i == 6 || i == 8) {
@@ -73,8 +74,7 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
    predictor->Run();
  }
-  //  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-  LOG(INFO) << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
            << " ms in average.";
@@ -85,8 +85,8 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
  std::vector<int64_t> out_shape = out->shape();
-  for (int i = 0; i < results.size(); ++i) {
+  for (size_t i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
      EXPECT_NEAR(
          out->data<float>()[j + (out_shape[1] * i)], results[i][j], 1e-6);
    }

--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -120,5 +120,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      stack.cc
      affine_channel.cc
      anchor_generator.cc
+      split_merge_lod_tenosr.cc
+      reduce_prod.cc
      DEPS ${lite_kernel_deps} context tensor)
 endif()
--- a/lite/backends/arm/math/concat.cc
+++ b/lite/backends/arm/math/concat.cc
@@ -26,31 +26,32 @@ namespace math {
 void concat_func(const std::vector<lite::Tensor *> &input,
                 const int axis,
                 lite::Tensor *output) {
-  size_t num = input.size();
+  int64_t concat_input_size = 1;
-  int rows = 1;
+  int64_t num_cancats = 1;
  auto dim_0 = input[0]->dims();
-  for (int i = 0; i < axis; ++i) {
+  size_t num = input.size();
-    rows *= dim_0[i];
+  for (int i = axis + 1; i < dim_0.size(); i++) {
+    concat_input_size *= dim_0[i];
  }
-  int out_rows = rows, out_cols = 0;
+  for (int i = 0; i < axis; i++) {
+    num_cancats *= dim_0[i];
-  std::vector<int64_t> input_cols(input.size());
-  for (int i = 0; i < num; ++i) {
-    int t_cols = input[i]->numel() / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
  }
+  float *dst_ptr = output->mutable_data<float>();
-  // computation
+  const int out_concat_axis = output->dims()[axis];
-  for (int k = 0; k < out_rows; ++k) {
+  int64_t offset_concat_axis = 0;
-    float *dst_ptr = output->mutable_data<float>() + k * out_cols;
+  int64_t out_sum = out_concat_axis * concat_input_size;
-    int col_idx = 0;
+  for (int n = 0; n < num; n++) {
-    for (int j = 0; j < num; ++j) {
+    auto dims = input[n]->dims();
-      int col_len = input_cols[j];
+    const float *src_ptr = input[n]->data<float>();
-      const float *src_prt = input[j]->data<float>() + k * col_len;
+    int64_t in_concat_axis = dims[axis];
-      std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
+    float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
-      col_idx += col_len;
+    int64_t in_sum = in_concat_axis * concat_input_size;
+    for (int i = 0; i < num_cancats; i++) {
+      std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum);
+      dout_ptr += out_sum;
+      src_ptr += in_sum;
    }
+    offset_concat_axis += in_concat_axis;
  }
 }

--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
--- a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
@@ -76,6 +76,7 @@ void conv_3x3s1_direct_fp32(const float* i_data,
  const int threads = ctx->threads();
  int l2_size = ctx->llc_size() / sizeof(float);
  auto paddings = *param.paddings;
+  auto act_param = param.activation_param;
  const int pad_h = paddings[0];
  const int pad_w = paddings[2];
@@ -469,7 +470,8 @@ void conv_3x3s1_direct_fp32(const float* i_data,
                                oh,
                                ow,
                                flag_relu,
-                                ptr_write);
+                                ptr_write,
+                                &act_param);
      }
      const float* weight_remain_ptr = weights + c_round_down * w_stride;
 #pragma omp parallel for num_threads(threads)
@@ -780,7 +782,8 @@ void conv_3x3s1_direct_fp32(const float* i_data,
                                oh,
                                ow,
                                flag_relu,
-                                ptr_write);
+                                ptr_write,
+                                &act_param);
      }
    }
  }

--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
@@ -75,6 +75,7 @@ void conv_3x3s2_direct_fp32(const float* i_data,
  //! prepack input to tmp buffer
  //! write output to tmp buffer
  auto paddings = *param.paddings;
+  auto act_param = param.activation_param;
  const int threads = ctx->threads();
  int l2_size = ctx->llc_size() / sizeof(float);
  const int pad_w = paddings[2];
@@ -510,7 +511,8 @@ void conv_3x3s2_direct_fp32(const float* i_data,
                                oh,
                                ow,
                                flag_relu,
-                                ptr_write);
+                                ptr_write,
+                                &act_param);
      }
 #pragma omp parallel for num_threads(threads)
@@ -839,7 +841,8 @@ void conv_3x3s2_direct_fp32(const float* i_data,
                                oh,
                                ow,
                                flag_relu,
-                                ptr_write);
+                                ptr_write,
+                                &act_param);
      }
    }
  }

--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -37,6 +37,7 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
                               const float* weights,
                               const float* bias,
                               const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
                               ARMContext* ctx);
 void conv_3x3s2_depthwise_fp32(const float* i_data,
@@ -51,6 +52,7 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
                               const float* weights,
                               const float* bias,
                               const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
                               ARMContext* ctx);
 void conv_depthwise_3x3s1_fp32(const float* din,
@@ -66,7 +68,7 @@ void conv_depthwise_3x3s1_fp32(const float* din,
                               const float* bias,
                               int pad,
                               bool flag_bias,
-                               bool flag_relu,
+                               const operators::ActivationParam act_param,
                               ARMContext* ctx);
 void conv_depthwise_3x3s2_fp32(const float* din,
@@ -82,7 +84,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                               const float* bias,
                               int pad,
                               bool flag_bias,
-                               bool flag_relu,
+                               const operators::ActivationParam act_param,
                               ARMContext* ctx);
 template <typename Dtype>

--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -579,11 +579,11 @@ void conv_depthwise_3x3_fp32(const void* din,
                             ARMContext* ctx,
                             const float* scale) {
  auto paddings = *param.paddings;
+  auto act_param = param.activation_param;
  const int pad_h = paddings[0];
  const int pad_w = paddings[2];
  int stride = param.strides[1];
  int pad = pad_w;
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
  bool pads_equal =
      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
@@ -602,7 +602,7 @@ void conv_depthwise_3x3_fp32(const void* din,
                                bias,
                                pad,
                                flag_bias,
-                                flag_relu,
+                                act_param,
                                ctx);
    } else {
      conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
@@ -617,6 +617,7 @@ void conv_depthwise_3x3_fp32(const void* din,
                                reinterpret_cast<const float*>(weights),
                                bias,
                                param,
+                                act_param,
                                ctx);
    }
@@ -635,7 +636,7 @@ void conv_depthwise_3x3_fp32(const void* din,
                                bias,
                                pad,
                                flag_bias,
-                                flag_relu,
+                                act_param,
                                ctx);
    } else {
      conv_3x3s2_depthwise_fp32(reinterpret_cast<const float*>(din),
@@ -650,6 +651,7 @@ void conv_depthwise_3x3_fp32(const void* din,
                                reinterpret_cast<const float*>(weights),
                                bias,
                                param,
+                                act_param,
                                ctx);
    }
  } else {

--- a/lite/backends/arm/math/conv_impl.h
+++ b/lite/backends/arm/math/conv_impl.h
@@ -316,7 +316,9 @@ void fill_bias_int8(int* tensor,
                    int channel_size);
 // new winograd
-void weight_trans_c4(
+void weight_trans_c4_8x8(
+    float* dest, const float* src, int ic, int oc, void* workspace);
+void weight_trans_c4_4x4(
    float* dest, const float* src, int ic, int oc, void* workspace);
 void conv_compute_6x6_3x3(const float* input,
                          float* output,
@@ -331,6 +333,32 @@ void conv_compute_6x6_3x3(const float* input,
                          const float* bias,
                          const operators::ConvParam& param,
                          ARMContext* ctx);
+void conv_compute_2x2_3x3(const float* input,
+                          float* output,
+                          int num,
+                          int chout,
+                          int hout,
+                          int wout,
+                          int chin,
+                          int hin,
+                          int win,
+                          const float* weight,
+                          const float* bias,
+                          const operators::ConvParam& param,
+                          ARMContext* ctx);
+void conv_compute_2x2_3x3_small(const float* input,
+                                float* output,
+                                int num,
+                                int chout,
+                                int hout,
+                                int wout,
+                                int chin,
+                                int hin,
+                                int win,
+                                const float* weight,
+                                const float* bias,
+                                const operators::ConvParam& param,
+                                ARMContext* ctx);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -557,6 +557,52 @@ void elementwise_mul<float>(const float* dinx,
  }
 }
+template <>
+void elementwise_mul<int>(const int* dinx,
+                          const int* diny,
+                          int* dout,
+                          int num) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; ++i) {
+    const int* dinx_ptr = dinx + (i << 4);
+    const int* diny_ptr = diny + (i << 4);
+    int* dout_ptr = dout + (i << 4);
+    int32x4_t dinx0 = vld1q_s32(dinx_ptr);
+    int32x4_t dinx1 = vld1q_s32(dinx_ptr + 4);
+    int32x4_t dinx2 = vld1q_s32(dinx_ptr + 8);
+    int32x4_t dinx3 = vld1q_s32(dinx_ptr + 12);
+    int32x4_t diny0 = vld1q_s32(diny_ptr);
+    int32x4_t diny1 = vld1q_s32(diny_ptr + 4);
+    int32x4_t diny2 = vld1q_s32(diny_ptr + 8);
+    int32x4_t diny3 = vld1q_s32(diny_ptr + 12);
+    dinx0 = vmulq_s32(dinx0, diny0);
+    dinx1 = vmulq_s32(dinx1, diny1);
+    dinx2 = vmulq_s32(dinx2, diny2);
+    dinx3 = vmulq_s32(dinx3, diny3);
+    vst1q_s32(dout_ptr, dinx0);
+    vst1q_s32(dout_ptr + 4, dinx1);
+    vst1q_s32(dout_ptr + 8, dinx2);
+    vst1q_s32(dout_ptr + 12, dinx3);
+  }
+  if (remain > 0) {
+    const int* dinx_ptr = dinx + (cnt << 4);
+    const int* diny_ptr = diny + (cnt << 4);
+    int* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *dinx_ptr * *diny_ptr;
+      dout_ptr++;
+      dinx_ptr++;
+      diny_ptr++;
+    }
+  }
+}
 template <>
 void elementwise_mul_relu<float>(const float* dinx,
                                 const float* diny,
@@ -678,6 +724,73 @@ void elementwise_mul_broadcast<float>(const float* dinx,
  }
 }
+template <>
+void elementwise_mul_broadcast<int>(const int* dinx,
+                                    const int* diny,
+                                    int* dout,
+                                    int batch,
+                                    int channels,
+                                    int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int* din_ptr = dinx + offset;
+      const int diny_data = diny[j];
+      int* dout_ptr = dout + offset;
+      int cnt = num >> 4;
+      int remain = num % 16;
+      int32x4_t rb = vdupq_n_s32(diny_data);
+      for (int k = 0; k < cnt; ++k) {
+        int32x4_t din0 = vld1q_s32(din_ptr);
+        int32x4_t din1 = vld1q_s32(din_ptr + 4);
+        int32x4_t din2 = vld1q_s32(din_ptr + 8);
+        int32x4_t din3 = vld1q_s32(din_ptr + 12);
+        din0 = vmulq_s32(din0, rb);
+        din1 = vmulq_s32(din1, rb);
+        din2 = vmulq_s32(din2, rb);
+        din3 = vmulq_s32(din3, rb);
+        vst1q_s32(dout_ptr, din0);
+        vst1q_s32(dout_ptr + 4, din1);
+        vst1q_s32(dout_ptr + 8, din2);
+        vst1q_s32(dout_ptr + 12, din3);
+        din_ptr += 16;
+        dout_ptr += 16;
+      }
+      if (remain >= 8) {
+        int32x4_t din0 = vld1q_s32(din_ptr);
+        int32x4_t din1 = vld1q_s32(din_ptr + 4);
+        din0 = vmulq_s32(din0, rb);
+        din1 = vmulq_s32(din1, rb);
+        vst1q_s32(dout_ptr, din0);
+        vst1q_s32(dout_ptr + 4, din1);
+        din_ptr += 8;
+        dout_ptr += 8;
+        remain -= 8;
+      }
+      if (remain >= 4) {
+        int32x4_t din0 = vld1q_s32(din_ptr);
+        din0 = vmulq_s32(din0, rb);
+        vst1q_s32(dout_ptr, din0);
+        din_ptr += 4;
+        dout_ptr += 4;
+        remain -= 4;
+      }
+      if (remain > 0) {
+        for (int p = 0; p < remain; ++p) {
+          *dout_ptr = *din_ptr * diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  }
+}
 template <>
 void elementwise_mul_relu_broadcast<float>(const float* dinx,
                                           const float* diny,

--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -51,6 +51,7 @@
 #include "lite/backends/arm/math/prior_box.h"
 #include "lite/backends/arm/math/reduce_max.h"
 #include "lite/backends/arm/math/reduce_mean.h"
+#include "lite/backends/arm/math/reduce_prod.h"
 #include "lite/backends/arm/math/scale.h"
 #include "lite/backends/arm/math/sequence_expand.h"
 #include "lite/backends/arm/math/sequence_pool.h"
@@ -61,6 +62,7 @@
 #include "lite/backends/arm/math/slice.h"
 #include "lite/backends/arm/math/softmax.h"
 #include "lite/backends/arm/math/split.h"
+#include "lite/backends/arm/math/split_merge_lod_tenosr.h"
 #include "lite/backends/arm/math/stack.h"
 #include "lite/backends/arm/math/topk.h"
 #include "lite/backends/arm/math/yolo_box.h"

--- a/lite/backends/arm/math/interpolate.cc
+++ b/lite/backends/arm/math/interpolate.cc
@@ -477,17 +477,23 @@ void nearest_interp(const float* src,
  float scale_h_new = (with_align)
                          ? (static_cast<float>(h_in - 1) / (h_out - 1))
                          : (static_cast<float>(h_in) / (h_out));
+  if (with_align) {
-#pragma omp parallel for collapse(2) schedule(static)
+    for (int h = 0; h < h_out; ++h) {
-  for (int h = 0; h < h_out; ++h) {
+      float* dst_p = dst + h * w_out;
-    for (int w = 0; w < w_out; ++w) {
+      int near_y = static_cast<int>(scale_h_new * h + 0.5);
-      int near_x = (with_align) ? static_cast<int>(scale_w_new * w + 0.5)
+      for (int w = 0; w < w_out; ++w) {
-                                : static_cast<int>(scale_w_new * w);
+        int near_x = static_cast<int>(scale_w_new * w + 0.5);
-      int near_y = (with_align) ? static_cast<int>(scale_h_new * h + 0.5)
+        *dst_p++ = src[near_y * w_in + near_x];
-                                : static_cast<int>(scale_h_new * h);
+      }
-      near_x = near_x < 0 ? 0 : near_x;
+    }
-      near_y = near_y < 0 ? 0 : near_y;
+  } else {
-      dst[h * w_out + w] = src[near_y * w_in + near_x];
+    for (int h = 0; h < h_out; ++h) {
+      float* dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w);
+        *dst_p++ = src[near_y * w_in + near_x];
+      }
    }
  }
 }
@@ -520,9 +526,9 @@ void interpolate(lite::Tensor* X,
    }
    auto out_size = OutSize;
    if (out_size != nullptr) {
-      auto out_size_data = get_new_data_from_tensor<float>(out_size);
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
-      out_height = static_cast<int>(out_size_data[0]);
+      out_height = out_size_data[0];
-      out_width = static_cast<int>(out_size_data[1]);
+      out_width = out_size_data[1];
    }
  }
  float height_scale = scale;
@@ -544,8 +550,10 @@ void interpolate(lite::Tensor* X,
  int out_w = Out->dims()[3];
  int spatial_in = in_h * in_w;
  int spatial_out = out_h * out_w;
-  for (int i = 0; i < count; ++i) {
-    if ("Bilinear" == interpolate_type) {
+  if ("Bilinear" == interpolate_type) {
+#pragma omp parallel for
+    for (int i = 0; i < count; ++i) {
      bilinear_interp(din + spatial_in * i,
                      in_w,
                      in_h,
@@ -555,7 +563,10 @@ void interpolate(lite::Tensor* X,
                      1.f / width_scale,
                      1.f / height_scale,
                      with_align);
-    } else if ("Nearest" == interpolate_type) {
+    }
+  } else if ("Nearest" == interpolate_type) {
+#pragma omp parallel for
+    for (int i = 0; i < count; ++i) {
      nearest_interp(din + spatial_in * i,
                     in_w,
                     in_h,

--- a/lite/backends/arm/math/packed_sgemm_c4.cc
+++ b/lite/backends/arm/math/packed_sgemm_c4.cc
--- a/lite/backends/arm/math/packed_sgemm_c4.h
+++ b/lite/backends/arm/math/packed_sgemm_c4.h
@@ -47,6 +47,13 @@ void sgemm_prepack_c4_small(int M,
                            bool has_bias,
                            bool has_relu,
                            ARMContext* ctx);
+void sgemm_prepack_c4_small(int M,
+                            int N,
+                            int K,
+                            const float* A_packed,
+                            const float* B,
+                            float* C,
+                            ARMContext* ctx);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -167,7 +167,7 @@ void pooling_basic(const float* din,
  "ld1 {v2.4s-v3.4s}, [%[data_in_channel]], #32    \n" \
  "fmax v6.4s, v4.4s, v5.4s \n"                        \
  "subs %w[cnt], %w[cnt], #1 \n"                       \
-  "fmax %w[vmax].4s, %w[vmax].4s, v6.4s \n"            \
+  "fmax %[vmax].4s, %[vmax].4s, v6.4s \n"              \
  "bne 1b \n"
 #define GLOBAL_AVG                                  \
  "1: \n"                                           \
@@ -176,7 +176,7 @@ void pooling_basic(const float* din,
  "ld1 {v0.4s-v1.4s}, [%[data_in_channel]], #32 \n" \
  "fadd %[vsum].4s, %[vsum].4s, v3.4s \n"           \
  "subs %w[cnt], %w[cnt], #1 \n"                    \
-  "fadd %w[vsum].4s, %w[vsum].4s, v4.4s \n"         \
+  "fadd %[vsum].4s, %[vsum].4s, v4.4s \n"           \
  "ld1 {v2.4s-v3.4s}, [%[data_in_channel]], #32 \n" \
  "bne 1b \n"

--- a/lite/backends/arm/math/reduce_prod.cc
+++ b/lite/backends/arm/math/reduce_prod.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "lite/backends/arm/math/reduce_prod.h"
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/reduce_prod.h
+++ b/lite/backends/arm/math/reduce_prod.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+template <typename T>
+void reduce_prod_n(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = channel_in * hw_size;
+  int data_index, src_index, src_index0;
+  for (int c = 0; c < channel_in; ++c) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = c * hw_size + h * width_in + w;
+        dst[data_index] = static_cast<T>(1);
+        for (int n = 0; n < num_in; ++n) {
+          src_index = n * chw_size + data_index;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+template <typename T>
+void reduce_prod_c(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  int data_index, src_index0, src_index;
+  for (int n = 0; n < num_in; ++n) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * hw_size + h * width_in + w;
+        src_index0 = n * chw_size + h * width_in + w;
+        dst[data_index] = static_cast<T>(1);
+        for (int c = 0; c < channel_in; ++c) {
+          src_index = src_index0 + c * hw_size;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+template <typename T>
+void reduce_prod_h(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int cw_size = channel_in * width_in;
+  int chw_size = cw_size * height_in;
+  int hw_size = height_in * width_in;
+  int data_index, src_index, src_index0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * cw_size + c * width_in + w;
+        src_index0 = n * chw_size + c * hw_size + w;
+        dst[data_index] = static_cast<T>(1);
+        for (int h = 0; h < height_in; ++h) {
+          src_index = src_index0 + h * width_in;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+template <typename T>
+void reduce_prod_w(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int ch_size = channel_in * height_in;
+  int hw_size = height_in * width_in;
+  int chw_size = ch_size * width_in;
+  int data_index = 0;
+  int src_index0 = 0;
+  int src_index = 0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int h = 0; h < height_in; ++h) {
+        data_index = n * ch_size + c * height_in + h;
+        src_index0 = n * chw_size + c * hw_size + h * width_in;
+        dst[data_index] = static_cast<T>(1);
+        for (int w = 0; w < width_in; ++w) {
+          src_index = src_index0 + w;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+template <typename T>
+void reduce_prod_nc(const T* src,
+                    T* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce n first.
+  DDimLite ddimA({1, channel_in, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  auto* tmp_out = tensor_tmp.mutable_data<T>();
+  reduce_prod_n(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_c(tmp_out, dst, 1, channel_in, height_in, width_in);
+}
+template <typename T>
+void reduce_prod_ch(const T* src,
+                    T* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce c first
+  DDimLite ddimA({num_in, 1, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  auto* tmp_out = tensor_tmp.mutable_data<T>();
+  reduce_prod_c(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_h(tmp_out, dst, num_in, 1, height_in, width_in);
+}
+template <typename T>
+void reduce_prod_hw(const T* src,
+                    T* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce h first
+  DDimLite ddimA({num_in, channel_in, 1, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  auto* tmp_out = tensor_tmp.mutable_data<T>();
+  reduce_prod_h(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_w(tmp_out, dst, num_in, channel_in, 1, width_in);
+}
+template <typename T>
+void reduce_prod_all(const T* src, T* dst, int64_t total_num) {
+  dst[0] = static_cast<T>(1);
+  for (int n = 0; n < total_num; ++n) {
+    dst[0] *= src[n];
+  }
+}
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/slice.cc
+++ b/lite/backends/arm/math/slice.cc
@@ -86,6 +86,13 @@ template void slice(const int* input,
                    std::vector<int> ends,
                    int* out,
                    Context<TARGET(kARM)>* ctx);
+template void slice(const float* input,
+                    std::vector<int64_t> dims,
+                    std::vector<int> axes,
+                    std::vector<int> starts,
+                    std::vector<int> ends,
+                    float* out,
+                    Context<TARGET(kARM)>* ctx);
 }  // namespace math
 }  // namespace arm

--- a/lite/backends/arm/math/split.cc
+++ b/lite/backends/arm/math/split.cc
@@ -70,10 +70,12 @@ void split<float>(const float* din,
    int in_after = in_strides[axis];
    int out_after = out_strides[axis];
+    const float* din_ptr = din + input_offset;
    for (int i = 0; i < before; ++i) {
-      split_cpy(din + input_offset + i * in_after,
+      std::memcpy(out_data, din_ptr, sizeof(float) * out_after);
-                out_data + i * out_after,
+      din_ptr += in_after;
-                out_after);
+      out_data += out_after;
    }
    input_offset += out_strides[axis];
  }

--- a/lite/backends/arm/math/split_merge_lod_tenosr.cc
+++ b/lite/backends/arm/math/split_merge_lod_tenosr.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/arm/math/split_merge_lod_tenosr.h"
+#include <utility>
+#include <vector>
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod,
+                                        size_t start_idx,
+                                        size_t end_idx,
+                                        size_t start_level) {
+  LoD sub_lod;
+  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+    CHECK(start_idx <= end_idx);
+    CHECK(end_idx < lod[level_idx].size());
+    std::vector<uint64_t> level_lens;
+    for (size_t i = start_idx; i < end_idx; ++i) {
+      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
+    }
+    sub_lod.emplace_back(level_lens);
+    start_idx = lod[level_idx][start_idx];
+    end_idx = lod[level_idx][end_idx];
+  }
+  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
+}
+void AppendLoD(LoD *lod, const LoD &lod_length) {
+  CHECK(lod->empty() || lod->size() == lod_length.size());
+  if (lod->empty()) {
+    for (size_t i = 0; i < lod_length.size(); ++i) {
+      lod->emplace_back(std::vector<uint64_t>({0}));
+    }
+  }
+  for (size_t i = 0; i < lod->size(); ++i) {
+    auto &level = (*lod)[i];
+    for (auto len : lod_length[i]) {
+      level.push_back(level.back() + len);
+    }
+  }
+}
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/split_merge_lod_tenosr.h
+++ b/lite/backends/arm/math/split_merge_lod_tenosr.h
--- a/lite/backends/bm/CMakeLists.txt
+++ b/lite/backends/bm/CMakeLists.txt
@@ -2,5 +2,4 @@ if (NOT LITE_WITH_BM)
    return()
 endif()
-lite_cc_library(target_wrapper_bm SRCS target_wrapper.cc bm_context.cc DEPS ${bm_runtime_libs})
+lite_cc_library(target_wrapper_bm SRCS target_wrapper.cc DEPS ${bm_runtime_libs})
-lite_cc_library(bm_builder SRCS builder.cc DEPS ${bm_builder_libs})
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -89,9 +89,15 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
        this->act_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
  }
+#if CUDNN_VERSION_MIN(7, 0, 0)
+  cudnnMathType_t math_type =
+      use_tensor_core_ ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
+  CUDNN_CHECK(cudnnSetConvolutionMathType(this->conv_desc_, math_type));
+#endif
  if (ic == param.groups && ic == oc && ic != 1) {
    this->fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-  } else if (1) {
+  } else if (!param.var_length) {
    const auto* i_data = param.x->data<float>();
    const auto* w_data = param.filter->data<float>();
    auto* o_data = param.output->mutable_data<float>(TARGET(kCUDA));

--- a/lite/backends/cuda/math/gemm.h
+++ b/lite/backends/cuda/math/gemm.h
--- a/lite/backends/cuda/math/transpose.cu
+++ b/lite/backends/cuda/math/transpose.cu
--- a/lite/backends/cuda/math/transpose.h
+++ b/lite/backends/cuda/math/transpose.h
--- a/lite/backends/fpga/CMakeLists.txt
+++ b/lite/backends/fpga/CMakeLists.txt
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
--- a/lite/backends/fpga/KD/dl_engine.cpp
+++ b/lite/backends/fpga/KD/dl_engine.cpp
--- a/lite/backends/fpga/KD/dl_engine.hpp
+++ b/lite/backends/fpga/KD/dl_engine.hpp
--- a/lite/backends/fpga/KD/layout.hpp
+++ b/lite/backends/fpga/KD/layout.hpp
--- a/lite/backends/fpga/KD/llapi/bias_scale.cpp
+++ b/lite/backends/fpga/KD/llapi/bias_scale.cpp
--- a/lite/backends/fpga/KD/llapi/bias_scale.h
+++ b/lite/backends/fpga/KD/llapi/bias_scale.h
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
--- a/lite/backends/fpga/KD/llapi/filter.h
+++ b/lite/backends/fpga/KD/llapi/filter.h
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.h
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.h
--- a/lite/backends/fpga/KD/pe.hpp
+++ b/lite/backends/fpga/KD/pe.hpp
@@ -32,6 +32,5 @@ class PE {
  virtual ~PE() {}
 };
 }  // namespace zynqmp
 }  // namespace paddle
--- a/lite/backends/fpga/KD/pe_params.hpp
+++ b/lite/backends/fpga/KD/pe_params.hpp
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
--- a/lite/backends/fpga/KD/pes/crop_pe.cpp
+++ b/lite/backends/fpga/KD/pes/crop_pe.cpp
--- a/lite/backends/fpga/KD/pes/crop_pe.hpp
+++ b/lite/backends/fpga/KD/pes/crop_pe.hpp
--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
--- a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
+++ b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
--- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
+++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
--- a/lite/backends/fpga/KD/pes/gru_pe.hpp
+++ b/lite/backends/fpga/KD/pes/gru_pe.hpp
--- a/lite/kernels/bm/bridges/paddle_use_bm_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bm_bridges.h
--- a/lite/backends/fpga/KD/pes/output_pe.hpp
+++ b/lite/backends/fpga/KD/pes/output_pe.hpp
--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
--- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp
+++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
--- a/lite/backends/fpga/KD/shape.hpp
+++ b/lite/backends/fpga/KD/shape.hpp
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
--- a/lite/backends/fpga/lite_tensor.cc
+++ b/lite/backends/fpga/lite_tensor.cc
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
--- a/lite/backends/npu/CMakeLists.txt
+++ b/lite/backends/npu/CMakeLists.txt
--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
--- a/lite/backends/opencl/CMakeLists.txt
+++ b/lite/backends/opencl/CMakeLists.txt
--- a/lite/backends/opencl/cl_caller.cc
+++ b/lite/backends/opencl/cl_caller.cc
--- a/lite/backends/opencl/cl_caller.h
+++ b/lite/backends/opencl/cl_caller.h
--- a/lite/backends/opencl/cl_functions_test.cc
+++ b/lite/backends/opencl/cl_functions_test.cc
--- a/lite/backends/opencl/cl_image_converter.h
+++ b/lite/backends/opencl/cl_image_converter.h
--- a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ b/lite/backends/opencl/cl_kernel/cl_common.h
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
--- a/lite/backends/opencl/target_wrapper.h
+++ b/lite/backends/opencl/target_wrapper.h
--- a/lite/backends/x86/cpu_info.cc
+++ b/lite/backends/x86/cpu_info.cc
--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
--- a/lite/backends/x86/jit/gen_base.cc
+++ b/lite/backends/x86/jit/gen_base.cc
--- a/lite/backends/x86/jit/gen_base.h
+++ b/lite/backends/x86/jit/gen_base.h
--- a/lite/backends/x86/math/detail/avx_mathfun.h
+++ b/lite/backends/x86/math/detail/avx_mathfun.h
--- a/lite/backends/x86/parallel.h
+++ b/lite/backends/x86/parallel.h
--- a/lite/backends/xpu/CMakeLists.txt
+++ b/lite/backends/xpu/CMakeLists.txt
--- a/lite/backends/xpu/device.cc
+++ b/lite/backends/xpu/device.cc
--- a/lite/backends/xpu/device.h
+++ b/lite/backends/xpu/device.h
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
--- a/lite/core/context.h
+++ b/lite/core/context.h
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
--- a/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc
+++ b/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc
--- a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
+++ b/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
--- a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc
--- a/lite/backends/bm/bm_context.cc
+++ b/lite/backends/bm/bm_context.cc
--- a/lite/core/mir/fusion/sequence_pool_concat_fuser.cc
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuser.cc
--- a/lite/core/mir/fusion/sequence_pool_concat_fuser.h
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuser.h
--- a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc
--- a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h
--- a/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
--- a/lite/core/mir/fusion/var_conv_2d_activation_fuser.h
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuser.h
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
--- a/lite/core/mir/pattern_matcher.cc
+++ b/lite/core/mir/pattern_matcher.cc
--- a/lite/core/mir/pattern_matcher.h
+++ b/lite/core/mir/pattern_matcher.h
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
--- a/lite/core/mir/static_kernel_pick_pass.cc
+++ b/lite/core/mir/static_kernel_pick_pass.cc
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
--- a/lite/core/mir/subgraph/CMakeLists.txt
+++ b/lite/core/mir/subgraph/CMakeLists.txt
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
--- a/lite/core/mir/subgraph/subgraph_detector.h
+++ b/lite/core/mir/subgraph/subgraph_detector.h
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
--- a/lite/core/mir/type_target_cast_pass.h
+++ b/lite/core/mir/type_target_cast_pass.h
--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
--- a/lite/core/profile/profiler.cc
+++ b/lite/core/profile/profiler.cc
--- a/lite/core/profile/profiler.h
+++ b/lite/core/profile/profiler.h
--- a/lite/core/profile/timer.h
+++ b/lite/core/profile/timer.h
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
--- a/lite/core/program.h
+++ b/lite/core/program.h
--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
--- a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7
--- a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8
--- a/lite/demo/cxx/mobile_classify/mobile_classify.cc
+++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
--- a/lite/kernels/arm/collect_fpn_proposals_compute.cc
+++ b/lite/kernels/arm/collect_fpn_proposals_compute.cc
--- a/lite/kernels/bm/bridges/registry.cc
+++ b/lite/kernels/bm/bridges/registry.cc
--- a/lite/kernels/arm/compare_compute.cc
+++ b/lite/kernels/arm/compare_compute.cc
--- a/lite/kernels/arm/compare_compute.h
+++ b/lite/kernels/arm/compare_compute.h
--- a/lite/kernels/arm/conditional_block_compute.cc
+++ b/lite/kernels/arm/conditional_block_compute.cc
--- a/lite/kernels/arm/conditional_block_compute.h
+++ b/lite/kernels/arm/conditional_block_compute.h
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
--- a/lite/kernels/arm/distribute_fpn_proposals_compute.cc
+++ b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
--- a/lite/kernels/arm/distribute_fpn_proposals_compute.h
+++ b/lite/kernels/arm/distribute_fpn_proposals_compute.h
--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ b/lite/kernels/arm/elementwise_compute_test.cc
--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
--- a/lite/kernels/arm/grid_sampler_compute.cc
+++ b/lite/kernels/arm/grid_sampler_compute.cc
--- a/lite/kernels/arm/grid_sampler_compute.h
+++ b/lite/kernels/arm/grid_sampler_compute.h
--- a/lite/kernels/arm/instance_norm_compute.cc
+++ b/lite/kernels/arm/instance_norm_compute.cc
--- a/lite/kernels/arm/instance_norm_compute.h
+++ b/lite/kernels/arm/instance_norm_compute.h
--- a/lite/kernels/arm/interpolate_compute.cc
+++ b/lite/kernels/arm/interpolate_compute.cc
--- a/lite/kernels/arm/merge_lod_tensor_compute.cc
+++ b/lite/kernels/arm/merge_lod_tensor_compute.cc
--- a/lite/kernels/arm/merge_lod_tensor_compute.h
+++ b/lite/kernels/arm/merge_lod_tensor_compute.h
--- a/lite/kernels/arm/merge_lod_tensor_compute_test.cc
+++ b/lite/kernels/arm/merge_lod_tensor_compute_test.cc
--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
--- a/lite/kernels/arm/reduce_prod_compute.cc
+++ b/lite/kernels/arm/reduce_prod_compute.cc
--- a/lite/kernels/arm/reduce_prod_compute.h
+++ b/lite/kernels/arm/reduce_prod_compute.h
--- a/lite/kernels/arm/shape_compute.cc
+++ b/lite/kernels/arm/shape_compute.cc
--- a/lite/kernels/arm/slice_compute.cc
+++ b/lite/kernels/arm/slice_compute.cc
--- a/lite/kernels/arm/slice_compute.h
+++ b/lite/kernels/arm/slice_compute.h
--- a/lite/kernels/arm/split_lod_tensor_compute.cc
+++ b/lite/kernels/arm/split_lod_tensor_compute.cc
--- a/lite/kernels/arm/split_lod_tensor_compute.h
+++ b/lite/kernels/arm/split_lod_tensor_compute.h
--- a/lite/kernels/arm/split_lod_tensor_compute_test.cc
+++ b/lite/kernels/arm/split_lod_tensor_compute_test.cc
--- a/lite/kernels/arm/unsqueeze_compute.cc
+++ b/lite/kernels/arm/unsqueeze_compute.cc
--- a/lite/kernels/arm/while_compute.h
+++ b/lite/kernels/arm/while_compute.h
--- a/lite/kernels/arm/yolo_box_compute.cc
+++ b/lite/kernels/arm/yolo_box_compute.cc
--- a/lite/kernels/bm/CMakeLists.txt
+++ b/lite/kernels/bm/CMakeLists.txt
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
--- a/lite/kernels/bm/bridges/batch_norm_op.cc
+++ b/lite/kernels/bm/bridges/batch_norm_op.cc
--- a/lite/kernels/bm/bridges/conv_op.cc
+++ b/lite/kernels/bm/bridges/conv_op.cc
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
--- a/lite/kernels/bm/bridges/graph.cc
+++ b/lite/kernels/bm/bridges/graph.cc
--- a/lite/kernels/bm/bridges/graph.h
+++ b/lite/kernels/bm/bridges/graph.h
--- a/lite/kernels/bm/bridges/mul_op.cc
+++ b/lite/kernels/bm/bridges/mul_op.cc
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
--- a/lite/kernels/bm/bridges/pool_op.cc
+++ b/lite/kernels/bm/bridges/pool_op.cc
--- a/lite/kernels/bm/bridges/registry.h
+++ b/lite/kernels/bm/bridges/registry.h
--- a/lite/kernels/bm/bridges/scale_op.cc
+++ b/lite/kernels/bm/bridges/scale_op.cc
--- a/lite/kernels/bm/bridges/softmax_op.cc
+++ b/lite/kernels/bm/bridges/softmax_op.cc
--- a/lite/backends/bm/builder.cc
+++ b/lite/backends/bm/builder.cc
--- a/lite/backends/bm/builder.h
+++ b/lite/backends/bm/builder.h
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
--- a/lite/kernels/bm/graph_compute.h
+++ b/lite/kernels/bm/graph_compute.h
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
--- a/lite/kernels/cuda/layout_compute.cc
+++ b/lite/kernels/cuda/layout_compute.cc
--- a/lite/kernels/cuda/layout_compute.h
+++ b/lite/kernels/cuda/layout_compute.h
--- a/lite/kernels/cuda/match_matrix_tensor_compute.cu
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.cu
--- a/lite/kernels/cuda/match_matrix_tensor_compute.h
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.h
--- a/lite/kernels/cuda/search_fc_compute.cu
+++ b/lite/kernels/cuda/search_fc_compute.cu
--- a/lite/kernels/cuda/search_fc_compute.h
+++ b/lite/kernels/cuda/search_fc_compute.h
--- a/lite/kernels/cuda/search_grnn_compute.cu
+++ b/lite/kernels/cuda/search_grnn_compute.cu
--- a/lite/kernels/cuda/search_grnn_compute.h
+++ b/lite/kernels/cuda/search_grnn_compute.h
--- a/lite/kernels/cuda/sequence_concat_compute.cu
+++ b/lite/kernels/cuda/sequence_concat_compute.cu
--- a/lite/kernels/cuda/sequence_concat_compute.h
+++ b/lite/kernels/cuda/sequence_concat_compute.h
--- a/lite/kernels/cuda/sequence_pool_concat_compute.cu
+++ b/lite/kernels/cuda/sequence_pool_concat_compute.cu
--- a/lite/kernels/cuda/sequence_pool_concat_compute.h
+++ b/lite/kernels/cuda/sequence_pool_concat_compute.h
--- a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
--- a/lite/kernels/cuda/softmax_compute.cu
+++ b/lite/kernels/cuda/softmax_compute.cu
--- a/lite/kernels/cuda/softmax_compute.h
+++ b/lite/kernels/cuda/softmax_compute.h
--- a/lite/kernels/cuda/transpose_compute.cu
+++ b/lite/kernels/cuda/transpose_compute.cu
--- a/lite/kernels/cuda/transpose_compute.h
+++ b/lite/kernels/cuda/transpose_compute.h
--- a/lite/kernels/cuda/transpose_compute_test.cc
+++ b/lite/kernels/cuda/transpose_compute_test.cc
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
--- a/lite/kernels/cuda/var_conv_2d_compute.h
+++ b/lite/kernels/cuda/var_conv_2d_compute.h
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
--- a/lite/kernels/fpga/calib_compute.cc
+++ b/lite/kernels/fpga/calib_compute.cc
--- a/lite/kernels/fpga/concat_compute.cc
+++ b/lite/kernels/fpga/concat_compute.cc
--- a/lite/kernels/fpga/concat_compute.h
+++ b/lite/kernels/fpga/concat_compute.h
--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
--- a/lite/kernels/fpga/conv_compute.h
+++ b/lite/kernels/fpga/conv_compute.h
--- a/lite/kernels/fpga/conv_compute_test.cc
+++ b/lite/kernels/fpga/conv_compute_test.cc
--- a/lite/kernels/fpga/dropout_compute.cc
+++ b/lite/kernels/fpga/dropout_compute.cc
--- a/lite/kernels/fpga/dropout_compute.h
+++ b/lite/kernels/fpga/dropout_compute.h
--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
--- a/lite/kernels/fpga/elementwise_compute.h
+++ b/lite/kernels/fpga/elementwise_compute.h
--- a/lite/kernels/fpga/fc_compute.cc
+++ b/lite/kernels/fpga/fc_compute.cc
--- a/lite/kernels/fpga/fc_compute.h
+++ b/lite/kernels/fpga/fc_compute.h
--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
--- a/lite/kernels/fpga/feed_compute.h
+++ b/lite/kernels/fpga/feed_compute.h
--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
--- a/lite/kernels/fpga/fetch_compute.h
+++ b/lite/kernels/fpga/fetch_compute.h
--- a/lite/kernels/fpga/gru_compute.cc
+++ b/lite/kernels/fpga/gru_compute.cc
--- a/lite/kernels/fpga/gru_compute.h
+++ b/lite/kernels/fpga/gru_compute.h
--- a/lite/kernels/fpga/im2sequence_compute.cc
+++ b/lite/kernels/fpga/im2sequence_compute.cc
--- a/lite/kernels/fpga/im2sequence_compute.h
+++ b/lite/kernels/fpga/im2sequence_compute.h
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
--- a/lite/kernels/fpga/layout_compute.cc
+++ b/lite/kernels/fpga/layout_compute.cc
--- a/lite/kernels/fpga/mul_compute.cc
+++ b/lite/kernels/fpga/mul_compute.cc
--- a/lite/kernels/bm/graph_compute.cc
+++ b/lite/kernels/bm/graph_compute.cc
--- a/lite/kernels/fpga/multiclass_nms_compute.cc
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
--- a/lite/kernels/fpga/multiclass_nms_compute.h
+++ b/lite/kernels/fpga/multiclass_nms_compute.h
--- a/lite/kernels/fpga/norm_compute.cc
+++ b/lite/kernels/fpga/norm_compute.cc
--- a/lite/kernels/fpga/norm_compute.h
+++ b/lite/kernels/fpga/norm_compute.h
--- a/lite/kernels/fpga/pooling_compute.cc
+++ b/lite/kernels/fpga/pooling_compute.cc
--- a/lite/kernels/fpga/pooling_compute_test.cc
+++ b/lite/kernels/fpga/pooling_compute_test.cc
--- a/lite/kernels/fpga/prior_box_compute.cc
+++ b/lite/kernels/fpga/prior_box_compute.cc
--- a/lite/kernels/fpga/prior_box_compute.h
+++ b/lite/kernels/fpga/prior_box_compute.h
--- a/lite/kernels/fpga/reshape_compute.cc
+++ b/lite/kernels/fpga/reshape_compute.cc
--- a/lite/kernels/fpga/reshape_compute.h
+++ b/lite/kernels/fpga/reshape_compute.h
--- a/lite/kernels/fpga/scale_compute.cc
+++ b/lite/kernels/fpga/scale_compute.cc
--- a/lite/kernels/fpga/scale_compute.h
+++ b/lite/kernels/fpga/scale_compute.h
--- a/lite/kernels/fpga/softmax_compute.cc
+++ b/lite/kernels/fpga/softmax_compute.cc
--- a/lite/kernels/fpga/transpose_compute.cc
+++ b/lite/kernels/fpga/transpose_compute.cc
--- a/lite/kernels/fpga/transpose_compute.h
+++ b/lite/kernels/fpga/transpose_compute.h
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
--- a/lite/kernels/npu/CMakeLists.txt
+++ b/lite/kernels/npu/CMakeLists.txt
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
--- a/lite/kernels/npu/bridges/act_op_test.cc
+++ b/lite/kernels/npu/bridges/act_op_test.cc
--- a/lite/kernels/npu/bridges/argmax_op.cc
+++ b/lite/kernels/npu/bridges/argmax_op.cc
--- a/lite/kernels/npu/bridges/argmax_op_test.cc
+++ b/lite/kernels/npu/bridges/argmax_op_test.cc
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
--- a/lite/kernels/npu/bridges/concat_op.cc
+++ b/lite/kernels/npu/bridges/concat_op.cc
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
--- a/lite/kernels/npu/bridges/fc_op.cc
+++ b/lite/kernels/npu/bridges/fc_op.cc
--- a/lite/kernels/npu/bridges/graph.cc
+++ b/lite/kernels/npu/bridges/graph.cc
--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
--- a/lite/kernels/npu/bridges/pad2d_op.cc
+++ b/lite/kernels/npu/bridges/pad2d_op.cc
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
--- a/lite/kernels/npu/bridges/reduce_mean_op.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
--- a/lite/kernels/npu/bridges/registry.cc
+++ b/lite/kernels/npu/bridges/registry.cc
--- a/lite/kernels/npu/bridges/registry.h
+++ b/lite/kernels/npu/bridges/registry.h
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
--- a/lite/kernels/npu/bridges/scale_op.cc
+++ b/lite/kernels/npu/bridges/scale_op.cc
--- a/lite/kernels/npu/bridges/shuffle_channel_op.cc
+++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc
--- a/lite/kernels/npu/bridges/softmax_op.cc
+++ b/lite/kernels/npu/bridges/softmax_op.cc
--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
--- a/lite/kernels/npu/bridges/sqrt_op.cc
+++ b/lite/kernels/npu/bridges/sqrt_op.cc
--- a/lite/kernels/npu/bridges/square_op.cc
+++ b/lite/kernels/npu/bridges/square_op.cc
--- a/lite/kernels/npu/bridges/transpose_op.cc
+++ b/lite/kernels/npu/bridges/transpose_op.cc
--- a/lite/kernels/npu/bridges/unsqueeze_op.cc
+++ b/lite/kernels/npu/bridges/unsqueeze_op.cc
--- a/lite/kernels/npu/bridges/unsqueeze_op_test.cc
+++ b/lite/kernels/npu/bridges/unsqueeze_op_test.cc
--- a/lite/kernels/npu/bridges/utility.cc
+++ b/lite/kernels/npu/bridges/utility.cc
--- a/lite/kernels/npu/bridges/utility.h
+++ b/lite/kernels/npu/bridges/utility.h
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/conv2d_1x1_compute.cc
+++ b/lite/kernels/opencl/conv2d_1x1_compute.cc
--- a/lite/kernels/opencl/conv2d_1x1_compute_test.cc
+++ b/lite/kernels/opencl/conv2d_1x1_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc
--- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
--- a/lite/kernels/opencl/elementwise_add_compute.h
+++ b/lite/kernels/opencl/elementwise_add_compute.h
--- a/lite/kernels/opencl/image_helper.h
+++ b/lite/kernels/opencl/image_helper.h
--- a/lite/kernels/opencl/layout_compute.cc
+++ b/lite/kernels/opencl/layout_compute.cc
--- a/lite/kernels/opencl/layout_compute_test.cc
+++ b/lite/kernels/opencl/layout_compute_test.cc
--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_compute.cc
--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_compute_test.cc
--- a/lite/kernels/opencl/relu_compute.cc
+++ b/lite/kernels/opencl/relu_compute.cc
--- a/lite/kernels/opencl/relu_compute_test.cc
+++ b/lite/kernels/opencl/relu_compute_test.cc
--- a/lite/kernels/opencl/reshape_compute.cc
+++ b/lite/kernels/opencl/reshape_compute.cc
--- a/lite/kernels/opencl/reshape_compute_test.cc
+++ b/lite/kernels/opencl/reshape_compute_test.cc
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
--- a/lite/kernels/x86/fc_compute.h
+++ b/lite/kernels/x86/fc_compute.h
--- a/lite/kernels/x86/gru_compute.cc
+++ b/lite/kernels/x86/gru_compute.cc
--- a/lite/kernels/x86/gru_compute.h
+++ b/lite/kernels/x86/gru_compute.h
--- a/lite/kernels/x86/lookup_table_compute.cc
+++ b/lite/kernels/x86/lookup_table_compute.cc
--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
--- a/lite/kernels/x86/lookup_table_compute_test.cc
+++ b/lite/kernels/x86/lookup_table_compute_test.cc
--- a/lite/kernels/x86/sequence_concat_compute.h
+++ b/lite/kernels/x86/sequence_concat_compute.h
--- a/lite/kernels/x86/stack_compute.cc
+++ b/lite/kernels/x86/stack_compute.cc
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
--- a/lite/kernels/xpu/bridges/act_op.cc
+++ b/lite/kernels/xpu/bridges/act_op.cc
--- a/lite/kernels/xpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/xpu/bridges/batch_norm_op.cc
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
--- a/lite/kernels/xpu/bridges/dropout_op.cc
+++ b/lite/kernels/xpu/bridges/dropout_op.cc
--- a/lite/kernels/xpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/xpu/bridges/elementwise_ops.cc
--- a/lite/kernels/xpu/bridges/gather_op.cc
+++ b/lite/kernels/xpu/bridges/gather_op.cc
--- a/lite/kernels/xpu/bridges/graph.cc
+++ b/lite/kernels/xpu/bridges/graph.cc
--- a/lite/kernels/xpu/bridges/graph.h
+++ b/lite/kernels/xpu/bridges/graph.h
--- a/lite/kernels/xpu/bridges/layer_norm_op.cc
+++ b/lite/kernels/xpu/bridges/layer_norm_op.cc
--- a/lite/kernels/xpu/bridges/lookup_table_op.cc
+++ b/lite/kernels/xpu/bridges/lookup_table_op.cc
--- a/lite/kernels/xpu/bridges/matmul_op.cc
+++ b/lite/kernels/xpu/bridges/matmul_op.cc
--- a/lite/kernels/xpu/bridges/mul_op.cc
+++ b/lite/kernels/xpu/bridges/mul_op.cc
--- a/lite/kernels/xpu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/xpu/bridges/paddle_use_bridges.h
--- a/lite/kernels/xpu/bridges/pool_op.cc
+++ b/lite/kernels/xpu/bridges/pool_op.cc
--- a/lite/kernels/xpu/bridges/reshape_op.cc
+++ b/lite/kernels/xpu/bridges/reshape_op.cc
--- a/lite/kernels/xpu/bridges/scale_op.cc
+++ b/lite/kernels/xpu/bridges/scale_op.cc
--- a/lite/kernels/xpu/bridges/slice_op.cc
+++ b/lite/kernels/xpu/bridges/slice_op.cc
--- a/lite/kernels/xpu/bridges/softmax_op.cc
+++ b/lite/kernels/xpu/bridges/softmax_op.cc
--- a/lite/kernels/xpu/bridges/stack_op.cc
+++ b/lite/kernels/xpu/bridges/stack_op.cc
--- a/lite/kernels/xpu/bridges/transpose_op.cc
+++ b/lite/kernels/xpu/bridges/transpose_op.cc
--- a/lite/kernels/xpu/bridges/utility.cc
+++ b/lite/kernels/xpu/bridges/utility.cc
--- a/lite/kernels/xpu/bridges/utility.h
+++ b/lite/kernels/xpu/bridges/utility.h
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
--- a/lite/model_parser/cpp/var_desc.h
+++ b/lite/model_parser/cpp/var_desc.h
--- a/lite/model_parser/naive_buffer/var_desc.cc
+++ b/lite/model_parser/naive_buffer/var_desc.cc
--- a/lite/model_parser/naive_buffer/var_desc.h
+++ b/lite/model_parser/naive_buffer/var_desc.h
--- a/lite/model_parser/pb/var_desc.cc
+++ b/lite/model_parser/pb/var_desc.cc
--- a/lite/model_parser/pb/var_desc.h
+++ b/lite/model_parser/pb/var_desc.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
--- a/lite/operators/collect_fpn_proposals_op.cc
+++ b/lite/operators/collect_fpn_proposals_op.cc
--- a/lite/operators/collect_fpn_proposals_op.h
+++ b/lite/operators/collect_fpn_proposals_op.h
--- a/lite/operators/conditional_block_op.cc
+++ b/lite/operators/conditional_block_op.cc
--- a/lite/operators/conditional_block_op.h
+++ b/lite/operators/conditional_block_op.h
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/distribute_fpn_proposals_op.cc
+++ b/lite/operators/distribute_fpn_proposals_op.cc
--- a/lite/operators/distribute_fpn_proposals_op.h
+++ b/lite/operators/distribute_fpn_proposals_op.h
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
--- a/lite/operators/fill_constant_op.cc
+++ b/lite/operators/fill_constant_op.cc
--- a/lite/operators/grid_sampler_op.cc
+++ b/lite/operators/grid_sampler_op.cc
--- a/lite/operators/grid_sampler_op.h
+++ b/lite/operators/grid_sampler_op.h
--- a/lite/operators/instance_norm_op.cc
+++ b/lite/operators/instance_norm_op.cc
--- a/lite/operators/instance_norm_op.h
+++ b/lite/operators/instance_norm_op.h
--- a/lite/operators/layer_norm_op.cc
+++ b/lite/operators/layer_norm_op.cc
--- a/lite/operators/merge_lod_tensor_op.cc
+++ b/lite/operators/merge_lod_tensor_op.cc
--- a/lite/operators/merge_lod_tensor_op.h
+++ b/lite/operators/merge_lod_tensor_op.h
--- a/lite/operators/multiclass_nms_op.cc
+++ b/lite/operators/multiclass_nms_op.cc
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/reduce_prod_op.cc
+++ b/lite/operators/reduce_prod_op.cc
--- a/lite/operators/reduce_prod_op.h
+++ b/lite/operators/reduce_prod_op.h
--- a/lite/operators/sequence_concat_op.cc
+++ b/lite/operators/sequence_concat_op.cc
--- a/lite/operators/sequence_pool_concat_op.cc
+++ b/lite/operators/sequence_pool_concat_op.cc
--- a/lite/operators/sequence_pool_concat_op.h
+++ b/lite/operators/sequence_pool_concat_op.h
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
--- a/lite/operators/split_lod_tensor_op.cc
+++ b/lite/operators/split_lod_tensor_op.cc
--- a/lite/operators/split_lod_tensor_op.h
+++ b/lite/operators/split_lod_tensor_op.h
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
--- a/lite/operators/subgraph_op.cc
+++ b/lite/operators/subgraph_op.cc
--- a/lite/operators/subgraph_op.h
+++ b/lite/operators/subgraph_op.h
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
--- a/lite/tests/kernels/dropout_compute_test.cc
+++ b/lite/tests/kernels/dropout_compute_test.cc
--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ b/lite/tests/kernels/elementwise_compute_test.cc
--- a/lite/tests/kernels/fc_compute_test.cc
+++ b/lite/tests/kernels/fc_compute_test.cc
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
--- a/lite/tests/kernels/grid_sampler_compute_test.cc
+++ b/lite/tests/kernels/grid_sampler_compute_test.cc
--- a/lite/tests/kernels/instance_norm_compute_test.cc
+++ b/lite/tests/kernels/instance_norm_compute_test.cc
--- a/lite/tests/kernels/layer_norm_compute_test.cc
+++ b/lite/tests/kernels/layer_norm_compute_test.cc
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
--- a/lite/tests/kernels/mul_compute_test.cc
+++ b/lite/tests/kernels/mul_compute_test.cc
--- a/lite/tests/kernels/reduce_prod_compute_test.cc
+++ b/lite/tests/kernels/reduce_prod_compute_test.cc
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
--- a/lite/tests/kernels/softmax_compute_test.cc
+++ b/lite/tests/kernels/softmax_compute_test.cc
--- a/lite/tests/kernels/stack_compute_test.cc
+++ b/lite/tests/kernels/stack_compute_test.cc
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
--- a/lite/tests/math/pool_compute_test.cc
+++ b/lite/tests/math/pool_compute_test.cc
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_npu.sh
+++ b/lite/tools/build_npu.sh
--- a/lite/tools/build_xpu.sh
+++ b/lite/tools/build_xpu.sh
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/lite/tools/debug/CMakeLists.txt
+++ b/lite/tools/debug/CMakeLists.txt
--- a/lite/tools/debug/debug_utils.h
+++ b/lite/tools/debug/debug_utils.h
--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
--- a/lite/utils/env.h
+++ b/lite/utils/env.h
--- a/mobile/src/common/log.h
+++ b/mobile/src/common/log.h
--- a/mobile/src/common/types.cpp
+++ b/mobile/src/common/types.cpp
--- a/mobile/src/common/types.h
+++ b/mobile/src/common/types.h
--- a/mobile/src/fpga/V2/bias_scale.cpp
+++ b/mobile/src/fpga/V2/bias_scale.cpp
--- a/mobile/src/fpga/V2/image.cpp
+++ b/mobile/src/fpga/V2/image.cpp
--- a/mobile/src/fpga/V2/pe.cpp
+++ b/mobile/src/fpga/V2/pe.cpp
--- a/mobile/src/framework/cl/cl_image.cpp
+++ b/mobile/src/framework/cl/cl_image.cpp
--- a/mobile/src/framework/cl/cl_image.h
+++ b/mobile/src/framework/cl/cl_image.h
--- a/mobile/src/framework/cl/cl_tool.h
+++ b/mobile/src/framework/cl/cl_tool.h
--- a/mobile/src/framework/executor.cpp
+++ b/mobile/src/framework/executor.cpp
--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
--- a/mobile/src/io/api_paddle_mobile.cc
+++ b/mobile/src/io/api_paddle_mobile.cc
--- a/mobile/src/io/api_paddle_mobile.h
+++ b/mobile/src/io/api_paddle_mobile.h
--- a/mobile/src/io/paddle_inference_api.h
+++ b/mobile/src/io/paddle_inference_api.h
--- a/mobile/src/operators/elementwise_sub_op.cpp
+++ b/mobile/src/operators/elementwise_sub_op.cpp
--- a/mobile/src/operators/expand_op.cpp
+++ b/mobile/src/operators/expand_op.cpp
--- a/mobile/src/operators/expand_op.h
+++ b/mobile/src/operators/expand_op.h
--- a/mobile/src/operators/fusion_instancenorm_relu_op.h
+++ b/mobile/src/operators/fusion_instancenorm_relu_op.h
--- a/mobile/src/operators/grid_sampler_op.cpp
+++ b/mobile/src/operators/grid_sampler_op.cpp
--- a/mobile/src/operators/grid_sampler_op.h
+++ b/mobile/src/operators/grid_sampler_op.h
--- a/mobile/src/operators/instancenorm_op.cpp
+++ b/mobile/src/operators/instancenorm_op.cpp
--- a/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
--- a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
+++ b/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
--- a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
--- a/mobile/src/operators/kernel/cl/expand_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/expand_kernel.cpp
--- a/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
--- a/mobile/src/operators/kernel/expand_kernel.h
+++ b/mobile/src/operators/kernel/expand_kernel.h
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
--- a/mobile/src/operators/kernel/grid_sampler_kernel.h
+++ b/mobile/src/operators/kernel/grid_sampler_kernel.h
--- a/mobile/src/operators/kernel/instancenorm_relu_kernel.h
+++ b/mobile/src/operators/kernel/instancenorm_relu_kernel.h
--- a/mobile/src/operators/nearest_interp_op.cpp
+++ b/mobile/src/operators/nearest_interp_op.cpp
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
--- a/mobile/test/common/test_log.cpp
+++ b/mobile/test/common/test_log.cpp
--- a/mobile/test/executor_for_test.h
+++ b/mobile/test/executor_for_test.h
--- a/mobile/test/executor_for_test_opencl.h
+++ b/mobile/test/executor_for_test_opencl.h
--- a/mobile/test/net/test_inference_api_v2.cpp
+++ b/mobile/test/net/test_inference_api_v2.cpp
--- a/mobile/test/net/test_net_multi_feed.cpp
+++ b/mobile/test/net/test_net_multi_feed.cpp
--- a/mobile/test/operators/test_expend_op.cpp
+++ b/mobile/test/operators/test_expend_op.cpp
--- a/mobile/tools/op.cmake
+++ b/mobile/tools/op.cmake
--- a/mobile/tools/python/fluidtools/.gitignore
+++ b/mobile/tools/python/fluidtools/.gitignore
--- a/mobile/tools/python/fluidtools/run_multi_feed.py
+++ b/mobile/tools/python/fluidtools/run_multi_feed.py