Merge branch 'develop' into step_rnn/opt_ddim_lite

794b01ec · Liu Yiqun · 3a01e0cc · 68d75d33 · 794b01ec · 794b01ec
300 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
+lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
 lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
 lite_option(LITE_WITH_FPGA   "Enable FPGA support in lite" OFF)
@@ -74,7 +75,7 @@ lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kerne
 lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
 # cv build options
 lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF)
-
+lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." ON)

 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 if(ANDROID OR IOS OR ARMLINUX)
@@ -192,6 +193,9 @@ if(LITE_WITH_CUDA)
  include(cuda)
 endif()

+if(LITE_WITH_BM)
+  include(bm)
+endif()
 include(generic)            # simplify cmake module
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs

--- a/cmake/bm.cmake
+++ b/cmake/bm.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_BM)
+  return()
+endif()
+
+if(NOT DEFINED BM_SDK_ROOT)
+    set(BM_SDK_ROOT $ENV{BM_SDK_ROOT})
+    if(NOT BM_SDK_ROOT)
+        message(FATAL_ERROR "Must set BM_SDK_ROOT or env BM_SDK_ROOT when LITE_WITH_BM=ON")
+    endif()
+endif()
+
+message(STATUS "BM_SDK_ROOT: ${BM_SDK_ROOT}")
+find_path(BM_SDK_INC NAMES bmruntime_interface.h
+  PATHS ${BM_SDK_ROOT}/include/bmruntime NO_DEFAULT_PATH)
+if(NOT BM_SDK_INC)
+  message(FATAL_ERROR "Can not find bmruntime_interface.h in ${BM_SDK_ROOT}/include")
+endif()
+
+include_directories("${BM_SDK_ROOT}/include/bmruntime")
+include_directories("${BM_SDK_ROOT}/include/bmlib")
+include_directories("${BM_SDK_ROOT}/include/bmcompiler")
+include_directories("${BM_SDK_ROOT}/include/bmcpu")
+include_directories("${BM_SDK_ROOT}/include/bmlog")
+
+find_library(BM_SDK_RT_LIB NAMES bmrt
+  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+if(NOT BM_SDK_RT_LIB)
+  message(FATAL_ERROR "Can not find bmrt Library in ${BM_SDK_ROOT}")
+else()
+  message(STATUS "Found bmrt Library: ${BM_SDK_RT_LIB}")
+  add_library(bmrt SHARED IMPORTED GLOBAL)
+  set_property(TARGET bmrt PROPERTY IMPORTED_LOCATION ${BM_SDK_RT_LIB})
+endif()
+
+find_library(BM_SDK_BM_LIB NAMES bmlib
+  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+if(NOT BM_SDK_BM_LIB)
+  message(FATAL_ERROR "Can not find bmlib Library in ${BM_SDK_ROOT}")
+else()
+  message(STATUS "Found bmlib Library: ${BM_SDK_BM_LIB}")
+  add_library(bmlib SHARED IMPORTED GLOBAL)
+  set_property(TARGET bmlib PROPERTY IMPORTED_LOCATION ${BM_SDK_BM_LIB})
+endif()
+
+find_library(BM_SDK_COMPILER_LIB NAMES bmcompiler
+  PATHS ${BM_SDK_ROOT}/lib/bmcompiler)
+if(NOT BM_SDK_COMPILER_LIB)
+  message(FATAL_ERROR "Can not find bmcompiler Library in ${BM_SDK_ROOT}")
+else()
+  message(STATUS "Found bmcompiler Library: ${BM_SDK_COMPILER_LIB}")
+  add_library(bmcompiler SHARED IMPORTED GLOBAL)
+  set_property(TARGET bmcompiler PROPERTY IMPORTED_LOCATION ${BM_SDK_COMPILER_LIB})
+endif()
+
+find_library(BM_SDK_CPU_LIB NAMES bmcpu
+  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+if(NOT BM_SDK_CPU_LIB)
+  message(FATAL_ERROR "Can not find bmcpu Library in ${BM_SDK_ROOT}")
+else()
+  message(STATUS "Found bmcpu Library: ${BM_SDK_CPU_LIB}")
+  add_library(bmcpu SHARED IMPORTED GLOBAL)
+  set_property(TARGET bmcpu PROPERTY IMPORTED_LOCATION ${BM_SDK_CPU_LIB})
+endif()
+
+set(bm_runtime_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm runtime libs")
+set(bm_builder_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm builder libs")
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -143,6 +143,10 @@ if (LITE_WITH_FPGA)
 add_definitions("-DLITE_WITH_FPGA")
 endif()

+if (LITE_WITH_BM)
+add_definitions("-DLITE_WITH_BM")
+endif()
+
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
    if (LITE_WITH_PRECISION_PROFILE)

--- a/cmake/cross_compiling/npu.cmake
+++ b/cmake/cross_compiling/npu.cmake
@@ -30,7 +30,7 @@ if(NOT NPU_DDK_INC)
  message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include")
 endif()

-include_directories("${NPU_DDK_ROOT}")
+include_directories("${NPU_DDK_ROOT}/include")

 set(NPU_SUB_LIB_PATH "lib64")
 if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -174,27 +174,45 @@ if(NOT WITH_DSO)
    endif(WIN32)
 endif(NOT WITH_DSO)

-function(add_cuda_static_lib alias cuda_lib_paths file_name)
-    unset(ABS_PATH CACHE)
-    find_library(ABS_PATH NAMES ${file_name} PATHS ${${cuda_lib_paths}} NO_DEFAULT_PATH)
-    add_library(${alias} STATIC IMPORTED GLOBAL)
-    set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${ABS_PATH})
-    set(CUDA_STATIC_MODULES ${CUDA_STATIC_MODULES} ${alias} PARENT_SCOPE)
-    if (NOT ABS_PATH)
-      message(FATAL_ERROR "Can not find CUDA static library: ${file_name}")
-    endif()
+function(add_cuda_lib TARGET_NAME)
+  set(options STATIC SHARED)
+  set(oneValueArgs "NAME")
+  set(multiValueArgs "PATHS")
+  cmake_parse_arguments(add_cuda_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  unset(ABS_PATH CACHE)
+  if (NOT add_cuda_lib_PATHS)
+      set(add_cuda_lib_PATHS CUDNN_CHECK_LIBRARY_DIRS)
+  endif()
+  find_library(ABS_PATH NAMES ${add_cuda_lib_NAME} PATHS ${${add_cuda_lib_PATHS}} NO_DEFAULT_PATH)
+  add_library(${TARGET_NAME} SHARED IMPORTED GLOBAL)
+  set_property(TARGET ${TARGET_NAME} PROPERTY IMPORTED_LOCATION ${ABS_PATH})
+  set(CUDA_MODULES ${CUDA_MODULES} ${TARGET_NAME} PARENT_SCOPE)
+  if (NOT ABS_PATH)
+    message(FATAL_ERROR "Can not find CUDA library: ${add_cuda_lib_NAME}")
+  endif()
 endfunction()

-add_cuda_static_lib(cudart_static CUDNN_CHECK_LIBRARY_DIRS libcudart_static.a)
-add_cuda_static_lib(cublas_static CUDNN_CHECK_LIBRARY_DIRS libcublas_static.a)
-add_cuda_static_lib(curand_static CUDNN_CHECK_LIBRARY_DIRS libcurand_static.a)
-add_cuda_static_lib(culibos_static CUDNN_CHECK_LIBRARY_DIRS libculibos.a)
-if(NOT ${CUDA_VERSION} LESS 10.1)
-  add_cuda_static_lib(cublasLt_static CUDNN_CHECK_LIBRARY_DIRS libcublasLt_static.a)
+if(LITE_WITH_STATIC_CUDA)
+  message(STATUS "Static link CUDA toolkit.")
+  add_cuda_lib(cudart_static STATIC NAME libcudart_static.a)
+  add_cuda_lib(cublas_static STATIC NAME libcublas_static.a)
+  add_cuda_lib(curand_static STATIC NAME libcurand_static.a)
+  add_cuda_lib(culibos_static STATIC NAME libculibos.a)
+  if(NOT ${CUDA_VERSION} LESS 10.1)
+    add_cuda_lib(cublasLt_static STATIC NAME libcublasLt_static.a)
+  endif()
+  set_property(GLOBAL PROPERTY CUDA_MODULES cudnn_static ${CUDA_MODULES})
+else()
+  message(STATUS "Dynamic Link CUDA toolkit.")
+  add_cuda_lib(cudart SHARED NAME libcudart.so)
+  add_cuda_lib(cublas SHARED NAME libcublas.so)
+  add_cuda_lib(curand SHARED NAME libcurand.so)
+  if(NOT ${CUDA_VERSION} LESS 10.1)
+    add_cuda_lib(cublasLt SHARED NAME libcublasLt.so)
+  endif()
+  set_property(GLOBAL PROPERTY CUDA_MODULES cudnn ${CUDA_MODULES})
 endif()

-set_property(GLOBAL PROPERTY CUDA_STATIC_MODULES cudnn_static ${CUDA_STATIC_MODULES})
-
 # setting nvcc arch flags
 select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -69,9 +69,15 @@ if(CUDNN_FOUND)
    file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)

    get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
-    add_library(cudnn_static STATIC IMPORTED GLOBAL)
-    set_property(TARGET cudnn_static PROPERTY IMPORTED_LOCATION
+    if(LITE_WITH_STATIC_CUDA)
+        add_library(cudnn_static STATIC IMPORTED GLOBAL)
+        set_property(TARGET cudnn_static PROPERTY IMPORTED_LOCATION
               "${CUDNN_LIB_PATH}/libcudnn_static.a")
+    else()
+        add_library(cudnn SHARED IMPORTED GLOBAL)
+        set_property(TARGET cudnn PROPERTY IMPORTED_LOCATION
+               "${CUDNN_LIB_PATH}/libcudnn.so")   
+    endif(LITE_WITH_STATIC_CUDA)

    string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"
        CUDNN_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  set(deps ${lite_deps_DEPS})
@@ -94,6 +94,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+  if (LITE_WITH_BM)
+    foreach(var ${lite_deps_BM_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()

@@ -119,7 +125,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -129,6 +135,7 @@ function(lite_cc_library TARGET)
            X86_DEPS ${args_X86_DEPS}
            CUDA_DEPS ${args_CUDA_DEPS}
            CL_DEPS ${args_CL_DEPS}
+            BM_DEPS ${args_BM_DEPS}
            ARM_DEPS ${args_ARM_DEPS}
            CV_DEPS ${args_CV_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
@@ -163,7 +170,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -177,6 +184,7 @@ function(lite_cc_binary TARGET)
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
+	    BM_DEPS ${args_BM_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
@@ -210,7 +218,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -232,6 +240,7 @@ function(lite_cc_test TARGET)
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
+	      BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -260,6 +269,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
+set(bm_kernels CACHE INTERNAL "bm kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")

@@ -270,12 +280,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA)
+# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -341,6 +351,12 @@ function(add_kernel TARGET device level)
        endif()
        set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "BM")
+        if (NOT LITE_WITH_BM)
+            return()
+        endif()
+        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
            return()
@@ -374,6 +390,7 @@ function(add_kernel TARGET device level)
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
+	      BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -392,7 +409,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -424,6 +441,7 @@ function(add_operator TARGET level)
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
+	      BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
+message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")

@@ -66,6 +67,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
    if (LITE_WITH_FPGA)
        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga")
    endif(LITE_WITH_FPGA)
+    if (LITE_WITH_BM)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm")
+    endif(LITE_WITH_BM)
 else()
    set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
 endif()
@@ -220,10 +224,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
-                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/ssd_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/ssd_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/ssd_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/yolov3_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
@@ -235,10 +243,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
-                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/ssd_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/ssd_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/ssd_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/yolov3_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
            )
            add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
        endif()

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -35,18 +35,18 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
        NPU_DEPS ${npu_kernels})

    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
-    if (LITE_WITH_NPU)
-        # Strips the symbols of our protobuf functions to fix the conflicts during
-        # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
-        set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-        set_target_properties(paddle_light_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-    endif()
+    set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
+    set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
+    add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
+    add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
+    set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
+    add_dependencies(paddle_full_api_shared custom_linker_map)
 else()
    if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
        add_library(paddle_light_api_shared SHARED "")
        target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc)
-       set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
-       add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
+        set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
+        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
        if (LITE_WITH_NPU)
            # Need to add HIAI runtime libs (libhiai.so) dependency
            target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
@@ -60,13 +60,19 @@ if (WITH_TESTING)
           ${ops} ${host_kernels}
      CUDA_DEPS ${cuda_kernels}
      X86_DEPS ${x86_kernels}
-      XPU_DEPS ${xpu_kernels})
+      XPU_DEPS ${xpu_kernels}
+      BM_DEPS ${bm_kernels})
 endif()
 if(LITE_WITH_FPGA)
    set(light_api_deps ${light_api_deps} ${fpga_deps})
    set(cxx_api_deps ${cxx_api_deps} ${fpga_deps})
 endif()

+if(LITE_WITH_BM)
+    set(light_api_deps ${light_api_deps} ${bm_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
+endif()
+
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
 message(STATUS "get CUDA kernels ${cuda_kernels}")
@@ -75,6 +81,7 @@ message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
+message(STATUS "get BM kernels ${bm_kernels}")

 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -84,10 +91,12 @@ if (NOT LITE_ON_TINY_PUBLISH)
                        SRCS cxx_api.cc
                        DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
                        X86_DEPS ${x86_kernels}
+                        CUDA_DEPS ${cuda_kernels}
                        ARM_DEPS ${arm_kernels}
                        CV_DEPS paddle_cv_arm
                        NPU_DEPS ${npu_kernels}
                        XPU_DEPS ${xpu_kernels}
+                        BM_DEPS ${bm_kernels}
                        CL_DEPS ${opencl_kernels}
                        FPGA_DEPS ${fpga_kernels})
 endif()
@@ -96,7 +105,7 @@ endif()
 set(light_api_deps
    scope target_wrapper_host model_parser program)
 if(LITE_WITH_CUDA)
-    get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
+    get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)
    set(light_api_deps ${light_api_deps} target_wrapper_cuda)
 endif()
 lite_cc_library(light_api SRCS light_api.cc
@@ -109,7 +118,8 @@ lite_cc_library(light_api SRCS light_api.cc
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
-        FPGA_DEPS ${fpga_kernels})
+        FPGA_DEPS ${fpga_kernels}
+        BM_DEPS ${bm_kernels})

 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -120,11 +130,14 @@ if(WITH_TESTING)
       DEPS cxx_api mir_passes lite_api_test_helper
       ${ops} ${host_kernels}
       X86_DEPS ${x86_kernels}
+       CUDA_DEPS ${cuda_kernels}
       ARM_DEPS ${arm_kernels}
+       CV_DEPS paddle_cv_arm
       NPU_DEPS ${npu_kernels}
       XPU_DEPS ${xpu_kernels}
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels}
+       BM_DEPS ${bm_kernels}
       EXCLUDE_COMPILE_DEPS "ON"
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -160,6 +173,12 @@ if(WITH_TESTING)
           ${ops} ${host_kernels} ${x86_kernels}
           ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn)
        add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz)
+        if(LITE_WITH_BM)
+           lite_cc_test(test_resnet50_lite_bm SRCS test_resnet50_lite_bm.cc
+              DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+              ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
+              ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+        endif()
    endif()
 endif()

@@ -238,9 +257,10 @@ if (NOT LITE_ON_TINY_PUBLISH)
        FPGA_DEPS ${fpga_kernels})
    # The final inference library for just MobileConfig.
    bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
+    target_link_libraries(paddle_api_full ${cuda_deps})
    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    cc_library(api_full_static SRCS DEPS paddle_api_full cxx_api paddle_api light_api  ${cxx_api_deps} ${ops} ${host_kernels} ${cuda_kernels} program tensor memory naive_buffer types ${fluid_modules} protobuf ${cuda_static_deps})
 endif()
+
 bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api)
 #-----------------------------------------------------------------------------------------------------

@@ -250,6 +270,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
        DEPS light_api program mir_passes paddle_api_light
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
+        BM_DEPS ${bm_kernels}
        ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)

 lite_cc_test(test_apis SRCS apis_test.cc
@@ -258,6 +279,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
        X86_DEPS ${x86_kernels}
        XPU_DEPS ${xpu_kernels}
        FPGA_DEPS ${fpga_kernels}
+        BM_DEPS ${bm_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
        --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)

@@ -283,11 +305,13 @@ endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
 lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light
  ${ops}
  ARM_DEPS ${arm_kernels}
+  CV_DEPS paddle_cv_arm
  NPU_DEPS ${npu_kernels}
  XPU_DEPS ${xpu_kernels}
  CL_DEPS ${opencl_kernels}
  X86_DEPS ${x86_kernels}
  FPGA_DEPS ${fpga_kernels}
+  BM_DEPS ${bm_kernels}
  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
 if (WITH_TESTING)
    add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -302,6 +326,7 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
+        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -309,6 +334,7 @@ if(NOT IOS)
    lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
@@ -322,6 +348,7 @@ if(NOT IOS)
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
+	BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -13,32 +13,61 @@
 // limitations under the License.

 #include <gflags/gflags.h>
+#include <sys/time.h>
+#include <time.h>
+#include <algorithm>
 #include <cstdio>
 #include <fstream>
+#include <iomanip>
+#include <numeric>
 #include <string>
 #include <vector>
 #include "lite/api/paddle_api.h"
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
 #include "lite/core/device_info.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"

+DEFINE_string(model_dir, "", "model dir");
 DEFINE_string(input_shape,
              "1,3,224,224",
-              "input shapes, separated by colon and comma");
-DEFINE_string(result_filename, "", "save test result");
+              "set input shapes according to the model, "
+              "separated by colon and comma, "
+              "such as 1,3,244,244:1,3,300,300.");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+DEFINE_int32(power_mode,
+             3,
+             "arm power mode: "
+             "0 for big cluster, "
+             "1 for little cluster, "
+             "2 for all cores, "
+             "3 for no bind");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_string(result_filename,
+              "result.txt",
+              "save benchmark "
+              "result to the file");
 DEFINE_bool(run_model_optimize,
            false,
-            "if set true, apply model_optimize_tool to model, use optimized "
-            "model to test");
-DEFINE_bool(is_quantized_model, false, "if set true, test the quantized model");
+            "if set true, apply model_optimize_tool to "
+            "model and use optimized model to test. ");
+DEFINE_bool(is_quantized_model,
+            false,
+            "if set true, "
+            "test the performance of the quantized model. ");

 namespace paddle {
 namespace lite_api {

+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
 void OutputOptModel(const std::string& load_model_dir,
                    const std::string& save_optimized_model_dir,
                    const std::vector<std::vector<int64_t>>& input_shapes) {
@@ -58,7 +87,7 @@ void OutputOptModel(const std::string& load_model_dir,
      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
          .c_str());
  if (ret == 0) {
-    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
+    LOG(INFO) << "Delete old optimized model " << save_optimized_model_dir;
  }
  predictor->SaveOptimizedModel(save_optimized_model_dir,
                                LiteModelType::kNaiveBuffer);
@@ -69,23 +98,22 @@ void OutputOptModel(const std::string& load_model_dir,
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 void Run(const std::vector<std::vector<int64_t>>& input_shapes,
         const std::string& model_dir,
-         const int repeat,
-         const int thread_num,
-         const int warmup_times,
         const std::string model_name) {
+  // set config and create predictor
  lite_api::MobileConfig config;
-  config.set_threads(thread_num);
-  config.set_power_mode(LITE_POWER_NO_BIND);
+  config.set_threads(FLAGS_threads);
+  config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
  config.set_model_dir(model_dir);

  auto predictor = lite_api::CreatePaddlePredictor(config);

+  // set input
  for (int j = 0; j < input_shapes.size(); ++j) {
    auto input_tensor = predictor->GetInput(j);
    input_tensor->Resize(input_shapes[j]);
    auto input_data = input_tensor->mutable_data<float>();
    int input_num = 1;
-    for (int i = 0; i < input_shapes[j].size(); ++i) {
+    for (size_t i = 0; i < input_shapes[j].size(); ++i) {
      input_num *= input_shapes[j][i];
    }
    for (int i = 0; i < input_num; ++i) {
@@ -93,26 +121,37 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
    }
  }

-  for (int i = 0; i < warmup_times; ++i) {
+  // warmup
+  for (int i = 0; i < FLAGS_warmup; ++i) {
    predictor->Run();
  }

-  auto start = lite::GetCurrentUS();
-  for (int i = 0; i < repeat; ++i) {
+  // run
+  std::vector<float> perf_vct;
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    auto start = GetCurrentUS();
    predictor->Run();
+    auto end = GetCurrentUS();
+    perf_vct.push_back((end - start) / 1000.0);
  }
-  auto end = lite::GetCurrentUS();
-
-  std::FILE* pf = std::fopen(FLAGS_result_filename.c_str(), "a");
-  if (nullptr == pf) {
-    LOG(INFO) << "create result file error";
-    exit(0);
+  std::sort(perf_vct.begin(), perf_vct.end());
+  float min_res = perf_vct.back();
+  float max_res = perf_vct.front();
+  float total_res = accumulate(perf_vct.begin(), perf_vct.end(), 0.0);
+  float avg_res = total_res / FLAGS_repeats;
+
+  // save result
+  std::ofstream ofs(FLAGS_result_filename, std::ios::app);
+  if (!ofs.is_open()) {
+    LOG(FATAL) << "open result file failed";
  }
-  fprintf(pf,
-          "-- %-18s    avg = %5.4f ms\n",
-          model_name.c_str(),
-          (end - start) / repeat / 1000.0);
-  std::fclose(pf);
+  ofs.precision(5);
+  ofs << std::setw(20) << std::fixed << std::left << model_name;
+  ofs << "min = " << std::setw(12) << min_res;
+  ofs << "max = " << std::setw(12) << max_res;
+  ofs << "average = " << std::setw(12) << avg_res;
+  ofs << std::endl;
+  ofs.close();
 }
 #endif

@@ -122,9 +161,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
 int main(int argc, char** argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  if (FLAGS_model_dir == "" || FLAGS_result_filename == "") {
-    LOG(INFO) << "usage: "
-              << "--model_dir /path/to/your/model --result_filename "
-                 "/path/to/resultfile";
+    LOG(INFO) << "please run ./benchmark_bin --help to obtain usage.";
    exit(0);
  }

@@ -166,11 +203,11 @@ int main(int argc, char** argv) {

  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
  std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
+  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
    input_shapes.push_back(get_shape(str_input_shapes[i]));
  }

-  // Output optimized model
+  // Output optimized model if needed
  if (FLAGS_run_model_optimize) {
    paddle::lite_api::OutputOptModel(
        FLAGS_model_dir, save_optimized_model_dir, input_shapes);
@@ -180,12 +217,7 @@ int main(int argc, char** argv) {
  // Run inference using optimized model
  std::string run_model_dir =
      FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
-  paddle::lite_api::Run(input_shapes,
-                        run_model_dir,
-                        FLAGS_repeats,
-                        FLAGS_threads,
-                        FLAGS_warmup,
-                        model_name);
+  paddle::lite_api::Run(input_shapes, run_model_dir, model_name);
 #endif
  return 0;
 }
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -42,11 +42,11 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {

 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
-  int num_threads = config.cpu_math_library_num_threads();
+  int num_threads = config.x86_math_library_num_threads();
  int real_num_threads = num_threads > 1 ? num_threads : 1;
  paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
  omp_set_num_threads(real_num_threads);
-  VLOG(3) << "set_cpu_math_library_math_threads() is set successfully and the "
+  VLOG(3) << "set_x86_math_library_math_threads() is set successfully and the "
             "number of threads is:"
          << num_threads;
 #endif

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <gflags/gflags.h>
+#include <sstream>
 #include <string>
 #include <vector>
 #include "lite/api/paddle_api.h"
@@ -33,10 +34,10 @@ using paddle::lite::profile::Timer;
 DEFINE_string(input_shape,
              "1,3,224,224",
              "input shapes, separated by colon and comma");
-
 DEFINE_bool(use_optimize_nb,
            false,
            "optimized & naive buffer model for mobile devices");
+DEFINE_string(arg_name, "", "the arg name");

 namespace paddle {
 namespace lite_api {
@@ -86,6 +87,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
    for (int i = 0; i < input_shapes[j].size(); ++i) {
      input_num *= input_shapes[j][i];
    }
+
    for (int i = 0; i < input_num; ++i) {
      input_data[i] = 1.f;
    }
@@ -122,6 +124,28 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
    output_num *= output_shape[i];
  }
  LOG(INFO) << "output_num: " << output_num;
+
+  // please turn off memory_optimize_pass to use this feature.
+  if (FLAGS_arg_name != "") {
+    auto arg_tensor = predictor->GetTensor(FLAGS_arg_name);
+    auto arg_shape = arg_tensor->shape();
+    int arg_num = 1;
+    std::ostringstream os;
+    os << "{";
+    for (int i = 0; i < arg_shape.size(); ++i) {
+      arg_num *= arg_shape[i];
+      os << arg_shape[i] << ",";
+    }
+    os << "}";
+    float sum = 0.;
+    std::ofstream out(FLAGS_arg_name + ".txt");
+    for (size_t i = 0; i < arg_num; ++i) {
+      sum += arg_tensor->data<float>()[i];
+      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+    }
+    LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
+              << ", mean value is " << sum * 1. / arg_num;
+  }
 }
 #endif


--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -133,7 +133,9 @@ class LITE_API CxxConfig : public ConfigBase {
  std::string model_file_;
  std::string param_file_;
  bool model_from_memory_{false};
-  int cpu_math_library_math_threads_ = 1;
+#ifdef LITE_WITH_X86
+  int x86_math_library_math_threads_ = 1;
+#endif

 public:
  void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -153,12 +155,14 @@ class LITE_API CxxConfig : public ConfigBase {
  std::string param_file() const { return param_file_; }
  bool model_from_memory() const { return model_from_memory_; }

-  void set_cpu_math_library_num_threads(int threads) {
-    cpu_math_library_math_threads_ = threads;
+#ifdef LITE_WITH_X86
+  void set_x86_math_library_num_threads(int threads) {
+    x86_math_library_math_threads_ = threads;
  }
-  int cpu_math_library_num_threads() const {
-    return cpu_math_library_math_threads_;
+  int x86_math_library_num_threads() const {
+    return x86_math_library_math_threads_;
  }
+#endif
 };

 /// MobileConfig is the config for the light weight predictor, it will skip

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -55,7 +55,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "any",
                                              "fpga",
                                              "npu",
-                                              "xpu"};
+                                              "xpu",
+                                              "bm"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -94,7 +95,8 @@ const std::string& TargetRepr(TargetType target) {
                                              "kAny",
                                              "kFPGA",
                                              "kNPU",
-                                              "kXPU"};
+                                              "kXPU",
+                                              "kBM"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -135,6 +137,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kOpenCL),
                                               TARGET(kNPU),
                                               TARGET(kXPU),
+                                               TARGET(kBM),
                                               TARGET(kFPGA)});
  if (target == TARGET(kAny)) {
    return valid_set;

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -52,8 +52,9 @@ enum class TargetType : int {
  kFPGA = 7,
  kNPU = 8,
  kXPU = 9,
+  kBM = 10,
  kAny = 6,  // any target
-  NUM = 10,  // number of fields.
+  NUM = 11,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,

--- a/lite/api/test_resnet50_lite_bm.cc
+++ b/lite/api/test_resnet50_lite_bm.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <fstream>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+DEFINE_string(input_img_txt_path,
+              "",
+              "if set input_img_txt_path, read the img filename as input.");
+
+namespace paddle {
+namespace lite {
+
+void TestModel(const std::vector<Place>& valid_places) {
+  lite::Predictor predictor;
+  std::vector<std::string> passes;
+  passes.push_back("bm_subgraph_pass");
+  predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  if (FLAGS_input_img_txt_path.empty()) {
+    for (int i = 0; i < item_size; i++) {
+      data[i] = 1;
+    }
+  } else {
+    std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
+    if (!fs.is_open()) {
+      LOG(FATAL) << "open input_img_txt error.";
+    }
+    for (int i = 0; i < item_size; i++) {
+      fs >> data[i];
+    }
+  }
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  auto* out = predictor.GetOutput(0);
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+
+  auto* out_data = out->data<float>();
+  FILE* fp = fopen("result.txt", "wb");
+  for (int i = 0; i < out->numel(); i++) {
+    fprintf(fp, "%f\n", out_data[i]);
+  }
+  fclose(fp);
+}
+
+TEST(ResNet50, test_bm) {
+  std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+
+  TestModel(valid_places);
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/api/test_step_rnn_lite_x86.cc
+++ b/lite/api/test_step_rnn_lite_x86.cc
@@ -30,7 +30,9 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
  std::string model_dir = FLAGS_model_dir;
  lite_api::CxxConfig config;
  config.set_model_dir(model_dir);
-  config.set_cpu_math_library_num_threads(1);
+#ifdef LITE_WITH_X86
+  config.set_x86_math_library_num_threads(1);
+#endif
  config.set_valid_places({lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -6,3 +6,4 @@ add_subdirectory(fpga)
 add_subdirectory(host)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(bm)
--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -79,6 +79,7 @@ void conv_compute_6x6_3x3(const float* input,
                          const float* bias,
                          const operators::ConvParam& param,
                          ARMContext* ctx) {
+  auto act_param = param.activation_param;
  const int pad_h = (*param.paddings)[0];
  const int pad_w = (*param.paddings)[2];
  float* tmp_work_space =
@@ -296,7 +297,7 @@ void conv_compute_6x6_3x3(const float* input,
                                    wout,
                                    false,
                                    zero_ptr,
-                                    nullptr);
+                                    &act_param);
          }
        } else {
          for (int ci = 0; ci < oc_4; ++ci) {
@@ -343,7 +344,7 @@ void conv_compute_6x6_3x3(const float* input,
                                    wout,
                                    false,
                                    zero_ptr,
-                                    nullptr);
+                                    &act_param);
          }
        }
      }
@@ -366,6 +367,7 @@ void conv_compute_2x2_3x3(const float* input,
                          const float* bias,
                          const operators::ConvParam& param,
                          ARMContext* ctx) {
+  auto act_param = param.activation_param;
  const int pad_h = (*param.paddings)[0];
  const int pad_w = (*param.paddings)[2];
  float* tmp_work_space =
@@ -565,7 +567,7 @@ void conv_compute_2x2_3x3(const float* input,
                                    wout,
                                    false,
                                    zero_ptr,
-                                    nullptr);
+                                    &act_param);
          }
        } else {
          for (int ci = 0; ci < oc_4; ++ci) {
@@ -606,7 +608,7 @@ void conv_compute_2x2_3x3(const float* input,
                                    wout,
                                    false,
                                    zero_ptr,
-                                    nullptr);
+                                    &act_param);
          }
        }
      }
@@ -627,6 +629,7 @@ void conv_compute_2x2_3x3_small(const float* input,
                                const float* bias,
                                const operators::ConvParam& param,
                                ARMContext* ctx) {
+  auto act_param = param.activation_param;
  const int pad_h = (*param.paddings)[0];
  const int pad_w = (*param.paddings)[2];
  float* tmp_work_space =
@@ -819,7 +822,7 @@ void conv_compute_2x2_3x3_small(const float* input,
                                    wout,
                                    false,
                                    zero_ptr,
-                                    nullptr);
+                                    &act_param);
          }
        } else {
          for (int ci = 0; ci < oc_4; ++ci) {
@@ -860,7 +863,7 @@ void conv_compute_2x2_3x3_small(const float* input,
                                    wout,
                                    false,
                                    zero_ptr,
-                                    nullptr);
+                                    &act_param);
          }
        }
      }

--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -924,58 +924,58 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                                                           \
  "st1 {v15.4s}, [%[doutr3]], #16     \n"

-#define RIGHT_RESULT_S1_RELU6                                             \
-  "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/                               \
-                                                                          \
-  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-                                                                          \
-  "fmin v12.4s, v12.4s, %[vsix].4s \n" /*relu6*/                          \
-                                                                          \
-  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/  \
-  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */         \
-  "bif v12.16b, v22.16b, v18.16b \n"                                      \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmax v13.4s, v13.4s, v20.4s \n"      /*relu*/                          \
-                                                                          \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                 \
-                                                                          \
-  "fmin v13.4s, v13.4s, %[vsix].4s \n" /*relu6*/                          \
-                                                                          \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                 \
-  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                \
-  "bif v13.16b, v23.16b, v18.16b \n"                                      \
-                                                                          \
-  "fmla v15.4s ,  v10.4s,  v20.s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                          \
-  "fmax v14.4s, v14.4s, v20.4s \n"        /*relu*/                        \
-  "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                        \
-                                                                          \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-                                                                          \
-  "fmin v14.4s, v14.4s, %[vsix].4s \n" /*relu6*/                          \
-                                                                          \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "bif v14.16b, v24.16b, v18.16b \n"                                      \
-  "fmax v15.4s, v15.4s, v20.4s \n" /*relu*/                               \
-                                                                          \
-  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                 \
-                                                                          \
-  "fmin v15.4s, v15.4s, %[vsix].4s \n" /*relu6*/                          \
-  "bif v15.16b, v25.16b, v18.16b \n"                                      \
-                                                                          \
+#define RIGHT_RESULT_S1_RELU6                                              \
+  "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/                                \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "fmin v12.4s, v12.4s, %[vsix].4s \n" /*relu6*/                           \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "bif v12.16b, v22.16b, v18.16b \n"                                       \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmax v13.4s, v13.4s, v20.4s \n"      /*relu*/                           \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "fmin v13.4s, v13.4s, %[vsix].4s \n" /*relu6*/                           \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+  "bif v13.16b, v23.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v10.4s,   %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
+                                                                           \
+  "fmax v14.4s, v14.4s, v20.4s \n"        /*relu*/                         \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                         \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "fmin v14.4s, v14.4s, %[vsix].4s \n" /*relu6*/                           \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "bif v14.16b, v24.16b, v18.16b \n"                                       \
+  "fmax v15.4s, v15.4s, v20.4s \n" /*relu*/                                \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmin v15.4s, v15.4s, %[vsix].4s \n" /*relu6*/                           \
+  "bif v15.16b, v25.16b, v18.16b \n"                                       \
+                                                                           \
  "st1 {v15.4s}, [%[doutr3]], #16     \n"

 #define RIGHT_RESULT_S1_LEAKY_RELU                                        \
@@ -1586,7 +1586,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
  /* r3 */                                                               \
  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
                                                                         \
-  "vld1.32  {d28-d29}, [%[six_ptr]]!    @ load din r0\n"                 \
+  "vld1.32  {d28-d29}, [%[six_ptr]]    @ load din r0\n"                  \
  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                               \
                                                                         \
  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
@@ -1617,7 +1617,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
  /* r3 */                                                               \
  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
                                                                         \
-  "vld1.32  {d28-d29}, [%[scale_ptr]]!    @ load din r0\n"               \
+  "vld1.32  {d28-d29}, [%[scale_ptr]]    @ load din r0\n"                \
                                                                         \
  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
                                                                         \
@@ -1694,7 +1694,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
  /* r3 */                                                              \
  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
                                                                        \
-  "vld1.32  {d28-d29}, [%[scale_ptr]]!    @ load din r0\n"              \
+  "vld1.32  {d28-d29}, [%[scale_ptr]]    @ load din r0\n"               \
                                                                        \
  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
                                                                        \
@@ -2339,17 +2339,29 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
  int size_out_channel = w_out * h_out;
  int w_stride = 9;

-  int tile_w = (w_in + 3) >> 2;
-  int cnt_col = tile_w - 2;
-
-  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
+  int tile_w = w_out >> 2;
+  int remain = w_out % 4;
+  int cnt_col = tile_w - 1;
+
+  unsigned int size_pad_right = (unsigned int)(5 + (tile_w << 2) - w_in);
+  const unsigned int remian_idx[4] = {0, 1, 2, 3};
+
+  if (remain == 0 && size_pad_right == 5) {
+    size_pad_right = 1;
+    cnt_col -= 1;
+    remain = 4;
+  } else if (remain == 0 && size_pad_right == 6) {
+    size_pad_right = 2;
+    cnt_col -= 1;
+    remain = 4;
+  }

  uint32x4_t vmask_rp1 =
      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
  uint32x4_t vmask_rp2 =
      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
  uint32x4_t vmask_result =
-      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+      vcgtq_u32(vdupq_n_u32(remain), vld1q_u32(remian_idx));

  unsigned int vmask[8];
  vst1q_u32(vmask, vmask_rp1);
@@ -2398,7 +2410,7 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
      const float *din_ptr5 = dr5;
      float *ptr_zero = const_cast<float *>(zero);
 #ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
+      for (int i = 0; i < h_out; i += 4) {
        //! process top pad pad_h = 1
        din_ptr0 = dr0;
        din_ptr1 = dr1;
@@ -2484,7 +2496,7 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
        dout_ptr = dout_ptr + 4 * w_out;
      }
 #else
-      for (int i = 0; i < h_in; i += 2) {
+      for (int i = 0; i < h_out; i += 2) {
        //! process top pad pad_h = 1
        din_ptr0 = dr0;
        din_ptr1 = dr1;
@@ -2883,39 +2895,57 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
        wbias = vdupq_n_f32(0.f);
      }

-      int hs = -1;
-      int he = 3;
-
      float out_buf1[4];
      float out_buf2[4];
      float trash_buf[4];

-      int h_cnt = (h_out + 1) >> 1;
      float *doutr0 = dout_channel;
      float *doutr1 = dout_channel + w_out;

-      for (int j = 0; j < h_cnt; ++j) {
-        const float *dr0 = din_channel + hs * w_in;
-        const float *dr1 = dr0 + w_in;
-        const float *dr2 = dr1 + w_in;
-        const float *dr3 = dr2 + w_in;
+      const float *dr0 = din_channel;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;

-        if (hs == -1) {
-          dr0 = zero;
+      for (int j = 0; j < h_out; j += 2) {
+        const float *dr0_ptr = dr0;
+        const float *dr1_ptr = dr1;
+        const float *dr2_ptr = dr2;
+        const float *dr3_ptr = dr3;
+        if (j == 0) {
+          dr0_ptr = zero;
+          dr1_ptr = dr0;
+          dr2_ptr = dr1;
+          dr3_ptr = dr2;
+          dr0 = dr1;
+          dr1 = dr2;
+        } else {
+          dr0 = dr2;
+          dr1 = dr3;
+        }
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        //! process bottom pad
+        if (j + 3 > h_in) {
+          switch (j + 3 - h_in) {
+            case 3:
+              dr1_ptr = zero;
+            case 2:
+              dr2_ptr = zero;
+            case 1:
+              dr3_ptr = zero;
+            default:
+              break;
+          }
        }
-        switch (he - h_in) {
-          case 2:
-            dr2 = zero;
-            doutr1 = trash_buf;
-          case 1:
-            dr3 = zero;
-          default:
-            break;
+        //! process bottom remain
+        if (j + 2 > h_out) {
+          doutr1 = trash_buf;
        }
-        act_switch_3x3s1p1_s(dr0,
-                             dr1,
-                             dr2,
-                             dr3,
+        act_switch_3x3s1p1_s(dr0_ptr,
+                             dr1_ptr,
+                             dr2_ptr,
+                             dr3_ptr,
                             out_buf1,
                             out_buf2,
                             wr0,
@@ -2931,8 +2961,6 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
        }
        doutr0 = doutr1;
        doutr1 += w_out;
-        hs += 2;
-        he += 2;
      }  // end of processing heights
    }    // end of processing channels
  }      // end of processing batchs
@@ -3458,6 +3486,12 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
  const int remian_idx[4] = {0, 1, 2, 3};

+  if (remain == 0 && size_pad_right == 6) {  // w_in == w_out and w_out % 4 == 0
+    tile_w -= 1;
+    remain = 4;
+    size_pad_right = 2;
+  }
+
  uint32x4_t vmask_rp1 =
      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
  uint32x4_t vmask_rp2 =
@@ -3603,16 +3637,14 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
        dr2 = dr1 + w_in;
        dr3 = dr2 + w_in;
        //! process bottom pad
-        if (i + 3 >= h_in) {
-          switch (i + 3 - h_in) {
+        if (i + 4 > h_in) {
+          switch (i + 4 - h_in) {
            case 3:
              din_ptr1 = zero_ptr;
            case 2:
              din_ptr2 = zero_ptr;
            case 1:
              din_ptr3 = zero_ptr;
-            case 0:
-              din_ptr3 = zero_ptr;
            default:
              break;
          }
@@ -4016,22 +4048,21 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
        doutr0 = dout_channel + j * w_out;
        doutr1 = doutr0 + w_out;

-        if (j + 3 >= h_in) {
-          switch (j + 3 - h_in) {
+        if (j + 4 > h_in) {
+          switch (j + 4 - h_in) {
            case 3:
              dr1 = zero_ptr;
            case 2:
              dr2 = zero_ptr;
            case 1:
              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            case 0:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
            default:
              break;
          }
        }
+        if (j + 2 > h_out) {
+          doutr1 = trash_buf;
+        }
        unsigned int *vmask_ptr = vmask;
        act_switch_3x3s1p0_s(dr0,
                             dr1,

--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
@@ -1202,15 +1202,17 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
  int out_pad_idx[4] = {0, 1, 2, 3};
  int size_pad_bottom = h_out * 2 - h_in;

-  int cnt_col = (w_out >> 2) - 2;
-  int size_right_remain = w_in - (7 + cnt_col * 8);
-  if (size_right_remain >= 9) {
-    cnt_col++;
-    size_right_remain -= 8;
-  }
-  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);
+  int tile_w = w_out >> 2;
+  int cnt_remain = w_out % 4;
+  unsigned int size_right_remain = (unsigned int)(7 + (tile_w << 3) - w_in);
+  size_right_remain = 8 - size_right_remain;

-  int size_right_pad = w_out * 2 - w_in;
+  if (cnt_remain == 0 && size_right_remain == 0) {
+    cnt_remain = 4;
+    tile_w -= 1;
+    size_right_remain = 8;
+  }
+  int cnt_col = tile_w - 1;

  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
@@ -1276,7 +1278,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
      float* doutr1_ptr = nullptr;

 #ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
+      for (int i = 0; i < h_out; i += 2) {
        din0_ptr = dr0;
        din1_ptr = dr1;
        din2_ptr = dr2;
@@ -1303,8 +1305,8 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
        dr4 = dr3 + w_in;

        //! process bottom pad
-        if (i + 4 > h_in) {
-          switch (i + 4 - h_in) {
+        if (i * 2 + 4 > h_in) {
+          switch (i * 2 + 4 - h_in) {
            case 4:
              din1_ptr = zero_ptr;
            case 3:
@@ -1318,7 +1320,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
          }
        }
        //! process output pad
-        if (i / 2 + 2 > h_out) {
+        if (i + 2 > h_out) {
          doutr1_ptr = write_ptr;
        }
        int cnt = cnt_col;
@@ -1343,7 +1345,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
        doutr0 = doutr0 + 2 * w_out;
      }
 #else
-      for (int i = 0; i < h_in; i += 2) {
+      for (int i = 0; i < h_out; i++) {
        din0_ptr = dr0;
        din1_ptr = dr1;
        din2_ptr = dr2;
@@ -1364,8 +1366,8 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
        }

        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
+        if (i * 2 + 2 > h_in) {
+          switch (i * 2 + 2 - h_in) {
            case 2:
              din1_ptr = zero_ptr;
            case 1:
@@ -1641,7 +1643,8 @@ void act_switch_3x3s2p0(const float* din0_ptr,
            "ld1 {v20.4s}, [%[inptr3]]                 \n"
            "ld1 {v21.4s}, [%[inptr4]]                 \n"
            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-            MID_COMPUTE_S2 MID_RESULT_S2_RELU6
+            "ld1 {v22.4s}, [%[six_ptr]]                  \n" MID_COMPUTE_S2
+                MID_RESULT_S2_RELU6
            "cmp %w[remain], #1                           \n"
            "blt 4f                                     \n" RIGHT_COMPUTE_S2
                RIGHT_RESULT_S2_RELU6
@@ -1700,7 +1703,8 @@ void act_switch_3x3s2p0(const float* din0_ptr,
            "ld1 {v20.4s}, [%[inptr3]]                 \n"
            "ld1 {v21.4s}, [%[inptr4]]                 \n"
            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-            MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
+            "ld1 {v22.4s}, [%[scale_ptr]]                  \n" MID_COMPUTE_S2
+                MID_RESULT_S2_LEAKY_RELU
            "cmp %w[remain], #1                           \n"
            "blt 4f                                     \n" RIGHT_COMPUTE_S2
                RIGHT_RESULT_S2_LEAKY_RELU
@@ -1718,7 +1722,7 @@ void act_switch_3x3s2p0(const float* din0_ptr,
              [w1] "w"(wr1),
              [w2] "w"(wr2),
              [remain] "r"(cnt_remain),
-              [six_ptr] "r"(vscale),
+              [scale_ptr] "r"(vscale),
              [mask1] "w"(vmask_rp1),
              [mask2] "w"(vmask_rp2),
              [wmask] "w"(wmask),
@@ -1834,7 +1838,14 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
  int tile_w = w_out >> 2;
  int cnt_remain = w_out % 4;

-  unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3));
+  unsigned int size_right_remain = (unsigned int)(8 + (tile_w << 3) - w_in);
+  size_right_remain = 8 - size_right_remain;
+
+  if (cnt_remain == 0 && size_right_remain == 0) {
+    cnt_remain = 4;
+    tile_w -= 1;
+    size_right_remain = 8;
+  }

  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
                                   vld1q_s32(right_pad_idx));  // 0 2 4 6

--- a/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
@@ -2237,7 +2237,7 @@ inline void act_switch_process(float* src,
  int cnt = size >> 4;
  int remain = size % 16;
  float32x4_t vzero = vdupq_n_f32(0.f);
-  if (act_param != nullptr && act_param->has_active) {
+  if (act_param != nullptr) {
    float32x4_t vsix = vdupq_n_f32(act_param->Relu_clipped_coef);
    float32x4_t vscale = vdupq_n_f32(act_param->Leaky_relu_alpha);
    if (cnt > 0) {
@@ -2327,6 +2327,7 @@ inline void act_switch_process(float* src,
          src++;
          dst++;
        }
+        break;
      case lite_api::ActivationType::kRelu6:
        for (int i = 0; i < remain; i++) {
          float tmp = *src >= 0.f ? *src : 0.f;
@@ -2336,6 +2337,7 @@ inline void act_switch_process(float* src,
          src++;
          dst++;
        }
+        break;
      case lite_api::ActivationType::kLeakyRelu:
        for (int i = 0; i < remain; i++) {
          if (*src >= 0.f) {

--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -150,11 +150,26 @@ void conv_depthwise_5x5s2_fp32(const float* din,
                               int win,
                               const float* weights,
                               const float* bias,
-                               int pad,
-                               bool flag_bias,
-                               bool flag_relu,
+                               const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
                               ARMContext* ctx);

+void conv_depthwise_5x5s2p2_fp32(const float* din,
+                                 float* dout,
+                                 int num,
+                                 int chout,
+                                 int hout,
+                                 int wout,
+                                 int chin,
+                                 int hin,
+                                 int win,
+                                 const float* weights,
+                                 const float* bias,
+                                 int pad,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 ARMContext* ctx);
+
 template <typename Dtype>
 void conv_depthwise_5x5s1_int8(Dtype* dout,
                               const int8_t* din,

--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -180,6 +180,8 @@ void conv1x1s1_gemm(const float* i_data,
  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;

+  auto act_param = param.activation_param;
+
  int hblock = get_hblock(ctx);
  int m_roundup = hblock * ((m + hblock - 1) / hblock);
  int weights_size_per_group = m * k;
@@ -223,7 +225,7 @@ void conv1x1s1_gemm(const float* i_data,
                      n,
                      bias_group,
                      flag_bias,
-                      flag_relu,
+                      act_param,
                      ctx);
      }
    }
@@ -361,6 +363,8 @@ void conv_im2col_gemm(const float* i_data,
  int hblock = get_hblock(ctx);
  int m_roundup = hblock * ((m + hblock - 1) / hblock);
  int weights_size_per_group = m * k;
+
+  auto act_param = param.activation_param;
  if (n > 1) {
    weights_size_per_group = ((m_roundup * k + 15) / 16) * 16;
  }
@@ -422,7 +426,7 @@ void conv_im2col_gemm(const float* i_data,
                      n,
                      bias_group,
                      flag_bias,
-                      flag_relu,
+                      act_param,
                      ctx);
      }
    }
@@ -585,10 +589,9 @@ void conv_depthwise_3x3_fp32(const void* din,
  int stride = param.strides[1];
  int pad = pad_w;
  bool flag_bias = param.bias != nullptr;
-  bool pads_equal =
-      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+  bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2));
  if (stride == 1) {
-    if (pads_equal && (pad_h == pad_w) && (pad < 2)) {  // support pad = [0, 1]
+    if (pads_less && (pad_h == pad_w) && (pad < 2)) {  // support pad = [0, 1]
      conv_depthwise_3x3s1_fp32(reinterpret_cast<const float*>(din),
                                reinterpret_cast<float*>(dout),
                                num,
@@ -620,9 +623,8 @@ void conv_depthwise_3x3_fp32(const void* din,
                                act_param,
                                ctx);
    }
-
  } else if (stride == 2) {
-    if (pad_h == pad_w && (pad < 2)) {  // support pad = [0, 1]
+    if (pads_less && pad_h == pad_w && (pad < 2)) {  // support pad = [0, 1]
      conv_depthwise_3x3s2_fp32(reinterpret_cast<const float*>(din),
                                reinterpret_cast<float*>(dout),
                                num,
@@ -674,12 +676,13 @@ void conv_depthwise_5x5_fp32(const void* din,
                             ARMContext* ctx,
                             const float* scale) {
  auto paddings = *param.paddings;
+  auto act_param = param.activation_param;
  int pad = paddings[0];
  int stride = param.strides[1];
  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
  ctx->ExtendWorkspace((w_in + w_out) * sizeof(float));
-  if (pad == 2 && stride == 2) {
+  if (stride == 2) {
    conv_depthwise_5x5s2_fp32(reinterpret_cast<const float*>(din),
                              reinterpret_cast<float*>(dout),
                              num,
@@ -691,9 +694,8 @@ void conv_depthwise_5x5_fp32(const void* din,
                              w_in,
                              reinterpret_cast<const float*>(weights),
                              bias,
-                              pad,
-                              flag_bias,
-                              flag_relu,
+                              param,
+                              act_param,
                              ctx);
  } else if (stride == 1) {
    conv_depthwise_5x5s1_fp32(reinterpret_cast<const float*>(din),

--- a/lite/backends/arm/math/conv_winograd_3x3.cc
+++ b/lite/backends/arm/math/conv_winograd_3x3.cc
@@ -44,6 +44,8 @@ void conv_winograd3x3(const float* din,
  int size_out_channel = wout * hout;
  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
+  auto act_param = param.activation_param;
+  act_param.has_active = false;

  //! transform input
  int tile_w = (wout + 5) / 6;
@@ -127,7 +129,7 @@ void conv_winograd3x3(const float* din,
                    size_tile,
                    nullptr,
                    false,
-                    false,
+                    act_param,
                    ctx);
    }


--- a/lite/backends/arm/math/fill_bias_relu.cc
+++ b/lite/backends/arm/math/fill_bias_relu.cc
@@ -115,7 +115,241 @@ void fill_bias_relu<int>(int* tensor,
    }
  }
 }
-
+#ifdef __aarch64__
+#define FILL_BIAS                                               \
+  "1:                               \n"                         \
+  "ld1 {v0.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v1.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v2.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v3.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "add v0.4s, v0.4s, %[vbias].4s    \n"                         \
+  "add v1.4s, v1.4s, %[vbias].4s    \n"                         \
+  "add v2.4s, v2.4s, %[vbias].4s    \n"                         \
+  "add v3.4s, v3.4s, %[vbias].4s    \n"
+#define FILL_RELU                                         \
+  "fmax v0.4s, v0.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v1.4s, v1.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v2.4s, v2.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v3.4s, v3.4s, %[vzero].4s   \n" /* vmaxq_f32() */
+#define FILL_RELU6                                       \
+  "fmin v0.4s, v0.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v1.4s, v1.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v2.4s, v2.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v3.4s, v3.4s, %[vsix].4s   \n" /* vmaxq_f32() */
+#define FILL_LEAKY_RELU                                   \
+  "cmhs v4.4s, v0.4s,  %[vzero].4s  \n"   /* vcgeq_u32 */ \
+  "fmul v5.4s, v0.4s, %[vscale].4s \n"    /* vmulq_f32 */ \
+  "cmhs v6.4s, v1.4s,  %[vzero].4s  \n"   /* vcgeq_u32 */ \
+  "fmul v7.4s, v1.4s, %[vscale].4s \n"    /* vmulq_f32 */ \
+  "cmhs v8.4s, v2.4s,  %[vzero].4s  \n"   /* vcgeq_u32 */ \
+  "fmul v9.4s, v2.4s, %[vscale].4s \n"    /* vmulq_f32 */ \
+  "cmhs v10.4s, v3.4s,  %[vzero].4s  \n"  /* vcgeq_u32 */ \
+  "fmul v11.4s, v3.4s, %[vscale].4s \n"   /* vmulq_f32 */ \
+  "bif v0.16b, v5.16b, v4.16b       \n"   /* choose*/     \
+  "bif v1.16b, v7.16b, v6.16b       \n"   /* choose*/     \
+  "bif v2.16b, v9.16b, v8.16b       \n"   /* choose*/     \
+  "bif v3.16b, v11.16b, v10.16b       \n" /* choose*/
+#define FILL_STORE                                       \
+  "subs %w[cnt], %w[cnt], #1                    \n"      \
+  "st1 {v0.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v1.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v2.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v3.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "bne  1b                                    \n"
+#else
+#define FILL_BIAS                                            \
+  "1:                               \n"                      \
+  "vld1.32 {d6-d7}, [%[din_ptr]]!   @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d8-d9}, [%[din_ptr]]!   @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d10-d11}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d12-d13}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
+  "vadd.f32 q3, q3, %q[vbias] @ add \n"                      \
+  "vadd.f32 q4, q4, %q[vbias] @ add \n"                      \
+  "vadd.f32 q5, q5, %q[vbias] @ add \n"                      \
+  "vadd.f32 q6, q6, %q[vbias] @ add \n"
+#define FILL_RELU                               \
+  "vmax.f32 q3, q3, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q4, q4, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q5, q5, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q6, q6, %q[vzero] @ vmaxq_f32() \n"
+#define FILL_RELU6                             \
+  "vmin.f32 q3, q3, %q[vsix] @ vminq_f32() \n" \
+  "vmin.f32 q4, q4, %q[vsix] @ vmaxq_f32() \n" \
+  "vmin.f32 q5, q5, %q[vsix] @ vmaxq_f32() \n" \
+  "vmin.f32 q6, q6, %q[vsix] @ vmaxq_f32() \n"
+#define FILL_LEAKY_RELU                          \
+  "vcge.f32 q7, q3, %q[vzero]   @ vcgeq_u32 \n"  \
+  "vmul.f32 q8, q3, %q[vscale]  @ vmulq_f32 \n"  \
+  "vcge.f32 q9, q4, %q[vzero]   @ vcgeq_u32 \n"  \
+  "vmul.f32 q10, q4, %q[vscale]  @ vmulq_f32 \n" \
+  "vcge.f32 q11, q5, %q[vzero]   @ vcgeq_u32 \n" \
+  "vmul.f32 q12, q5, %q[vscale]  @ vmulq_f32 \n" \
+  "vcge.f32 q13, q6, %q[vzero]   @ vcgeq_u32 \n" \
+  "vmul.f32 q14, q6, %q[vscale]  @ vmulq_f32 \n" \
+  "vbif q3, q8, q7               @ choose \n"    \
+  "vbif q4, q10, q9              @ choose \n"    \
+  "vbif q5, q12, q11             @ choose \n"    \
+  "vbif q6, q14, q13             @ choose \n"
+#define FILL_STORE                                          \
+  "subs %[cnt], #1                                \n"       \
+  "vst1.32 {d6-d7}, [%[dout_ptr]]!       @ vst1q_f32()  \n" \
+  "vst1.32 {d8-d9}, [%[dout_ptr]]!       @ vst1q_f32()  \n" \
+  "vst1.32 {d10-d11}, [%[dout_ptr]]!     @ vst1q_f32()  \n" \
+  "vst1.32 {d12-d13}, [%[dout_ptr]]!     @ vst1q_f32()  \n" \
+  "bne  1b                                    \n"
+#endif
+template <>
+void fill_bias_act<float>(float* tensor,
+                          const float* bias,
+                          int channel,
+                          int channel_size,
+                          bool flag_bias,
+                          const operators::ActivationParam* act_param) {
+  float* data = tensor;
+  int cnt = channel_size >> 4;
+  int remain = channel_size % 16;
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  if (act_param != nullptr && act_param->has_active) {
+    float32x4_t vsix = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    for (int j = 0; j < channel; j++) {
+      float bias_data = flag_bias ? bias[j] : 0.f;
+      float* src = data + j * channel_size;
+      float* dst = data + j * channel_size;
+      float32x4_t vbias = vdupq_n_f32(bias_data);
+      if (cnt > 0) {
+        switch (act_param->active_type) {
+          case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+            asm volatile(
+                FILL_BIAS FILL_RELU FILL_STORE
+                : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                : [vzero] "w"(vzero), [vbias] "w"(vbias)
+                : "memory", "cc", "v0", "v1", "v2", "v3");
+#else
+            asm volatile(
+                FILL_BIAS FILL_RELU FILL_STORE
+                : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                : [vzero] "w"(vzero), [vbias] "w"(vbias)
+                : "memory", "cc", "q3", "q4", "q5", "q6");
+#endif
+            break;
+          case lite_api::ActivationType::kRelu6:
+#ifdef __aarch64__
+            asm volatile(
+                FILL_BIAS FILL_RELU FILL_RELU6 FILL_STORE
+                : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                : [vzero] "w"(vzero), [vsix] "w"(vsix), [vbias] "w"(vbias)
+                : "memory", "cc", "v0", "v1", "v2", "v3");
+#else
+            asm volatile(
+                FILL_BIAS FILL_RELU FILL_RELU6 FILL_STORE
+                : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                : [vzero] "w"(vzero), [vsix] "w"(vsix), [vbias] "w"(vbias)
+                : "memory", "cc", "q3", "q4", "q5", "q6");
+#endif
+            break;
+          case lite_api::ActivationType::kLeakyRelu:
+#ifdef __aarch64__
+            asm volatile(
+                FILL_BIAS FILL_LEAKY_RELU FILL_STORE
+                : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                : [vzero] "w"(vzero), [vscale] "w"(vscale), [vbias] "w"(vbias)
+                : "memory",
+                  "cc",
+                  "v0",
+                  "v1",
+                  "v2",
+                  "v3",
+                  "v4",
+                  "v5",
+                  "v6",
+                  "v7",
+                  "v8",
+                  "v9",
+                  "v10",
+                  "v11");
+#else
+            asm volatile(
+                FILL_BIAS FILL_LEAKY_RELU FILL_STORE
+                : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                : [vzero] "w"(vzero), [vscale] "w"(vscale), [vbias] "w"(vbias)
+                : "memory",
+                  "cc",
+                  "q3",
+                  "q4",
+                  "q5",
+                  "q6",
+                  "q7",
+                  "q8",
+                  "q9",
+                  "q10",
+                  "q11",
+                  "q12",
+                  "q13",
+                  "q14");
+#endif
+            break;
+          default:
+            LOG(FATAL) << "this act_type: "
+                       << static_cast<int>(act_param->active_type)
+                       << " fuse not support";
+        }
+      }
+      // remain
+      switch (act_param->active_type) {
+        case lite_api::ActivationType::kRelu:
+          for (int i = 0; i < remain; i++) {
+            *dst = *src >= 0.f ? *src : 0.f;
+            src++;
+            dst++;
+          }
+        case lite_api::ActivationType::kRelu6:
+          for (int i = 0; i < remain; i++) {
+            float tmp = *src >= 0.f ? *src : 0.f;
+            *dst = tmp <= act_param->Relu_clipped_coef
+                       ? tmp
+                       : act_param->Relu_clipped_coef;
+            src++;
+            dst++;
+          }
+        case lite_api::ActivationType::kLeakyRelu:
+          for (int i = 0; i < remain; i++) {
+            if (*src >= 0.f) {
+              *dst = *src;
+            } else {
+              *dst = *src * act_param->Leaky_relu_alpha;
+            }
+            src++;
+            dst++;
+          }
+          break;
+        default:
+          LOG(FATAL) << "this act_type: "
+                     << static_cast<int>(act_param->active_type)
+                     << " fuse not support";
+      }
+    }
+  } else {
+    for (int j = 0; j < channel; ++j) {
+      float bias_data = flag_bias ? bias[j] : 0.f;
+      float32x4_t vbias = vdupq_n_f32(bias_data);
+      float* src = data + j * channel_size;
+      float* dst = data + j * channel_size;
+#ifdef __aarch64__
+      asm volatile(FILL_BIAS FILL_STORE
+                   : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                   : [vbias] "w"(vbias)
+                   : "memory", "cc", "v0", "v1", "v2", "v3");
+#else
+      asm volatile(FILL_BIAS FILL_STORE
+                   : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                   : [vbias] "w"(vbias)
+                   : "memory", "cc", "q3", "q4", "q5", "q6");
+#endif
+    }
+  }
+}
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/fill_bias_relu.h
+++ b/lite/backends/arm/math/fill_bias_relu.h
@@ -37,7 +37,22 @@ void fill_bias_relu(Dtype* tensor,
                    int channel_size,
                    bool flag_bias,
                    bool flag_relu);
-
+/**
+ *  * \brief neon implementation to add bias and activation(relu, relu6,
+ * leakyrelu)
+ *  * @param tensor
+ *  * @param bias
+ *  * @param channel
+ *  * @param channel_size
+ *
+ */
+template <typename Dtype>
+void fill_bias_act(Dtype* tensor,
+                   const Dtype* bias,
+                   int channel,
+                   int channel_size,
+                   bool flag_bias,
+                   const operators::ActivationParam* act_param);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/funcs.cc
+++ b/lite/backends/arm/math/funcs.cc
@@ -21,128 +21,179 @@ namespace arm {
 namespace math {

 template <>
-void fill_bias_fc<float>(float *out, const float *bias, int num, int channel) {
+void fill_bias_fc<float>(
+    float *out, const float *bias, int num, int channel, bool flag_relu) {
  int cnt = channel >> 4;
  int remain = channel & 15;
-
-  for (int j = 0; j < num; ++j) {
-    const float *ptr_bias = bias;
-    float *ptr_out = out + j * channel;
-
-    float32x4_t vout1;
-    float32x4_t vout2;
-    float32x4_t vout3;
-    float32x4_t vout4;
-
-    for (int i = 0; i < cnt; ++i) {
-      float32x4_t vin1 = vld1q_f32(ptr_out);
-      float32x4_t vb1 = vld1q_f32(ptr_bias);
-
-      float32x4_t vin2 = vld1q_f32(ptr_out + 4);
-      float32x4_t vb2 = vld1q_f32(ptr_bias + 4);
-
-      float32x4_t vin3 = vld1q_f32(ptr_out + 8);
-      float32x4_t vb3 = vld1q_f32(ptr_bias + 8);
-
-      float32x4_t vin4 = vld1q_f32(ptr_out + 12);
-      float32x4_t vb4 = vld1q_f32(ptr_bias + 12);
-
-      vout1 = vaddq_f32(vin1, vb1);
-      vout2 = vaddq_f32(vin2, vb2);
-      vout3 = vaddq_f32(vin3, vb3);
-      vout4 = vaddq_f32(vin4, vb4);
-
-      vst1q_f32(ptr_out, vout1);
-      vst1q_f32(ptr_out + 4, vout2);
-      vst1q_f32(ptr_out + 8, vout3);
-      vst1q_f32(ptr_out + 12, vout4);
-
-      ptr_out += 16;
-      ptr_bias += 16;
+  if (flag_relu) {
+    float32x4_t vzero = vdupq_n_f32(0.f);
+    for (int j = 0; j < num; ++j) {
+      const float *ptr_bias = bias;
+      float *ptr_out = out + j * channel;
+
+      for (int i = 0; i < cnt; ++i) {
+        float32x4_t vin1 = vld1q_f32(ptr_out);
+        float32x4_t vb1 = vld1q_f32(ptr_bias);
+
+        float32x4_t vin2 = vld1q_f32(ptr_out + 4);
+        float32x4_t vb2 = vld1q_f32(ptr_bias + 4);
+
+        float32x4_t vin3 = vld1q_f32(ptr_out + 8);
+        float32x4_t vb3 = vld1q_f32(ptr_bias + 8);
+
+        float32x4_t vin4 = vld1q_f32(ptr_out + 12);
+        float32x4_t vb4 = vld1q_f32(ptr_bias + 12);
+
+        float32x4_t vout1 = vaddq_f32(vin1, vb1);
+        float32x4_t vout2 = vaddq_f32(vin2, vb2);
+        float32x4_t vout3 = vaddq_f32(vin3, vb3);
+        float32x4_t vout4 = vaddq_f32(vin4, vb4);
+
+        vout1 = vmaxq_f32(vout1, vzero);
+        vout2 = vmaxq_f32(vout2, vzero);
+        vout3 = vmaxq_f32(vout3, vzero);
+        vout4 = vmaxq_f32(vout4, vzero);
+
+        vst1q_f32(ptr_out, vout1);
+        vst1q_f32(ptr_out + 4, vout2);
+        vst1q_f32(ptr_out + 8, vout3);
+        vst1q_f32(ptr_out + 12, vout4);
+
+        ptr_out += 16;
+        ptr_bias += 16;
+      }
+      for (int i = 0; i < remain; ++i) {
+        *ptr_out += *(ptr_bias++);
+        *ptr_out = *ptr_out > 0.f ? *ptr_out : 0.f;
+        ptr_out++;
+      }
    }
-#if 0
-        if (cnt > 0) {
-            asm(
-            "1: \n"
-            "vld1.32 {d0-d1}, [%[ptr_out]]    @ load data\n"
-            "vld1.32 {d2-d3}, [%[ptr_bias]]!  @ load data\n"
-            "vadd.f32 q2, q0, q1              @ add bias\n"
-            "vst1.32  {d4-d5}, [%[ptr_out]]!  @ store result\n"
-            "subs   %[cnt], #1                @ loop count -1\n"
-            "bne    1b                        @ jump to main loop\n"
-            :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \
-                    [cnt] "+r"(cnt)
-            :
-            :"q0", "q1", "q2"
-            );
-        }
-#endif
-    for (int i = 0; i < remain; ++i) {
-      *(ptr_out++) += *(ptr_bias++);
+  } else {
+    for (int j = 0; j < num; ++j) {
+      const float *ptr_bias = bias;
+      float *ptr_out = out + j * channel;
+
+      for (int i = 0; i < cnt; ++i) {
+        float32x4_t vin1 = vld1q_f32(ptr_out);
+        float32x4_t vb1 = vld1q_f32(ptr_bias);
+
+        float32x4_t vin2 = vld1q_f32(ptr_out + 4);
+        float32x4_t vb2 = vld1q_f32(ptr_bias + 4);
+
+        float32x4_t vin3 = vld1q_f32(ptr_out + 8);
+        float32x4_t vb3 = vld1q_f32(ptr_bias + 8);
+
+        float32x4_t vin4 = vld1q_f32(ptr_out + 12);
+        float32x4_t vb4 = vld1q_f32(ptr_bias + 12);
+
+        float32x4_t vout1 = vaddq_f32(vin1, vb1);
+        float32x4_t vout2 = vaddq_f32(vin2, vb2);
+        float32x4_t vout3 = vaddq_f32(vin3, vb3);
+        float32x4_t vout4 = vaddq_f32(vin4, vb4);
+
+        vst1q_f32(ptr_out, vout1);
+        vst1q_f32(ptr_out + 4, vout2);
+        vst1q_f32(ptr_out + 8, vout3);
+        vst1q_f32(ptr_out + 12, vout4);
+
+        ptr_out += 16;
+        ptr_bias += 16;
+      }
+      for (int i = 0; i < remain; ++i) {
+        *(ptr_out++) += *(ptr_bias++);
+      }
    }
  }
 }

 template <>
-void fill_bias_fc<int>(int *out, const int *bias, int num, int channel) {
+void fill_bias_fc<int>(
+    int *out, const int *bias, int num, int channel, bool flag_relu) {
  int cnt = channel >> 4;
  int remain = channel & 15;
-
-  for (int j = 0; j < num; ++j) {
-    const int *ptr_bias = bias;
-    int *ptr_out = out + j * channel;
-
-    int32x4_t vout1;
-    int32x4_t vout2;
-    int32x4_t vout3;
-    int32x4_t vout4;
-
-    for (int i = 0; i < cnt; ++i) {
-      int32x4_t vin1 = vld1q_s32(ptr_out);
-      int32x4_t vb1 = vld1q_s32(ptr_bias);
-
-      int32x4_t vin2 = vld1q_s32(ptr_out + 4);
-      int32x4_t vb2 = vld1q_s32(ptr_bias + 4);
-
-      int32x4_t vin3 = vld1q_s32(ptr_out + 8);
-      int32x4_t vb3 = vld1q_s32(ptr_bias + 8);
-
-      int32x4_t vin4 = vld1q_s32(ptr_out + 12);
-      int32x4_t vb4 = vld1q_s32(ptr_bias + 12);
-
-      vout1 = vaddq_s32(vin1, vb1);
-      vout2 = vaddq_s32(vin2, vb2);
-      vout3 = vaddq_s32(vin3, vb3);
-      vout4 = vaddq_s32(vin4, vb4);
-
-      vst1q_s32(ptr_out, vout1);
-      vst1q_s32(ptr_out + 4, vout2);
-      vst1q_s32(ptr_out + 8, vout3);
-      vst1q_s32(ptr_out + 12, vout4);
-
-      ptr_out += 16;
-      ptr_bias += 16;
-    }
-
-#if 0
-        if (cnt > 0) {
-        asm(
-        "1: \n"
-        "vld1.32 {d0-d1}, [%[ptr_out]]    @ load data\n"
-        "vld1.32 {d2-d3}, [%[ptr_bias]]!  @ load data\n"
-        "vadd.s32 q2, q0, q1              @ add bias\n"
-        "vst1.32  {d4-d5}, [%[ptr_out]]!  @ store result\n"
-        "subs   %[cnt], #1                @ loop count -1\n"
-        "bne    1b                        @ jump to main loop\n"
-        :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \
-                [cnt] "+r"(cnt)
-        :
-        :"q0", "q1", "q2"
-        );
+  if (flag_relu) {
+    for (int j = 0; j < num; ++j) {
+      const int *ptr_bias = bias;
+      int *ptr_out = out + j * channel;
+
+      int32x4_t vzero = vdupq_n_s32(0);
+
+      for (int i = 0; i < cnt; ++i) {
+        int32x4_t vin1 = vld1q_s32(ptr_out);
+        int32x4_t vb1 = vld1q_s32(ptr_bias);
+
+        int32x4_t vin2 = vld1q_s32(ptr_out + 4);
+        int32x4_t vb2 = vld1q_s32(ptr_bias + 4);
+
+        int32x4_t vin3 = vld1q_s32(ptr_out + 8);
+        int32x4_t vb3 = vld1q_s32(ptr_bias + 8);
+
+        int32x4_t vin4 = vld1q_s32(ptr_out + 12);
+        int32x4_t vb4 = vld1q_s32(ptr_bias + 12);
+
+        int32x4_t vout1 = vaddq_s32(vin1, vb1);
+        int32x4_t vout2 = vaddq_s32(vin2, vb2);
+        int32x4_t vout3 = vaddq_s32(vin3, vb3);
+        int32x4_t vout4 = vaddq_s32(vin4, vb4);
+
+        vout1 = vmaxq_s32(vout1, vzero);
+        vout2 = vmaxq_s32(vout2, vzero);
+        vout3 = vmaxq_s32(vout3, vzero);
+        vout4 = vmaxq_s32(vout4, vzero);
+
+        vst1q_s32(ptr_out, vout1);
+        vst1q_s32(ptr_out + 4, vout2);
+        vst1q_s32(ptr_out + 8, vout3);
+        vst1q_s32(ptr_out + 12, vout4);
+
+        ptr_out += 16;
+        ptr_bias += 16;
+      }
+      for (int i = 0; i < remain; ++i) {
+        *ptr_out += *(ptr_bias++);
+        *ptr_out = *ptr_out > 0 ? *ptr_out : 0;
+        ptr_out++;
+      }
    }
-#endif
-    for (int i = 0; i < remain; ++i) {
-      *(ptr_out++) += *(ptr_bias++);
+  } else {
+    for (int j = 0; j < num; ++j) {
+      const int *ptr_bias = bias;
+      int *ptr_out = out + j * channel;
+
+      int32x4_t vout1;
+      int32x4_t vout2;
+      int32x4_t vout3;
+      int32x4_t vout4;
+
+      for (int i = 0; i < cnt; ++i) {
+        int32x4_t vin1 = vld1q_s32(ptr_out);
+        int32x4_t vb1 = vld1q_s32(ptr_bias);
+
+        int32x4_t vin2 = vld1q_s32(ptr_out + 4);
+        int32x4_t vb2 = vld1q_s32(ptr_bias + 4);
+
+        int32x4_t vin3 = vld1q_s32(ptr_out + 8);
+        int32x4_t vb3 = vld1q_s32(ptr_bias + 8);
+
+        int32x4_t vin4 = vld1q_s32(ptr_out + 12);
+        int32x4_t vb4 = vld1q_s32(ptr_bias + 12);
+
+        vout1 = vaddq_s32(vin1, vb1);
+        vout2 = vaddq_s32(vin2, vb2);
+        vout3 = vaddq_s32(vin3, vb3);
+        vout4 = vaddq_s32(vin4, vb4);
+
+        vst1q_s32(ptr_out, vout1);
+        vst1q_s32(ptr_out + 4, vout2);
+        vst1q_s32(ptr_out + 8, vout3);
+        vst1q_s32(ptr_out + 12, vout4);
+
+        ptr_out += 16;
+        ptr_bias += 16;
+      }
+      for (int i = 0; i < remain; ++i) {
+        *(ptr_out++) += *(ptr_bias++);
+      }
    }
  }
 }

--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -356,7 +356,8 @@ inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
 }

 template <typename T>
-void fill_bias_fc(T* tensor, const T* bias, int num, int channel);
+void fill_bias_fc(
+    T* tensor, const T* bias, int num, int channel, bool flag_relu);

 template <lite_api::ActivationType Act = lite_api::ActivationType::kIndentity>
 inline float32x4_t vactive_f32(const float32x4_t& x) {

--- a/lite/backends/arm/math/gru_utils.h
+++ b/lite/backends/arm/math/gru_utils.h
@@ -383,6 +383,8 @@ struct GRUUnitFunctor {
                      const lite_api::ActivationType active_gate,
                      bool origin_mode,
                      ARMContext* ctx) {
+    operators::ActivationParam act_param;
+    act_param.has_active = false;
    if (value.prev_out_value) {
      sgemm(false,
            false,
@@ -399,7 +401,7 @@ struct GRUUnitFunctor {
            frame_size * 3,
            nullptr,
            false,
-            false,
+            act_param,
            ctx);
    }
    gru_unit_reset_act(active_gate, value, frame_size, batch_size);
@@ -420,7 +422,7 @@ struct GRUUnitFunctor {
            frame_size * 3,
            nullptr,
            false,
-            false,
+            act_param,
            ctx);
    }


--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
@@ -14,6 +14,7 @@

 #include "lite/backends/arm/math/packed_sgemm.h"
 #include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"

 namespace paddle {
 namespace lite {
@@ -51,7 +52,7 @@ void sgemm_prepacked_8x12(bool is_transB,
                          int ldc,
                          const float *bias,
                          bool has_bias,
-                          bool has_relu,
+                          const operators::ActivationParam act_param,
                          ARMContext *ctx);

 void pack_m4(float *out,
@@ -83,7 +84,7 @@ void sgemm_prepacked_4x4(bool is_transB,
                         int ldc,
                         const float *bias,
                         bool has_bias,
-                         bool has_relu,
+                         const operators::ActivationParam act_param,
                         ARMContext *ctx);
 #else
 // for kA72
@@ -136,7 +137,7 @@ void sgemm_prepacked_6x8(bool is_transB,
                         int ldc,
                         const float *bias,
                         bool has_bias,
-                         bool has_relu,
+                         const operators::ActivationParam act_param,
                         ARMContext *ctx);
 // for kA73, 4x8
 void sgemm_prepacked_4x8(bool is_transB,
@@ -151,7 +152,7 @@ void sgemm_prepacked_4x8(bool is_transB,
                         int ldc,
                         const float *bias,
                         bool has_bias,
-                         bool has_relu,
+                         const operators::ActivationParam act_param,
                         ARMContext *ctx);
 #endif  // __aarch64__

@@ -249,7 +250,7 @@ void sgemm_prepack(bool is_transB,
                   int ldc,
                   const float *bias,
                   bool has_bias,
-                   bool has_relu,
+                   const operators::ActivationParam act_param,
                   ARMContext *ctx) {
 #ifdef __aarch64__
  if (M <= 4) {
@@ -265,7 +266,7 @@ void sgemm_prepack(bool is_transB,
                        ldc,
                        bias,
                        has_bias,
-                        has_relu,
+                        act_param,
                        ctx);
  } else {
    sgemm_prepacked_8x12(is_transB,
@@ -280,7 +281,7 @@ void sgemm_prepack(bool is_transB,
                         ldc,
                         bias,
                         has_bias,
-                         has_relu,
+                         act_param,
                         ctx);
  }
 #else   // armv7
@@ -297,7 +298,7 @@ void sgemm_prepack(bool is_transB,
                        ldc,
                        bias,
                        has_bias,
-                        has_relu,
+                        act_param,
                        ctx);
  } else {
    sgemm_prepacked_6x8(is_transB,
@@ -312,7 +313,7 @@ void sgemm_prepack(bool is_transB,
                        ldc,
                        bias,
                        has_bias,
-                        has_relu,
+                        act_param,
                        ctx);
  }
 #endif  // arm64
@@ -2283,7 +2284,7 @@ void sgemm_prepacked_8x12(bool is_transB,
                          int ldc,
                          const float *bias,
                          bool has_bias,
-                          bool has_relu,
+                          const operators::ActivationParam act_param,
                          ARMContext *ctx) {
  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
  auto workspace = ctx->workspace_data<float>();
@@ -2837,33 +2838,6 @@ void sgemm_prepacked_8x12(bool is_transB,
            "fmla	v28.4s,  v4.4s,  v1.s[2]\n"   /* out22 = b2 * a10[0], b2 =q7*/
            "fmla	v31.4s,  v4.4s,  v1.s[3]\n"   /* out23 = b2 * a10[0], b2 =q7*/
            "11: \n"                            /* check if relu */
-            "cbz    %w[relu],   12f\n"          /* skip relu */
-            "movi   v2.4s, #0\n"                /* for relu*/
-            "fmax   v8.4s, v8.4s, v2.4s\n"      /* relu*/
-            "fmax   v9.4s, v9.4s, v2.4s\n"      /* relu*/
-            "fmax   v10.4s, v10.4s, v2.4s\n"    /* relu*/
-            "fmax   v11.4s, v11.4s, v2.4s\n"    /* relu*/
-            "fmax   v12.4s, v12.4s, v2.4s\n"    /* relu*/
-            "fmax   v13.4s, v13.4s, v2.4s\n"    /* relu*/
-            "fmax   v14.4s, v14.4s, v2.4s\n"    /* relu*/
-            "fmax   v15.4s, v15.4s, v2.4s\n"    /* relu*/
-            "fmax   v16.4s,v16.4s,v2.4s\n"      /* relu*/
-            "fmax   v17.4s,v17.4s,v2.4s\n"      /* relu*/
-            "fmax   v18.4s, v18.4s, v2.4s\n"    /* relu*/
-            "fmax   v19.4s, v19.4s, v2.4s\n"    /* relu*/
-            "fmax   v20.4s, v20.4s, v2.4s\n"    /* relu*/
-            "fmax   v21.4s, v21.4s, v2.4s\n"    /* relu*/
-            "fmax   v22.4s, v22.4s, v2.4s\n"    /* relu*/
-            "fmax   v23.4s, v23.4s, v2.4s\n"    /* relu*/
-            "fmax   v24.4s,v24.4s,v2.4s\n"      /* relu*/
-            "fmax   v25.4s,v25.4s,v2.4s\n"      /* relu*/
-            "fmax   v26.4s, v26.4s, v2.4s\n"    /* relu*/
-            "fmax   v27.4s, v27.4s, v2.4s\n"    /* relu*/
-            "fmax   v28.4s, v28.4s, v2.4s\n"    /* relu*/
-            "fmax   v29.4s, v29.4s, v2.4s\n"    /* relu*/
-            "fmax   v30.4s, v30.4s, v2.4s\n"    /* relu*/
-            "fmax   v31.4s, v31.4s, v2.4s\n"    /* relu*/
-            "12: \n"
            "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n"   /* store r0 */
            "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */
            "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */
@@ -2886,7 +2860,6 @@ void sgemm_prepacked_8x12(bool is_transB,
              [c_ptr6] "+r"(c_ptr6),
              [c_ptr7] "+r"(c_ptr7)
            : [bias_ptr] "r"(bias_local),
-              [relu] "r"(has_relu),
              [has_beta] "r"(has_beta),
              [beta] "r"(beta)
            : "cc","memory",
@@ -2911,6 +2884,13 @@ void sgemm_prepacked_8x12(bool is_transB,
      }
    }
  }
+  if (act_param.has_active) {
+#pragma omp parallel for num_threads(threads)
+    for (unsigned int x = 0; x < M; x++) {
+      float *dst = C + x * ldc;
+      act_switch_process(dst, dst, N, &act_param);
+    }
+  }
 }

 void sgemm_prepacked_4x4(bool is_transB,
@@ -2925,7 +2905,7 @@ void sgemm_prepacked_4x4(bool is_transB,
                         int ldc,
                         const float *bias,
                         bool has_bias,
-                         bool has_relu,
+                         const operators::ActivationParam act_param,
                         ARMContext *ctx) {
  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
  auto workspace = ctx->workspace_data<float>();
@@ -3158,13 +3138,6 @@ void sgemm_prepacked_4x4(bool is_transB,
            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b1 =q6*/

            "11: \n"                            /* check if relu */
-            "cbz    %w[relu],   12f\n"          /* skip relu */
-            "movi   v2.4s, #0\n"                /* for relu*/
-            "fmax   v8.4s, v8.4s, v2.4s\n"      /* relu*/
-            "fmax   v9.4s, v9.4s, v2.4s\n"      /* relu*/
-            "fmax   v10.4s, v10.4s, v2.4s\n"    /* relu*/
-            "fmax   v11.4s, v11.4s, v2.4s\n"    /* relu*/
-            "12: \n"
            "st1 {v8.4s}, [%[c_ptr0]], #16\n"   /* store r0 */
            "st1 {v9.4s}, [%[c_ptr1]], #16\n" /* store r1 */
            "st1 {v10.4s}, [%[c_ptr2]], #16\n" /* store r2 */
@@ -3179,7 +3152,6 @@ void sgemm_prepacked_4x4(bool is_transB,
              [c_ptr2] "+r"(c_ptr2),
              [c_ptr3] "+r"(c_ptr3)
            : [bias_ptr] "r"(bias_local),
-              [relu] "r"(has_relu),
              [has_beta] "r"(has_beta),
              [beta] "r"(beta)
            : "cc","memory",
@@ -3197,6 +3169,13 @@ void sgemm_prepacked_4x4(bool is_transB,
      }
    }
  }
+  if (act_param.has_active) {
+#pragma omp parallel for num_threads(threads)
+    for (unsigned int x = 0; x < M; x++) {
+      float *dst = C + x * ldc;
+      act_switch_process(dst, dst, N, &act_param);
+    }
+  }
 }
 #else  // __aarch64__
 /**
@@ -3222,7 +3201,7 @@ void sgemm_prepacked_6x8(bool is_transB,
                         int ldc,
                         const float* bias,
                         bool has_bias,
-                         bool has_relu,
+                         const operators::ActivationParam act_param,
                         ARMContext* ctx) {
  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
  auto* workspace = ctx->workspace_data<float>();
@@ -3601,22 +3580,6 @@ void sgemm_prepacked_6x8(bool is_transB,
            "vmla.f32	q13, q3, d0[0]              @ out10 += b2 * a4\n"
            "vmla.f32	q15, q3, d0[1]              @ out11 += b2 * a5\n"
            "2:                                     @ check relu\n"
-            "cmp    %[relu], #0                     @ check if has relu\n"
-            "ble    6f                              @ skip relu if relu <= 0\n"
-            "vmov.u32    q0, #0                     @ for relu\n"
-            "vmax.f32   q4, q4, q0                  @ for relu\n"
-            "vmax.f32   q5, q5, q0                  @ for relu\n"
-            "vmax.f32   q6, q6, q0                  @ for relu\n"
-            "vmax.f32   q7, q7, q0                  @ for relu\n"
-            "vmax.f32   q8, q8, q0                  @ for relu\n"
-            "vmax.f32   q9, q9, q0                  @ for relu\n"
-            "vmax.f32   q10, q10, q0                @ for relu\n"
-            "vmax.f32   q11, q11, q0                @ for relu\n"
-            "vmax.f32   q12, q12, q0                @ for relu\n"
-            "vmax.f32   q13, q13, q0                @ for relu\n"
-            "vmax.f32   q14, q14, q0                @ for relu\n"
-            "vmax.f32   q15, q15, q0                @ for relu\n"
-            "6:                                     @ store result\n"
            "vst1.32    {d8-d11},   [%[c_ptr0]]!    @ store r0\n"
            "vst1.32    {d12-d15},  [%[c_ptr1]]!    @ store r1\n"
            "vst1.32    {d16-d19},  [%[c_ptr2]]!    @ store r2\n"
@@ -3634,7 +3597,6 @@ void sgemm_prepacked_6x8(bool is_transB,
              [k] "+r"(k),
              [tails] "+r"(tails)
            : [bias_ptr] "r"(bias_local),
-              [relu] "r"(has_relu),
              [beta] "r"(beta)
            : "q0","q1","q2","q3","q4",
              "q5","q6","q7","q8","q9","q10","q11",
@@ -3654,6 +3616,13 @@ void sgemm_prepacked_6x8(bool is_transB,
      }
    }
  }
+  if (act_param.has_active) {
+#pragma omp parallel for num_threads(threads)
+    for (unsigned int x = 0; x < M; x++) {
+      float* dst = C + x * ldc;
+      act_switch_process(dst, dst, N, &act_param);
+    }
+  }
 }

 void sgemm_prepacked_4x8(bool is_transB,
@@ -3668,7 +3637,7 @@ void sgemm_prepacked_4x8(bool is_transB,
                         int ldc,
                         const float* bias,
                         bool has_bias,
-                         bool has_relu,
+                         const operators::ActivationParam act_param,
                         ARMContext* ctx) {
  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
  auto* workspace = ctx->workspace_data<float>();
@@ -3953,18 +3922,6 @@ void sgemm_prepacked_4x8(bool is_transB,
            /*aptr - 16*/
            "sub		%[a_ptr], %[a_ptr], #16     @ tail--\n"
            "2:                                     @ check relu\n"
-            "cmp    %[relu], #0                     @ check if has relu\n"
-            "ble    6f                              @ skip relu if relu <= 0\n"
-            "vmov.u32    q0, #0                     @ for relu\n"
-            "vmax.f32   q8, q8, q0                  @ for relu\n"
-            "vmax.f32   q9, q9, q0                  @ for relu\n"
-            "vmax.f32   q10, q10, q0                @ for relu\n"
-            "vmax.f32   q11, q11, q0                @ for relu\n"
-            "vmax.f32   q12, q12, q0                @ for relu\n"
-            "vmax.f32   q13, q13, q0                @ for relu\n"
-            "vmax.f32   q14, q14, q0                @ for relu\n"
-            "vmax.f32   q15, q15, q0                @ for relu\n"
-            "6:                                     @ store result\n"
            "vst1.32    {d16-d19},  [%[c_ptr0]]!    @ store r0\n"
            "vst1.32    {d20-d23},  [%[c_ptr1]]!    @ store r1\n"
            "vst1.32    {d24-d27},  [%[c_ptr2]]!    @ store r2\n"
@@ -3978,7 +3935,6 @@ void sgemm_prepacked_4x8(bool is_transB,
              [k] "+r"(k),
              [tails] "+r"(tails)
            : [bias_ptr] "r"(bias_local),
-              [relu] "r"(has_relu),
              [beta] "r"(beta)
            : "q0","q1","q2","q3",
              "q4","q5","q6","q7","q8","q9","q10",
@@ -3995,6 +3951,13 @@ void sgemm_prepacked_4x8(bool is_transB,
      }
    }
  }
+  if (act_param.has_active) {
+#pragma omp parallel for num_threads(threads)
+    for (unsigned int x = 0; x < M; x++) {
+      float* dst = C + x * ldc;
+      act_switch_process(dst, dst, N, &act_param);
+    }
+  }
 }
 #endif  // __aarch64__


--- a/lite/backends/arm/math/packed_sgemm.h
+++ b/lite/backends/arm/math/packed_sgemm.h
@@ -17,6 +17,7 @@
 #include <cmath>
 #include "lite/core/context.h"
 #include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"

 namespace paddle {
 namespace lite {
@@ -74,7 +75,7 @@ void sgemm_prepack(bool is_transB,
                   int ldc,
                   const float* bias,
                   bool has_bias,
-                   bool has_relu,
+                   const operators::ActivationParam act_param,
                   ARMContext* ctx);

 }  // namespace math

--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -898,6 +898,121 @@ void pooling_global_avg(const float* din,
  }
 }

+void pooling1x1s2p0_max(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win) {
+  int size_channel_out = wout * hout;
+  int size_channel_in = win * hin;
+  auto data_out = static_cast<float*>(dout);
+  auto data_in = static_cast<const float*>(din);
+
+  int w_unroll_size = wout / 4;
+  int w_unroll_remian = wout - w_unroll_size * 4;
+  int win_ext = w_unroll_size * 8;
+  auto zero_ptr =
+      static_cast<float*>(TargetMalloc(TARGET(kARM), win * sizeof(float)));
+  memset(zero_ptr, 0, win * sizeof(float));
+  auto write_ptr =
+      static_cast<float*>(TargetMalloc(TARGET(kARM), wout * sizeof(float)));
+
+  for (int n = 0; n < num; ++n) {
+    float* data_out_batch = data_out + n * chout * size_channel_out;
+    const float* data_in_batch = data_in + n * chin * size_channel_in;
+#pragma omp parallel for
+    for (int c = 0; c < chout; c++) {
+      float* data_out_channel = data_out_batch + c * size_channel_out;
+      const float* data_in_channel = data_in_batch + c * size_channel_in;
+      for (int h = 0; h < hout; h += 4) {
+        const float* din0_ptr = data_in_channel + h * 2 * win;
+        const float* din1_ptr = din0_ptr + 2 * win;
+        const float* din2_ptr = din1_ptr + 2 * win;
+        const float* din3_ptr = din2_ptr + 2 * win;
+
+        float* doutr0 = data_out_channel + h * wout;
+        float* doutr1 = doutr0 + wout;
+        float* doutr2 = doutr1 + wout;
+        float* doutr3 = doutr2 + wout;
+        if (h + 4 > hout) {
+          switch (h + 4 - hout) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+        if (h * 2 + 7 > hin) {
+          switch (h * 2 + 7 - hin) {
+            case 7:
+              din0_ptr = zero_ptr;
+            case 6:
+            case 5:
+              din1_ptr = zero_ptr;
+            case 4:
+            case 3:
+              din2_ptr = zero_ptr;
+            case 2:
+            case 1:
+              din3_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        for (int i = 0; i < w_unroll_size; i++) {
+          float32x4x2_t din0 = vld2q_f32(din0_ptr);
+          float32x4x2_t din1 = vld2q_f32(din1_ptr);
+          float32x4x2_t din2 = vld2q_f32(din2_ptr);
+          float32x4x2_t din3 = vld2q_f32(din3_ptr);
+          din0_ptr += 8;
+          din1_ptr += 8;
+          din2_ptr += 8;
+          din3_ptr += 8;
+
+          vst1q_f32(doutr0, din0.val[0]);
+          vst1q_f32(doutr1, din1.val[0]);
+          vst1q_f32(doutr2, din2.val[0]);
+          vst1q_f32(doutr3, din3.val[0]);
+
+          doutr0 += 4;
+          doutr1 += 4;
+          doutr2 += 4;
+          doutr3 += 4;
+        }
+        int j = win_ext;
+        for (int i = 0; i < w_unroll_remian; i++) {
+          if (j >= win) {
+            *doutr0++ = 0.f;
+            *doutr1++ = 0.f;
+            *doutr2++ = 0.f;
+            *doutr3++ = 0.f;
+          } else {
+            *doutr0++ = *din0_ptr;
+            *doutr1++ = *din1_ptr;
+            *doutr2++ = *din2_ptr;
+            *doutr3++ = *din3_ptr;
+            din0_ptr += 2;
+            din1_ptr += 2;
+            din2_ptr += 2;
+            din3_ptr += 2;
+          }
+          j += 2;
+        }
+      }
+    }
+  }
+  TargetFree(TARGET(kARM), zero_ptr);
+  TargetFree(TARGET(kARM), write_ptr);
+}
+
 void pooling2x2s2_max(const float* din,
                      float* dout,
                      int num,

--- a/lite/backends/arm/math/pooling.h
+++ b/lite/backends/arm/math/pooling.h
@@ -64,6 +64,16 @@ void pooling_global_avg(const float* din,
                        int hin,
                        int win);

+void pooling1x1s2p0_max(const float* din,
+                        float* dout,
+                        int num,
+                        int chout,
+                        int hout,
+                        int wout,
+                        int chin,
+                        int hin,
+                        int win);
+
 void pooling2x2s2_max(const float* din,
                      float* dout,
                      int num,

--- a/lite/backends/arm/math/sgemm.cc
+++ b/lite/backends/arm/math/sgemm.cc
@@ -34,7 +34,7 @@ void sgemm(bool is_transA,
           int ldc,
           const float* bias,
           bool is_bias,
-           bool is_relu,
+           const operators::ActivationParam act_param,
           ARMContext* ctx) {
  int hblock = get_hblock(ctx);
  int m_roundup = hblock * ((M + hblock - 1) / hblock);
@@ -56,7 +56,7 @@ void sgemm(bool is_transA,
                ldc,
                bias,
                is_bias,
-                is_relu,
+                act_param,
                ctx);
  TargetFree(TargetType::kARM, packed_A);
 }

--- a/lite/backends/arm/math/sgemm.h
+++ b/lite/backends/arm/math/sgemm.h
@@ -39,7 +39,7 @@ void sgemm(bool is_transA,
           int ldc,
           const float* bias,
           bool is_bias,
-           bool is_relu,
+           const operators::ActivationParam act_param,
           ARMContext* ctx);

 }  // namespace math

--- a/lite/backends/bm/CMakeLists.txt
+++ b/lite/backends/bm/CMakeLists.txt
+if (NOT LITE_WITH_BM)
+    return()
+endif()
+
+lite_cc_library(target_wrapper_bm SRCS target_wrapper.cc DEPS ${bm_runtime_libs})
--- a/lite/backends/bm/target_wrapper.cc
+++ b/lite/backends/bm/target_wrapper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/bm/target_wrapper.h"
+#include <bmcompiler_if.h>
+#include <bmlib_runtime.h>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+
+int TargetWrapperBM::device_id_ = 0;
+std::map<int, void*> TargetWrapperBM::bm_hds_;
+
+size_t TargetWrapperBM::num_devices() {
+  int count = 0;
+  bm_dev_getcount(&count);
+  return count;
+}
+
+void TargetWrapperBM::SetDevice(int id) {
+  /*
+    if (id < 0 || (size_t)id >= num_devices()) {
+      LOG(FATAL) << "Failed with invalid device id " << id;
+    }
+  */
+  device_id_ = id;
+  if (bm_hds_.find(id) == bm_hds_.end()) {
+    bm_handle_t bm_handle;
+    bm_status_t ret = bm_dev_request(&bm_handle, id);
+    CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: "
+                              << static_cast<int>(ret);
+    bm_hds_.insert(std::pair<int, bm_handle_t>(id, bm_handle));
+  }
+  return;
+}
+
+void* TargetWrapperBM::GetHandle() {
+  if (bm_hds_.find(device_id_) == bm_hds_.end()) {
+    LOG(FATAL) << "device not initialized " << device_id_;
+  }
+  return bm_hds_.at(device_id_);
+}
+
+void* TargetWrapperBM::Malloc(size_t size) {
+  void* ptr{};
+
+  if (bm_hds_.find(device_id_) == bm_hds_.end()) {
+    SetDevice(device_id_);
+  }
+
+  bm_handle_t bm_handle = static_cast<bm_handle_t>(bm_hds_.at(device_id_));
+  bm_device_mem_t* p_mem =
+      reinterpret_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
+  bm_malloc_device_byte(bm_handle, p_mem, size);
+  ptr = reinterpret_cast<void*>(p_mem);
+  return ptr;
+}
+
+void TargetWrapperBM::Free(void* ptr) {
+  if (ptr != NULL) {
+    bm_handle_t bm_handle = static_cast<bm_handle_t>(bm_hds_.at(device_id_));
+    bm_device_mem_t* mem = static_cast<bm_device_mem_t*>(ptr);
+    bm_free_device(bm_handle, *mem);
+    free(ptr);
+  }
+  return;
+}
+
+void TargetWrapperBM::MemcpySync(void* dst,
+                                 const void* src,
+                                 size_t size,
+                                 IoDirection dir) {
+  if (bm_hds_.find(device_id_) == bm_hds_.end()) {
+    return;
+  }
+
+  bm_handle_t bm_handle = static_cast<bm_handle_t>(bm_hds_.at(device_id_));
+  bm_device_mem_t* pmem{};
+  const bm_device_mem_t* pcst_mem{};
+
+  switch (dir) {
+    case IoDirection::HtoD:
+      pmem = static_cast<bm_device_mem_t*>(dst);
+      bm_memcpy_s2d_partial_offset(
+          bm_handle, *pmem, const_cast<void*>(src), size, 0);
+      break;
+    case IoDirection::DtoH:
+      pcst_mem = static_cast<const bm_device_mem_t*>(src);
+      bm_memcpy_d2s_partial_offset(
+          bm_handle, reinterpret_cast<void*>(dst), *pcst_mem, size, 0);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
+      break;
+  }
+  return;
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/bm/target_wrapper.h
+++ b/lite/backends/bm/target_wrapper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+
+using TargetWrapperBM = TargetWrapper<TARGET(kBM)>;
+
+template <>
+class TargetWrapper<TARGET(kBM)> {
+ public:
+  using stream_t = int;
+  using event_t = int;
+
+  static size_t num_devices();
+  static size_t maximum_stream() { return 0; }
+
+  static void SetDevice(int id);
+  static void CreateStream(stream_t* stream) {}
+  static void DestroyStream(const stream_t& stream) {}
+
+  static void CreateEvent(event_t* event) {}
+  static void DestroyEvent(const event_t& event) {}
+
+  static void RecordEvent(const event_t& event) {}
+  static void SyncEvent(const event_t& event) {}
+
+  static void StreamSync(const stream_t& stream) {}
+
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+
+  static void* GetHandle();
+
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+
+  static void MemcpyAsync(void* dst,
+                          const void* src,
+                          size_t size,
+                          IoDirection dir,
+                          const stream_t& stream) {}
+
+  static void MemsetSync(void* devPtr, int value, size_t count) {}
+
+  static void MemsetAsync(void* devPtr,
+                          int value,
+                          size_t count,
+                          const stream_t& stream) {}
+
+ private:
+  static int device_id_;
+  static std::map<int, void*> bm_hds_;
+};
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/CMakeLists.txt
+++ b/lite/backends/cuda/CMakeLists.txt
 if(NOT LITE_WITH_CUDA)
    return()
 endif()
-get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
+get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)

-nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_static_deps})
-nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_static_deps})
+nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_deps})
+nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_deps})
 
 add_subdirectory(math)
--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
@@ -18,8 +18,8 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "ai_ddk_lib/include/hiai_ir_build.h"
+#include "HiAiModelManagerService.h"  // NOLINT
+#include "hiai_ir_build.h"            // NOLINT

 namespace paddle {
 namespace lite {

--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
@@ -214,3 +214,172 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
  }
 }
+
+__kernel void conv2d_1x1_simple(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+    __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+__read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int input_c_origin,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height,
+                         __private const int old_w) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int out_w0 = out_w;
+  int out_w1 = out_w + global_size_dim1;
+  int out_w2 = out_w + global_size_dim1 * 2;
+  int out_w3 = out_w + global_size_dim1 * 3;
+
+  int outpos_main = mul24(out_c, old_w);
+  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
+  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
+  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
+  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 stride_xy = (int2)(stride, stride);
+
+  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
+  int2 in_pos_in_one_block0 =
+      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
+  int2 in_pos_in_one_block1 =
+      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
+  int2 in_pos_in_one_block2 =
+      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
+  int2 in_pos_in_one_block3 =
+      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE_CH
+  CL_DTYPE4 output0 =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+  CL_DTYPE4 output1 = output0;
+  CL_DTYPE4 output2 = output0;
+  CL_DTYPE4 output3 = output0;
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos0);
+  CL_DTYPE4 output1 = output0;
+  CL_DTYPE4 output2 = output0;
+  CL_DTYPE4 output3 = output0;
+
+#else
+  CL_DTYPE4 output0 = 0.0f;
+  CL_DTYPE4 output1 = 0.0f;
+  CL_DTYPE4 output2 = 0.0f;
+  CL_DTYPE4 output3 = 0.0f;
+#endif
+
+  for (int i = 0; i < input_c; ++i) {
+    // ------------0---------------
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
+    CL_DTYPE4 input0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+
+    CL_DTYPE4 weight0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 0));
+    CL_DTYPE4 weight1 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 1));
+    CL_DTYPE4 weight2 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2));
+    CL_DTYPE4 weight3 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3));
+
+    output0 = mad(input0.x, weight0, output0);
+    output0 = mad(input0.y, weight1, output0);
+    output0 = mad(input0.z, weight2, output0);
+    output0 = mad(input0.w, weight3, output0);
+
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
+    CL_DTYPE4 input1 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+    output1 = mad(input1.x, weight0, output1);
+    output1 = mad(input1.y, weight1, output1);
+    output1 = mad(input1.z, weight2, output1);
+    output1 = mad(input1.w, weight3, output1);
+
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
+    CL_DTYPE4 input2 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+    output2 = mad(input2.x, weight0, output2);
+    output2 = mad(input2.y, weight1, output2);
+    output2 = mad(input2.z, weight2, output2);
+    output2 = mad(input2.w, weight3, output2);
+
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
+    CL_DTYPE4 input3 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+    output3 = mad(input3.x, weight0, output3);
+    output3 = mad(input3.y, weight1, output3);
+    output3 = mad(input3.z, weight2, output3);
+    output3 = mad(input3.w, weight3, output3);
+  }
+
+#ifdef BATCH_NORM
+  output0 = output0 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+
+  output1 = output1 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+
+  output2 = output2 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+
+  output3 = output3 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+  output0 = activation_type4(output0);
+  output1 = activation_type4(output1);
+  output2 = activation_type4(output2);
+  output3 = activation_type4(output3);
+#endif
+
+  if (out_w0 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
+  }
+
+  if (out_w1 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1);
+  }
+
+  if (out_w2 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos2, output2);
+  }
+
+  if (out_w3 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
+  }
+}
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void depth_conv2d(__private const int global_size_dim0,
+                           __private const int global_size_dim1,
+                           __private const int global_size_dim2,
+                           __read_only image2d_t input,
+                           __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                           __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                           __read_only image2d_t new_scale,
+                           __read_only image2d_t new_biase,
+#endif
+                           __write_only image2d_t output_image,
+                           __private const int stride,
+                           __private const int offset,
+                           __private const int input_c,
+                           __private const int dilation,
+                           __private const int input_width,  /* of one block */
+                           __private const int input_height, /* of one block */
+                           __private const int output_width,
+                           __private const int output_height,
+                           __private const int filter_width,
+                           __private const int filter_height) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+#ifdef BIASE_CH
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+  CL_DTYPE4 output = 0.0f;
+#endif
+
+  int2 pos_in_input_block =
+      (int2)(out_c * input_width, batch_index * input_height);
+  int2 pos_in_filter_block =
+      (int2)(out_c * filter_width, batch_index * filter_height);
+  int filter_x = pos_in_filter_block.x;
+  int filter_y = pos_in_filter_block.y;
+  int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x;
+  int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y;
+  int2 align = {filter_width / 2, filter_height / 2};
+  for (int fy = 0; fy < filter_height; ++fy) {
+    for (int fx = 0; fx < filter_width; ++fx) {
+      int x_off = fx - align.x;
+      int y_off = fy - align.y;
+      CL_DTYPE4 in = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input,
+                        sampler,
+                        (int2)(input_x_base + x_off, input_y_base + y_off)),
+          (CL_DTYPE4)(0.0f),
+          (ushort4)((in_pos_in_one_block.x + x_off < 0 ||
+                     in_pos_in_one_block.y + y_off < 0 ||
+                     in_pos_in_one_block.x + x_off >= input_width ||
+                     in_pos_in_one_block.y + y_off >= input_height)
+                    << 15));
+      CL_DTYPE4 f = READ_IMG_TYPE(
+          CL_DTYPE_CHAR, filter, sampler, (int2)(filter_x + fx, filter_y + fy));
+      output += in * f;
+    }
+  }
+#ifdef BATCH_NORM
+  output = output * READ_IMG_TYPE(
+                        CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+           READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+  output = activation_type4(output);
+
+#endif
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
\ No newline at end of file
--- a/lite/backends/x86/jit/README.en.md
+++ b/lite/backends/x86/jit/README.en.md
@@ -89,7 +89,7 @@ All kernels are inlcuded in `lite/backends/x86/jit/kernels.h`, which is automati
 3. Add reference function of `your_key`. 
 Note:
    - this should be run on CPU and do not depend on any third-party.
-    - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
+    - Add `USE_JITKERNEL_REFER_LITE(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
 4. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
 Test more data type for some special functions if necessary, for example `int8`.
 5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one.

--- a/lite/backends/x86/jit/README.md
+++ b/lite/backends/x86/jit/README.md
@@ -79,7 +79,7 @@ PaddlePaddle/Paddle/paddle/fluid/
 # 如何添加新的算子

 1. 在`KernelType` 中添加 `your_key` 。
-2. 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel。
+2. 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER_LITE(your_key)`来使用该kernel。
 3. (optional) 实现更多的算法在`more`目录下，可以依赖mkl，intrinsic或者mkldnn等第三方库。
 4. (optional) 实现基于Xbyak的生成code，在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`，并注册在与refer相同的`KernelType`上。
 5. 添加新的`KernelTuple`，需要与`KernelType`一一对应，是所有类型的一个打包，包括数据类型，属性的类型，以及返回的函数类型。可以参考`SeqPoolTuple`，新加的Attr类型需要特例化`JitCodeKey`方法。

--- a/lite/backends/x86/jit/gen/CMakeLists.txt
+++ b/lite/backends/x86/jit/gen/CMakeLists.txt
@@ -4,33 +4,33 @@ file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
 cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak)
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE)

-function(USE_JITKERNEL_GEN TARGET)
-    file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n")
+function(USE_JITKERNEL_GEN_LITE TARGET)
+    file(APPEND ${jit_file} "USE_JITKERNEL_GEN_LITE(${TARGET});\n")
 endfunction()

 # use gen jitcode kernel by name
-USE_JITKERNEL_GEN(kMatMul)
-USE_JITKERNEL_GEN(kVMul)
-USE_JITKERNEL_GEN(kVAdd)
-USE_JITKERNEL_GEN(kVSub)
-USE_JITKERNEL_GEN(kVAddRelu)
-USE_JITKERNEL_GEN(kVScal)
-USE_JITKERNEL_GEN(kVAddBias)
-USE_JITKERNEL_GEN(kVRelu)
-USE_JITKERNEL_GEN(kVSquare)
-USE_JITKERNEL_GEN(kVIdentity)
-USE_JITKERNEL_GEN(kVExp)
-USE_JITKERNEL_GEN(kVSigmoid)
-USE_JITKERNEL_GEN(kVTanh)
-USE_JITKERNEL_GEN(kLSTMCtHt)
-USE_JITKERNEL_GEN(kLSTMC1H1)
-USE_JITKERNEL_GEN(kGRUH1)
-USE_JITKERNEL_GEN(kGRUHtPart1)
-USE_JITKERNEL_GEN(kGRUHtPart2)
-USE_JITKERNEL_GEN(kNCHW16CMulNC)
-USE_JITKERNEL_GEN(kSeqPool)
-USE_JITKERNEL_GEN(kHMax)
-USE_JITKERNEL_GEN(kHSum)
-USE_JITKERNEL_GEN(kEmbSeqPool)
-USE_JITKERNEL_GEN(kSgd)
-USE_JITKERNEL_GEN(kVBroadcast)
+USE_JITKERNEL_GEN_LITE(kMatMul)
+USE_JITKERNEL_GEN_LITE(kVMul)
+USE_JITKERNEL_GEN_LITE(kVAdd)
+USE_JITKERNEL_GEN_LITE(kVSub)
+USE_JITKERNEL_GEN_LITE(kVAddRelu)
+USE_JITKERNEL_GEN_LITE(kVScal)
+USE_JITKERNEL_GEN_LITE(kVAddBias)
+USE_JITKERNEL_GEN_LITE(kVRelu)
+USE_JITKERNEL_GEN_LITE(kVSquare)
+USE_JITKERNEL_GEN_LITE(kVIdentity)
+USE_JITKERNEL_GEN_LITE(kVExp)
+USE_JITKERNEL_GEN_LITE(kVSigmoid)
+USE_JITKERNEL_GEN_LITE(kVTanh)
+USE_JITKERNEL_GEN_LITE(kLSTMCtHt)
+USE_JITKERNEL_GEN_LITE(kLSTMC1H1)
+USE_JITKERNEL_GEN_LITE(kGRUH1)
+USE_JITKERNEL_GEN_LITE(kGRUHtPart1)
+USE_JITKERNEL_GEN_LITE(kGRUHtPart2)
+USE_JITKERNEL_GEN_LITE(kNCHW16CMulNC)
+USE_JITKERNEL_GEN_LITE(kSeqPool)
+USE_JITKERNEL_GEN_LITE(kHMax)
+USE_JITKERNEL_GEN_LITE(kHSum)
+USE_JITKERNEL_GEN_LITE(kEmbSeqPool)
+USE_JITKERNEL_GEN_LITE(kSgd)
+USE_JITKERNEL_GEN_LITE(kVBroadcast)
--- a/lite/backends/x86/jit/gen/act.cc
+++ b/lite/backends/x86/jit/gen/act.cc
@@ -156,9 +156,9 @@ size_t VTanhCreator::CodeSize(const int& d) const {

 namespace gen = paddle::lite::jit::gen;

-REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator);
-REGISTER_JITKERNEL_GEN(kVSquare, gen::VSquareCreator);
-REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator);
-REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator);
-REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator);
-REGISTER_JITKERNEL_GEN(kVTanh, gen::VTanhCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVRelu, gen::VReluCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVSquare, gen::VSquareCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVIdentity, gen::VIdentityCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVExp, gen::VExpCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVSigmoid, gen::VSigmoidCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVTanh, gen::VTanhCreator);
--- a/lite/backends/x86/jit/gen/blas.cc
+++ b/lite/backends/x86/jit/gen/blas.cc
@@ -181,10 +181,10 @@ DECLARE_BLAS_CREATOR(VAddBias);

 namespace gen = paddle::lite::jit::gen;

-REGISTER_JITKERNEL_GEN(kVMul, gen::VMulCreator);
-REGISTER_JITKERNEL_GEN(kVAdd, gen::VAddCreator);
-REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator);
-REGISTER_JITKERNEL_GEN(kVAddRelu, gen::VAddReluCreator);
-REGISTER_JITKERNEL_GEN(kVScal, gen::VScalCreator);
-REGISTER_JITKERNEL_GEN(kVAddBias, gen::VAddBiasCreator);
-REGISTER_JITKERNEL_GEN(kNCHW16CMulNC, gen::NCHW16CMulNCCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVMul, gen::VMulCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVAdd, gen::VAddCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVSub, gen::VSubCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVAddRelu, gen::VAddReluCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVScal, gen::VScalCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVAddBias, gen::VAddBiasCreator);
+REGISTER_JITKERNEL_GEN_LITE(kNCHW16CMulNC, gen::NCHW16CMulNCCreator);
--- a/lite/backends/x86/jit/gen/embseqpool.cc
+++ b/lite/backends/x86/jit/gen/embseqpool.cc
@@ -145,4 +145,4 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {

 namespace gen = paddle::lite::jit::gen;

-REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator);
+REGISTER_JITKERNEL_GEN_LITE(kEmbSeqPool, gen::EmbSeqPoolCreator);
--- a/lite/backends/x86/jit/gen/gru.cc
+++ b/lite/backends/x86/jit/gen/gru.cc
@@ -111,6 +111,6 @@ DECLARE_GRU_CREATOR(GRUHtPart2);

 namespace gen = paddle::lite::jit::gen;

-REGISTER_JITKERNEL_GEN(kGRUH1, gen::GRUH1Creator);
-REGISTER_JITKERNEL_GEN(kGRUHtPart1, gen::GRUHtPart1Creator);
-REGISTER_JITKERNEL_GEN(kGRUHtPart2, gen::GRUHtPart2Creator);
+REGISTER_JITKERNEL_GEN_LITE(kGRUH1, gen::GRUH1Creator);
+REGISTER_JITKERNEL_GEN_LITE(kGRUHtPart1, gen::GRUHtPart1Creator);
+REGISTER_JITKERNEL_GEN_LITE(kGRUHtPart2, gen::GRUHtPart2Creator);
--- a/lite/backends/x86/jit/gen/hopv.cc
+++ b/lite/backends/x86/jit/gen/hopv.cc
@@ -99,5 +99,5 @@ DECLARE_HOP_CREATOR(HSum);

 namespace gen = paddle::lite::jit::gen;

-REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator);
-REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator);
+REGISTER_JITKERNEL_GEN_LITE(kHMax, gen::HMaxCreator);
+REGISTER_JITKERNEL_GEN_LITE(kHSum, gen::HSumCreator);
--- a/lite/backends/x86/jit/gen/lstm.cc
+++ b/lite/backends/x86/jit/gen/lstm.cc
@@ -138,5 +138,5 @@ DECLARE_LSTM_CREATOR(LSTMC1H1);

 namespace gen = paddle::lite::jit::gen;

-REGISTER_JITKERNEL_GEN(kLSTMCtHt, gen::LSTMCtHtCreator);
-REGISTER_JITKERNEL_GEN(kLSTMC1H1, gen::LSTMC1H1Creator);
+REGISTER_JITKERNEL_GEN_LITE(kLSTMCtHt, gen::LSTMCtHtCreator);
+REGISTER_JITKERNEL_GEN_LITE(kLSTMC1H1, gen::LSTMC1H1Creator);
--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
@@ -130,4 +130,4 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {

 namespace gen = paddle::lite::jit::gen;

-REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator);
+REGISTER_JITKERNEL_GEN_LITE(kMatMul, gen::MatMulCreator);
--- a/lite/backends/x86/jit/gen/seqpool.cc
+++ b/lite/backends/x86/jit/gen/seqpool.cc
@@ -82,4 +82,4 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {

 namespace gen = paddle::lite::jit::gen;

-REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator);
+REGISTER_JITKERNEL_GEN_LITE(kSeqPool, gen::SeqPoolCreator);
--- a/lite/backends/x86/jit/gen/sgd.cc
+++ b/lite/backends/x86/jit/gen/sgd.cc
@@ -127,4 +127,4 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {

 namespace gen = paddle::lite::jit::gen;

-REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator);
+REGISTER_JITKERNEL_GEN_LITE(kSgd, gen::SgdCreator);
--- a/lite/backends/x86/jit/gen/vbroadcast.cc
+++ b/lite/backends/x86/jit/gen/vbroadcast.cc
@@ -88,4 +88,4 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {

 namespace gen = paddle::lite::jit::gen;

-REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVBroadcast, gen::VBroadcastCreator);
--- a/lite/backends/x86/jit/more/CMakeLists.txt
+++ b/lite/backends/x86/jit/more/CMakeLists.txt

-function(USE_JITKERNEL_MORE TARGET TYPE)
-    file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n")
+function(USE_JITKERNEL_MORE_LITE TARGET TYPE)
+    file(APPEND ${jit_file} "USE_JITKERNEL_MORE_LITE(${TARGET} ${TYPE});\n")
 endfunction()

 # enable it latter

--- a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt
+++ b/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt
@@ -5,5 +5,5 @@ cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE)

 # use mkl kernels by name and type
-USE_JITKERNEL_MORE(kCRFDecoding, intrinsic)
-USE_JITKERNEL_MORE(kLayerNorm, intrinsic)
+USE_JITKERNEL_MORE_LITE(kCRFDecoding, intrinsic)
+USE_JITKERNEL_MORE_LITE(kLayerNorm, intrinsic)
--- a/lite/backends/x86/jit/more/mix/CMakeLists.txt
+++ b/lite/backends/x86/jit/more/mix/CMakeLists.txt
@@ -5,11 +5,11 @@ cc_library(jit_kernel_mix SRCS ${jit_kernel_mix_cc} DEPS jit_kernel_base)

 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_mix PARENT_SCOPE)

-USE_JITKERNEL_MORE(kVSigmoid, mix)
-USE_JITKERNEL_MORE(kVTanh, mix)
-USE_JITKERNEL_MORE(kLSTMCtHt, mix)
-USE_JITKERNEL_MORE(kLSTMC1H1, mix)
-USE_JITKERNEL_MORE(kGRUH1, mix)
-USE_JITKERNEL_MORE(kGRUHtPart1, mix)
-USE_JITKERNEL_MORE(kGRUHtPart2, mix)
-USE_JITKERNEL_MORE(kSoftmax, mix)
+USE_JITKERNEL_MORE_LITE(kVSigmoid, mix)
+USE_JITKERNEL_MORE_LITE(kVTanh, mix)
+USE_JITKERNEL_MORE_LITE(kLSTMCtHt, mix)
+USE_JITKERNEL_MORE_LITE(kLSTMC1H1, mix)
+USE_JITKERNEL_MORE_LITE(kGRUH1, mix)
+USE_JITKERNEL_MORE_LITE(kGRUHtPart1, mix)
+USE_JITKERNEL_MORE_LITE(kGRUHtPart2, mix)
+USE_JITKERNEL_MORE_LITE(kSoftmax, mix)
--- a/lite/backends/x86/jit/more/mkl/CMakeLists.txt
+++ b/lite/backends/x86/jit/more/mkl/CMakeLists.txt
@@ -3,18 +3,18 @@ cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml)
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE)

 # use mkl kernels by name and type
-USE_JITKERNEL_MORE(kMatMul, mkl)
-USE_JITKERNEL_MORE(kVMul, mkl)
-USE_JITKERNEL_MORE(kVAdd, mkl)
-USE_JITKERNEL_MORE(kVScal, mkl)
-USE_JITKERNEL_MORE(kStrideScal, mkl)
-USE_JITKERNEL_MORE(kVExp, mkl)
-USE_JITKERNEL_MORE(kVSquare, mkl)
-USE_JITKERNEL_MORE(kVCopy, mkl)
-USE_JITKERNEL_MORE(kVSigmoid, mkl)
-USE_JITKERNEL_MORE(kVTanh, mkl)
-USE_JITKERNEL_MORE(kSeqPool, mkl)
-USE_JITKERNEL_MORE(kSoftmax, mkl)
-USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
-USE_JITKERNEL_MORE(kSgd, mkl)
-USE_JITKERNEL_MORE(kVBroadcast, mkl)
+USE_JITKERNEL_MORE_LITE(kMatMul, mkl)
+USE_JITKERNEL_MORE_LITE(kVMul, mkl)
+USE_JITKERNEL_MORE_LITE(kVAdd, mkl)
+USE_JITKERNEL_MORE_LITE(kVScal, mkl)
+USE_JITKERNEL_MORE_LITE(kStrideScal, mkl)
+USE_JITKERNEL_MORE_LITE(kVExp, mkl)
+USE_JITKERNEL_MORE_LITE(kVSquare, mkl)
+USE_JITKERNEL_MORE_LITE(kVCopy, mkl)
+USE_JITKERNEL_MORE_LITE(kVSigmoid, mkl)
+USE_JITKERNEL_MORE_LITE(kVTanh, mkl)
+USE_JITKERNEL_MORE_LITE(kSeqPool, mkl)
+USE_JITKERNEL_MORE_LITE(kSoftmax, mkl)
+USE_JITKERNEL_MORE_LITE(kEmbSeqPool, mkl)
+USE_JITKERNEL_MORE_LITE(kSgd, mkl)
+USE_JITKERNEL_MORE_LITE(kVBroadcast, mkl)
--- a/lite/backends/x86/jit/refer/CMakeLists.txt
+++ b/lite/backends/x86/jit/refer/CMakeLists.txt
@@ -2,39 +2,39 @@
 cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base)
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE)

-function(USE_JITKERNEL_REFER TARGET)
-    file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n")
+function(USE_JITKERNEL_REFER_LITE TARGET)
+    file(APPEND ${jit_file} "USE_JITKERNEL_REFER_LITE(${TARGET});\n")
 endfunction()

 # use refer kernel by name
-USE_JITKERNEL_REFER(kVMul)
-USE_JITKERNEL_REFER(kVAdd)
-USE_JITKERNEL_REFER(kVAddRelu)
-USE_JITKERNEL_REFER(kVSub)
-USE_JITKERNEL_REFER(kVScal)
-USE_JITKERNEL_REFER(kStrideScal)
-USE_JITKERNEL_REFER(kVAddBias)
-USE_JITKERNEL_REFER(kVCopy)
-USE_JITKERNEL_REFER(kVRelu)
-USE_JITKERNEL_REFER(kVIdentity)
-USE_JITKERNEL_REFER(kVExp)
-USE_JITKERNEL_REFER(kVSigmoid)
-USE_JITKERNEL_REFER(kVTanh)
-USE_JITKERNEL_REFER(kLSTMCtHt)
-USE_JITKERNEL_REFER(kLSTMC1H1)
-USE_JITKERNEL_REFER(kGRUH1)
-USE_JITKERNEL_REFER(kGRUHtPart1)
-USE_JITKERNEL_REFER(kGRUHtPart2)
-USE_JITKERNEL_REFER(kCRFDecoding)
-USE_JITKERNEL_REFER(kLayerNorm)
-USE_JITKERNEL_REFER(kNCHW16CMulNC)
-USE_JITKERNEL_REFER(kSeqPool)
-USE_JITKERNEL_REFER(kMatMul)
-USE_JITKERNEL_REFER(kVSquare)
-USE_JITKERNEL_REFER(kHSum)
-USE_JITKERNEL_REFER(kHMax)
-USE_JITKERNEL_REFER(kStrideASum)
-USE_JITKERNEL_REFER(kSoftmax)
-USE_JITKERNEL_REFER(kEmbSeqPool)
-USE_JITKERNEL_REFER(kSgd)
-USE_JITKERNEL_REFER(kVBroadcast)
+USE_JITKERNEL_REFER_LITE(kVMul)
+USE_JITKERNEL_REFER_LITE(kVAdd)
+USE_JITKERNEL_REFER_LITE(kVAddRelu)
+USE_JITKERNEL_REFER_LITE(kVSub)
+USE_JITKERNEL_REFER_LITE(kVScal)
+USE_JITKERNEL_REFER_LITE(kStrideScal)
+USE_JITKERNEL_REFER_LITE(kVAddBias)
+USE_JITKERNEL_REFER_LITE(kVCopy)
+USE_JITKERNEL_REFER_LITE(kVRelu)
+USE_JITKERNEL_REFER_LITE(kVIdentity)
+USE_JITKERNEL_REFER_LITE(kVExp)
+USE_JITKERNEL_REFER_LITE(kVSigmoid)
+USE_JITKERNEL_REFER_LITE(kVTanh)
+USE_JITKERNEL_REFER_LITE(kLSTMCtHt)
+USE_JITKERNEL_REFER_LITE(kLSTMC1H1)
+USE_JITKERNEL_REFER_LITE(kGRUH1)
+USE_JITKERNEL_REFER_LITE(kGRUHtPart1)
+USE_JITKERNEL_REFER_LITE(kGRUHtPart2)
+USE_JITKERNEL_REFER_LITE(kCRFDecoding)
+USE_JITKERNEL_REFER_LITE(kLayerNorm)
+USE_JITKERNEL_REFER_LITE(kNCHW16CMulNC)
+USE_JITKERNEL_REFER_LITE(kSeqPool)
+USE_JITKERNEL_REFER_LITE(kMatMul)
+USE_JITKERNEL_REFER_LITE(kVSquare)
+USE_JITKERNEL_REFER_LITE(kHSum)
+USE_JITKERNEL_REFER_LITE(kHMax)
+USE_JITKERNEL_REFER_LITE(kStrideASum)
+USE_JITKERNEL_REFER_LITE(kSoftmax)
+USE_JITKERNEL_REFER_LITE(kEmbSeqPool)
+USE_JITKERNEL_REFER_LITE(kSgd)
+USE_JITKERNEL_REFER_LITE(kVBroadcast)
--- a/lite/backends/x86/jit/refer/refer.cc
+++ b/lite/backends/x86/jit/refer/refer.cc
@@ -18,7 +18,7 @@
 namespace refer = paddle::lite::jit::refer;

 #define REGISTER_REFER_KERNEL(func) \
-  REGISTER_JITKERNEL_REFER(         \
+  REGISTER_JITKERNEL_REFER_LITE(    \
      k##func, refer::func##Kernel<float>, refer::func##Kernel<double>)

 REGISTER_REFER_KERNEL(VMul);

--- a/lite/backends/x86/jit/registry.h
+++ b/lite/backends/x86/jit/registry.h
@@ -77,16 +77,16 @@ class JitKernelRegistrar {
  void Touch() {}
 };

-#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
+#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg)         \
  struct __test_global_namespace_##uniq_name##__ {};                          \
  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                             __test_global_namespace_##uniq_name##__>::value, \
                msg)

 // Refer always on CPUPlace
-#define REGISTER_JITKERNEL_REFER(kernel_type, ...)                  \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_##kernel_type##_refer_CPUPlace,               \
+#define REGISTER_JITKERNEL_REFER_LITE(kernel_type, ...)             \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                    \
+      __reg_litejitkernel_##kernel_type##_refer_CPUPlace,           \
      "REGISTER_KERNEL_REFER must be called in global namespace");  \
  static ::paddle::lite::jit::JitKernelRegistrar<                   \
      ::paddle::lite::jit::ReferKernelPool,                         \
@@ -94,84 +94,84 @@ class JitKernelRegistrar {
      __VA_ARGS__>                                                  \
      __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_(       \
          ::paddle::lite::jit::KernelType::kernel_type);            \
-  int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() {         \
+  int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_() {     \
    __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \
    return 0;                                                       \
  }

 // kernel_type: should be in paddle::lite::jit::KernelType
 // place_type: should be one of CPUPlace and GPUPlace in paddle::platform
-#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...)         \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                   \
-      __reg_jitkernel_##kernel_type##_##impl_type##_##place_type,             \
-      "REGISTER_KERNEL_MORE must be called in global namespace");             \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();             \
+#define REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, place_type, ...)    \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                              \
+      __reg_litejitkernel_##kernel_type##_##impl_type##_##place_type,         \
+      "REGISTER_KERNEL_MORE_LITE must be called in global namespace");        \
+  extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();         \
  static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \
-      UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();           \
+      UNUSED = LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();       \
  static ::paddle::lite::jit::JitKernelRegistrar<                             \
      ::paddle::lite::jit::KernelPool,                                        \
      ::paddle::lite::fluid::place_type,                                      \
      __VA_ARGS__>                                                            \
      __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_(   \
          ::paddle::lite::jit::KernelType::kernel_type);                      \
-  int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() {     \
+  int LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \
    __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_      \
        .Touch();                                                             \
    return 0;                                                                 \
  }

 #define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \
-  REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__)
-
-#define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \
-  REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__)
-
-#define REGISTER_JITKERNEL_GEN(kernel_type, ...)                    \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_gen_##kernel_type##_CPUPlace_,                \
-      "REGISTER_JITKERNEL_GEN must be called in global namespace"); \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
-  static int __assert_gen_##kernel_type##_has_refer_ UNUSED =       \
-      TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();          \
-  static ::paddle::lite::jit::JitKernelRegistrar<                   \
-      ::paddle::lite::jit::JitCodeCreatorPool,                      \
-      ::paddle::lite::fluid::CPUPlace,                              \
-      __VA_ARGS__>                                                  \
-      __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_(         \
-          ::paddle::lite::jit::KernelType::kernel_type);            \
-  int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() {           \
-    __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch();   \
-    return 0;                                                       \
+  REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace, __VA_ARGS__)
+
+#define REGISTER_GPUKERNEL_MORE_LITE(kernel_type, impl_type, ...) \
+  REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, GPUPlace, __VA_ARGS__)
+
+#define REGISTER_JITKERNEL_GEN_LITE(kernel_type, ...)                    \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                         \
+      __reg_litejitkernel_gen_##kernel_type##_CPUPlace_,                 \
+      "REGISTER_JITKERNEL_GEN_LITE must be called in global namespace"); \
+  extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();    \
+  static int __assert_gen_##kernel_type##_has_refer_ UNUSED =            \
+      LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();           \
+  static ::paddle::lite::jit::JitKernelRegistrar<                        \
+      ::paddle::lite::jit::JitCodeCreatorPool,                           \
+      ::paddle::lite::fluid::CPUPlace,                                   \
+      __VA_ARGS__>                                                       \
+      __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_(              \
+          ::paddle::lite::jit::KernelType::kernel_type);                 \
+  int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_() {            \
+    __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch();        \
+    return 0;                                                            \
  }

-#define USE_JITKERNEL_GEN(kernel_type)                            \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                       \
-      __reg_jitkernel_gen_##kernel_type##_CPUPlace_,              \
-      "USE_JITKERNEL_GEN must be called in global namespace");    \
-  extern int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_();   \
-  static int use_jitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \
-      TouchJitKernelReg_gen_##kernel_type##_CPUPlace_()
-
-#define USE_JITKERNEL_REFER(kernel_type)                            \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_##kernel_type##_refer_CPUPlace_,              \
-      "USE_JITKERNEL_REFER must be called in global namespace");    \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
-  static int use_jitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \
-      TouchJitKernelReg_##kernel_type##_refer_CPUPlace_()
-
-#define USE_KERNEL_MORE(kernel_type, impl_type, place_type)              \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                              \
-      __reg_jitkernel_##kernel_type##_##impl_type##_##place_type##_,     \
-      "USE_JITKERNEL_MORE must be called in global namespace");          \
-  extern int                                                             \
-      TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \
-  static int use_jitkernel_##kernel_type##_##impl_type##_##place_type##_ \
-      UNUSED =                                                           \
-          TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_()
-
-#define USE_JITKERNEL_MORE(kernel_type, impl_type) \
-  USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace)
+#define USE_JITKERNEL_GEN_LITE(kernel_type)                           \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                      \
+      __reg_litejitkernel_gen_##kernel_type##_CPUPlace_,              \
+      "USE_JITKERNEL_GEN_LITE must be called in global namespace");   \
+  extern int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_();   \
+  static int use_litejitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \
+      LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_()
+
+#define USE_JITKERNEL_REFER_LITE(kernel_type)                           \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                        \
+      __reg_litejitkernel_##kernel_type##_refer_CPUPlace_,              \
+      "USE_JITKERNEL_REFER_LITE must be called in global namespace");   \
+  extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
+  static int use_litejitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \
+      LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_()
+
+#define USE_KERNEL_MORE_LITE(kernel_type, impl_type, place_type)             \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                             \
+      __reg_litejitkernel_##kernel_type##_##impl_type##_##place_type##_,     \
+      "USE_JITKERNEL_MORE_LITE must be called in global namespace");         \
+  extern int                                                                 \
+      LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \
+  static int use_litejitkernel_##kernel_type##_##impl_type##_##place_type##_ \
+      UNUSED =                                                               \
+          LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_()
+
+#define USE_JITKERNEL_MORE_LITE(kernel_type, impl_type) \
+  USE_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace)

 }  // namespace jit
 }  // namespace lite

--- a/lite/backends/xpu/device.cc
+++ b/lite/backends/xpu/device.cc
@@ -36,8 +36,11 @@ std::unique_ptr<xtcl::network::xRuntimeInstance> Device::Build(
  }
  xtcl::xNetwork network =
      builder->FinalizeNetwork(xtcl::relay::TupleNode::make(all_outs));
-  auto target = xtcl::Target::Create(device_name_);
-  auto compiler = xtcl::network::xTensorCompiler(network, target);
+  auto target = xtcl::NullValue<xtcl::Target>();
+  if (!target_.empty()) {
+    target = xtcl::Target::Create(target_);
+  }
+  xtcl::network::xTensorCompiler compiler(network, target);
  compiler.SetParams(*params);  // Set the data of constant tensors
  compiler.Build();
  VLOG(3) << "[XPU] Build done";

--- a/lite/backends/xpu/device.h
+++ b/lite/backends/xpu/device.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <xtcl/xtcl.h>
+#include <cstdlib>
 #include <memory>
 #include <string>
 #include <utility>
@@ -30,7 +31,18 @@ class Device {
    static Device x;
    return x;
  }
-  Device() {}
+  Device() {
+    char* name = std::getenv("XPU_DEVICE_NAME");
+    if (name) {
+      name_ = std::string(name);
+    }
+    // XPU_DEVICE_TARGET for XPU model building, which supports 'llvm' and 'xpu
+    // -libs=xdnn'
+    char* target = std::getenv("XPU_DEVICE_TARGET");
+    if (target) {
+      target_ = std::string(target);
+    }
+  }

  // Build the XPU graph to the XPU runtime, return the XPU runtime which can be
  // used to run inference.
@@ -39,10 +51,12 @@ class Device {
      xtcl::network::xTensorCompiler::ParamNDArrayMap* params,
      std::vector<xtcl::xExpr*>* outputs);

+  const std::string name() const { return name_; }
+  const std::string target() const { return target_; }
+
 private:
-  // Keep reserved fields
-  int device_id_{0};
-  std::string device_name_{"llvm"};
+  std::string name_{""};
+  std::string target_{""};
 };

 }  // namespace xpu

--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -6,7 +6,8 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
  X86_DEPS target_wrapper_x86
  CUDA_DEPS target_wrapper_cuda
  CL_DEPS cl_target_wrapper
-  FPGA_DEPS fpga_target_wrapper)
+  FPGA_DEPS fpga_target_wrapper
+  BM_DEPS target_wrapper_bm)

 lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)


--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)

 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -107,7 +107,8 @@ class TestCase {
  void SetCommonTensor(const std::string& var_name,
                       const DDim& ddim,
                       const T* data,
-                       const LoD& lod = {}) {
+                       const LoD& lod = {},
+                       bool is_persistable = false) {
    auto* tensor = scope_->NewTensor(var_name);
    tensor->Resize(ddim);
    auto* d = tensor->mutable_data<T>();
@@ -115,6 +116,8 @@ class TestCase {

    // set lod
    if (!lod.empty()) *tensor->mutable_lod() = lod;
+    // set persistable
+    tensor->set_persistable(is_persistable);
  }

  // Prepare for the operator.

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -55,6 +55,7 @@ using NPUContext = Context<TargetType::kNPU>;
 using XPUContext = Context<TargetType::kXPU>;
 using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
+using BMContext = Context<TargetType::kBM>;

 template <>
 class Context<TargetType::kHost> {
@@ -82,6 +83,23 @@ class Context<TargetType::kNPU> {
 };
 #endif

+#ifdef LITE_WITH_BM
+template <>
+class Context<TargetType::kBM> {
+ public:
+  Context() {}
+  explicit Context(const BMContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() { Init(0); }
+
+  void Init(int dev_id) { TargetWrapperBM::SetDevice(dev_id); }
+  void CopySharedTo(BMContext* ctx) {}
+  void* GetHandle() { return TargetWrapperBM::GetHandle(); }
+
+  std::string name() const { return "BMContext"; }
+};
+#endif
+
 #ifdef LITE_WITH_XPU
 template <>
 class Context<TargetType::kXPU> {
@@ -374,6 +392,12 @@ class ContextScheduler {
        kernel_contexts_[TargetType::kFPGA].As<FPGAContext>().CopySharedTo(
            &ctx->As<FPGAContext>());
        break;
+#endif
+#ifdef LITE_WITH_BM
+      case TARGET(kBM):
+        kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
+            &ctx->As<BMContext>());
+        break;
 #endif
      default:
 #ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
@@ -412,6 +436,9 @@ class ContextScheduler {
 #endif
 #ifdef LITE_WITH_XPU
    InitContext<TargetType::kXPU, XPUContext>();
+#endif
+#ifdef LITE_WITH_BM
+    InitContext<TargetType::kBM, BMContext>();
 #endif
  }


--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -79,7 +79,7 @@ const int DEFAULT_L3_CACHE_SIZE = 0;
 int get_cpu_num() {
 #ifdef LITE_WITH_LINUX
  // get cpu count from /sys/devices/system/cpu/cpunum/uevent
-  int max_cpu_num = 20;
+  int max_cpu_num = 128;
  int cpu_num = 0;
  for (int i = 0; i < max_cpu_num; ++i) {
    char path[256];
@@ -227,19 +227,24 @@ void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
 #ifdef LITE_WITH_LINUX

 std::string get_cpu_name() {
-  std::string cpu_name;
+  std::string cpu_name = "";
  FILE* fp = fopen("/proc/cpuinfo", "rb");
  if (!fp) {
    return "";
  }
  char line[1024];
+  bool first_model_name = true;
  while (!feof(fp)) {
    char* s = fgets(line, 1024, fp);
    if (!s) {
      break;
    }
    if (strstr(line, "Hardware") != NULL) {
-      cpu_name = std::string(line);
+      cpu_name += std::string(line);
+    }
+    if (strstr(line, "model name") != NULL && first_model_name) {
+      cpu_name += std::string(line);
+      first_model_name = false;
    }
  }
 #ifdef LITE_WITH_ANDROID
@@ -816,6 +821,21 @@ bool DeviceInfo::SetCPUInfoByName() {
    SetFP16Info(1, 1);
    SetDotInfo(1, 1);
    return true;
+  } else if (dev_name_.find("FT2000PLUS") != std::string::npos) {
+    core_num_ = 64;
+    core_ids_.resize(core_num_);
+    big_core_ids_.resize(core_num_);
+    cluster_ids_.resize(core_num_);
+    for (int i = 0; i < core_num_; ++i) {
+      core_ids_[i] = i;
+      big_core_ids_[i] = i;
+      cluster_ids_[i] = 0;
+    }
+    little_core_ids_ = {};
+    SetCacheInfo(0, 1, 64 * 1024);
+    SetCacheInfo(1, 1, 32 * 1024 * 1024);
+    SetCacheInfo(2, 1, 128 * 1024 * 1024);
+    return true;
  }
  return false;
 }

--- a/lite/core/framework.proto
+++ b/lite/core/framework.proto
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
 package paddle.framework.proto;

 // Any incompatible changes to ProgramDesc and its dependencies should

--- a/lite/core/lite.map
+++ b/lite/core/lite.map
 {
    global:
        *paddle*;
+        *touch_*;
+        *mir_pass_*;
    local:
        *;
 };
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -40,6 +40,11 @@ void* TargetMalloc(TargetType target, size_t size) {
      data = TargetWrapper<TARGET(kFPGA)>::Malloc(size);
      break;
 #endif  // LITE_WITH_OPENCL
+#ifdef LITE_WITH_BM
+    case TargetType::kBM:
+      data = TargetWrapper<TARGET(kBM)>::Malloc(size);
+      break;
+#endif
    default:
      LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
  }
@@ -69,6 +74,11 @@ void TargetFree(TargetType target, void* data) {
      TargetWrapper<TARGET(kFPGA)>::Free(data);
      break;
 #endif  // LITE_WITH_CUDA
+#ifdef LITE_WITH_BM
+    case TargetType::kBM:
+      TargetWrapper<TARGET(kBM)>::Free(data);
+      break;
+#endif
    default:
      LOG(FATAL) << "Unknown type";
  }
@@ -95,6 +105,11 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
          dst, src, size, IoDirection::DtoD);
      break;
 #endif
+#ifdef LITE_WITH_BM
+    case TargetType::kBM:
+      TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
+      break;
+#endif
 #ifdef LITE_WITH_OPENCL
    case TargetType::kOpenCL:
      TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);

--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -25,6 +25,10 @@
 #include "lite/backends/cuda/target_wrapper.h"
 #endif  // LITE_WITH_CUDA

+#ifdef LITE_WITH_BM
+#include "lite/backends/bm/target_wrapper.h"
+#endif  // LITE_WITH_BM
+
 namespace paddle {
 namespace lite {

@@ -71,6 +75,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
    case TARGET(kFPGA):
      TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir);
      break;
+#endif
+#ifdef LITE_WITH_BM
+    case TARGET(kBM):
+      TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, dir);
+      break;
 #endif
  }
 }

--- a/lite/core/mir/fusion/conv_activation_fuser.cc
+++ b/lite/core/mir/fusion/conv_activation_fuser.cc
@@ -79,6 +79,9 @@ cpp::OpDesc ConvActivationFuser::GenOpDesc(const key2nodes_t& matched) {
  op_desc.SetAttr("act_type", act_type_);
  if (act_type_ == "relu") {
    op_desc.SetAttr("fuse_relu", true);
+  } else if (act_type_ == "relu6") {
+    float alpha = act_op_desc.GetAttr<float>("threshold");
+    op_desc.SetAttr("fuse_brelu_threshold", alpha);
  } else if (act_type_ == "leaky_relu") {
    float alpha = act_op_desc.GetAttr<float>("alpha");
    op_desc.SetAttr("leaky_relu_alpha", alpha);

--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
@@ -26,9 +26,7 @@ namespace mir {
 void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  // initialze fuser params
  std::vector<bool> conv_has_bias_cases{true, false};
-  std::vector<std::string> conv_type_cases{
-      "conv2d", "depthwise_conv2d", "conv2d_transpose"};
-
+  std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
  // start fuse using params
  for (auto conv_has_bias : conv_has_bias_cases) {
    for (auto conv_type : conv_type_cases) {
@@ -46,4 +44,4 @@ void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {

 REGISTER_MIR_PASS(lite_conv_bn_fuse_pass, paddle::lite::mir::ConvBNFusePass)
    .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kX86), TARGET(kXPU)});
+    .ExcludeTargets({TARGET(kX86), TARGET(kXPU), TARGET(kBM)});
--- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
@@ -47,4 +47,4 @@ void ConvElementwiseFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 REGISTER_MIR_PASS(lite_conv_elementwise_fuse_pass,
                  paddle::lite::mir::ConvElementwiseFusePass)
    .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kXPU)});
+    .ExcludeTargets({TARGET(kXPU), TARGET(kBM)});
--- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
@@ -36,4 +36,6 @@ REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass,
                  paddle::lite::mir::ElementwiseAddActivationFusePass)
    .BindTargets({TARGET(kAny)})
    .ExcludeTargets({TARGET(kXPU)})
+    .ExcludeTargets({TARGET(kBM)})
+    .ExcludeTargets({TARGET(kX86)})
    .BindKernel("fusion_elementwise_add_activation");
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -23,8 +23,13 @@ namespace lite {
 namespace mir {

 void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  fusion::FcFuser fuser;
+#ifdef LITE_WITH_X86
+  fusion::FcFuser fuser(true);
  fuser(graph.get());
+#endif
+
+  fusion::FcFuser fuser2(false);
+  fuser2(graph.get());
 }

 }  // namespace mir
@@ -34,4 +39,6 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
    .BindTargets({TARGET(kAny)})
    .ExcludeTargets({TARGET(kXPU)})
+    .ExcludeTargets({TARGET(kBM)})
+    .ExcludeTargets({TARGET(kCUDA)})
    .BindKernel("fc");
--- a/lite/core/mir/fusion/fc_fuse_pass_test.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass_test.cc
@@ -88,6 +88,7 @@ USE_LITE_OP(mul);
 USE_LITE_OP(elementwise_add);
 USE_LITE_OP(elementwise_sub);
 USE_LITE_OP(fc);
+USE_LITE_OP(relu);
 USE_LITE_OP(feed);
 USE_LITE_OP(fetch);
 USE_LITE_OP(io_copy);

--- a/lite/core/mir/fusion/fc_fuser.cc
+++ b/lite/core/mir/fusion/fc_fuser.cc
@@ -35,12 +35,23 @@ void FcFuser::BuildPattern() {
  std::vector<PMNode*> mul_inputs{W, x};
  std::vector<PMNode*> add_inputs{mul_out, b};
  mul_inputs >> *mul >> *mul_out;
-  add_inputs >> *add >> *Out;

  // Some op specialities.
  mul_out->AsIntermediate();
  mul->AsIntermediate();
  add->AsIntermediate();
+
+  if (with_relu_) {
+    auto* add_out = VarNode("add_out");
+    auto* relu = OpNode("relu", "relu");
+    std::vector<PMNode*> relu_inputs{add_out};
+    add_inputs >> *add >> *add_out;
+    relu_inputs >> *relu >> *Out;
+    add_out->AsIntermediate();
+    relu->AsIntermediate();
+  } else {
+    add_inputs >> *add >> *Out;
+  }
 }

 void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
@@ -71,6 +82,9 @@ cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
  op_desc.SetAttr(
      "in_num_col_dims",
      matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
+  if (with_relu_) {
+    op_desc.SetAttr("activation_type", std::string{"relu"});
+  }
  return op_desc;
 }


--- a/lite/core/mir/fusion/fc_fuser.h
+++ b/lite/core/mir/fusion/fc_fuser.h
@@ -25,11 +25,13 @@ namespace fusion {

 class FcFuser : public FuseBase {
 public:
+  explicit FcFuser(bool with_relu) : with_relu_(with_relu) {}
  void BuildPattern() override;
  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;

 private:
  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  bool with_relu_;
 };

 }  // namespace fusion

--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -256,4 +256,4 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {

 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
    .BindTargets({TARGET(kARM)})
-    .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU)});
+    .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU), TARGET(kBM)});
--- a/lite/core/mir/subgraph/CMakeLists.txt
+++ b/lite/core/mir/subgraph/CMakeLists.txt
@@ -4,7 +4,7 @@ lite_cc_library(subgraph_detector
 lite_cc_library(subgraph_pass
    SRCS subgraph_pass.cc
    DEPS mir_pass types context ${mir_fusers} subgraph_detector)
-if (WITH_TESTING)
+if (WITH_TESTING AND NOT LITE_WITH_CUDA)
    lite_cc_test(test_subgraph_detector
        SRCS subgraph_detector_test.cc
        DEPS subgraph_detector mir_passes gflags model_parser cxx_api

--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -27,7 +27,7 @@ namespace mir {

 void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  std::unordered_set<std::string> supported_lists;
-#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
 #include "lite/kernels/npu/bridges/paddle_use_bridges.h"
 #undef USE_SUBGRAPH_BRIDGE
  auto teller = [&](Node* node) {
@@ -41,7 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {

 void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  std::unordered_set<std::string> supported_lists;
-#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
 #include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
 #undef USE_SUBGRAPH_BRIDGE
  auto teller = [&](Node* node) {
@@ -53,6 +53,20 @@ void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }

+void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/bm/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
@@ -61,3 +75,5 @@ REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
    .BindTargets({TARGET(kNPU)});
 REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
    .BindTargets({TARGET(kXPU)});
+REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
+    .BindTargets({TARGET(kBM)});
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -32,6 +32,11 @@ class XPUSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };

+class BMSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -92,7 +92,7 @@ void FillInputTensors(
 #define FILL_TENSOR_WITH_TYPE(type)                            \
  auto input_tensor_data = input_tensor->mutable_data<type>(); \
  for (int j = 0; j < input_tensor_size; j++) {                \
-    input_tensor_data[i] = static_cast<type>(value);           \
+    input_tensor_data[j] = static_cast<type>(value);           \
  }
  for (int i = 0; i < input_tensor_shape.size(); i++) {
    auto input_tensor = predictor->GetInput(i);

--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -100,6 +100,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
    case TARGET(kFPGA): {
      CREATE_KERNEL(kFPGA);
    } break;
+    case TARGET(kBM): {
+      CREATE_KERNEL(kBM);
+    } break;
    default:
      CHECK(false) << "not supported kernel target " << TargetToStr(target);
  }
@@ -186,6 +189,11 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kFPGA, kFloat, kNHWC);
  INIT_FOR(kFPGA, kAny, kNHWC);
  INIT_FOR(kFPGA, kAny, kAny);
+
+  INIT_FOR(kBM, kFloat, kNCHW);
+  INIT_FOR(kBM, kInt8, kNCHW);
+  INIT_FOR(kBM, kAny, kNCHW);
+  INIT_FOR(kBM, kAny, kAny);
 #undef INIT_FOR
 }


--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -230,6 +230,16 @@ class KernelRegistry final {
                                      PRECISION(kInt8),
                                      DATALAYOUT(kNCHW)> *,  //

+              KernelRegistryForTarget<TARGET(kBM),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kBM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kBM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+
              KernelRegistryForTarget<TARGET(kFPGA),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -137,8 +137,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {

 void RuntimeProgram::Run() {
  for (auto& inst : instructions_) {
-    std::string op_type = inst.op()->op_info()->Type();
-    if (op_type == "feed" || op_type == "fetch") continue;
+    if (inst.is_feed_fetch_op()) continue;
    inst.Run();
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE

--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -90,7 +90,12 @@ struct Program {
 struct Instruction {
  Instruction(const std::shared_ptr<OpLite>& op,
              std::unique_ptr<KernelBase>&& kernel)
-      : op_(op), kernel_(std::move(kernel)) {}
+      : op_(op), kernel_(std::move(kernel)) {
+    std::string op_type = op->Type();
+    if (op_type == "feed" || op_type == "fetch") {
+      is_feed_fetch_op_ = true;
+    }
+  }

  // Run the instruction.
  void Run();
@@ -101,6 +106,8 @@ struct Instruction {
  const KernelBase* kernel() const { return kernel_.get(); }
  KernelBase* mutable_kernel() { return kernel_.get(); }

+  bool is_feed_fetch_op() const { return is_feed_fetch_op_; }
+
 #ifdef LITE_WITH_PROFILE
  void set_profiler(profile::Profiler* profiler) {
    profiler_ = profiler;
@@ -118,6 +125,7 @@ struct Instruction {
 private:
  std::shared_ptr<OpLite> op_;
  std::unique_ptr<KernelBase> kernel_;
+  bool is_feed_fetch_op_{false};
  bool first_epoch_{true};
  bool has_run_{false};


--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
@@ -198,6 +198,22 @@ class TensorLite {
  // For other devices, T and R may be the same type.
  template <typename T, typename R = T>
  R *mutable_data() {
+    auto type_id = typeid(T).hash_code();
+    if (type_id == typeid(bool).hash_code()) {  // NOLINT
+      precision_ = PrecisionType::kBool;
+    } else if (type_id == typeid(float).hash_code()) {  // NOLINT
+      precision_ = PrecisionType::kFloat;
+    } else if (type_id == typeid(int8_t).hash_code()) {
+      precision_ = PrecisionType::kInt8;
+    } else if (type_id == typeid(int16_t).hash_code()) {
+      precision_ = PrecisionType::kInt16;
+    } else if (type_id == typeid(int32_t).hash_code()) {
+      precision_ = PrecisionType::kInt32;
+    } else if (type_id == typeid(int64_t).hash_code()) {
+      precision_ = PrecisionType::kInt64;
+    } else {
+      precision_ = PrecisionType::kUnk;
+    }
    memory_size_ = dims_.production() * sizeof(T);
    buffer_->ResetLazy(target_, memory_size_);
    return reinterpret_cast<R *>(static_cast<char *>(buffer_->data()) +
@@ -222,10 +238,7 @@ class TensorLite {
  template <typename T, typename R = T>
  R *mutable_data(TargetType target) {
    target_ = target;
-    memory_size_ = dims_.production() * sizeof(T);
-    buffer_->ResetLazy(target, memory_size());
-    return reinterpret_cast<R *>(static_cast<char *>(buffer_->data()) +
-                                 offset_);
+    return mutable_data<T, R>();
  }
  void *mutable_data(size_t memory_size);
  void *mutable_data(TargetType target, size_t memory_size);

--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
 # C++ Demo
-1. 使用`lite/tools/Dockerfile.mobile`生成docker镜像
-2. 运行并进入docker镜像环境，执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv7.tar.gz` 进行下载)。
-3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz `
-4. 执行以下命令准备模拟器环境
-```shell
-# armv8
-adb kill-server
-adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
-echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 &
-sleep 1m
-```
-```shell
-# armv7
-adb kill-server
-adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
-echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 &
-sleep 1m
-```
-5. 准备模型、编译并运行完整api的demo
+1. 环境准备
+   - 保证Android NDK在/opt目录下
+   - 一台armv7或armv8架构的安卓手机
+2. 编译并运行全量api的demo(注：当编译模式为tiny_pubish时将不存在该demo)
 ```shell
 cd inference_lite_lib.android.armv8/demo/cxx/mobile_full
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
 tar zxvf mobilenet_v1.tar.gz
 make
-adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/
-adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/
-adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api
-adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+adb push mobilenet_v1 /data/local/tmp/
+adb push mobilenetv1_full_api /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobilenetv1_full_api
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
 /data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
 ```
 运行成功将在控制台输出预测结果的前10个类别的预测概率

-6. 编译并运行轻量级api的demo
+3. 编译并运行轻量级api的demo
 ```shell
 cd ../mobile_light
 make
-adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/
-adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api
-adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+adb push mobilenetv1_light_api /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobilenetv1_light_api
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
 /data/local/tmp/mobilenetv1_light_api /data/local/tmp/mobilenet_v1.opt"
 ```
+运行成功将在控制台输出预测结果的前10个类别的预测概率

-7. 编译并运行目标检测的demo
+4. 编译并运行ssd目标检测的demo
 ```shell
-cd ../mobile_detection
+cd ../ssd_detection
 wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz
 tar zxvf mobilenetv1-ssd.tar.gz
 make
-adb -s emulator-5554 push mobile_detection /data/local/tmp/
-adb -s emulator-5554 push test.jpg /data/local/tmp/
-adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_detection
-adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
-/data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg"
-adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./
+adb push ssd_detection /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push mobilenetv1-ssd /data/local/tmp
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/ssd_detection
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/ssd_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg"
+adb pull /data/local/tmp/test_ssd_detection_result.jpg ./
 ```
-运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg
+运行成功将在ssd_detection目录下看到生成的目标检测结果图像: test_ssd_detection_result.jpg

-8. 编译并运行物体分类的demo
+5. 编译并运行yolov3目标检测的demo
+```shell
+cd ../yolov3_detection
+wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-yolov3.tar.gz
+tar zxvf mobilenetv1-yolov3.tar.gz
+make
+adb push yolov3_detection /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push mobilenetv1-yolov3 /data/local/tmp
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/yolov3_detection
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/yolov3_detection /data/local/tmp/mobilenetv1-yolov3 /data/local/tmp/test.jpg"
+adb pull /data/local/tmp/test_yolov3_detection_result.jpg ./
+```
+运行成功将在yolov3_detection目录下看到生成的目标检测结果图像: test_yolov3_detection_result.jpg
+
+6. 编译并运行物体分类的demo
 ```shell
 cd ../mobile_classify
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
 tar zxvf mobilenet_v1.tar.gz
+./model_optimize_tool optimize model
 make
+
 adb -s emulator-5554 push mobile_classify /data/local/tmp/
 adb -s emulator-5554 push test.jpg /data/local/tmp/
 adb -s emulator-5554 push labels.txt /data/local/tmp/
 adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
 adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify
 adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
-/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
 ```
 运行成功将在控制台输出预测结果的前5个类别的预测概率
 - 如若想看前10个类别的预测概率，在运行命令输入topk的值即可
    eg:
    ```shell
    adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
-    /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
+    /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
    ```
 - 如若想看其他模型的分类结果， 在运行命令输入model_dir 及其model的输入大小即可
    eg:
    ```shell
    adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
-    /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
+    /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv2opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
    ```
    
+9. 编译含CV预处理库模型单测demo 
+```shell
+cd ../test_cv
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+./model_optimize_tool optimize model
+make
+adb -s emulator-5554 push test_model_cv /data/local/tmp/
+adb -s emulator-5554 push test.jpg /data/local/tmp/
+adb -s emulator-5554 push labels.txt /data/local/tmp/
+adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb -s emulator-5554 shell chmod +x /data/local/tmp/test_model_cv
+adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/test_model_cv /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+```
+运行成功将在控制台输出预测结果的前10个类别的预测概率
--- a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7
@@ -40,11 +40,11 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY

 #CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)

-mobile_detection: fetch_opencv mobile_detection.o
-	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection  $(CXX_LIBS) $(LDFLAGS)
+ssd_detection: fetch_opencv ssd_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) ssd_detection.o -o ssd_detection  $(CXX_LIBS) $(LDFLAGS)

-mobile_detection.o: mobile_detection.cc
-	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc
+ssd_detection.o: ssd_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o ssd_detection.o -c ssd_detection.cc

 fetch_opencv:
 	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
@@ -57,5 +57,5 @@ fetch_opencv:

 .PHONY: clean
 clean:
-	rm -f mobile_detection.o
-	rm -f mobile_detection
+	rm -f ssd_detection.o
+	rm -f ssd_detection
--- a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8
@@ -40,11 +40,11 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY

 #CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)

-mobile_detection: fetch_opencv mobile_detection.o
-	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection  $(CXX_LIBS) $(LDFLAGS)
+ssd_detection: fetch_opencv ssd_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) ssd_detection.o -o ssd_detection  $(CXX_LIBS) $(LDFLAGS)

-mobile_detection.o: mobile_detection.cc
-	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc
+ssd_detection.o: ssd_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o ssd_detection.o -c ssd_detection.cc

 fetch_opencv:
 	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
@@ -57,5 +57,5 @@ fetch_opencv:

 .PHONY: clean
 clean:
-	rm -f mobile_detection.o
-	rm -f mobile_detection
+	rm -f ssd_detection.o
+	rm -f ssd_detection
--- a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
+ARM_ABI = arm7
+LITE_WITH_CV = ON
+export ARM_ABI
+export LITE_WITH_CV
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+test_model_cv: fetch_opencv test_model_cv.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+test_model_cv.o: test_model_cv.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+test_img_prepross: fetch_opencv test_img_prepross.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross  $(CXX_LIBS) $(LDFLAGS)
+
+test_img_prepross.o: test_img_prepross.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f test_model_cv.o
+	rm -f test_model_cv
+      rm -f test_img_prepross.o
+	rm -f test_img_prepross
--- a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
+ARM_ABI = arm8
+LITE_WITH_CV = ON
+export ARM_ABI
+export LITE_WITH_CV
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+test_model_cv: fetch_opencv test_model_cv.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+test_model_cv.o: test_model_cv.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+test_img_prepross: fetch_opencv test_img_prepross.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross  $(CXX_LIBS) $(LDFLAGS)
+
+test_img_prepross.o: test_img_prepross.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f test_model_cv.o
+	rm -f test_model_cv
+      rm -f test_img_prepross.o
+	rm -f test_img_prepross
--- a/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+yolov3_detection: fetch_opencv yolov3_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_detection.o -o yolov3_detection  $(CXX_LIBS) $(LDFLAGS)
+
+yolov3_detection.o: yolov3_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_detection.o -c yolov3_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f yolov3_detection.o
+	rm -f yolov3_detection
--- a/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+yolov3_detection: fetch_opencv yolov3_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_detection.o -o yolov3_detection  $(CXX_LIBS) $(LDFLAGS)
+
+yolov3_detection.o: yolov3_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_detection.o -c yolov3_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f yolov3_detection.o
+	rm -f yolov3_detection
--- a/lite/demo/cxx/mobile_classify/mobile_classify.cc
+++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc
--- a/lite/demo/cxx/mobile_detection/test.jpg
+++ b/lite/demo/cxx/mobile_detection/test.jpg
--- a/lite/demo/cxx/mobile_detection/mobile_detection.cc
+++ b/lite/demo/cxx/mobile_detection/mobile_detection.cc
--- a/lite/demo/cxx/test_cv/README.md
+++ b/lite/demo/cxx/test_cv/README.md
--- a/lite/demo/cxx/test_cv/test_img_prepross.cc
+++ b/lite/demo/cxx/test_cv/test_img_prepross.cc
--- a/lite/demo/cxx/test_cv/test_model_cv.cc
+++ b/lite/demo/cxx/test_cv/test_model_cv.cc
--- a/lite/demo/cxx/yolov3_detection/yolov3_detection.cc
+++ b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
--- a/lite/kernels/arm/conv_transpose_compute.cc
+++ b/lite/kernels/arm/conv_transpose_compute.cc
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
--- a/lite/kernels/arm/matmul_compute.cc
+++ b/lite/kernels/arm/matmul_compute.cc
--- a/lite/kernels/arm/mul_compute.cc
+++ b/lite/kernels/arm/mul_compute.cc
--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
--- a/lite/kernels/arm/split_compute.cc
+++ b/lite/kernels/arm/split_compute.cc
--- a/lite/kernels/arm/write_to_array_compute.cc
+++ b/lite/kernels/arm/write_to_array_compute.cc
--- a/lite/kernels/arm/write_to_array_compute.h
+++ b/lite/kernels/arm/write_to_array_compute.h
--- a/lite/kernels/arm/yolo_box_compute.cc
+++ b/lite/kernels/arm/yolo_box_compute.cc
--- a/lite/kernels/bm/CMakeLists.txt
+++ b/lite/kernels/bm/CMakeLists.txt
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
--- a/lite/kernels/bm/bridges/batch_norm_op.cc
+++ b/lite/kernels/bm/bridges/batch_norm_op.cc
--- a/lite/kernels/bm/bridges/conv_op.cc
+++ b/lite/kernels/bm/bridges/conv_op.cc
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
--- a/lite/kernels/bm/bridges/graph.cc
+++ b/lite/kernels/bm/bridges/graph.cc
--- a/lite/kernels/npu/bridges/test_helper.h
+++ b/lite/kernels/npu/bridges/test_helper.h
--- a/lite/kernels/bm/bridges/mul_op.cc
+++ b/lite/kernels/bm/bridges/mul_op.cc
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
--- a/lite/kernels/bm/bridges/pool_op.cc
+++ b/lite/kernels/bm/bridges/pool_op.cc
--- a/lite/kernels/bm/bridges/scale_op.cc
+++ b/lite/kernels/bm/bridges/scale_op.cc
--- a/lite/kernels/bm/bridges/softmax_op.cc
+++ b/lite/kernels/bm/bridges/softmax_op.cc
--- a/lite/kernels/bm/bridges/utility.cc
+++ b/lite/kernels/bm/bridges/utility.cc
--- a/lite/kernels/xpu/bridges/test_helper.h
+++ b/lite/kernels/xpu/bridges/test_helper.h
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
--- a/lite/kernels/cuda/conv_compute_test.cc
+++ b/lite/kernels/cuda/conv_compute_test.cc
--- a/lite/kernels/cuda/mul_compute.h
+++ b/lite/kernels/cuda/mul_compute.h
--- a/lite/kernels/cuda/search_aligned_mat_mul_compute.h
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.h
--- a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
--- a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
--- a/lite/kernels/cuda/var_conv_2d_compute.h
+++ b/lite/kernels/cuda/var_conv_2d_compute.h
--- a/lite/kernels/cuda/yolo_box_compute.cu
+++ b/lite/kernels/cuda/yolo_box_compute.cu
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
--- a/lite/kernels/npu/bridges/act_op_test.cc
+++ b/lite/kernels/npu/bridges/act_op_test.cc
--- a/lite/kernels/npu/bridges/argmax_op.cc
+++ b/lite/kernels/npu/bridges/argmax_op.cc
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
--- a/lite/kernels/npu/bridges/batch_norm_op_test.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op_test.cc
--- a/lite/kernels/npu/bridges/concat_op.cc
+++ b/lite/kernels/npu/bridges/concat_op.cc
--- a/lite/kernels/npu/bridges/concat_op_test.cc
+++ b/lite/kernels/npu/bridges/concat_op_test.cc
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
--- a/lite/kernels/npu/bridges/conv_op_test.cc
+++ b/lite/kernels/npu/bridges/conv_op_test.cc
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
--- a/lite/kernels/npu/bridges/conv_transpose_op_test.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op_test.cc
--- a/lite/kernels/npu/bridges/dropout_op.cc
+++ b/lite/kernels/npu/bridges/dropout_op.cc
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
--- a/lite/kernels/npu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops_test.cc
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
--- a/lite/kernels/npu/bridges/fc_op.cc
+++ b/lite/kernels/npu/bridges/fc_op.cc
--- a/lite/kernels/npu/bridges/fc_op_test.cc
+++ b/lite/kernels/npu/bridges/fc_op_test.cc
--- a/lite/kernels/npu/bridges/graph.cc
+++ b/lite/kernels/npu/bridges/graph.cc
--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
--- a/lite/kernels/npu/bridges/instance_norm_op.cc
+++ b/lite/kernels/npu/bridges/instance_norm_op.cc
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
--- a/lite/kernels/npu/bridges/interpolate_op_test.cc
+++ b/lite/kernels/npu/bridges/interpolate_op_test.cc
--- a/lite/kernels/npu/bridges/layer_norm_op.cc
+++ b/lite/kernels/npu/bridges/layer_norm_op.cc
--- a/lite/kernels/npu/bridges/matmul_op.cc
+++ b/lite/kernels/npu/bridges/matmul_op.cc
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
--- a/lite/kernels/npu/bridges/pad2d_op.cc
+++ b/lite/kernels/npu/bridges/pad2d_op.cc
--- a/lite/kernels/npu/bridges/pad2d_op_test.cc
+++ b/lite/kernels/npu/bridges/pad2d_op_test.cc
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
--- a/lite/kernels/npu/bridges/pool_op_test.cc
+++ b/lite/kernels/npu/bridges/pool_op_test.cc
--- a/lite/kernels/npu/bridges/reduce_mean_op.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
--- a/lite/kernels/npu/bridges/registry.cc
+++ b/lite/kernels/npu/bridges/registry.cc
--- a/lite/kernels/npu/bridges/registry.h
+++ b/lite/kernels/npu/bridges/registry.h
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
--- a/lite/kernels/npu/bridges/reshape_op_test.cc
+++ b/lite/kernels/npu/bridges/reshape_op_test.cc
--- a/lite/kernels/npu/bridges/scale_op.cc
+++ b/lite/kernels/npu/bridges/scale_op.cc
--- a/lite/kernels/npu/bridges/shuffle_channel_op.cc
+++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc
--- a/lite/kernels/npu/bridges/shuffle_channel_op_test.cc
+++ b/lite/kernels/npu/bridges/shuffle_channel_op_test.cc
--- a/lite/kernels/npu/bridges/softmax_op.cc
+++ b/lite/kernels/npu/bridges/softmax_op.cc
--- a/lite/kernels/npu/bridges/softmax_op_test.cc
+++ b/lite/kernels/npu/bridges/softmax_op_test.cc
--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
--- a/lite/kernels/npu/bridges/sqrt_op.cc
+++ b/lite/kernels/npu/bridges/sqrt_op.cc
--- a/lite/kernels/npu/bridges/square_op.cc
+++ b/lite/kernels/npu/bridges/square_op.cc
--- a/lite/kernels/npu/bridges/test_helper.cc
+++ b/lite/kernels/npu/bridges/test_helper.cc
--- a/lite/kernels/npu/bridges/transpose_op.cc
+++ b/lite/kernels/npu/bridges/transpose_op.cc
--- a/lite/kernels/npu/bridges/transpose_op_test.cc
+++ b/lite/kernels/npu/bridges/transpose_op_test.cc
--- a/lite/kernels/npu/bridges/unsqueeze_op.cc
+++ b/lite/kernels/npu/bridges/unsqueeze_op.cc
--- a/lite/kernels/npu/bridges/unsqueeze_op_test.cc
+++ b/lite/kernels/npu/bridges/unsqueeze_op_test.cc
--- a/lite/kernels/npu/bridges/utility.cc
+++ b/lite/kernels/npu/bridges/utility.cc
--- a/lite/kernels/npu/bridges/utility.h
+++ b/lite/kernels/npu/bridges/utility.h
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/conv2d_1x1_compute.cc
+++ b/lite/kernels/opencl/conv2d_1x1_compute.cc
--- a/lite/kernels/opencl/conv2d_1x1_compute_test.cc
+++ b/lite/kernels/opencl/conv2d_1x1_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_basic_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_basic_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc
--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
--- a/lite/kernels/xpu/bridges/act_op.cc
+++ b/lite/kernels/xpu/bridges/act_op.cc
--- a/lite/kernels/xpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/xpu/bridges/batch_norm_op.cc
--- a/lite/kernels/xpu/bridges/batch_norm_op_test.cc
+++ b/lite/kernels/xpu/bridges/batch_norm_op_test.cc
--- a/lite/kernels/xpu/bridges/cast_op.cc
+++ b/lite/kernels/xpu/bridges/cast_op.cc
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
--- a/lite/kernels/xpu/bridges/dropout_op.cc
+++ b/lite/kernels/xpu/bridges/dropout_op.cc
--- a/lite/kernels/xpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/xpu/bridges/elementwise_ops.cc
--- a/lite/kernels/xpu/bridges/gather_op.cc
+++ b/lite/kernels/xpu/bridges/gather_op.cc
--- a/lite/kernels/xpu/bridges/graph.cc
+++ b/lite/kernels/xpu/bridges/graph.cc
--- a/lite/kernels/xpu/bridges/graph.h
+++ b/lite/kernels/xpu/bridges/graph.h
--- a/lite/kernels/xpu/bridges/layer_norm_op.cc
+++ b/lite/kernels/xpu/bridges/layer_norm_op.cc
--- a/lite/kernels/xpu/bridges/lookup_table_op.cc
+++ b/lite/kernels/xpu/bridges/lookup_table_op.cc
--- a/lite/kernels/xpu/bridges/matmul_op.cc
+++ b/lite/kernels/xpu/bridges/matmul_op.cc
--- a/lite/kernels/xpu/bridges/mul_op.cc
+++ b/lite/kernels/xpu/bridges/mul_op.cc
--- a/lite/kernels/xpu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/xpu/bridges/paddle_use_bridges.h
--- a/lite/kernels/xpu/bridges/pool_op.cc
+++ b/lite/kernels/xpu/bridges/pool_op.cc
--- a/lite/kernels/xpu/bridges/reshape_op.cc
+++ b/lite/kernels/xpu/bridges/reshape_op.cc
--- a/lite/kernels/xpu/bridges/scale_op.cc
+++ b/lite/kernels/xpu/bridges/scale_op.cc
--- a/lite/kernels/xpu/bridges/slice_op.cc
+++ b/lite/kernels/xpu/bridges/slice_op.cc
--- a/lite/kernels/xpu/bridges/softmax_op.cc
+++ b/lite/kernels/xpu/bridges/softmax_op.cc
--- a/lite/kernels/xpu/bridges/softmax_op_test.cc
+++ b/lite/kernels/xpu/bridges/softmax_op_test.cc
--- a/lite/kernels/xpu/bridges/stack_op.cc
+++ b/lite/kernels/xpu/bridges/stack_op.cc
--- a/lite/kernels/xpu/bridges/test_helper.cc
+++ b/lite/kernels/xpu/bridges/test_helper.cc
--- a/lite/kernels/xpu/bridges/transpose_op.cc
+++ b/lite/kernels/xpu/bridges/transpose_op.cc
--- a/lite/kernels/xpu/bridges/utility.cc
+++ b/lite/kernels/xpu/bridges/utility.cc
--- a/lite/kernels/xpu/bridges/utility.h
+++ b/lite/kernels/xpu/bridges/utility.h
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
--- a/lite/model_parser/naive_buffer/naive_buffer.h
+++ b/lite/model_parser/naive_buffer/naive_buffer.h
--- a/lite/model_parser/naive_buffer/param_desc.cc
+++ b/lite/model_parser/naive_buffer/param_desc.cc
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
--- a/lite/operators/dropout_op.cc
+++ b/lite/operators/dropout_op.cc
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
--- a/lite/tests/cv/cv_basic.h
+++ b/lite/tests/cv/cv_basic.h
--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/batch_norm_compute_test.cc
+++ b/lite/tests/kernels/batch_norm_compute_test.cc
--- a/lite/tests/kernels/bilinear_interp_compute_test.cc
+++ b/lite/tests/kernels/bilinear_interp_compute_test.cc
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
--- a/lite/tests/kernels/concat_compute_test.cc
+++ b/lite/tests/kernels/concat_compute_test.cc
--- a/lite/tests/kernels/conv2d_transpose_compute_test.cc
+++ b/lite/tests/kernels/conv2d_transpose_compute_test.cc
--- a/lite/tests/kernels/conv_compute_test.cc
+++ b/lite/tests/kernels/conv_compute_test.cc
--- a/lite/tests/kernels/conv_transpose_compute_test.cc
+++ b/lite/tests/kernels/conv_transpose_compute_test.cc
--- a/lite/tests/kernels/dropout_compute_test.cc
+++ b/lite/tests/kernels/dropout_compute_test.cc
--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ b/lite/tests/kernels/elementwise_compute_test.cc
--- a/lite/tests/kernels/instance_norm_compute_test.cc
+++ b/lite/tests/kernels/instance_norm_compute_test.cc
--- a/lite/tests/kernels/interp_compute_test.cc
+++ b/lite/tests/kernels/interp_compute_test.cc
--- a/lite/tests/kernels/layer_norm_compute_test.cc
+++ b/lite/tests/kernels/layer_norm_compute_test.cc
--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
--- a/lite/tests/kernels/nearest_interp_compute_test.cc
+++ b/lite/tests/kernels/nearest_interp_compute_test.cc
--- a/lite/tests/kernels/pad2d_compute_test.cc
+++ b/lite/tests/kernels/pad2d_compute_test.cc
--- a/lite/tests/kernels/pool_compute_test.cc
+++ b/lite/tests/kernels/pool_compute_test.cc
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
--- a/lite/tests/kernels/shuffle_channel_compute_test.cc
+++ b/lite/tests/kernels/shuffle_channel_compute_test.cc
--- a/lite/tests/kernels/softmax_compute_test.cc
+++ b/lite/tests/kernels/softmax_compute_test.cc
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
--- a/lite/tests/math/conv_transpose_compute_test.cc
+++ b/lite/tests/math/conv_transpose_compute_test.cc
--- a/lite/tests/math/sgemm_compute_test.cc
+++ b/lite/tests/math/sgemm_compute_test.cc
--- a/lite/tests/utils/naive_math_impl.h
+++ b/lite/tests/utils/naive_math_impl.h
--- a/lite/tools/Dockerfile.gcc82-cuda10.1-cudnn7
+++ b/lite/tools/Dockerfile.gcc82-cuda10.1-cudnn7
--- a/lite/tools/benchmark.sh
+++ b/lite/tools/benchmark.sh
--- a/lite/tools/build_bm.sh
+++ b/lite/tools/build_bm.sh
--- a/lite/tools/build_xpu.sh
+++ b/lite/tools/build_xpu.sh
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
--- a/lite/utils/cv/image2tensor.cc
+++ b/lite/utils/cv/image2tensor.cc
--- a/lite/utils/cv/image_convert.cc
+++ b/lite/utils/cv/image_convert.cc
--- a/lite/utils/cv/image_flip.cc
+++ b/lite/utils/cv/image_flip.cc
--- a/lite/utils/cv/image_flip.h
+++ b/lite/utils/cv/image_flip.h
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
--- a/lite/utils/cv/image_resize.h
+++ b/lite/utils/cv/image_resize.h
--- a/lite/utils/cv/image_rotate.cc
+++ b/lite/utils/cv/image_rotate.cc
--- a/lite/utils/cv/image_rotate.h
+++ b/lite/utils/cv/image_rotate.h
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h
--- a/mobile/src/framework/cl/cl_deleter.h
+++ b/mobile/src/framework/cl/cl_deleter.h
--- a/mobile/src/framework/cl/cl_engine.cpp
+++ b/mobile/src/framework/cl/cl_engine.cpp
--- a/mobile/src/framework/cl/cl_engine.h
+++ b/mobile/src/framework/cl/cl_engine.h
--- a/mobile/src/framework/cl/cl_image.h
+++ b/mobile/src/framework/cl/cl_image.h
--- a/mobile/src/framework/cl/cl_scope.h
+++ b/mobile/src/framework/cl/cl_scope.h
--- a/mobile/src/io/opencl_interface.cpp
+++ b/mobile/src/io/opencl_interface.cpp
--- a/mobile/src/io/opencl_interface.h
+++ b/mobile/src/io/opencl_interface.h
--- a/mobile/src/io/paddle_mobile.h
+++ b/mobile/src/io/paddle_mobile.h
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
--- a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
--- a/mobile/test/net/test_mobilenet_male2fe.cpp
+++ b/mobile/test/net/test_mobilenet_male2fe.cpp