test=develop

3394388c · jiweibo · 11d81a7d · cb9af4df · 3394388c · 3394388c
391 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,7 +59,10 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
+lite_option(LITE_WITH_RKNPU  "Enable RKNPU in lite mode"  OFF)
+lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
+lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
 lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
@@ -127,6 +130,10 @@ if (LITE_WITH_PYTHON)
    include(external/pybind11)    # download, build, install pybind11
 endif()
+if(LITE_WITH_RKNPU)
+   include(device/rknpu)
+endif()
 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
@@ -177,6 +184,10 @@ if(LITE_WITH_XPU)
    include(device/xpu)
 endif()
+if(LITE_WITH_MLU)
+    include(mlu)
+endif()
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm
@@ -203,7 +214,9 @@ include(generic)            # simplify cmake module
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(version)            # set PADDLE_VERSION
-include(flags)
+if(NOT APPLE)
+  include(flags)
+endif()
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -70,7 +70,7 @@ endif()
 if (WITH_MKLML AND MKLML_IOMP_LIB)
    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    if(WIN32)
+    if(WIN32 OR APPLE)
        # openmp not support well for now on windows
        set(OPENMP_FLAGS "")
    else(WIN32)
@@ -134,8 +134,15 @@ if (LITE_WITH_NPU)
    add_definitions("-DLITE_WITH_NPU")
 endif()
+if (LITE_WITH_RKNPU)
+    add_definitions("-DLITE_WITH_RKNPU")
+endif()
 if (LITE_WITH_XPU)
    add_definitions("-DLITE_WITH_XPU")
+    if (LITE_WITH_XTCL)
+      add_definitions("-DLITE_WITH_XTCL")
+    endif()
 endif()
 if (LITE_WITH_OPENCL)
@@ -150,6 +157,10 @@ if (LITE_WITH_BM)
 add_definitions("-DLITE_WITH_BM")
 endif()
+if (LITE_WITH_MLU)
+add_definitions("-DLITE_WITH_MLU")
+endif()
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
 endif()
@@ -174,3 +185,6 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
  add_definitions("-DLITE_ON_MODEL_OPTIMIZE_TOOL")
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
+if (LITE_WITH_PYTHON)
+  add_definitions("-DLITE_WITH_PYTHON")
+endif(LITE_WITH_PYTHON)
--- a/cmake/device/rknpu.cmake
+++ b/cmake/device/rknpu.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+if(NOT DEFINED RKNPU_DDK_ROOT)
+    set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT})
+    if(NOT RKNPU_DDK_ROOT)
+        message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON")
+    endif()
+endif()
+message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}")
+find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h
+  PATHS ${RKNPU_DDK_ROOT}/include/  NO_DEFAULT_PATH)
+if(NOT RKNPU_DDK_INC)
+  message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include")
+endif()
+include_directories("${RKNPU_DDK_ROOT}/include")
+set(RKNPU_SUB_LIB_PATH "lib64")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+    set(RKNPU_SUB_LIB_PATH "lib64")
+endif()
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+    set(RKNPU_SUB_LIB_PATH "lib")
+endif()
+find_library(RKNPU_DDK_FILE NAMES rknpu_ddk
+  PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH})
+if(NOT RKNPU_DDK_FILE)
+  message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}")
+else()
+  message(STATUS "Found RKNPU_DDK_FILE  Library: ${RKNPU_DDK_FILE}")
+  add_library(rknpu_ddk  SHARED IMPORTED GLOBAL)
+  set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE})
+endif()
+set(rknpu_runtime_libs rknpu_ddk  CACHE INTERNAL "rknpu ddk runtime libs")
--- a/cmake/device/xpu.cmake
+++ b/cmake/device/xpu.cmake
@@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT)
    message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
  endif()
 endif()
 message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
-find_path(XPU_SDK_INC NAMES xtcl.h
-  PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl
-  NO_DEFAULT_PATH)
-if(NOT XPU_SDK_INC)
-  message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
-endif()
-include_directories("${XPU_SDK_ROOT}/XTCL/include")
 include_directories("${XPU_SDK_ROOT}/XTDK/include")
-find_library(XPU_SDK_XTCL_FILE NAMES xtcl
-  PATHS ${XPU_SDK_ROOT}/XTCL/so
-  NO_DEFAULT_PATH)
-if(NOT XPU_SDK_XTCL_FILE)
-  message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
-  add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
-endif()
-find_library(XPU_SDK_TVM_FILE NAMES tvm
-  PATHS ${XPU_SDK_ROOT}/XTCL/so
-  NO_DEFAULT_PATH)
-if(NOT XPU_SDK_TVM_FILE)
-  message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
-  add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
-endif()
 find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
  NO_DEFAULT_PATH)
@@ -82,23 +50,55 @@ else()
  set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE})
 endif()
-find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc
+set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs")
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs")
-  NO_DEFAULT_PATH)
+if(LITE_WITH_XTCL)
-find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
+    find_path(XPU_SDK_INC NAMES xtcl.h
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+      PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
-  NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_INC)
+      message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
-if(NOT XPU_SDK_LLVM_FILE)
+    endif()
-  message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
+    include_directories("${XPU_SDK_ROOT}/XTCL/include")
-else()
-  message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
+    find_library(XPU_SDK_XTCL_FILE NAMES xtcl
-  add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
-  set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
+      NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_XTCL_FILE)
+      message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
+      add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
+    endif()
+    find_library(XPU_SDK_TVM_FILE NAMES tvm
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
+      NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_TVM_FILE)
+      message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
+      add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
+    endif()
+    find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
+      PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+      NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_LLVM_FILE)
+      message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
+      add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
+    endif()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1")
+    set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
+    set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
 endif()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0")
-set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
-set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -36,7 +36,16 @@ else()
        # eigen on cuda9.1 missing header of math_funtions.hpp
        # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
        GIT_TAG
-        URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        ######################################################################################################
+        # url address of eigen before v2.3.0
+        # URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        ######################################################################################################
+        # url address of eigen since  v2.6.0
+        #         github address: https://github.com/eigenteam/eigen-git-mirror
+        # we changed the source code to adapt for windows compiling
+        #         git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
+        ######################################################################################################
+        URL             https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
        DOWNLOAD_NO_PROGRESS  1
        PREFIX          ${EIGEN_SOURCE_DIR}

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,12 +16,6 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})
-IF(APPLE)
-    MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
-    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
-    return()
-ENDIF()
 INCLUDE(ExternalProject)
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
@@ -39,6 +33,15 @@ IF(WIN32)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
+ELSEIF(APPLE)
+    #TODO(intel-huying):
+    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
+    SET(MKLML_VER "mklml_mac_2019.0.5.20190502" CACHE STRING "" FORCE)
+    SET(MKLML_URL "https://paddlelite-data.bj.bcebos.com/third_party_libs/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml.dylib)
+    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.dylib)
+    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml.dylib)
+    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.dylib)
 ELSE()
    #TODO(intel-huying):
    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -70,10 +70,10 @@ SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 SET(py_env "")
 IF(PYTHONINTERP_FOUND)
    find_python_module(pip REQUIRED)
-    find_python_module(numpy REQUIRED)
+    #find_python_module(numpy REQUIRED)
    #find_python_module(wheel REQUIRED)
    #find_python_module(google.protobuf REQUIRED)
-    FIND_PACKAGE(NumPy REQUIRED)
+    #FIND_PACKAGE(NumPy REQUIRED)
    #IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
    #    MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
    #    "please use pip to upgrade protobuf. pip install -U protobuf")

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -276,7 +276,7 @@ function(cc_library TARGET_NAME)
        add_dependencies(${TARGET_NAME} mklml)
        if(WIN32)
          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
-        else(WIN32)
+        elseif(NOT APPLE)
          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
        endif(WIN32)
      endif()

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(deps ${lite_deps_DEPS})
@@ -88,6 +88,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()
+  if (LITE_WITH_RKNPU)
+    foreach(var ${lite_deps_RKNPU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
  if (LITE_WITH_XPU)
    foreach(var ${lite_deps_XPU_DEPS})
      set(deps ${deps} ${var})
@@ -100,6 +106,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()
+  if (LITE_WITH_MLU)
+    foreach(var ${lite_deps_MLU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()
@@ -125,7 +137,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -136,6 +148,7 @@ function(lite_cc_library TARGET)
            CUDA_DEPS ${args_CUDA_DEPS}
            CL_DEPS ${args_CL_DEPS}
            BM_DEPS ${args_BM_DEPS}
+            RKNPU_DEPS ${args_RKNPU_DEPS}
            ARM_DEPS ${args_ARM_DEPS}
            CV_DEPS ${args_CV_DEPS}
            FPGA_DEPS ${args_FPGA_DEPS}
@@ -144,6 +157,7 @@ function(lite_cc_library TARGET)
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )
    if (args_SHARED OR ARGS_shared)
@@ -170,7 +184,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -184,11 +198,13 @@ function(lite_cc_binary TARGET)
            FPGA_DEPS ${args_FPGA_DEPS}
            NPU_DEPS ${args_NPU_DEPS}
            XPU_DEPS ${args_XPU_DEPS}
-	    BM_DEPS ${args_BM_DEPS}
+            RKNPU_DEPS ${args_RKNPU_DEPS}
+            BM_DEPS ${args_BM_DEPS}
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            CV_DEPS ${CV_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -218,7 +234,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -240,11 +256,13 @@ function(lite_cc_test TARGET)
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
              CV_DEPS ${args_CV_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size
@@ -269,7 +287,9 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
+set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
+set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")
@@ -285,12 +305,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
+# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -313,6 +333,12 @@ function(add_kernel TARGET device level)
    if ("${device}" STREQUAL "Host")
+       if (LITE_ON_MODEL_OPTIMIZE_TOOL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
        set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
    if ("${device}" STREQUAL "ARM")
@@ -369,6 +395,24 @@ function(add_kernel TARGET device level)
        endif()
        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "RKNPU")
+        if (NOT LITE_WITH_RKNPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
+    if ("${device}" STREQUAL "MLU")
+        if (NOT LITE_WITH_MLU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
            foreach(src ${args_SRCS})
@@ -408,7 +452,9 @@ function(add_kernel TARGET device level)
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -427,7 +473,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -461,7 +507,9 @@ function(add_operator TARGET level)
              FPGA_DEPS ${args_FPGA_DEPS}
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
-	      BM_DEPS ${args_BM_DEPS}
+              RKNPU_DEPS ${args_RKNPU_DEPS}
+              BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -7,8 +7,11 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
 message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
+message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
+message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
+message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
@@ -74,6 +77,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
    if (LITE_WITH_BM)
        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm")
    endif(LITE_WITH_BM)
+    if (LITE_WITH_RKNPU)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu")
+    endif(LITE_WITH_RKNPU)
 else()
    set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
 endif()
@@ -81,16 +87,27 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
 # add python lib
 if (LITE_WITH_PYTHON)
-    add_custom_target(publish_inference_python_lib ${TARGET}
+    if(APPLE)
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+        add_custom_target(publish_inference_python_lib ${TARGET}
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
-            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    else()
+        add_custom_target(publish_inference_python_lib ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    endif()
    add_custom_target(publish_inference_python_installer ${TARGET}
-        COMMAND python setup.py bdist_wheel
+        COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
        WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
        DEPENDS publish_inference_python_lib)
    add_custom_target(publish_inference_python_light_demo ${TARGET}
@@ -108,8 +125,24 @@ if (LITE_WITH_PYTHON)
    add_dependencies(publish_inference publish_inference_python_light_demo)
 endif()
-if (LITE_WITH_X86)
+if (LITE_WITH_CUDA OR LITE_WITH_X86)
-    add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
+    if(APPLE)
+        add_custom_target(publish_inference_cxx_lib ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.dylib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            )
+        add_custom_target(publish_inference_third_party ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+                COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
+        add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
+        add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
+        add_dependencies(publish_inference publish_inference_cxx_lib)
+        add_dependencies(publish_inference publish_inference_third_party)
+    else()
+        add_custom_target(publish_inference_cxx_lib ${TARGET}
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
@@ -117,50 +150,45 @@ if (LITE_WITH_X86)
            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            )
+        add_custom_target(publish_inference_third_party ${TARGET}
+                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+                COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
+        add_dependencies(publish_inference_cxx_lib bundle_full_api)
+        add_dependencies(publish_inference_cxx_lib bundle_light_api)
+        add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
+        add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
+        add_dependencies(publish_inference publish_inference_cxx_lib)
+        add_dependencies(publish_inference publish_inference_third_party)
+    endif()
+endif()
+if (LITE_WITH_X86)
+    add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
            )
-    add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
-    add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
    add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
-    add_dependencies(publish_inference_x86_cxx_lib paddle_full_api_shared)
-    add_dependencies(publish_inference_x86_cxx_lib paddle_light_api_shared)
-    add_dependencies(publish_inference publish_inference_x86_cxx_lib)
    add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
           COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
           )
    add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
    add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
+    add_dependencies(publish_inference publish_inference_x86_cxx_lib)
+    add_dependencies(publish_inference publish_inference_x86_cxx_demos)
 endif()
 if(LITE_WITH_CUDA)
-    add_custom_target(publish_inference_cuda_cxx_lib ${TARGET}
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-        )
-    add_dependencies(publish_inference_cuda_cxx_lib bundle_full_api)
-    add_dependencies(publish_inference_cuda_cxx_lib bundle_light_api)
-    add_dependencies(publish_inference_cuda_cxx_lib paddle_full_api_shared)
-    add_dependencies(publish_inference_cuda_cxx_lib paddle_light_api_shared)
-    add_dependencies(publish_inference publish_inference_cuda_cxx_lib)
    add_custom_target(publish_inference_cuda_cxx_demos ${TARGET}
-           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
           COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
           )
-    add_dependencies(publish_inference_cuda_cxx_lib publish_inference_cuda_cxx_demos)
    add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared)
-endif(LITE_WITH_CUDA) 
+    add_dependencies(publish_inference publish_inference_cuda_cxx_demos)
+endif(LITE_WITH_CUDA)
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
    if (NOT LITE_ON_TINY_PUBLISH)
        # add cxx lib

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -10,6 +10,7 @@ if (LITE_ON_TINY_PUBLISH)
 endif()
 set(light_lib_DEPS light_api paddle_api paddle_api_light)
 if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
    #full api dynamic library
    lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
@@ -19,7 +20,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
    if(LITE_WITH_X86)
        add_dependencies(paddle_full_api_shared xxhash)
        target_link_libraries(paddle_full_api_shared xxhash)
-        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) 
+        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
            add_dependencies(paddle_full_api_shared dynload_mklml)
        endif()
    endif()
@@ -33,15 +34,19 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                  ARM_DEPS ${arm_kernels}
                  CV_DEPS paddle_cv_arm
                  NPU_DEPS ${npu_kernels}
+                  RKNPU_DEPS ${rknpu_kernels}
                  )
    add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
-    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
+    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels})
-    set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
+    if(NOT APPLE)
-    set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
+        set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-    add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
+        set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
-    add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
+        add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
-    set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
+        add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
-    add_dependencies(paddle_full_api_shared custom_linker_map)
+        set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
+        add_dependencies(paddle_full_api_shared custom_linker_map)
+   endif()
 else()
    if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
        add_library(paddle_light_api_shared SHARED "")
@@ -56,6 +61,11 @@ else()
            # Need to add HIAI runtime libs (libhiai.so) dependency
            target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
        endif()
+        if (LITE_WITH_RKNPU)
+            # Need to add RKNPU runtime libs dependency
+            target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs})
+        endif()
    endif()
 endif()
@@ -66,7 +76,9 @@ if (WITH_TESTING)
      CUDA_DEPS ${cuda_kernels}
      X86_DEPS ${x86_kernels}
      XPU_DEPS ${xpu_kernels}
-      BM_DEPS ${bm_kernels})
+      RKNPU_DEPS ${rknpu_kernels}
+      BM_DEPS ${bm_kernels}
+      MLU_DEPS ${mlu_kernels})
 endif()
 if(LITE_WITH_FPGA)
    set(light_api_deps ${light_api_deps} ${fpga_deps})
@@ -78,6 +90,12 @@ if(LITE_WITH_BM)
    set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
 endif()
+if(LITE_WITH_RKNPU)
+    set(light_api_deps ${light_api_deps} ${rknpu_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
+endif()
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
 message(STATUS "get CUDA kernels ${cuda_kernels}")
@@ -86,8 +104,10 @@ message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get OpenCL kernels ${opencl_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
+message(STATUS "get RKNPU kernels ${rknpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
+message(STATUS "get MLU kernels ${mlu_kernels}")
 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -102,6 +122,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
                        CV_DEPS paddle_cv_arm
                        NPU_DEPS ${npu_kernels}
                        XPU_DEPS ${xpu_kernels}
+                        RKNPU_DEPS ${rknpu_kernels}
                        BM_DEPS ${bm_kernels}
                        CL_DEPS ${opencl_kernels}
                        FPGA_DEPS ${fpga_kernels})
@@ -123,9 +144,11 @@ lite_cc_library(light_api SRCS light_api.cc
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
-        BM_DEPS ${bm_kernels})
+        BM_DEPS ${bm_kernels}
+        MLU_DEPS ${mlu_kernels})
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -141,9 +164,11 @@ if(WITH_TESTING)
       CV_DEPS paddle_cv_arm
       NPU_DEPS ${npu_kernels}
       XPU_DEPS ${xpu_kernels}
+       RKNPU_DEPS ${rknpu_kernels}
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels}
       BM_DEPS ${bm_kernels}
+       MLU_DEPS ${mlu_kernels}
       EXCLUDE_COMPILE_DEPS "ON"
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -185,7 +210,11 @@ if(WITH_TESTING)
           lite_cc_test(test_classify_lite_bm SRCS test_classify_lite_bm.cc
              DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
              ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
-              ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+              ARGS --model_dir=${LITE_MODEL_DIR}/classify)
+           lite_cc_test(test_yolov3_lite_bm SRCS test_yolov3_lite_bm.cc
+              DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+              ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
+              ARGS --model_dir=${LITE_MODEL_DIR}/yolov3)
        endif()
    endif()
 endif()
@@ -237,6 +266,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
            --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
    add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
   # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model
   # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
   #    DEPS ${lite_model_test_DEPS})
@@ -264,8 +294,6 @@ if (NOT LITE_ON_TINY_PUBLISH)
        NPU_DEPS ${npu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
-        CV_DEPS paddle_cv_arm
-        NPU_DEPS ${npu_kernels}
        BM_DEPS ${bm_kernels})
    # The final inference library for just MobileConfig.
    bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
@@ -282,6 +310,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
        DEPS light_api program mir_passes paddle_api_light
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        BM_DEPS ${bm_kernels}
        ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -291,7 +320,9 @@ lite_cc_test(test_apis SRCS apis_test.cc
        X86_DEPS ${x86_kernels}
        XPU_DEPS ${xpu_kernels}
        FPGA_DEPS ${fpga_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        BM_DEPS ${bm_kernels}
+        MLU_DEPS ${mlu_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
        --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -325,10 +356,12 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
  CV_DEPS paddle_cv_arm
  NPU_DEPS ${npu_kernels}
  XPU_DEPS ${xpu_kernels}
+  RKNPU_DEPS ${rknpu_kernels}
  CL_DEPS ${opencl_kernels}
  X86_DEPS ${x86_kernels}
  FPGA_DEPS ${fpga_kernels}
  BM_DEPS ${bm_kernels}
+  MLU_DEPS ${mlu_kernels}
  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
 if (WITH_TESTING)
    add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -342,8 +375,10 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -354,8 +389,10 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -366,8 +403,10 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -378,6 +417,8 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
@@ -389,16 +430,20 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        RKNPU_DEPS ${rknpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
 	BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
    lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
        ARM_DEPS ${arm_kernels}
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
+        RKNPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}

--- a/lite/api/_paddle_use_ops.h
+++ b/lite/api/_paddle_use_ops.h
@@ -63,6 +63,7 @@ USE_LITE_OP(swish)
 USE_LITE_OP(log)
 USE_LITE_OP(exp)
 USE_LITE_OP(conv2d_transpose)
+USE_LITE_OP(depthwise_conv2d_transpose)
 USE_LITE_OP(negative)
 USE_LITE_OP(pad2d)
 USE_LITE_OP(power)

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -27,6 +27,9 @@
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
+DEFINE_string(optimized_model_path,
+              "",
+              "the path of the model that is optimized by opt.");
 DEFINE_string(model_dir,
              "",
              "the path of the model, the model and param files is under "
@@ -61,10 +64,7 @@ DEFINE_int32(threads, 1, "threads num");
 DEFINE_string(result_filename,
              "result.txt",
              "save the inference time to the file.");
-DEFINE_bool(run_model_optimize,
+DEFINE_bool(show_output, false, "Wether to show the output in shell.");
-            false,
-            "if set true, apply model_optimize_tool to "
-            "model and use optimized model to test. ");
 namespace paddle {
 namespace lite_api {
@@ -100,15 +100,23 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
 }
+int64_t ShapeProduction(const std::vector<int64_t>& shape) {
+  int64_t num = 1;
+  for (auto i : shape) {
+    num *= i;
+  }
+  return num;
+}
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 void Run(const std::vector<int64_t>& input_shape,
-         const std::string& model_dir,
+         const std::string& model_path,
         const std::string model_name) {
  // set config and create predictor
  lite_api::MobileConfig config;
  config.set_threads(FLAGS_threads);
  config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
-  config.set_model_from_file(model_dir + ".nb");
+  config.set_model_from_file(model_path);
  auto predictor = lite_api::CreatePaddlePredictor(config);
@@ -116,10 +124,7 @@ void Run(const std::vector<int64_t>& input_shape,
  auto input_tensor = predictor->GetInput(0);
  input_tensor->Resize(input_shape);
  auto input_data = input_tensor->mutable_data<float>();
-  int input_num = 1;
+  int64_t input_num = ShapeProduction(input_shape);
-  for (size_t i = 0; i < input_shape.size(); ++i) {
-    input_num *= input_shape[i];
-  }
  if (FLAGS_input_img_path.empty()) {
    for (int i = 0; i < input_num; ++i) {
      input_data[i] = 1.f;
@@ -167,26 +172,78 @@ void Run(const std::vector<int64_t>& input_shape,
  ofs << "average = " << std::setw(12) << avg_res;
  ofs << std::endl;
  ofs.close();
+  if (FLAGS_show_output) {
+    auto out_tensor = predictor->GetOutput(0);
+    auto* out_data = out_tensor->data<float>();
+    int64_t output_num = ShapeProduction(out_tensor->shape());
+    float max_value = out_data[0];
+    int max_index = 0;
+    for (int i = 0; i < output_num; i++) {
+      if (max_value < out_data[i]) {
+        max_value = out_data[i];
+        max_index = i;
+      }
+    }
+    LOG(INFO) << "max_value:" << max_value;
+    LOG(INFO) << "max_index:" << max_index;
+    LOG(INFO) << "output data[0:10]:";
+    for (int i = 0; i < 10; i++) {
+      LOG(INFO) << out_data[i];
+    }
+  }
 }
 #endif
 }  // namespace lite_api
 }  // namespace paddle
+void print_usage() {
+  std::string help_info =
+      "Usage: \n"
+      "./benchmark_bin \n"
+      "  --optimized_model_path (The path of the model that is optimized\n"
+      "    by opt. If the model is optimized, please set the param.) \n"
+      "    type: string \n"
+      "  --model_dir (The path of the model that is not optimized by opt,\n"
+      "    the model and param files is under model_dir.) type: string \n"
+      "  --model_filename (The filename of model file. When the model is\n "
+      "    combined formate, please set model_file. Otherwise, it is not\n"
+      "    necessary to set it.) type: string \n"
+      "  --param_filename (The filename of param file, set param_file when\n"
+      "    the model is combined formate. Otherwise, it is not necessary\n"
+      "    to set it.) type: string \n"
+      "  --input_shape (Set input shapes according to the model, separated by\n"
+      "    colon and comma, such as 1,3,244,244) type: string\n"
+      "    default: 1,3,224,224 \n"
+      "  --input_img_path (The path of input image, if not set\n"
+      "    input_img_path, the input will be 1.0.) type: string \n "
+      "  --power_mode (Arm power mode: 0 for big cluster, 1 for little\n"
+      "    cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n"
+      "  --repeats (Repeats times) type: int32 default: 1 \n"
+      "  --result_filename (Save the inference time to the file.) type: \n"
+      "    string default: result.txt \n"
+      "  --threads (Threads num) type: int32 default: 1 \n"
+      "  --warmup (Warmup times) type: int32 default: 0 \n"
+      "Note that: \n"
+      "  If load the optimized model, set optimized_model_path. Otherwise, \n"
+      "    set model_dir, model_filename and param_filename according to \n"
+      "    the model. \n";
+  LOG(INFO) << help_info;
+}
 int main(int argc, char** argv) {
+  // Check inputs
  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_model_dir == "") {
+  bool is_opt_model = (FLAGS_optimized_model_path != "");
-    LOG(INFO) << "Please run ./benchmark_bin --help to obtain usage.";
+  bool is_origin_model = (FLAGS_model_dir != "");
+  if (!is_origin_model && !is_opt_model) {
+    LOG(INFO) << "Input error, the model path should not be empty.\n";
+    print_usage();
    exit(0);
  }
-  if (FLAGS_model_dir.back() == '/') {
+  // Get input shape
-    FLAGS_model_dir.pop_back();
-  }
-  std::size_t found = FLAGS_model_dir.find_last_of("/");
-  std::string model_name = FLAGS_model_dir.substr(found + 1);
-  std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2";
  auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
    std::vector<int64_t> shape;
    std::string tmp_str = str_shape;
@@ -202,19 +259,31 @@ int main(int argc, char** argv) {
    }
    return shape;
  };
  std::vector<int64_t> input_shape = get_shape(FLAGS_input_shape);
-  // Output optimized model if needed
+  // Get model_name and run_model_path
-  if (FLAGS_run_model_optimize) {
+  std::string model_name;
-    paddle::lite_api::OutputOptModel(save_optimized_model_dir);
+  std::string run_model_path;
+  if (is_origin_model) {
+    if (FLAGS_model_dir.back() == '/') {
+      FLAGS_model_dir.pop_back();
+    }
+    std::size_t found = FLAGS_model_dir.find_last_of("/");
+    model_name = FLAGS_model_dir.substr(found + 1);
+    std::string optimized_model_path = FLAGS_model_dir + "_opt2";
+    paddle::lite_api::OutputOptModel(optimized_model_path);
+    run_model_path = optimized_model_path + ".nb";
+  } else {
+    size_t found1 = FLAGS_optimized_model_path.find_last_of("/");
+    size_t found2 = FLAGS_optimized_model_path.find_last_of(".");
+    size_t len = found2 - found1 - 1;
+    model_name = FLAGS_optimized_model_path.substr(found1 + 1, len);
+    run_model_path = FLAGS_optimized_model_path;
  }
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  // Run inference using optimized model
+  // Run test
-  std::string run_model_dir =
+  paddle::lite_api::Run(input_shape, run_model_path, model_name);
-      FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
-  paddle::lite_api::Run(input_shape, run_model_dir, model_name);
 #endif
  return 0;
 }
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -296,9 +296,10 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
  program_desc_ = desc;
  // `inner_places` is used to optimize passes
  std::vector<Place> inner_places = valid_places;
-  inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
+  for (auto &valid_place : valid_places) {
-  inner_places.emplace_back(
+    inner_places.emplace_back(
-      TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+        Place(TARGET(kHost), valid_place.precision, valid_place.layout));
+  }
  // Analysis whether the modle is quantized.
  // For quantized model, add place(arm, int8) to inner_places

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -46,6 +46,7 @@ class LITE_API Predictor {
    scope_ = std::make_shared<Scope>();
    program_desc_ = std::make_shared<cpp::ProgramDesc>();
  }
  // Create a predictor with the weight variable scope set.
  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
      : scope_(root_scope) {}

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -19,41 +19,54 @@
 #include "lite/api/paddle_api.h"
 #include "lite/core/device_info.h"
 #include "lite/core/version.h"
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
-    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
 #include <omp.h>
 #include "lite/backends/x86/mklml.h"
 #endif
 namespace paddle {
 namespace lite {
 void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  config_ = config;
+  auto places = config.valid_places();
+  std::vector<std::string> passes{};
 #ifdef LITE_WITH_CUDA
-  Env<TARGET(kCUDA)>::Init();
+  // if kCUDA is included in valid places, it should be initialized first,
-#endif
+  // otherwise skip this step.
-  if (!status_is_cloned_) {
+  for (auto &p : places) {
-    auto places = config.valid_places();
+    if (p.target == TARGET(kCUDA)) {
-    std::vector<std::string> passes{};
+      Env<TARGET(kCUDA)>::Init();
-    auto use_layout_preprocess_pass =
+      if (config_.multi_stream()) {
-        config.model_dir().find("OPENCL_PRE_PRECESS");
+        passes = {"multi_stream_analysis_pass"};
-    VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
+        VLOG(3) << "add pass: " << passes[0];
-    if (places[0].target == TARGET(kOpenCL) &&
+      }
-        use_layout_preprocess_pass != std::string::npos) {
+      break;
-      passes = {"type_layout_cast_preprocess_pass"};
-      VLOG(1) << "add pass:" << passes[0];
    }
-    raw_predictor_->Build(config, places, passes);
+  }
-  } else {
+#endif
-    CHECK(raw_predictor_) << "The Predictor can not be nullptr in Clone mode.";
+#ifdef LITE_WITH_MLU
+  Env<TARGET(kMLU)>::Init();
+  lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
+                                           config.mlu_core_number(),
+                                           config.mlu_use_first_conv(),
+                                           config.mlu_first_conv_mean(),
+                                           config.mlu_first_conv_std(),
+                                           config.mlu_input_layout());
+#endif  // LITE_WITH_MLU
+  auto use_layout_preprocess_pass =
+      config.model_dir().find("OPENCL_PRE_PRECESS");
+  VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
+  if (places[0].target == TARGET(kOpenCL) &&
+      use_layout_preprocess_pass != std::string::npos) {
+    passes = {"type_layout_cast_preprocess_pass"};
+    VLOG(1) << "add pass:" << passes[0];
  }
  mode_ = config.power_mode();
  threads_ = config.threads();
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+//  set_thread_by input is disabled here, because this inference is proved unstable
 //  int num_threads = config.x86_math_library_num_threads();
 //  int real_num_threads = num_threads > 1 ? num_threads : 1;
  int real_num_threads=1;

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -14,6 +14,7 @@
 #include "lite/api/light_api.h"
 #include <algorithm>
+#include <unordered_map>
 #include "paddle_use_kernels.h"  // NOLINT
 #include "paddle_use_ops.h"      // NOLINT
@@ -28,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file,
    LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
  }
+  // For weight quantization of post training, load the int8/16 weights
+  // for optimized model, and dequant it to fp32.
  DequantizeWeight();
  BuildRuntimeProgram(cpp_program_desc_);
  PrepareFeedFetch();
 }
@@ -135,7 +139,12 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
  // 1. Create op first
  Program program(prog, scope_, {});
-  // 2. Create Instructs
+// 2. Create Instructs
+#ifdef LITE_WITH_OPENCL
+  using OpenCLContext = Context<TargetType::kOpenCL>;
+  std::unique_ptr<KernelContext> local_ctx(new KernelContext());
+  local_ctx->As<OpenCLContext>().InitOnce();
+#endif
  // Create the kernels of the target places, and filter out the specific
  // kernel with the target alias.
@@ -151,7 +160,18 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
          return it->alias() == alias;
        });
    CHECK(it != kernels.end());
+#ifdef LITE_WITH_OPENCL
+    if ((*it)->target() == TARGET(kOpenCL)) {
+      std::unique_ptr<KernelContext> ctx(new KernelContext());
+      (*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
+      (*it)->SetContext(std::move(ctx));
+    } else {
+      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+    }
+#else
    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+#endif
    insts.emplace_back(op, std::move(*it));
  }
@@ -162,58 +182,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
 }
 void LightPredictor::DequantizeWeight() {
-#define PROCESS_CONV2D_DATA()                                   \
+#define PROCESS_CONV2D_DATA()                                             \
-  for (int64_t i = 0; i < h; ++i) {                             \
+  for (int64_t i = 0; i < ch; ++i) {                                      \
-    for (int64_t j = 0; j < w; ++j) {                           \
+    for (int64_t j = 0; j < offset; ++j) {                                \
-      fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \
+      fp_data[i * offset + j] = scale_list[i] * int_data[i * offset + j]; \
-    }                                                           \
+    }                                                                     \
  }
-#define PROCESS_FC_DATA()                           \
+#define PROCESS_FC_DATA()                                               \
-  for (int i = 0; i < input_tensor->numel(); i++) { \
+  for (int64_t i = 0; i < chin; i++) {                                  \
-    *fp_data = scale_list[0] * (*int_data);         \
+    for (int64_t j = 0; j < chout; j++) {                               \
-    ++fp_data;                                      \
+      fp_data[i * chout + j] = scale_list[j] * int_data[i * chout + j]; \
-    ++int_data;                                     \
+    }                                                                   \
  }
+  auto is_weight_quantized_op = [](const cpp::OpDesc* op_desc) {
+    bool result = false;
+    if (op_desc->HasAttr("quantization_type")) {
+      std::string type = op_desc->GetAttr<std::string>("quantization_type");
+      result = (type == "post_weight_abs_max") ||
+               (type == "post_weight_channel_wise_abs_max");
+    } else {
+      result = op_desc->HasAttr("quantize_weight_bits");
+    }
+    return result;
+  };
  Tensor tmp_tensor;
-  CHECK(cpp_program_desc_.BlocksSize());
+  for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
-  auto* main_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
+    auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
-  for (size_t k = 0; k < main_block->OpsSize(); ++k) {
+    for (size_t k = 0; k < block->OpsSize(); ++k) {
-    auto* op_desc = main_block->GetOp<cpp::OpDesc>(k);
+      auto* op_desc = block->GetOp<cpp::OpDesc>(k);
-    if (op_desc->HasAttr("quantize_weight_bits")) {  //  weight quantized op
+      if (is_weight_quantized_op(op_desc)) {
-      auto input_names = op_desc->input_vars();
+        auto input_names = op_desc->input_vars();
-      for (auto& input_name : input_names) {
+        for (auto& input_name : input_names) {
-        std::string input_scale_name = input_name + "_quant_scale";
+          std::string input_scale_name = input_name + "_quant_scale";
-        if (op_desc->HasAttr(input_scale_name)) {  // the input is quantized
+          if (op_desc->HasAttr(input_scale_name)) {  // the input is quantized
-          auto input_tensor =
+            auto input_tensor =
-              scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
+                scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
-          tmp_tensor.CopyDataFrom(*input_tensor);
+            tmp_tensor.CopyDataFrom(*input_tensor);
-          auto scale_list =
+            auto scale_list =
-              op_desc->GetAttr<std::vector<float>>(input_scale_name);
+                op_desc->GetAttr<std::vector<float>>(input_scale_name);
-          int quantize_weight_bits =
-              op_desc->GetAttr<int>("quantize_weight_bits");
+            int quantize_weight_bits =
-          float* fp_data = input_tensor->mutable_data<float>();
+                op_desc->GetAttr<int>("quantize_weight_bits");
+            CHECK(quantize_weight_bits == 8 || quantize_weight_bits == 16);
-          std::string op_type = op_desc->Type();
+            float* fp_data = input_tensor->mutable_data<float>();
-          if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
-            int64_t h = input_tensor->dims()[0];
+            std::string op_type = op_desc->Type();
-            int64_t w = input_tensor->numel() / h;
+            if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
-            CHECK_EQ(scale_list.size(), h);
+              int64_t ch = input_tensor->dims()[0];
-            if (quantize_weight_bits == 8) {
+              int64_t offset = input_tensor->numel() / ch;
-              const int8_t* int_data = tmp_tensor.data<int8_t>();
+              CHECK_EQ(scale_list.size(), ch);
-              PROCESS_CONV2D_DATA()
+              if (quantize_weight_bits == 8) {
-            } else {
+                const int8_t* int_data = tmp_tensor.data<int8_t>();
-              const int16_t* int_data = tmp_tensor.data<int16_t>();
+                PROCESS_CONV2D_DATA()
-              PROCESS_CONV2D_DATA()
+              } else {
-            }
+                const int16_t* int_data = tmp_tensor.data<int16_t>();
-          } else if (op_type == "fc" || op_type == "mul") {
+                PROCESS_CONV2D_DATA()
-            if (quantize_weight_bits == 8) {
+              }
-              const int8_t* int_data = tmp_tensor.data<int8_t>();
+            } else if (op_type == "fc" || op_type == "mul") {
-              PROCESS_FC_DATA()
+              int64_t chin = input_tensor->dims()[0];
-            } else {
+              int64_t chout = input_tensor->dims()[1];
-              const int16_t* int_data = tmp_tensor.data<int16_t>();
+              CHECK_EQ(scale_list.size(), chout);
-              PROCESS_FC_DATA()
+              if (quantize_weight_bits == 8) {
+                const int8_t* int_data = tmp_tensor.data<int8_t>();
+                PROCESS_FC_DATA()
+              } else {
+                const int16_t* int_data = tmp_tensor.data<int16_t>();
+                PROCESS_FC_DATA()
+              }
            }
          }
        }

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -109,6 +109,12 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(TARGET(kNPU));
    } else if (target_repr == "xpu") {
      valid_places.emplace_back(TARGET(kXPU));
+    } else if (target_repr == "rknpu") {
+      valid_places.emplace_back(TARGET(kRKNPU));
+      valid_places.emplace_back(
+          TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
+    } else if (target_repr == "mlu") {
+      valid_places.emplace_back(TARGET(kMLU));
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "
@@ -185,6 +191,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                      "kFPGA",
                                      "kNPU",
                                      "kXPU",
+                                      "kRKNPU",
                                      "kAny",
                                      "kUnk"};
  int maximum_optype_length = 0;
@@ -249,16 +256,16 @@ void PrintHelpInfo() {
      "        `--param_file=<param_path>`\n"
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
      "  Display operators in the input model\n";
  std::cout << "opt version:" << opt_version << std::endl
            << help_info << std::endl;

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "lite/api/paddle_api.h"
+#include "lite/core/context.h"
 #include "lite/core/device_info.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
@@ -217,6 +218,58 @@ void ConfigBase::set_threads(int threads) {
 #endif
 }
+#ifdef LITE_WITH_MLU
+void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
+  mlu_core_version_ = core_version;
+}
+void CxxConfig::set_mlu_core_number(int core_number) {
+  mlu_core_number_ = core_number;
+}
+void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
+  mlu_input_layout_ = layout;
+}
+void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
+  mlu_use_first_conv_ = use_first_conv;
+}
+void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
+  mlu_first_conv_mean_ = mean;
+}
+void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
+  mlu_first_conv_std_ = std;
+}
+lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
+  return mlu_core_version_;
+}
+int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
+DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
+bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
+const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
+  return mlu_first_conv_mean_;
+}
+const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
+  return mlu_first_conv_std_;
+}
+#endif
+void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_workspace_l3_size_per_thread' is ignored, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::SetDev(dev_no);
+#else
+  LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
+                  "ignored, please rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
 // set model data in combined format, `set_model_from_file` refers to loading
 // model from file, set_model_from_buffer refers to loading model from memory
 // buffer

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -141,6 +141,17 @@ class LITE_API CxxConfig : public ConfigBase {
 #ifdef LITE_WITH_X86
  int x86_math_library_math_threads_ = 1;
 #endif
+#ifdef LITE_WITH_CUDA
+  bool multi_stream_{false};
+#endif
+#ifdef LITE_WITH_MLU
+  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
+  int mlu_core_number_{1};
+  DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
+  bool mlu_use_first_conv_{false};
+  std::vector<float> mlu_first_conv_mean_;
+  std::vector<float> mlu_first_conv_std_;
+#endif
 public:
  void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -168,6 +179,41 @@ class LITE_API CxxConfig : public ConfigBase {
    return x86_math_library_math_threads_;
  }
 #endif
+#ifdef LITE_WITH_CUDA
+  void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
+  int multi_stream() const { return multi_stream_; }
+#endif
+#ifdef LITE_WITH_MLU
+  // set MLU core version, which is used when compiling MLU kernels
+  void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
+  // set MLU core number, which is used when compiling MLU kernels
+  void set_mlu_core_number(int core_number);
+  // set MLU input layout. User can specify layout of input data to be NHWC,
+  // default is NCHW
+  void set_mlu_input_layout(DataLayoutType layout);
+  // whether use MLU's first conv kernel. First conv is a special kernel
+  // provided by MLU, its input is uint8, and also needs two 3-dimentional
+  // vectors which save all inputs' mean and std values
+  void set_mlu_use_first_conv(bool use_first_conv);
+  // set the 3-dimentional mean vector used by MLU's first conv
+  void set_mlu_first_conv_mean(const std::vector<float>& mean);
+  // set the 3-dimentional std vector used by MLU's first conv
+  void set_mlu_first_conv_std(const std::vector<float>& std);
+  lite_api::MLUCoreVersion mlu_core_version() const;
+  int mlu_core_number() const;
+  DataLayoutType mlu_input_layout() const;
+  bool mlu_use_first_conv() const;
+  const std::vector<float>& mlu_first_conv_mean() const;
+  const std::vector<float>& mlu_first_conv_std() const;
+#endif
+  // XPU only, set the size of the workspace memory from L3 cache for the
+  // current thread.
+  void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
+  // XPU only, specify the target device ID for the current thread.
+  void set_xpu_dev_per_thread(int dev_no = 0);
 };
 /// MobileConfig is the config for the light weight predictor, it will skip

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -71,7 +71,9 @@ const std::string& TargetToStr(TargetType target) {
                                              "fpga",
                                              "npu",
                                              "xpu",
-                                              "bm"};
+                                              "bm",
+                                              "mlu",
+                                              "rknpu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -111,7 +113,9 @@ const std::string& TargetRepr(TargetType target) {
                                              "kFPGA",
                                              "kNPU",
                                              "kXPU",
-                                              "kBM"};
+                                              "kMLU",
+                                              "kBM",
+                                              "kRKNPU"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -153,6 +157,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kNPU),
                                               TARGET(kXPU),
                                               TARGET(kBM),
+                                               TARGET(kMLU),
                                               TARGET(kFPGA)});
  if (target == TARGET(kAny)) {
    return valid_set;

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -53,9 +53,10 @@ enum class TargetType : int {
  kNPU = 8,
  kXPU = 9,
  kBM = 10,
-  kAny = 6,  // any target
  kMLU = 11,
-  NUM = 12,  // number of fields.
+  kRKNPU = 12,
+  kAny = 6,  // any target
+  NUM = 13,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,
@@ -89,6 +90,8 @@ typedef enum {
  LITE_POWER_RAND_LOW = 5
 } PowerMode;
+typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion;
 enum class ActivationType : int {
  kIndentity = 0,
  kRelu = 1,
@@ -100,7 +103,9 @@ enum class ActivationType : int {
  kSwish = 7,
  kExp = 8,
  kAbs = 9,
-  NUM = 10,
+  kHardSwish = 10,
+  kReciprocal = 11,
+  NUM = 12,
 };
 static size_t PrecisionTypeLength(PrecisionType type) {

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -42,8 +42,13 @@ USE_MIR_PASS(type_precision_cast_pass);
 USE_MIR_PASS(type_layout_cast_pass);
 USE_MIR_PASS(type_layout_cast_preprocess_pass);
 USE_MIR_PASS(memory_optimize_pass);
+USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
+USE_MIR_PASS(mlu_subgraph_pass);
+USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(__xpu__resnet_fuse_pass);
+USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
--- a/lite/api/python/CMakeLists.txt
+++ b/lite/api/python/CMakeLists.txt
@@ -17,8 +17,12 @@ execute_process(
  OUTPUT_VARIABLE PADDLE_LITE_COMMIT
  OUTPUT_STRIP_TRAILING_WHITESPACE
 )
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+if(APPLE)
-    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup_mac.py.in
+        ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+else()
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+        ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+endif()
 add_subdirectory(pybind)
 #add_subdirectory(interface)
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -47,6 +47,7 @@ using lite_api::TargetType;
 using lite_api::PrecisionType;
 using lite_api::DataLayoutType;
 using lite_api::Place;
+using lite_api::MLUCoreVersion;
 using lite::LightPredictorImpl;
 using lite_api::OptBase;
@@ -76,6 +77,7 @@ static void BindLiteMobileConfig(py::module *m);
 static void BindLitePowerMode(py::module *m);
 static void BindLitePlace(py::module *m);
 static void BindLiteTensor(py::module *m);
+static void BindLiteMLUCoreVersion(py::module *m);
 void BindLiteApi(py::module *m) {
  BindLiteCxxConfig(m);
@@ -83,6 +85,7 @@ void BindLiteApi(py::module *m) {
  BindLitePowerMode(m);
  BindLitePlace(m);
  BindLiteTensor(m);
+  BindLiteMLUCoreVersion(m);
 #ifndef LITE_ON_TINY_PUBLISH
  BindLiteCxxPredictor(m);
 #endif
@@ -124,6 +127,14 @@ void BindLiteCxxConfig(py::module *m) {
      .def("set_power_mode", &CxxConfig::set_power_mode)
      .def("power_mode", &CxxConfig::power_mode);
 #endif
+#ifdef LITE_WITH_MLU
+  cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version)
+      .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number)
+      .def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout)
+      .def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv)
+      .def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean)
+      .def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std);
+#endif
 }
 // TODO(sangoly): Should MobileConfig be renamed to LightConfig ??
@@ -155,6 +166,12 @@ void BindLitePowerMode(py::module *m) {
      .value("LITE_POWER_RAND_LOW", PowerMode::LITE_POWER_RAND_LOW);
 }
+void BindLiteMLUCoreVersion(py::module *m) {
+  py::enum_<MLUCoreVersion>(*m, "MLUCoreVersion")
+      .value("LITE_MLU_220", MLUCoreVersion::MLU_220)
+      .value("LITE_MLU_270", MLUCoreVersion::MLU_270);
+}
 void BindLitePlace(py::module *m) {
  // TargetType
  py::enum_<TargetType>(*m, "TargetType")
@@ -165,6 +182,7 @@ void BindLitePlace(py::module *m) {
      .value("OpenCL", TargetType::kOpenCL)
      .value("FPGA", TargetType::kFPGA)
      .value("NPU", TargetType::kNPU)
+      .value("MLU", TargetType::kMLU)
      .value("Any", TargetType::kAny);
  // PrecisionType
@@ -245,6 +263,20 @@ void BindLiteTensor(py::module *m) {
  DO_GETTER_ONCE(data_type__, name__##_data)
  DATA_GETTER_SETTER_ONCE(int8_t, int8);
+#ifdef LITE_WITH_MLU
+  tensor.def("set_uint8_data",
+             [](Tensor &self,
+                const std::vector<uint8_t> &data,
+                TargetType type = TargetType::kHost) {
+               if (type == TargetType::kHost) {
+                 self.CopyFromCpu<uint8_t, TargetType::kHost>(data.data());
+               }
+             },
+             py::arg("data"),
+             py::arg("type") = TargetType::kHost);
+  DO_GETTER_ONCE(uint8_t, "uint8_data");
+#endif
  DATA_GETTER_SETTER_ONCE(int32_t, int32);
  DATA_GETTER_SETTER_ONCE(float, float);
 #undef DO_GETTER_ONCE

--- a/lite/api/python/setup_mac.py.in
+++ b/lite/api/python/setup_mac.py.in
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# module of pack whl installer for Paddle-lite
+import shutil
+import os
+from setuptools import setup, Distribution
+class BinaryDistribution(Distribution):
+    'binary distribution'
+    def has_ext_modules(foo):
+        return True
+# get paddle-lite version, if it's not based on a release tag, we use commit id instead
+PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
+PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
+if PADDLELITE_TAG == "":
+    PADDLELITE_VERSION = PADDLELITE_COMMITE
+else:
+    PADDLELITE_VERSION = PADDLELITE_TAG
+# core lib of paddlelite is stored as lite.so
+LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
+PACKAGE_DATA = {'paddlelite': ['lite.so']}
+# put all thirdparty libraries in paddlelite.libs
+PACKAGE_DATA['paddlelite.libs'] = []
+LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
+    shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
+    PACKAGE_DATA['paddlelite.libs'] += ['libmklml.dylib', 'libiomp5.dylib']
+# link lite.so to paddlelite.libs
+COMMAND = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}\
+/inference_lite_lib/python/install/lite/lite.so"
+if os.system(COMMAND) != 0:
+    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+# remove unused paddle/libs/__init__.py
+if os.path.isfile(LIB_PATH+'/__init__.py'):
+    os.remove(LIB_PATH+'/__init__.py')
+# set dir path of each package
+PACKAGE_DIR = {
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddlelite.libs': LIB_PATH,
+    'paddlelite': LITE_PATH
+}
+setup(
+    name='paddlelite',
+    version=PADDLELITE_VERSION,
+    description='Paddle-Lite Library',
+    packages=['paddlelite', 'paddlelite.libs'],
+    package_dir=PACKAGE_DIR,
+    package_data=PACKAGE_DATA,
+    distclass=BinaryDistribution
+)
--- a/lite/api/test_yolov3_lite_bm.cc
+++ b/lite/api/test_yolov3_lite_bm.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <fstream>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+DEFINE_string(input_img_txt_path,
+              "",
+              "if set input_img_txt_path, read the img filename as input.");
+namespace paddle {
+namespace lite {
+void TestModel(const std::vector<Place>& valid_places) {
+  lite::Predictor predictor;
+  std::vector<std::string> passes;
+  predictor.Build(FLAGS_model_dir,
+                  FLAGS_model_dir + "/model",
+                  FLAGS_model_dir + "/params",
+                  valid_places,
+                  passes);
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  if (FLAGS_input_img_txt_path.empty()) {
+    for (int i = 0; i < item_size; i++) {
+      data[i] = 1;
+    }
+  } else {
+    std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
+    if (!fs.is_open()) {
+      LOG(FATAL) << "open input_img_txt error.";
+    }
+    for (int i = 0; i < item_size; i++) {
+      fs >> data[i];
+    }
+  }
+  auto* image_tensor = predictor.GetInput(1);
+  image_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 2})));
+  data = image_tensor->mutable_data<float>();
+  data[0] = FLAGS_im_height;
+  data[1] = FLAGS_im_width;
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+  auto out = predictor.GetOutputs();
+  FILE* fp = fopen("result.txt", "wb");
+  for (int i = 0; i < out.size(); i++) {
+    auto* out_data = out[i]->data<float>();
+    for (int j = 0; j < out[i]->numel(); j++) {
+      fprintf(fp, "%f\n", out_data[j]);
+    }
+  }
+  fclose(fp);
+}
+TEST(Yolov3, test_bm) {
+  std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+  TestModel(valid_places);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -6,4 +6,6 @@ add_subdirectory(fpga)
 add_subdirectory(host)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(mlu)
 add_subdirectory(bm)
+add_subdirectory(rknpu)
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "lite/backends/arm/math/activation.h"
+#include <algorithm>
 #include <string>
 #include "lite/backends/arm/math/funcs.h"
@@ -711,6 +712,47 @@ void act_square<float>(const float* din, float* dout, int size, int threads) {
  }
 }
+template <>
+void act_hard_swish<float>(const float* din,
+                           float* dout,
+                           int size,
+                           float threshold,
+                           float scale,
+                           float offset,
+                           int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = std::min(std::max(0.f, ptr_in[0] + offset), threshold) *
+                 ptr_in[0] / scale;
+    ptr_in++;
+    ptr_out++;
+  }
+}
+template <>
+void act_reciprocal<float>(const float* din,
+                           float* dout,
+                           int size,
+                           int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = 1.0 / ptr_in[0];
+    ptr_in++;
+    ptr_out++;
+  }
+}
+template <>
+void act_abs<float>(const float* din, float* dout, int size, int threads) {
+  for (int i = 0; i < size; ++i) {
+    dout[0] = (din[0] > 0 ? din[0] : -din[0]);
+    din++;
+    dout++;
+  }
+}
 #ifdef LITE_WITH_TRAIN
 template <>
 void act_square_grad(const float* din,

--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -72,6 +72,20 @@ void act_rsqrt(const T* din, T* dout, int size, int threads);
 template <typename T>
 void act_square(const T* din, T* dout, int size, int threads);
+template <typename T>
+void act_hard_swish(const T* din,
+                    T* dout,
+                    int size,
+                    float threshold,
+                    float scale,
+                    float offset,
+                    int threads);
+template <typename T>
+void act_reciprocal(const T* din, T* dout, int size, int threads);
+template <typename T>
+void act_abs(const T* din, T* dout, int size, int threads);
 #ifdef LITE_WITH_TRAIN
 template <typename T>
 void act_square_grad(

--- a/lite/backends/arm/math/concat.cc
+++ b/lite/backends/arm/math/concat.cc
@@ -16,46 +16,3 @@
 #include <algorithm>
 #include <limits>
 #include <memory>
-#include "lite/backends/arm/math/funcs.h"
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-void concat_func(const std::vector<lite::Tensor *> &input,
-                 const int axis,
-                 lite::Tensor *output) {
-  int64_t concat_input_size = 1;
-  int64_t num_cancats = 1;
-  auto dim_0 = input[0]->dims();
-  size_t num = input.size();
-  for (int i = axis + 1; i < dim_0.size(); i++) {
-    concat_input_size *= dim_0[i];
-  }
-  for (int i = 0; i < axis; i++) {
-    num_cancats *= dim_0[i];
-  }
-  float *dst_ptr = output->mutable_data<float>();
-  const int out_concat_axis = output->dims()[axis];
-  int64_t offset_concat_axis = 0;
-  int64_t out_sum = out_concat_axis * concat_input_size;
-  for (int n = 0; n < num; n++) {
-    auto dims = input[n]->dims();
-    const float *src_ptr = input[n]->data<float>();
-    int64_t in_concat_axis = dims[axis];
-    float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
-    int64_t in_sum = in_concat_axis * concat_input_size;
-    for (int i = 0; i < num_cancats; i++) {
-      std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum);
-      dout_ptr += out_sum;
-      src_ptr += in_sum;
-    }
-    offset_concat_axis += in_concat_axis;
-  }
-}
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/lite/backends/arm/math/concat.h
+++ b/lite/backends/arm/math/concat.h
@@ -25,9 +25,39 @@ namespace lite {
 namespace arm {
 namespace math {
-void concat_func(const std::vector<lite::Tensor *> &input,
+template <typename T>
+void concat_func(const std::vector<lite::Tensor*>& input,
                 const int axis,
-                 lite::Tensor *output);
+                 lite::Tensor* output) {
+  size_t num = input.size();
+  auto dim_0 = input[0]->dims();
+  int64_t concat_input_size = 1;
+  int64_t num_cancats = 1;
+  for (int i = axis + 1; i < dim_0.size(); i++) {
+    concat_input_size *= dim_0[i];
+  }
+  for (int i = 0; i < axis; i++) {
+    num_cancats *= dim_0[i];
+  }
+  auto* dst_ptr = output->mutable_data<T>();
+  const int out_concat_axis = output->dims()[axis];
+  int64_t offset_concat_axis = 0;
+  int64_t out_sum = out_concat_axis * concat_input_size;
+  for (int n = 0; n < num; n++) {
+    auto dims = input[n]->dims();
+    auto* src_ptr = input[n]->data<T>();
+    int64_t in_concat_axis = dims[axis];
+    auto* dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
+    int64_t in_sum = in_concat_axis * concat_input_size;
+    for (int i = 0; i < num_cancats; i++) {
+      std::memcpy(dout_ptr, src_ptr, sizeof(T) * in_sum);
+      dout_ptr += out_sum;
+      src_ptr += in_sum;
+    }
+    offset_concat_axis += in_concat_axis;
+  }
+}
 }  // namespace math
 }  // namespace arm

--- a/lite/backends/arm/math/reduce_mean.cc
+++ b/lite/backends/arm/math/reduce_mean.cc
@@ -198,6 +198,23 @@ void reduce_mean_hw<float>(const float* src,
  reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
 }
+template <>
+void mean_grad<float>(const float* out_grad, float* in_grad, int size) {
+  float grad = out_grad[0] / size;
+  float32x4_t grad_v = vdupq_n_f32(grad);
+  int loop = size >> 2;
+  int remain = size & 3;
+#pragma omp parallel for
+  for (int i = 0; i < loop; ++i) {
+    vst1q_f32(in_grad, grad_v);
+    in_grad += 4;
+  }
+  for (int i = 0; i < remain; ++i) {
+    in_grad[i] = grad;
+  }
+}
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/arm/math/reduce_mean.h
+++ b/lite/backends/arm/math/reduce_mean.h
@@ -83,6 +83,9 @@ void reduce_mean_all(const T* src,
                     int height_in,
                     int width_in);
+template <typename T>
+void mean_grad(const T* out_grad, T* in_grad, int size);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/cuda/CMakeLists.txt
+++ b/lite/backends/cuda/CMakeLists.txt
@@ -5,5 +5,7 @@ get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)
 nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_deps})
 nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_deps})
+lite_cc_library(cuda_context SRCS context.cc DEPS device_info)
 add_subdirectory(math)
--- a/lite/backends/cuda/context.cc
+++ b/lite/backends/cuda/context.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/cuda/context.h"
+namespace paddle {
+namespace lite {}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/context.h
+++ b/lite/backends/cuda/context.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/cuda/blas.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/target_wrapper.h"
+#include "lite/core/device_info.h"
+namespace paddle {
+namespace lite {
+template <TargetType Type>
+class Context;
+using CUDAContext = Context<TargetType::kCUDA>;
+// Only works with CUDA kernels.
+template <>
+class Context<TargetType::kCUDA> {
+ public:
+  typename Env<TargetType::kCUDA>::Devs& devs =
+      Env<TargetType::kCUDA>::Global();
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {
+    if (devs.size() > 0) {
+      cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
+    } else {
+      LOG(INFO) << "No cuda device(s) found, CUDAContext init failed.";
+    }
+  }
+  void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
+    CHECK_GT(devs.size(), 0UL)
+        << "Env is not initialized or current target is not exit!";
+    if (dev_id >= static_cast<int>(devs.size())) {
+      LOG(WARNING) << "device index exceeds the number of devices, set to "
+                      "default device(0)!";
+      device_id_ = 0;
+    } else {
+      device_id_ = dev_id;
+    }
+    if (io_stream_id >= devs[dev_id].max_stream()) {
+      LOG(WARNING) << "data stream index exceeds the maximum stream number, "
+                      "set to default stream(0)!";
+      io_stream_id = 0;
+    }
+    if (exec_stream_id >= devs[dev_id].max_stream()) {
+      LOG(WARNING) << "exec stream index exceeds the maximum stream number, "
+                      "set to default stream(0)!";
+      exec_stream_id = 0;
+    }
+    exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id];
+    io_stream_ = devs[dev_id].io_streams()[io_stream_id];
+    exec_stream_id_ = exec_stream_id;
+    io_stream_id_ = io_stream_id;
+    need_sync_ = false;
+  }
+  void CopySharedTo(CUDAContext* ctx) {
+    CHECK(ctx);
+    CHECK(cublas_fp32_) << "cublas_fp32 should be set first";
+    ctx->cublas_fp32_ = cublas_fp32_;
+  }
+  const cudaStream_t& exec_stream() const { return exec_stream_; }
+  void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
+  const cudaStream_t& io_stream() const { return io_stream_; }
+  void SetIoStream(cudaStream_t stream) { io_stream_ = stream; }
+  std::shared_ptr<cuda::Blas<float>> cublas_fp32() { return cublas_fp32_; }
+  void SetCuBlasFP32(std::shared_ptr<cuda::Blas<float>> cublas_fp32) {
+    cublas_fp32_ = cublas_fp32;
+  }
+  const std::vector<cudaEvent_t>& input_events() { return input_events_; }
+  void SetInputEvents(const std::vector<cudaEvent_t>& input_events) {
+    input_events_.clear();
+    input_events_.assign(input_events.begin(), input_events.end());
+  }
+  const std::vector<cudaEvent_t>& output_events() { return output_events_; }
+  void SetOutputEvents(const std::vector<cudaEvent_t>& output_events) {
+    output_events_.clear();
+    output_events_.assign(output_events.begin(), output_events.end());
+  }
+  std::vector<cudaStream_t> all_exec_streams() {
+    int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
+    return devs[dev_id].exec_streams();
+  }
+  void SetSyncStreams(const std::vector<int>& nums) {
+    sync_streams_.clear();
+    std::vector<cudaStream_t> exec_streams = all_exec_streams();
+    for (size_t i = 0; i < nums.size(); ++i) {
+      CHECK(nums[i] >= 0 && nums[i] < static_cast<int>(exec_streams.size()))
+          << "streams id is not valid";
+      sync_streams_.push_back(exec_streams[nums[i]]);
+    }
+    InitSyncEvents(nums.size());
+  }
+  void InitSyncEvents(const int num) {
+    sync_events_.clear();
+    for (int i = 0; i < num; ++i) {
+      cudaEvent_t eve;
+      TargetWrapperCuda::CreateEventWithFlags(&eve);
+      sync_events_.push_back(eve);
+    }
+  }
+  void SetNeedSync(bool sync) { need_sync_ = sync; }
+  bool need_sync() const { return need_sync_; }
+  void Sync() {
+    CHECK_EQ(sync_streams_.size(), sync_events_.size());
+    for (size_t i = 0; i < sync_events_.size(); ++i) {
+      TargetWrapperCuda::RecordEvent(sync_events_[i], sync_streams_[i]);
+      TargetWrapperCuda::StreamSync(exec_stream_, sync_events_[i]);
+    }
+  }
+  std::string name() const { return "CUDAContext"; }
+  CUDAContext& operator=(const CUDAContext& context) {
+    this->Init(
+        context.device_id_, context.exec_stream_id_, context.io_stream_id_);
+    cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
+    return *this;
+  }
+ private:
+  int device_id_;
+  // overall information
+  int exec_stream_id_;
+  int io_stream_id_;
+  cudaStream_t exec_stream_;
+  cudaStream_t io_stream_;
+  // not thread-safe, should allocate for each thread.
+  std::shared_ptr<cuda::Blas<float>> cublas_fp32_;
+  // kernel information
+  std::vector<cudaEvent_t> input_events_;
+  std::vector<cudaEvent_t> output_events_;
+  // multi stream sync.
+  std::vector<cudaStream_t> sync_streams_;
+  std::vector<cudaEvent_t> sync_events_;
+  bool need_sync_;
+};
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,7 +10,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "lite/backends/opencl/cl_context.h"
-#include <algorithm>
 #include <memory>
 #include <string>
 #include <utility>
@@ -36,10 +32,8 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
  STL::stringstream program_key_ss;
  program_key_ss << file_name << options;
  std::string program_key = program_key_ss.str();
+  auto it = programs_.find(program_key);
-  auto &programs = CLRuntime::Global()->programs();
+  if (it != programs_.end()) {
-  auto it = programs.find(program_key);
-  if (it != programs.end()) {
    VLOG(3) << " --- program -> " << program_key << " has been built --- ";
    return *(it->second);
  }
@@ -50,9 +44,9 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
  CLRuntime::Global()->BuildProgram(program.get(), options);
  VLOG(3) << " --- end build program -> " << program_key << " --- ";
-  programs[program_key] = std::move(program);
+  programs_[program_key] = std::move(program);
-  return *(programs[program_key]);
+  return *(programs_[program_key]);
 }
 void CLContext::AddKernel(const std::string &kernel_name,
@@ -64,34 +58,29 @@ void CLContext::AddKernel(const std::string &kernel_name,
  auto program = GetProgram(file_name, options);
  VLOG(3) << " --- end get program --- ";
  VLOG(3) << " --- to create kernel: " << kernel_name << " --- ";
-  std::unique_ptr<cl::Kernel> kernel(
+  std::shared_ptr<cl::Kernel> kernel(
      new cl::Kernel(program, kernel_name.c_str(), &status));
  CL_CHECK_FATAL(status);
  VLOG(3) << " --- end create kernel --- ";
+  kernels_.emplace_back(std::move(kernel));
-  auto &kernels = CLRuntime::Global()->kernels();
-  auto &kernel_offset_map = CLRuntime::Global()->kernel_offset();
-  kernels.emplace_back(std::move(kernel));
  STL::stringstream kernel_key;
  kernel_key << kernel_name << options << time_stamp;
-  kernel_offset_map[kernel_key.str()] = kernels.size() - 1;
+  kernel_offset_[kernel_key.str()] = kernels_.size() - 1;
 }
 cl::Kernel &CLContext::GetKernel(const int index) {
-  auto &kernels = CLRuntime::Global()->kernels();
+  VLOG(3) << " --- kernel count: " << kernels_.size() << " --- ";
-  VLOG(3) << " --- kernel count: " << kernels.size() << " --- ";
+  CHECK(static_cast<size_t>(index) < kernels_.size())
-  CHECK(static_cast<size_t>(index) < kernels.size())
      << "The index must be less than the size of kernels.";
-  CHECK(kernels[index] != nullptr)
+  CHECK(kernels_[index] != nullptr)
      << "The target kernel pointer cannot be null.";
-  return *(kernels[index]);
+  return *(kernels_[index]);
 }
 cl::Kernel &CLContext::GetKernel(const std::string &name) {
-  auto &kernel_offset_map = CLRuntime::Global()->kernel_offset();
+  auto it = kernel_offset_.find(name);
-  auto it = kernel_offset_map.find(name);
+  CHECK(it != kernel_offset_.end()) << "Cannot find the kernel function: "
-  CHECK(it != kernel_offset_map.end()) << "Cannot find the kernel function: "
+                                    << name;
-                                       << name;
  return GetKernel(it->second);
 }

--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -27,6 +27,21 @@ namespace lite {
 class CLContext {
 public:
+  ~CLContext() {
+    for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
+      // Note(ysh329): Don't need `clReleaseKernel`
+      kernels_[kidx].reset();
+    }
+    kernels_.clear();
+    kernel_offset_.clear();
+    for (auto &p : programs_) {
+      // Note(ysh329): Dont't need `clReleaseProgram`
+      p.second.reset();
+    }
+    programs_.clear();
+    LOG(INFO) << "release cl::Program, cl::Kernel finished.";
+  }
  cl::CommandQueue &GetCommandQueue();
  cl::Context &GetContext();
@@ -52,6 +67,11 @@ class CLContext {
                                int divitor = 2);
  //  cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
  //                                   size_t max_work_size);
+ private:
+  std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
+  std::vector<std::shared_ptr<cl::Kernel>> kernels_;
+  std::map<std::string, int> kernel_offset_;
 };
 }  // namespace lite

--- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
@@ -55,17 +55,20 @@ __kernel void relu6(__read_only image2d_t input,
 __kernel void sigmoid(__read_only image2d_t input,
                      __write_only image2d_t output,
                      __private const float threshold,
-                   __private const float scale) {
+                      __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height
-  const int x = get_global_id(0); // image_width
+  const sampler_t sampler =
-  const int y = get_global_id(1); // image_height
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  CL_DTYPE4 out = 1 / (1 + exp(-in));
+  CL_DTYPE4 out;
+  out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
+  out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
+  out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
+  out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }

--- a/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cl_common.h>
+__kernel void decode_center_size(__read_only image2d_t prior_box_image,
+                                __read_only image2d_t prior_box_var_image,
+                                __read_only image2d_t target_box_image,
+                                __write_only image2d_t output_image,
+                                __private const int out_C,
+                                __private const int out_H){
+                        const int out_c = get_global_id(0);
+                        const int out_nh = get_global_id(1);
+                        const int out_h = out_nh % out_H;
+                        const int out_n =  1;
+                        const int prior_box_n = 1;
+                        const int prior_box_c = 0;
+                        const int prior_box_h = out_h;
+                        const int prior_box_var_n = 1;
+                        const int prior_box_var_c = 0;
+                        const int prior_box_var_h = out_h;
+                        const int target_box_n = 1;
+                        const int target_box_c = out_c;
+                        const int target_box_h = out_h;
+                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                 CLK_ADDRESS_CLAMP      |
+                                                 CLK_FILTER_NEAREST;
+                        int2  prior_box_pos;
+                        int2  prior_box_var_pos;
+                        int2  target_box_pos;
+                        int2  output_pos;
+                        prior_box_pos.x = prior_box_c * 4;
+                        prior_box_pos.y = prior_box_n * prior_box_h;
+                        prior_box_var_pos.x = prior_box_var_c * 4;
+                        prior_box_var_pos.y = prior_box_var_n * prior_box_var_h;
+                        target_box_pos.x = target_box_c * 4;
+                        target_box_pos.y = target_box_n * target_box_h;
+                        output_pos.x = out_c * 4;
+                        output_pos.y = out_n * out_h;
+                        CL_DTYPE4 prior_box_input[4];
+                        CL_DTYPE4 prior_box_var_input[4];
+                        CL_DTYPE4 target_box_input[4];
+                        prior_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 0, prior_box_pos.y));
+                        prior_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 1, prior_box_pos.y));
+                        prior_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 2, prior_box_pos.y));
+                        prior_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 3, prior_box_pos.y));
+                        prior_box_var_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 0, prior_box_var_pos.y));
+                        prior_box_var_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 1, prior_box_var_pos.y));
+                        prior_box_var_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 2, prior_box_var_pos.y));
+                        prior_box_var_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, 
+                                                (int2)(prior_box_var_pos.x + 3, prior_box_var_pos.y));
+                        target_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 0,target_box_pos.y));
+                        target_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 1, target_box_pos.y));
+                        target_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 2, target_box_pos.y));
+                        target_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 3, target_box_pos.y));
+                        CL_DTYPE prior_box_width = prior_box_input[2].x - prior_box_input[0].x;
+                        CL_DTYPE prior_box_height = prior_box_input[3].x - prior_box_input[1].x;
+                        CL_DTYPE prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(CL_DTYPE)2;
+                        CL_DTYPE prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(CL_DTYPE)2;
+                        CL_DTYPE4 target_box_center_x;
+                        CL_DTYPE4 target_box_center_y;
+                        CL_DTYPE4 target_box_width;
+                        CL_DTYPE4 target_box_height;
+                        CL_DTYPE4 output[4];
+                        output[0] = 0.0f;
+                        output[1] = 0.0f;
+                        output[2] = 0.0f;
+                        output[3] = 0.0f;
+                        target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x;
+                        target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y;
+                        target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width;
+                        target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height;
+                        output[0].x = target_box_center_x.x - target_box_width.x/(half)2;
+                        output[1].x = target_box_center_y.x - target_box_height.x/(half)2;
+                        output[2].x = target_box_center_x.x + target_box_width.x/(half)2;
+                        output[3].x = target_box_center_y.x + target_box_height.x/(half)2;
+                        if(out_C - out_c * 4 >= 2){
+                            target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x;
+                            target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y;
+                            target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width;
+                            target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height;
+                            output[0].y = target_box_center_x.y - target_box_width.y/(half)2;
+                            output[1].y = target_box_center_y.y - target_box_height.y/(half)2;
+                            output[2].y = target_box_center_x.y + target_box_width.y/(half)2;
+                            output[3].y = target_box_center_y.y + target_box_height.y/(half)2;
+                        }
+                        if(out_C - out_c * 4 >= 3){
+                            target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x;
+                            target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y;
+                            target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width;
+                            target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height;
+                            output[0].z = target_box_center_x.z - target_box_width.z/(half)2;
+                            output[1].z = target_box_center_y.z - target_box_height.z/(half)2;
+                            output[2].z = target_box_center_x.z + target_box_width.z/(half)2;
+                            output[3].z = target_box_center_y.z + target_box_height.z/(half)2;
+                        }
+                        if(out_C - out_c * 4 >= 4){
+                            target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x;
+                            target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y;
+                            target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width;
+                            target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height;
+                            output[0].w = target_box_center_x.w - target_box_width.w/(half)2;
+                            output[1].w = target_box_center_y.w - target_box_height.w/(half)2;
+                            output[2].w = target_box_center_x.w + target_box_width.w/(half)2;
+                            output[3].w = target_box_center_y.w + target_box_height.w/(half)2;
+                        }
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]);
+}
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,7 +11,6 @@ limitations under the License. */
 #include "lite/backends/opencl/cl_runtime.h"
 #include <string>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "lite/utils/cp_logging.h"
@@ -33,26 +29,12 @@ CLRuntime::~CLRuntime() {
    command_queue_->flush();
    command_queue_->finish();
  }
-  for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
-    clReleaseKernel(kernels_[kidx]->get());
-    kernels_[kidx].reset();
-  }
-  kernels_.clear();
-  kernel_offset_.clear();
-  for (auto& p : programs_) {
-    clReleaseProgram(p.second->get());
-  }
-  programs_.clear();
  // For controlling the destruction order
-  command_queue_&& clReleaseCommandQueue(command_queue_->get());
  command_queue_.reset();
-  context_&& clReleaseContext(context_->get());
  context_.reset();
  device_.reset();
  platform_.reset();
+  device_info_.clear();
 }
 bool CLRuntime::Init() {
@@ -90,14 +72,14 @@ cl::CommandQueue& CLRuntime::command_queue() {
  return *command_queue_;
 }
-std::shared_ptr<cl::Program> CLRuntime::CreateProgram(
+std::unique_ptr<cl::Program> CLRuntime::CreateProgram(
    const cl::Context& context, std::string file_name) {
  auto cl_file = opencl_kernels_files.find(file_name);
  std::string content(cl_file->second.begin(), cl_file->second.end());
  cl::Program::Sources sources;
  sources.push_back(content);
  auto prog =
-      std::shared_ptr<cl::Program>(new cl::Program(context, sources, &status_));
+      std::unique_ptr<cl::Program>(new cl::Program(context, sources, &status_));
  VLOG(4) << "OpenCL kernel file name: " << file_name;
  VLOG(4) << "Program source size: " << content.size();
  CL_CHECK_FATAL(status_);

--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,7 +15,6 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/backends/opencl/cl_utility.h"
@@ -43,7 +39,7 @@ class CLRuntime {
  cl::CommandQueue& command_queue();
-  std::shared_ptr<cl::Program> CreateProgram(const cl::Context& context,
+  std::unique_ptr<cl::Program> CreateProgram(const cl::Context& context,
                                             std::string file_name);
  std::unique_ptr<cl::UserEvent> CreateEvent(const cl::Context& context);
@@ -58,14 +54,8 @@ class CLRuntime {
  std::map<std::string, size_t>& GetDeviceInfo();
-  std::unordered_map<std::string, std::shared_ptr<cl::Program>>& programs() {
-    return programs_;
-  }
-  std::vector<std::unique_ptr<cl::Kernel>>& kernels() { return kernels_; }
-  std::map<std::string, int>& kernel_offset() { return kernel_offset_; }
 private:
-  CLRuntime() = default;
+  CLRuntime() { Init(); }
  ~CLRuntime();
@@ -105,12 +95,6 @@ class CLRuntime {
  std::shared_ptr<cl::CommandQueue> command_queue_{nullptr};
-  std::unordered_map<std::string, std::shared_ptr<cl::Program>> programs_{};
-  std::vector<std::unique_ptr<cl::Kernel>> kernels_{};
-  std::map<std::string, int> kernel_offset_{};
  cl_int status_{CL_SUCCESS};
  bool initialized_{false};

--- a/lite/backends/rknpu/CMakeLists.txt
+++ b/lite/backends/rknpu/CMakeLists.txt
+if(NOT LITE_WITH_RKNPU)
+  return()
+endif()
+lite_cc_library(device_rknpu SRCS device.cc DEPS ${rknpu_builder_libs} ${rknpu_runtime_libs})
--- a/lite/backends/rknpu/device.cc
+++ b/lite/backends/rknpu/device.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/rknpu/device.h"
+#include "lite/utils/cp_logging.h"
+namespace paddle {
+namespace lite {
+namespace rknpu {
+std::unique_ptr<rk::nn::Exection> Device::Build(
+    std::string& model_name,                                   // NOLINT
+    rk::nn::Graph* rk_graph,                                   // NOLINT
+    std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes,  // NOLINT
+    std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes  // NOLINT
+    ) {
+  VLOG(3) << "[RKNPU] Build model";
+  rk_graph->SetInputsOutputs(input_nodes, output_nodes);
+  std::unique_ptr<rk::nn::Exection> exector =
+      std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(rk_graph));
+  exector->Build();
+  return exector;
+}
+}  // namespace rknpu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/rknpu/device.h
+++ b/lite/backends/rknpu/device.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "rknpu/rknpu_pub.h"  // NOLINT
+namespace paddle {
+namespace lite {
+namespace rknpu {
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() {}
+  // Build the RK IR graph to om model, return RK model exector to
+  // load om model and run inference.
+  std::unique_ptr<rk::nn::Exection> Build(
+      std::string& model_name,                                   // NOLINT
+      rk::nn::Graph* rk_graph,                                   // NOLINT
+      std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes,  // NOLINT
+      std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes  // NOLINT
+      );                                                         // NOLINT
+ private:
+};
+}  // namespace rknpu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/CMakeLists.txt
+++ b/lite/backends/x86/CMakeLists.txt
@@ -10,7 +10,7 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
 lite_cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
 lite_cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
-lite_cc_library(x86_cpu_info SRCS cpu_info.cc DEPS xbyak)
+lite_cc_library(x86_cpu_info SRCS cpu_info.cc)
 add_subdirectory(jit)
 add_subdirectory(math)
--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
@@ -262,7 +262,7 @@ void* GetTensorRtDsoHandle() {
 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib");
+  return GetDsoHandleFromSearchPath(mklml_dir, "libmklml.dylib");
 #elif defined(_WIN32)
  return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll");
 #else

--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -96,8 +96,8 @@ class BeamSearchFunctor<TARGET(kX86), T> {
    //        : nullptr;
    // fill in data
-    std::vector<size_t> low_level;
+    std::vector<uint64_t> low_level;
-    size_t low_offset = 0;
+    uint64_t low_offset = 0;
    for (auto &items : selected_items) {
      low_level.push_back(low_offset);
      for (auto &item : items) {

--- a/lite/backends/x86/math/beam_search_test.cc
+++ b/lite/backends/x86/math/beam_search_test.cc
@@ -22,8 +22,8 @@ void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
                       paddle::framework::LoDTensor* pre_scores) {
  // lod
  paddle::framework::LoD lod;
-  std::vector<size_t> level0({0, 2, 4});
+  std::vector<uint64_t> level0({0, 2, 4});
-  std::vector<size_t> level1({0, 1, 2, 3, 4});
+  std::vector<uint64_t> level1({0, 1, 2, 3, 4});
  lod.push_back(level0);
  lod.push_back(level1);
  ids->set_lod(lod);

--- a/lite/backends/x86/math/blas_impl.h
+++ b/lite/backends/x86/math/blas_impl.h
@@ -483,7 +483,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
             mat_a.data<T>(),
             mat_b.data<T>(),
             beta,
-             mat_out->mutable_data<T>());
+             mat_out->template mutable_data<T>());
 }
 template <>
@@ -759,7 +759,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
                           mat_a.data<T>(),
                           mat_b.data<T>(),
                           beta,
-                           mat_out->mutable_data<T>());
+                           mat_out->template mutable_data<T>());
  } else {
    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
                   dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
@@ -773,7 +773,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
        mat_a.data<T>(),
        mat_b.data<T>(),
        beta,
-        mat_out->mutable_data<T>(),
+        mat_out->template mutable_data<T>(),
        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
        dim_a.stride_,
        dim_b.stride_);

--- a/lite/backends/x86/math/concat_and_split.cc
+++ b/lite/backends/x86/math/concat_and_split.cc
@@ -51,7 +51,7 @@ class ConcatFunctor<lite::TargetType::kX86, T> {
    // auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
    // computation
-    auto output_data = output->mutable_data<T>();
+    auto output_data = output->template mutable_data<T>();
    int col_idx = 0;
    for (int j = 0; j < num; ++j) {
      int col_len = input_cols[j];
@@ -108,7 +108,7 @@ class SplitFunctor<lite::TargetType::kX86, T> {
        int col_len = output_cols[j];
        auto* out_tensor = outputs->at(j);
        if (out_tensor != nullptr) {
-          T* dst_ptr = out_tensor->mutable_data<T>() + k * col_len;
+          T* dst_ptr = out_tensor->template mutable_data<T>() + k * col_len;
          std::copy_n(src_ptr + col_idx, col_len, dst_ptr);
          // memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
          //             sizeof(T) * col_len);

--- a/lite/backends/x86/math/cross_entropy.cc
+++ b/lite/backends/x86/math/cross_entropy.cc
@@ -50,8 +50,8 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> {
                .reshape(batch_axis_remain)
                .sum(Eigen::DSizes<int, 1>(1)));
    } else {
-      const T* prob_data = prob->data<T>();
+      const T* prob_data = prob->template data<T>();
-      T* loss_data = out->mutable_data<T>();
+      T* loss_data = out->template mutable_data<T>();
      const int64_t* label_data = labels->data<int64_t>();
      for (int i = 0; i < batch_size; ++i) {

--- a/lite/backends/x86/math/im2col.cc
+++ b/lite/backends/x86/math/im2col.cc
@@ -99,7 +99,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
    int channels_col = im_channels * filter_height * filter_width;
-    T* im_data = im->mutable_data<T>();
+    T* im_data = im->template mutable_data<T>();
    const T* col_data = col.data<T>();
    for (int c = 0; c < channels_col; ++c) {
@@ -161,7 +161,7 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
    int col_width = col->dims()[1];
    const T* im_data = im.data<T>();
-    T* col_data = col->mutable_data<T>();
+    T* col_data = col->template mutable_data<T>();
    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
@@ -235,7 +235,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
        "col_width and padding(padding_left, padding_right) are "
        "inconsistent.");
-    T* im_data = im->mutable_data<T>();
+    T* im_data = im->template mutable_data<T>();
    const T* col_data = col.data<T>();
    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {

--- a/lite/backends/x86/math/im2col_cfo_cpu.h
+++ b/lite/backends/x86/math/im2col_cfo_cpu.h
@@ -42,7 +42,7 @@ inline void im2col_common(const lite::Tensor& im,
  int channels_col = im_channels * filter_height * filter_width;
  const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
  for (int c = 0; c < channels_col; ++c) {
    int w_offset = c % filter_width;
    int h_offset = (c / filter_width) % filter_height;
@@ -77,7 +77,7 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im,
  int output_width = col->dims()[4];
  const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
  int col_matrix_width = output_width * output_height;
  int im_size = im_height * im_width;
  size_t copy_size = sizeof(T) * output_width;
@@ -123,7 +123,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im,
  constexpr int prw = 1;
  const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
  int im_size = im_height * im_width;
  int col_matrix_width = output_width * output_height;
  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow

--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -65,7 +65,7 @@ struct TensorSetConstantCPU {
      : tensor_(tensor), value_(value) {}
  template <typename T>
  void apply() const {
-    auto* begin = tensor_->mutable_data<T>(lite::TargetType::kX86);
+    auto* begin = tensor_->template mutable_data<T>(lite::TargetType::kX86);
    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
  }
  lite::Tensor* tensor_;
@@ -125,7 +125,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
    const T* input_data = input.data<T>();
    const T* vector_data = vector.data<T>();
-    T* output_data = output->mutable_data<T>();
+    T* output_data = output->template mutable_data<T>();
    for (int64_t i = 0; i < in_dims[0]; ++i) {
      for (int64_t j = 0; j < size; ++j) {
      output_data[i * size + j] =

--- a/lite/backends/x86/math/math_function_impl.h
+++ b/lite/backends/x86/math/math_function_impl.h
@@ -83,7 +83,7 @@ class ColwiseSum<lite::TargetType::kX86, T> {
    auto size = in_dims[1];
    PADDLE_ENFORCE_EQ(out->numel(), size);
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
    const T* in_buf = input.data<T>();
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -129,7 +129,7 @@ class RowwiseMean<lite::TargetType::kX86, T> {
    auto size = in_dims[1];
    PADDLE_ENFORCE_EQ(out->numel(), height);
    auto inv_size = 1.0 / size;
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
    const T* in_buf = input.data<T>();
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -173,7 +173,7 @@ class RowwiseSum<lite::TargetType::kX86, T> {
    auto size = in_dims[1];
    PADDLE_ENFORCE_EQ(out->numel(), height);
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
    const T* in_buf = input.data<T>();
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {

--- a/lite/backends/x86/math/maxouting.cc
+++ b/lite/backends/x86/math/maxouting.cc
@@ -35,7 +35,7 @@ class MaxOutFunctor<lite::TargetType::kX86, T> {
    // c_size means the output size of each sample
    int c_size = fea_size * output_channels;
    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
    for (int i = 0; i < batch_size; ++i) {
      int new_bindex = c_size * i;
@@ -72,7 +72,8 @@ class MaxOutGradFunctor<lite::TargetType::kX86, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    for (int i = 0; i < batch_size; ++i) {
      int blen = fea_size * output_channels * i;

--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
@@ -54,8 +54,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const int input_stride = input_height * input_width;
    const int output_stride = output_height * output_width;
-    const T* input_data = input->data<T>();
+    const T* input_data = input->template data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
    int hstart, hend;
    int wstart, wend;
@@ -137,7 +137,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    int hstart, hend;
    int wstart, wend;
@@ -220,7 +221,8 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -322,7 +324,7 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const int output_stride = output_depth * output_height * output_width;
    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
    int dstart, dend;
    int hstart, hend;
@@ -425,7 +427,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    int dstart, dend;
    int hstart, hend;
@@ -530,7 +533,8 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {

--- a/lite/backends/x86/math/sample_prob.h
+++ b/lite/backends/x86/math/sample_prob.h
@@ -58,11 +58,11 @@ class SampleWithProb {
    const int64_t* label_data = L->data<int64_t>();
    // int64_t* samples_data =
    //    S->mutable_data<int64_t>(ret_dim, Target);
-    // T* probabilities_data = P->mutable_data<T>(ret_dim, Target);
+    // T* probabilities_data = P->template mutable_data<T>(ret_dim, Target);
    S->Resize({batch_size, num_sampled_classes});
    auto* samples_data = S->mutable_data<int64_t>(Target);
    P->Resize({batch_size, num_sampled_classes});
-    auto* probabilities_data = P->mutable_data<T>(Target);
+    auto* probabilities_data = P->template mutable_data<T>(Target);
    // temp sets for unique sampling
    std::unordered_set<int64_t> tmp_samples;

--- a/lite/backends/x86/math/search_fc.cc
+++ b/lite/backends/x86/math/search_fc.cc
@@ -42,7 +42,7 @@ class SearchFcFunctor<lite::TargetType::kX86, T> {
    lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size}));
    const auto bottom_data = bottom.data<T>();
-    auto top_data = top->mutable_data<T>(lite::TargetType::kX86);
+    auto top_data = top->template mutable_data<T>(lite::TargetType::kX86);
    const auto weights = w.data<T>();
    auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
    call_gemm<lite::X86Context, T>(blas,

--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
@@ -52,7 +52,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
-    auto* out_data = out_value->mutable_data<T>();
+    auto* out_data = out_value->template mutable_data<T>();
    auto* in1_data = in1_value.data<T>();
    std::copy_n(in1_data, in1_value.numel(), out_data);
@@ -87,7 +87,7 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
    functor(context, output, 0.0);
    auto* in1_data = in1_value.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
    for (size_t i = 0; i < in1_rows.size(); i++) {
      for (int64_t j = 0; j < in1_row_numel; j++) {
@@ -127,7 +127,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = in2_value->mutable_data<T>();
+    auto* in2_data = in2_value->template mutable_data<T>();
    std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset);
  }
 };
@@ -161,7 +161,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
    input2->set_rows(in2_rows);
    auto* in2_value = input2->mutable_value();
-    T* in2_data = in2_value->mutable_data<T>();
+    T* in2_data = in2_value->template mutable_data<T>();
    auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
    size_t offset = 0u;
    for (size_t i = 0u; i != input1.size(); ++i) {
@@ -194,7 +194,7 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {
    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->mutable_data<T>();
+    auto* input2_data = input2->template mutable_data<T>();
    for (size_t i = 0; i < in1_rows.size(); i++) {
      for (int64_t j = 0; j < in1_row_numel; j++) {
@@ -305,7 +305,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
    lite::DDim dims(std::vector<int64_t>(
        {static_cast<int64_t>(merged_row_set.size()), input_width}));
    out.mutable_value()->Resize(dims);
-    auto* out_data = out.mutable_value()->mutable_data<T>();
+    auto* out_data = out.mutable_value()->template mutable_data<T>();
    if (merged_row_set.size() == row_num && !sorted_result) {
      // no duplicated ids, just concat the result together
@@ -385,7 +385,7 @@ struct UpdateToTensor<lite::TargetType::kX86, T> {
    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
+    auto* input2_data = input2->template data<T>();
    // FIXME(typhoonzero): use macro fix the below messy code.
    switch (op) {

--- a/lite/backends/x86/math/sequence2batch.cc
+++ b/lite/backends/x86/math/sequence2batch.cc
@@ -24,10 +24,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
 public:
  void operator()(const lite::Context<lite::TargetType::kX86>& context,
                  const lite::Tensor& src,
-                  const std::vector<size_t>& index_lod,
+                  const std::vector<uint64_t>& index_lod,
                  lite::Tensor* dst,
                  bool is_src_index) {
-    const size_t* index = index_lod.data();
+    const uint64_t* index = index_lod.data();
    const auto& src_dims = src.dims();
    const auto& dst_dims = dst->dims();
    PADDLE_ENFORCE_EQ(
@@ -39,7 +39,7 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
    auto height = dst_dims[0];
    auto width = dst_dims[1];
    auto* src_data = src.data<T>();
-    auto* dst_data = dst->mutable_data<T>();
+    auto* dst_data = dst->template mutable_data<T>();
    const int sz = width * sizeof(T);
    if (is_src_index) {
      for (int i = 0; i < height; ++i) {

--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
@@ -36,7 +36,7 @@ class CopyMatrixRowsFunctor {
  // The indexed rows are based on the input index.
  void operator()(const lite::Context<Target>& context,
                  const lite::Tensor& src,
-                  const std::vector<size_t>& index_lod,
+                  const std::vector<uint64_t>& index_lod,
                  lite::Tensor* dst,
                  bool is_src_index);
 };
@@ -130,8 +130,8 @@ class LoDTensor2BatchFunctor {
    // batch_lods[2] is the sort order for the input LoDTensor.
    batch_lods->at(2).resize(seq_info.size());
-    size_t* batch_starts = batch_lods->at(0).data();
+    auto* batch_starts = batch_lods->at(0).data();
-    size_t* seq2batch_idx = batch_lods->at(1).data();
+    auto* seq2batch_idx = batch_lods->at(1).data();
    batch_starts[0] = 0;
    for (int n = 0; n < max_seqlen; n++) {
      auto batch_id = static_cast<int>(batch_starts[n]);
@@ -148,7 +148,7 @@ class LoDTensor2BatchFunctor {
      }
      batch_starts[n + 1] = static_cast<size_t>(batch_id);
    }
-    size_t* seq_order = batch_lods->at(2).data();
+    auto* seq_order = batch_lods->at(2).data();
    for (size_t i = 0; i < seq_info.size(); ++i) {
      seq_order[i] = seq_info[i].seq_idx;
    }

--- a/lite/backends/x86/math/sequence_padding.cc
+++ b/lite/backends/x86/math/sequence_padding.cc
@@ -22,15 +22,15 @@ namespace math {
 template <typename T>
 void CopyValidData(lite::Tensor* dst_tensor,
                   const lite::Tensor* src_tensor,
-                   const std::vector<size_t>& seq_offsets,
+                   const std::vector<uint64_t>& seq_offsets,
                   int pad_seq_len,
                   int step_width,
                   bool norm_by_len,
                   CopyType type,
                   PadLayout layout) {
  int seq_num = seq_offsets.size() - 1;
-  const T* src_data = src_tensor->data<T>();
+  const T* src_data = src_tensor->template data<T>();
-  T* dst_data = dst_tensor->mutable_data<T>();
+  T* dst_data = dst_tensor->template mutable_data<T>();
  int seq_cpy_gap = step_width;
  int pad_cpy_gap =
@@ -113,7 +113,7 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
                   "'step_width'.");
    // fill padding value
-    T* pad_data = pad_tensor->mutable_data<T>();
+    T* pad_data = pad_tensor->template mutable_data<T>();
    const T* pad_value_data = pad_value.data<T>();
    if (pad_value.numel() == 1) {
      fast_mem_init<T>(

--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
@@ -30,10 +30,10 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
 enum CopyType { kSeqToPad, kPadToSeq };
-inline static size_t MaximumSequenceLength(
+inline static uint64_t MaximumSequenceLength(
-    const std::vector<size_t>& seq_offset) {
+    const std::vector<uint64_t>& seq_offset) {
-  size_t seq_num = seq_offset.size() - 1;
+  uint64_t seq_num = seq_offset.size() - 1;
-  size_t max_seq_len = 0;
+  uint64_t max_seq_len = 0;
  for (size_t i = 0; i < seq_num; ++i) {
    max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
  }
@@ -42,7 +42,7 @@ inline static size_t MaximumSequenceLength(
 inline static void CheckDims(const lite::DDim& seq_tensor_dims,
                             const lite::DDim& pad_tensor_dims,
-                             const std::vector<size_t>& seq_offset,
+                             const std::vector<uint64_t>& seq_offset,
                             int64_t padded_seq_len,
                             int64_t step_width,
                             const PadLayout& layout) {

--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
@@ -55,7 +55,7 @@ class MaxSeqPoolFunctor {
    auto starts = input.lod()[0];
    const T* in_data = input.data<T>();
-    T* out_data = output->mutable_data<T>();
+    T* out_data = output->template mutable_data<T>();
    int* max_index = index->mutable_data<int>();
    int64_t num_seq = out_dims[0];
@@ -103,7 +103,7 @@ class MaxSeqPoolFunctor<T, true> {
    auto starts = input.lod()[0];
    const T* in_data = input.data<T>();
-    T* out_data = output->mutable_data<T>();
+    T* out_data = output->template mutable_data<T>();
    int64_t num_seq = out_dims[0];
    int64_t dim = output->numel() / num_seq;
@@ -145,7 +145,7 @@ class MaxSeqPoolGradFunctor {
    const T* og_data = out_grad.data<T>();
    const int* max_index = index.data<int>();
-    T* ig_data = in_grad->mutable_data<T>();
+    T* ig_data = in_grad->template mutable_data<T>();
    SetConstant<TARGET(kX86), T> set_zero;
    set_zero(context, in_grad, static_cast<T>(0.0));
@@ -170,7 +170,7 @@ class LastSeqPoolFunctor {
                  lite::Tensor* output) {
    // Create pointers to input and output data
    auto* in_data = input.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
    // Calculate the size of each item in sequence
    int64_t item_size = input.numel() / input.dims()[0];
@@ -203,7 +203,7 @@ class FirstSeqPoolFunctor {
                  lite::Tensor* output) {
    // Create pointers to input and output data
    auto* in_data = input.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
    // Calculate the size of each item in sequence
    int64_t item_size = input.numel() / input.dims()[0];
@@ -238,7 +238,7 @@ class SumSeqPoolGradFunctor {
    int64_t in_w = in_grad->numel() / in_grad->dims()[0];
    PADDLE_ENFORCE(in_w == out_w);
    const T* out_g_data = out_grad.data<T>();
-    T* in_g_data = in_grad->mutable_data<T>(TARGET(kX86));
+    T* in_g_data = in_grad->template mutable_data<T>(TARGET(kX86));
    auto blas = math::GetBlas<TARGET(kX86), T>(context);
    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
@@ -288,7 +288,7 @@ class SequencePoolFunctor<TARGET(kX86), T> {
    auto lod = input.lod()[0];
    if (pooltype == "SUM") {
      const T* src = input.data<T>();
-      T* dst = output->mutable_data<T>(TARGET(kX86));
+      T* dst = output->template mutable_data<T>(TARGET(kX86));
      jit::seq_pool_attr_t attr(
          static_cast<int>(input.numel() / input.dims()[0]),
          jit::SeqPoolType::kSum);

--- a/lite/backends/x86/math/sequence_pooling_test.cc
+++ b/lite/backends/x86/math/sequence_pooling_test.cc
@@ -101,13 +101,13 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
 TEST(SequencePoolingGrad, CPU_SUM) {
  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
+  lod1.push_back(std::vector<uint64_t>{0, 10});
  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
                         paddle::platform::CPUPlace,
                         float>(lod1);
  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
                         paddle::platform::CPUPlace,
                         float>(lod2);
@@ -116,13 +116,13 @@ TEST(SequencePoolingGrad, CPU_SUM) {
 #ifdef PADDLE_WITH_CUDA
 TEST(SequencePoolingGrad, CUDA_SUM) {
  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
+  lod1.push_back(std::vector<uint64_t>{0, 10});
  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
                         paddle::platform::CUDAPlace,
                         float>(lod1);
  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
                         paddle::platform::CUDAPlace,
                         float>(lod2);

--- a/lite/backends/x86/math/sequence_scale.cc
+++ b/lite/backends/x86/math/sequence_scale.cc
@@ -32,7 +32,7 @@ class ScaleLoDTensorFunctor<lite::TargetType::kX86, T> {
    size_t seq_width = seq->dims()[1];
    lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod);
-    T* seq_data = seq->mutable_data<T>(lite::TargetType::kX86);
+    T* seq_data = seq->template mutable_data<T>(lite::TargetType::kX86);
    for (size_t i = 0; i < num_seq; ++i) {
      for (size_t j = lod[level][i] * seq_width;
           j < lod[level][i + 1] * seq_width;

--- a/lite/backends/x86/math/sequence_topk_avg_pooling.cc
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
@@ -83,7 +83,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
    auto pos_data = pos->mutable_data<int>(lite::TargetType::kX86);
    int offset = 0;
-    std::vector<size_t> vec_out_lod;
+    std::vector<uint64_t> vec_out_lod;
    vec_out_lod.reserve(batch_size + 1);
    for (int i = 0; i <= batch_size; ++i) {
      offset = row_lod[i];
@@ -95,7 +95,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
    out->set_lod(lod_temp);
    auto in_data = in.data<T>();
-    auto out_data = out->mutable_data<T>(lite::TargetType::kX86);
+    auto out_data = out->template mutable_data<T>(lite::TargetType::kX86);
    T* sum_data = new T[max_k];
    for (int i = 0; i < batch_size; ++i) {

--- a/lite/backends/x86/math/softmax_impl.h
+++ b/lite/backends/x86/math/softmax_impl.h
@@ -108,8 +108,8 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
    const int num_remain = num_classes / axis_dim;
    if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
-      const T* in_data = X->data<T>();
+      const T* in_data = X->template data<T>();
-      auto* out_data = Y->mutable_data<T>();
+      auto* out_data = Y->template mutable_data<T>();
      for (int bs = 0; bs < batch_size; ++bs) {
        T max_val = *std::max_element(in_data, in_data + num_classes);
        max_val *= static_cast<T>(-1);
@@ -219,9 +219,9 @@ class SoftmaxGradFunctor<Target, T, enable_if_CPU<Target>> {
    const int num_remain = num_classes / axis_dim;
    if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
-      const T* out_data = y->data<T>();
+      const T* out_data = y->template data<T>();
-      const T* out_grad = y_grad->data<T>();
+      const T* out_grad = y_grad->template data<T>();
-      T* in_grad = x_grad->mutable_data<T>();
+      T* in_grad = x_grad->template mutable_data<T>();
      for (int bs = 0; bs < batch_size; ++bs) {
        T scalar;
        vec_mul_reduce<T, lite::x86::avx>(

--- a/lite/backends/x86/math/tree2col.cc
+++ b/lite/backends/x86/math/tree2col.cc
@@ -104,12 +104,12 @@ class Tree2ColFunctor<lite::TargetType::kX86, T> {
    patch_size = processing_list.size();
    // T *patch_data =
-    //    patch->mutable_data<T>({static_cast<int64_t>(patch_size),
+    //    patch->template mutable_data<T>({static_cast<int64_t>(patch_size),
    //                            static_cast<int64_t>(patch_elem_size)},
    //                           cpu_place);
    patch->Resize({static_cast<int64_t>(patch_size),
                   static_cast<int64_t>(patch_elem_size)});
-    auto *patch_data = patch->mutable_data<T>(lite::TargetType::kX86);
+    auto *patch_data = patch->template mutable_data<T>(lite::TargetType::kX86);
    constant(context, patch, 0);
    const T *features = node_features.data<T>();
@@ -166,12 +166,12 @@ class Col2TreeFunctor<lite::TargetType::kX86, T> {
      }
    }
    // T *grad_data =
-    //    in_grad->mutable_data<T>({static_cast<int64_t>(node_count),
+    //    in_grad->template mutable_data<T>({static_cast<int64_t>(node_count),
    //                              static_cast<int64_t>(grad_elem_size)},
    //                             cpu_place);
    in_grad->Resize({static_cast<int64_t>(node_count),
                     static_cast<int64_t>(grad_elem_size)});
-    auto *grad_data = in_grad->mutable_data<T>(lite::TargetType::kX86);
+    auto *grad_data = in_grad->template mutable_data<T>(lite::TargetType::kX86);
    constant(context, in_grad, 0);
    const T *out_g = out_grad.data<T>();

--- a/lite/backends/x86/math/unpooling.cc
+++ b/lite/backends/x86/math/unpooling.cc
@@ -36,7 +36,7 @@ class Unpool2dMaxFunctor<lite::TargetType::kX86, T> {
    int output_feasize = output_height * output_width;
    const T* input_data = input.data<T>();
    const int* indices_data = indices.data<int>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
    for (int b = 0; b < batch_size; ++b) {
      for (int c = 0; c < output_channels; ++c) {
        for (int i = 0; i < input_feasize; ++i) {
@@ -70,7 +70,8 @@ class Unpool2dMaxGradFunctor<lite::TargetType::kX86, T> {
    int output_feasize = output_height * output_width;
    const int* indices_data = indices.data<int>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    for (int b = 0; b < batch_size; ++b) {
      for (int c = 0; c < output_channels; ++c) {

--- a/lite/backends/x86/math/vol2col.cc
+++ b/lite/backends/x86/math/vol2col.cc
@@ -75,7 +75,7 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
                      "mismatching.");
    const T* vol_data = vol.data<T>();
-    T* col_data = col->mutable_data<T>();
+    T* col_data = col->template mutable_data<T>();
    for (int c = 0; c < channels_col; ++c) {
      int w_offset = c % filter_width;
@@ -159,7 +159,7 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
                      output_width,
                      "input_width and output_width are "
                      "mismatching.");
-    T* vol_data = vol->mutable_data<T>();
+    T* vol_data = vol->template mutable_data<T>();
    const T* col_data = col.data<T>();
    for (int c = 0; c < channels_col; ++c) {

--- a/lite/backends/xpu/CMakeLists.txt
+++ b/lite/backends/xpu/CMakeLists.txt
@@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU)
  return()
 endif()
-lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+if(LITE_WITH_XTCL)
+  lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+endif()
+lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
--- a/lite/backends/xpu/device.h
+++ b/lite/backends/xpu/device.h
@@ -14,12 +14,12 @@
 #pragma once
-#include <xtcl/xtcl.h>
 #include <cstdlib>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 namespace paddle {
 namespace lite {

--- a/lite/backends/xpu/math.h
+++ b/lite/backends/xpu/math.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <stdint.h>
+#include <cmath>
+#include <cstdlib>
+#include <utility>
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace xpu {
+namespace math {
+static inline long round_half_to_even(const float src) {  // NOLINT
+  long ret = llround(src);                                // NOLINT
+  if (fabs(fabs(round(src) - src) - 0.5) > 0) {
+    return ret;
+  } else {
+    if (abs(ret) % 2 == 0) {
+      return ret;
+    } else {
+      return ret + (ret > 0 ? -1 : 1);
+    }
+  }
+}
+static float ieee_compliance_0(float f) {
+  uint32_t *ptr = reinterpret_cast<uint32_t *>(&f);
+  uint32_t sign = (*ptr) & 0x80000000;
+  uint32_t uf = 0;
+  // nan -> inf
+  if (std::isnan(f)) {
+    uf = (sign | 0x7F800000);
+    float *ptr = reinterpret_cast<float *>(&uf);
+    return *ptr;
+  } else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) {
+    return f;
+  } else {
+    // denormal -> +-0
+    uf = 0x0;
+    float *ptr = reinterpret_cast<float *>(&uf);
+    return *ptr;
+  }
+}
+template <typename T, int RMAX>
+static inline T fp32_to_intx(const float f, float max) {
+  max = ieee_compliance_0(max);
+  float input = ieee_compliance_0(f);
+  // +0 and -0 -> +0
+  if (input == 0) {
+    input = 0.0f;
+  }
+  float tmp = RMAX / max;
+  if (std::isinf(tmp)) {
+    uint32_t *ptr = reinterpret_cast<uint32_t *>(&input);
+    if ((*ptr) >> 31 & 1) {
+      return T(-RMAX);
+    } else {
+      return T(RMAX);
+    }
+  }
+  tmp = input * tmp;
+  if (std::isnan(tmp)) {
+    return T(RMAX);
+  }
+  tmp = ieee_compliance_0(tmp);
+  // early check to avoid INF or big value get into convertor func.
+  if (tmp > RMAX) {
+    return T(RMAX);
+  }
+  if (tmp < -RMAX) {
+    return T(-RMAX);
+  }
+  T ret = (T)round_half_to_even(tmp);
+  if (ret > RMAX) {
+    ret = T(RMAX);
+  }
+  if (ret < -RMAX) {
+    ret = T(-RMAX);
+  }
+  return ret;
+}
+static inline int16_t fp32_to_int16(const float f, float max) {
+  int16_t v1 = fp32_to_intx<int16_t, 32767>(f, max);
+  return v1;
+}
+static inline int ConvertFP32ToInt16(const void *input,
+                                     void *output,
+                                     float max_val,
+                                     int len) {
+  for (int i = 0; i < len; i++) {
+    static_cast<int16_t *>(output)[i] =
+        fp32_to_int16(static_cast<const float *>(input)[i], max_val);
+  }
+  return 0;
+}
+static inline float FindMaxAbs(const float *data, int len) {
+  float max_f = 0.0f;
+  for (int i = 0; i < len; ++i) {
+    float max = std::abs(data[i]);
+    if (max > max_f) {
+      max_f = max;
+    }
+  }
+  return max_f;
+}
+template <typename T>
+static inline void Transpose(const T *in, T *out, int h, int w) {
+  for (int h1 = 0; h1 < w; ++h1) {
+    for (int w1 = 0; w1 < h; ++w1) {
+      out[h1 * h + w1] = in[w1 * w + h1];
+    }
+  }
+}
+/**
+ * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
+ * original x_dim is returned.
+ */
+static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return lite::DDim({1, x_dim[0]});
+}
+/**
+ * Get column matrix shape from a vector shape. If the rank of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return lite::DDim({y_dim[0], 1});
+}
+/**
+ * Matrix Descriptor of a memory buffer.
+ *
+ * It is used for Blas::MatMul. MatMul operator can be batched.
+ * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
+ * `batch_size` times of GEMM. The batched GEMM could be faster base on the
+ * implementation of the blas library. The batch size could be zero. If any
+ * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
+ * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
+ * [BatchSize, H1, W2]
+ *
+ * The boolean flag, `trans`, describe the memory is the transpose of matrix or
+ * not. If the trans is true, the last two dims of matrix are transposed. The
+ * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
+ *
+ * The MatDescriptor is not only the dimension or shape of a matrix, it also
+ * contains the layout, stride of matrix. It is clearer to have a structure than
+ * reuse `DDim`.
+ */
+struct MatDescriptor {
+  int64_t height_;
+  int64_t width_;
+  int64_t stride_{0};
+  int64_t batch_size_{0};
+  bool trans_;
+};
+static MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
+                                            int num_flatten_cols,
+                                            bool trans) {
+  MatDescriptor retv;
+  if (num_flatten_cols > 1) {
+    auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
+    retv.height_ = flatten_dim[0];
+    retv.width_ = flatten_dim[1];
+  } else {
+    if (tensor_dim.size() == 2) {
+      retv.height_ = tensor_dim[0];
+      retv.width_ = tensor_dim[1];
+    } else {
+      auto dim_vec = tensor_dim.Vectorize();
+      retv.batch_size_ = 1;
+      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
+        retv.batch_size_ *= dim_vec[i];
+      }
+      retv.height_ = dim_vec[dim_vec.size() - 2];
+      retv.width_ = dim_vec[dim_vec.size() - 1];
+      retv.stride_ = retv.height_ * retv.width_;
+    }
+  }
+  if (trans) {
+    std::swap(retv.width_, retv.height_);
+  }
+  retv.trans_ = trans;
+  return retv;
+}
+}  // namespace math
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/xpu/target_wrapper.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+namespace paddle {
+namespace lite {
+void* TargetWrapperXPU::Malloc(size_t size) {
+  void* ptr{nullptr};
+  xpu_malloc(&ptr, size);
+  return ptr;
+}
+void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
+void TargetWrapperXPU::MemcpySync(void* dst,
+                                  const void* src,
+                                  size_t size,
+                                  IoDirection dir) {
+  switch (dir) {
+    case IoDirection::HtoD:
+      xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
+      break;
+    case IoDirection::DtoH:
+      xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/target_wrapper.h"
+namespace paddle {
+namespace lite {
+using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
+template <>
+class TargetWrapper<TARGET(kXPU)> {
+ public:
+  static size_t num_devices() { return 1; }
+  static size_t maximum_stream() { return 0; }
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+};
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/xpu/xpu_header_sitter.h
+++ b/lite/backends/xpu/xpu_header_sitter.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#pragma GCC system_header
+#include <xpu/api.h>
+#include <xpu/golden.h>
+#include <xpu/runtime.h>
+#if defined(LITE_WITH_XTCL)
+#include <xtcl/xtcl.h>
+#endif
+namespace paddle {
+namespace lite {
+namespace xdnn = baidu::xpu::api;
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -5,9 +5,11 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
  DEPS target_wrapper_host place
  X86_DEPS target_wrapper_x86
  CUDA_DEPS target_wrapper_cuda
+  XPU_DEPS target_wrapper_xpu
  CL_DEPS cl_target_wrapper
  FPGA_DEPS fpga_target_wrapper
-  BM_DEPS target_wrapper_bm)
+  BM_DEPS target_wrapper_bm
+  MLU_DEPS target_wrapper_mlu)
 lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)
@@ -36,7 +38,7 @@ lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
 if (LITE_WITH_ARM)
 lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context)
 else()
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context)
+lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context CUDA_DEPS cuda_context)
 endif()
 #-------------------------------------------- GET CODE META INFO ------------------------------------------

--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -15,5 +15,11 @@
 #include "lite/core/context.h"
 namespace paddle {
-namespace lite {}  // namespace lite
+namespace lite {
+#ifdef LITE_WITH_XPU
+thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
+#endif
+}  // namespace lite
 }  // namespace paddle
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -16,14 +16,21 @@
 #include "lite/utils/any.h"
 #ifdef LITE_WITH_CUDA
-#include "lite/backends/cuda/blas.h"
+#include "lite/backends/cuda/context.h"
-#include "lite/backends/cuda/cuda_utils.h"
 #endif
 #ifdef LITE_WITH_OPENCL
 #include <unordered_map>
 #include "lite/backends/opencl/cl_context.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
+#ifdef LITE_WITH_MLU
+#include <cnml.h>
+#include <cnrt.h>
+#include "lite/backends/mlu/mlu_utils.h"
+#endif
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#endif
 #include <map>
 #include <memory>
@@ -45,7 +52,6 @@ class Context;
 using HostContext = Context<TargetType::kHost>;
 using X86Context = Context<TargetType::kX86>;
-using CUDAContext = Context<TargetType::kCUDA>;
 using ARMContext = Context<TargetType::kARM>;
 using NPUContext = Context<TargetType::kNPU>;
 using XPUContext = Context<TargetType::kXPU>;
@@ -53,6 +59,7 @@ using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
 using BMContext = Context<TargetType::kBM>;
 using MLUContext = Context<TargetType::kMLU>;
+using RKNPUContext = Context<TargetType::kRKNPU>;
 template <>
 class Context<TargetType::kHost> {
@@ -97,17 +104,59 @@ class Context<TargetType::kBM> {
 };
 #endif
+#ifdef LITE_WITH_RKNPU
+template <>
+class Context<TargetType::kRKNPU> {
+ public:
+  Context() {}
+  explicit Context(const RKNPUContext& ctx);
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(RKNPUContext* ctx) {}
+  RKNPUContext& operator=(const RKNPUContext& ctx) {}
+  std::string name() const { return "RKNPUContext"; }
+};
+#endif
 #ifdef LITE_WITH_XPU
 template <>
 class Context<TargetType::kXPU> {
 public:
  Context() {}
  explicit Context(const XPUContext& ctx);
  // NOTE: InitOnce should only be used by ContextScheduler
  void InitOnce() {}
  void CopySharedTo(XPUContext* ctx) {}
+  static xdnn::Context* GetRawContext() {
+    if (_tls_raw_ctx == nullptr) {
+      _tls_raw_ctx = xdnn::create_context();
+      CHECK(_tls_raw_ctx);
+    }
+    return _tls_raw_ctx;
+  }
+  static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
+    xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
+  }
+  static void SetDev(int dev_no = 0) {
+    const char* dev_env = getenv("LITE_XPU_DEV");
+    if (dev_env) {
+      xpu_set_device(atoi(dev_env));
+      return;
+    }
+    xpu_set_device(dev_no);
+  }
  std::string name() const { return "XPUContext"; }
+ private:
+  static thread_local xdnn::Context* _tls_raw_ctx;
 };
 #endif
@@ -172,18 +221,20 @@ class Context<TargetType::kFPGA> {
 };
 #endif
-#ifdef LITE_WITH_CUDA
+#ifdef LITE_WITH_MLU
-// Only works with CUDA kernels.
 template <>
-class Context<TargetType::kCUDA> {
+class Context<TargetType::kMLU> {
 public:
-  typename Env<TargetType::kCUDA>::Devs& devs =
+  typename Env<TargetType::kMLU>::Devs& devs = Env<TargetType::kMLU>::Global();
-      Env<TargetType::kCUDA>::Global();
-  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
-  void InitOnce() {
-    cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
+  MLUContext& operator=(const MLUContext& ctx) {
+    this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
+    return *this;
  }
-  void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
+  void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
    CHECK_GT(devs.size(), 0UL)
        << "Env is not initialized or current target is not exit!";
    if (dev_id >= static_cast<int>(devs.size())) {
@@ -193,77 +244,61 @@ class Context<TargetType::kCUDA> {
    } else {
      device_id_ = dev_id;
    }
-    if (io_stream_id >= devs[dev_id].max_stream()) {
+    SetMluDevice(device_id_);
-      LOG(WARNING) << "data stream index exceeds the maximum stream number, "
+    if (io_queue_id >= devs[dev_id].max_queue()) {
-                      "set to default stream(0)!";
+      LOG(WARNING) << "data queue index exceeds the maximum queue number, "
-      io_stream_id = 0;
+                      "set to default qeueu(0)!";
+      io_queue_id = 0;
    }
-    if (exec_stream_id >= devs[dev_id].max_stream()) {
+    if (exec_queue_id >= devs[dev_id].max_queue()) {
-      LOG(WARNING) << "exec stream index exceeds the maximum stream number, "
+      LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
-                      "set to default stream(0)!";
+                      "set to default qeueu(0)!";
-      exec_stream_id = 0;
+      exec_queue_id = 0;
    }
+    io_queue_ = devs[dev_id].io_queues()[io_queue_id];
+    exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
-    exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id];
+    exec_queue_id_ = exec_queue_id;
-    io_stream_ = devs[dev_id].io_streams()[io_stream_id];
+    io_queue_id_ = io_queue_id;
-    exec_stream_id_ = exec_stream_id;
-    io_stream_id_ = io_stream_id;
-  }
-  void CopySharedTo(CUDAContext* ctx) {
-    CHECK(ctx);
-    CHECK(cublas_fp32_) << "cublas_fp32 should be set first";
-    ctx->cublas_fp32_ = cublas_fp32_;
  }
-  const cudaStream_t& exec_stream() const { return exec_stream_; }
+  void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
-  void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
-  const cudaStream_t& io_stream() const { return io_stream_; }
+  const cnrtQueue_t& exec_queue() const { return exec_queue_; }
-  void SetIoStream(cudaStream_t stream) { io_stream_ = stream; }
+  void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; }
-  std::shared_ptr<cuda::Blas<float>> cublas_fp32() { return cublas_fp32_; }
+  const cnrtQueue_t& io_queue() const { return io_queue_; }
-  void SetCuBlasFP32(std::shared_ptr<cuda::Blas<float>> cublas_fp32) {
+  void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
-    cublas_fp32_ = cublas_fp32;
-  }
-  const std::vector<cudaEvent_t>& input_events() { return input_events_; }
+  cnmlCoreVersion_t MLUCoreVersion() {
-  void SetInputEvents(const std::vector<cudaEvent_t>& input_events) {
+    return DeviceInfo::Global().MLUCoreVersion();
-    input_events_.clear();
-    input_events_.assign(input_events.begin(), input_events.end());
  }
-  const std::vector<cudaEvent_t>& output_events() { return output_events_; }
+  int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
-  void SetOutputEvents(const std::vector<cudaEvent_t>& output_events) {
-    output_events_.clear();
-    output_events_.assign(output_events.begin(), output_events.end());
-  }
-  std::string name() const { return "CUDAContext"; }
+  u32_t affinity() { return affinity_; }
-  CUDAContext& operator=(const CUDAContext& context) {
+  cnrtInvokeFuncParam_t forward_param() { return forward_param_; }
-    this->Init(
-        context.device_id_, context.exec_stream_id_, context.io_stream_id_);
+  int device_id() { return device_id_; }
-    cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
-    return *this;
+  std::string name() const { return "MLUContext"; }
-  }
 private:
  int device_id_;
  // overall information
-  int exec_stream_id_;
+  int exec_queue_id_;
-  int io_stream_id_;
+  int io_queue_id_;
-  cudaStream_t exec_stream_;
+  cnrtQueue_t io_queue_;
-  cudaStream_t io_stream_;
+  cnrtQueue_t exec_queue_;
-  // not thread-safe, should allocate for each thread.
+  std::vector<cnrtNotifier_t> input_notifiers_;
-  std::shared_ptr<cuda::Blas<float>> cublas_fp32_;
+  std::vector<cnrtNotifier_t> output_notifiers_;
-  // kernel information
+  cnrtInvokeFuncParam_t forward_param_;
-  std::vector<cudaEvent_t> input_events_;
+  u32_t affinity_ = 0x01;
-  std::vector<cudaEvent_t> output_events_;
 };
-#endif
+#endif  // LITE_WITH_MLU
 #ifdef LITE_WITH_X86
 template <>
@@ -337,7 +372,9 @@ class ContextScheduler {
    return *x;
  }
-  std::unique_ptr<KernelContext> NewContext(TargetType target) {
+  std::unique_ptr<KernelContext> NewContext(
+      TargetType target,
+      /*only used for cuda context*/ int exec_stream_id = 0) {
    std::unique_ptr<KernelContext> ctx(new KernelContext);
    switch (target) {
      case TARGET(kHost):
@@ -354,7 +391,7 @@ class ContextScheduler {
      case TARGET(kCUDA): {
        int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
        auto& context = ctx->As<CUDAContext>();
-        context.Init(dev_id);
+        context.Init(dev_id, exec_stream_id);
        kernel_contexts_[TargetType::kCUDA].As<CUDAContext>().CopySharedTo(
            &context);
      } break;
@@ -371,6 +408,12 @@ class ContextScheduler {
            &ctx->As<NPUContext>());
        break;
 #endif
+#ifdef LITE_WITH_RKNPU
+      case TARGET(kRKNPU):
+        kernel_contexts_[TargetType::kRKNPU].As<RKNPUContext>().CopySharedTo(
+            &ctx->As<RKNPUContext>());
+        break;
+#endif
 #ifdef LITE_WITH_XPU
      case TARGET(kXPU):
        kernel_contexts_[TargetType::kXPU].As<XPUContext>().CopySharedTo(
@@ -394,6 +437,16 @@ class ContextScheduler {
        kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
            &ctx->As<BMContext>());
        break;
+#endif
+#ifdef LITE_WITH_MLU
+      case TARGET(kMLU): {
+        int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
+        auto& context = ctx->As<MLUContext>();
+        context.Init(dev_id);
+        kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
+            &context);
+        LOG(INFO) << "New Context for MLU";
+      } break;
 #endif
      default:
 #if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
@@ -430,11 +483,17 @@ class ContextScheduler {
 #ifdef LITE_WITH_NPU
    InitContext<TargetType::kNPU, NPUContext>();
 #endif
+#ifdef LITE_WITH_RKNPU
+    InitContext<TargetType::kRKNPU, RKNPUContext>();
+#endif
 #ifdef LITE_WITH_XPU
    InitContext<TargetType::kXPU, XPUContext>();
 #endif
 #ifdef LITE_WITH_BM
    InitContext<TargetType::kBM, BMContext>();
+#endif
+#ifdef LITE_WITH_MLU
+    InitContext<TargetType::kMLU, MLUContext>();
 #endif
  }

--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -58,7 +58,7 @@
 namespace paddle {
 namespace lite {
-#ifdef LITE_WITH_ARM
+#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
 thread_local lite_api::PowerMode DeviceInfo::mode_;
 thread_local ARMArch DeviceInfo::arch_;
 thread_local int DeviceInfo::mem_size_;
@@ -66,6 +66,15 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
 thread_local TensorLite DeviceInfo::workspace_;
 thread_local int64_t DeviceInfo::count_ = 0;
+#ifdef LITE_WITH_MLU
+thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
+thread_local int DeviceInfo::mlu_core_number_{1};
+thread_local bool DeviceInfo::use_first_conv_{false};
+thread_local std::vector<float> DeviceInfo::mean_vec_;
+thread_local std::vector<float> DeviceInfo::std_vec_;
+thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
+#endif
 #ifdef TARGET_IOS
 const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
 const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
@@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() {
  return 0;
 }
+#ifdef LITE_WITH_MLU
+void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                               int core_number,
+                               bool use_first_conv,
+                               const std::vector<float>& mean_vec,
+                               const std::vector<float>& std_vec,
+                               DataLayoutType input_layout) {
+  switch (core_version) {
+    case (lite_api::MLUCoreVersion::MLU_220):
+      mlu_core_version_ = CNML_MLU220;
+      break;
+    case (lite_api::MLUCoreVersion::MLU_270):
+      mlu_core_version_ = CNML_MLU270;
+      break;
+    default:
+      mlu_core_version_ = CNML_MLU270;
+      break;
+  }
+  mlu_core_number_ = core_number;
+  use_first_conv_ = use_first_conv;
+  mean_vec_ = mean_vec;
+  std_vec_ = std_vec;
+  input_layout_ = input_layout;
+}
+cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
+int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
+bool DeviceInfo::UseFirstConv() { return use_first_conv_; }
+const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
+const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
+DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
+#endif  // LITE_WITH_MLU
 void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
 #ifdef ARM_WITH_OMP
  thread_num = std::min(thread_num, core_num_);
@@ -1159,6 +1207,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) {
 #endif  // LITE_WITH_ARM
+#ifdef LITE_WITH_MLU
+void SetMluDevice(int device_id) {
+  LOG(INFO) << "Set mlu device " << device_id;
+  cnrtDev_t dev_handle;
+  CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, device_id));
+  CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
+}
+void Device<TARGET(kMLU)>::Init() {
+  SetMluDevice(idx_);
+  GetInfo();
+  CreateQueue();
+}
+void Device<TARGET(kMLU)>::GetInfo() {}
+void Device<TARGET(kMLU)>::CreateQueue() {
+  exec_queue_.clear();
+  io_queue_.clear();
+  for (size_t i = 0; i < max_queue_; ++i) {
+    cnrtQueue_t exec_queue;
+    cnrtQueue_t io_queue;
+    cnrtCreateQueue(&exec_queue);
+    cnrtCreateQueue(&io_queue);
+    exec_queue_.push_back(exec_queue);
+    io_queue_.push_back(io_queue);
+    cnrtCreateQueue(&exec_queue);
+    exec_queue_.push_back(exec_queue);
+  }
+}
+#endif  // LITE_WITH_MLU
 #ifdef LITE_WITH_CUDA
 void Device<TARGET(kCUDA)>::Init() {

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -19,11 +19,14 @@
 #include <vector>
 #include "lite/core/tensor.h"
 #include "lite/utils/cp_logging.h"
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/mlu_utils.h"
+#endif
 namespace paddle {
 namespace lite {
-#ifdef LITE_WITH_ARM
+#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
 typedef enum {
  kAPPLE = 0,
@@ -52,6 +55,20 @@ class DeviceInfo {
  int Setup();
  void SetRunMode(lite_api::PowerMode mode, int thread_num);
+#ifdef LITE_WITH_MLU
+  void SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                     int core_number,
+                     bool use_first_conv,
+                     const std::vector<float>& mean_vec,
+                     const std::vector<float>& std_vec,
+                     DataLayoutType input_layout);
+  cnmlCoreVersion_t MLUCoreVersion();
+  int MLUCoreNumber();
+  bool UseFirstConv();
+  const std::vector<float>& MeanVec() const;
+  const std::vector<float>& StdVec() const;
+  DataLayoutType InputLayout() const;
+#endif
  void SetCache(int l1size, int l2size, int l3size);
  void SetArch(ARMArch arch) { arch_ = arch; }
@@ -103,6 +120,15 @@ class DeviceInfo {
  static thread_local TensorLite workspace_;
  static thread_local int64_t count_;
+#ifdef LITE_WITH_MLU
+  static thread_local cnmlCoreVersion_t mlu_core_version_;
+  static thread_local int mlu_core_number_;
+  static thread_local bool use_first_conv_;
+  static thread_local std::vector<float> mean_vec_;
+  static thread_local std::vector<float> std_vec_;
+  static thread_local DataLayoutType input_layout_;
+#endif
  void SetDotInfo(int argc, ...);
  void SetFP16Info(int argc, ...);
  void SetFP32Info(int argc, ...);
@@ -133,7 +159,10 @@ class Env {
    static Devs* devs = new Devs();
    return *devs;
  }
-  static void Init(int max_stream = 4) {
+  static void Init(int max_stream = 6) {
+#ifdef LITE_WITH_MLU
+    CNRT_CALL(cnrtInit(0));
+#endif
    Devs& devs = Global();
    if (devs.size() > 0) {
      return;
@@ -142,10 +171,11 @@ class Env {
    // Get device count
    count = API::num_devices();
    if (count == 0) {
-      CHECK(false) << "No device found!";
+      LOG(INFO) << "No " << TargetToStr(Type) << " device(s) found!";
    } else {
      LOG(INFO) << "Found " << count << " device(s)";
    }
+    CHECK_GT(max_stream, 0) << "max_stream must be greater than 0.";
    // create all device
    for (int i = 0; i < count; i++) {
      auto dev = Device<Type>(i, max_stream);
@@ -156,6 +186,41 @@ class Env {
  }
 };
+#ifdef LITE_WITH_MLU
+void SetMluDevice(int device_id);
+template <>
+class Device<TARGET(kMLU)> {
+ public:
+  Device(int dev_id, int max_queue = 1) : idx_(dev_id), max_queue_(max_queue) {}
+  void Init();
+  int id() { return idx_; }
+  int max_queue() { return max_queue_; }
+  void SetId(int idx) { idx_ = idx; }
+  std::string name() { return "MLU"; }
+  int core_num() { return 16; }
+  float max_memory() { return 16 * 1024; }
+  std::vector<cnrtQueue_t> io_queues() { return io_queue_; }
+  std::vector<cnrtQueue_t> exec_queues() { return exec_queue_; }
+ private:
+  void CreateQueue();
+  void GetInfo();
+ private:
+  int idx_{0};
+  int max_queue_;
+  std::string device_name_;
+  float max_memory_;
+  std::vector<cnrtQueue_t> io_queue_;
+  std::vector<cnrtQueue_t> exec_queue_;
+};
+template class Env<TARGET(kMLU)>;
+#endif  // LITE_WITH_MLU
 #ifdef LITE_WITH_CUDA
 template <>
 class Device<TARGET(kCUDA)> {
@@ -170,8 +235,8 @@ class Device<TARGET(kCUDA)> {
  std::string name() { return device_prop_.name; }
  int core_num() { return device_prop_.multiProcessorCount; }
  float max_memory() { return device_prop_.totalGlobalMem / 1048576.; }
-  std::vector<cudaStream_t> exec_streams() { return exec_stream_; }
+  const std::vector<cudaStream_t>& exec_streams() { return exec_stream_; }
-  std::vector<cudaStream_t> io_streams() { return io_stream_; }
+  const std::vector<cudaStream_t>& io_streams() { return io_stream_; }
  int sm_version() { return sm_version_; }
  bool has_fp16() { return has_fp16_; }

--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -83,6 +83,9 @@ class KernelBase {
 #if defined(LITE_WITH_CUDA)
    WorkSpace::Global_CUDA().AllocReset();
 #endif
+#if defined(LITE_WITH_MLU)
+    WorkSpace::Global_MLU().AllocReset();
+#endif
 #ifdef LITE_WITH_PROFILE
    profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
    profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());

--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -47,6 +47,16 @@ void* TargetMalloc(TargetType target, size_t size) {
      data = TargetWrapper<TARGET(kBM)>::Malloc(size);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      data = TargetWrapper<TARGET(kMLU)>::Malloc(size);
+      break;
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      data = TargetWrapperXPU::Malloc(size);
+      break;
+#endif  // LITE_WITH_XPU
    default:
      LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
  }
@@ -85,6 +95,16 @@ void TargetFree(TargetType target, void* data, std::string free_flag) {
      TargetWrapper<TARGET(kBM)>::Free(data);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      TargetWrapper<TARGET(kMLU)>::Free(data);
+      break;
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      TargetWrapperXPU::Free(data);
+      break;
+#endif  // LITE_WITH_XPU
    default:
      LOG(FATAL) << "Unknown type";
  }
@@ -116,6 +136,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
      TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      TargetWrapper<TARGET(kMLU)>::MemcpySync(
+          dst, src, size, IoDirection::HtoD);
+      break;
+#endif
 #ifdef LITE_WITH_OPENCL
    case TargetType::kOpenCL:
      TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);

--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -31,6 +31,14 @@
 #include "lite/backends/bm/target_wrapper.h"
 #endif  // LITE_WITH_BM
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/target_wrapper.h"
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif  // LITE_WITH_XPU
 namespace paddle {
 namespace lite {
@@ -75,6 +83,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
      TargetWrapperCL::MemcpySync(dst, src, size, dir);
      break;
 #endif  // LITE_WITH_OPENCL
+#ifdef LITE_WITH_MLU
+    case TARGET(kMLU):
+      TargetWrapperMlu::MemcpySync(dst, src, size, dir);
+      break;
+#endif
 #ifdef LITE_WITH_FPGA
    case TARGET(kFPGA):
      TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir);
@@ -126,7 +139,7 @@ class Buffer {
                        const size_t img_h,
                        void* host_ptr = nullptr) {
    if (target != target_ || cl_image2d_width_ < img_w ||
-        cl_image2d_height_ < img_h) {
+        cl_image2d_height_ < img_h || host_ptr != nullptr) {
      CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
      Free();
      data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -21,6 +21,8 @@ lite_cc_library(mir_passes
      fusion/elementwise_add_activation_fuse_pass.cc
      fusion/quant_dequant_fuse_pass.cc
      fusion/sequence_pool_concat_fuse_pass.cc
+      fusion/__xpu__resnet_fuse_pass.cc
+      fusion/__xpu__multi_encoder_fuse_pass.cc
      elimination/identity_scale_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc
      static_kernel_pick_pass.cc
@@ -35,6 +37,8 @@ lite_cc_library(mir_passes
      demo_pass.cc
      runtime_context_assign_pass.cc
      memory_optimize_pass.cc
+      multi_stream_analysis_pass.cc
+      mlu_postprocess_pass.cc
      weight_quantization_preprocess_pass.cc
      quantized_op_attributes_inference_pass.cc
  DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
@@ -69,10 +73,10 @@ set(pattern_deps mir_node mir_ssa_graph op)
 if (WITH_TESTING)
  list(APPEND pattern_deps gtest)
 endif()
-lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps})
+lite_cc_library(pattern_matcher SRCS pattern_matcher.cc xpu_pattern_matcher.cc DEPS ${pattern_deps})
 lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher)
-lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher)
+lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc xpu_pattern_matcher_high_api.cc DEPS pattern_matcher)
 # for mobile, unnecessary to compile the following testings.

--- a/lite/core/mir/dot.h
+++ b/lite/core/mir/dot.h
@@ -27,8 +27,8 @@
 #include "lite/utils/string.h"
 namespace paddle {
-namespace inference {
+namespace lite {
-namespace analysis {
+namespace mir {
 static size_t dot_node_counter{0};
@@ -162,6 +162,6 @@ class Dot {
  std::vector<Attr> attrs_;
 };
-}  // namespace analysis
+}  // namespace mir
-}  // namespace inference
+}  // namespace lite
 }  // namespace paddle
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -27,10 +27,10 @@ lite_cc_library(fuse_transpose_softmax_transpose
        DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_interpolate
        SRCS interpolate_fuser.cc
-        DEPS pattern_matcher_high_api)       
+        DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_sequence_pool_concat
        SRCS sequence_pool_concat_fuser.cc
-        DEPS pattern_matcher_high_api)       
+        DEPS pattern_matcher_high_api)
 set(mir_fusers
    fuse_fc

--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -116,8 +116,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  }
  size_t weight_num = conv_weight_t->data_size();
  bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
-  bool is_weight_quantization =
+  bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits");
-      conv_op_desc->HasAttr("quantize_weight_bits") ? true : false;
  // comupte BN alpha and beta
  Tensor alpha_tensor, beta_tensor;

--- a/lite/core/mir/generate_program_pass.cc
+++ b/lite/core/mir/generate_program_pass.cc
@@ -14,6 +14,7 @@
 #include "lite/core/mir/generate_program_pass.h"
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 #include "lite/core/mir/graph_visualize_pass.h"
@@ -25,10 +26,37 @@ namespace mir {
 void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  VLOG(4) << "final program \n" << Visualize(graph.get());
-  for (auto& item : graph->StmtTopologicalOrder()) {
+  std::vector<Node*> nodes_in_order;
+#ifdef LITE_WITH_CUDA
+  const std::string depend_pass = "multi_stream_analysis_pass";
+  const std::string attr_name = "nodes_in_order";
+  mir::Pass* pass = mir::PassManager::Global().LookUp(depend_pass);
+  if (pass->HasAttr(attr_name)) {
+    nodes_in_order = pass->GetAttr<std::vector<Node*>>(attr_name);
+  }
+#endif
+  if (nodes_in_order.empty()) {
+    nodes_in_order = graph->StmtTopologicalOrder();
+  }
+  for (auto& item : nodes_in_order) {
    if (item->IsStmt()) {
      auto& stmt = item->AsStmt();
      VLOG(4) << stmt;
+#ifdef LITE_WITH_CUDA
+      if (stmt.kernels().front()->target() == TargetType::kCUDA) {
+        stmt.kernels()
+            .front()
+            ->mutable_context()
+            ->As<CUDAContext>()
+            .SetNeedSync(stmt.need_sync_);
+        stmt.kernels()
+            .front()
+            ->mutable_context()
+            ->As<CUDAContext>()
+            .SetSyncStreams(stmt.sync_streams_);
+      }
+#endif
      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
    }
  }

--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -26,15 +26,13 @@ namespace paddle {
 namespace lite {
 namespace mir {
-using inference::analysis::Dot;
 void GraphVisualizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  VLOG(5) << "\n" << Visualize(graph.get());
 }
 std::string Visualize(mir::SSAGraph* graph) {
  std::ostringstream os;
-  inference::analysis::Dot dot;
+  Dot dot;
  auto string_trunc = [](const std::string& str) -> std::string {
    const int max_disp_size = 100;
    if (str.length() > max_disp_size)
@@ -87,7 +85,23 @@ std::string Visualize(mir::SSAGraph* graph) {
    if (!node->IsStmt()) continue;
    auto op_info = node->AsStmt().op_info();
    auto op_type = op_info->Type();
-    std::string op_name = string_format("%s%d", op_type.c_str(), op_idx++);
+    std::string op_name;
+    if (node->AsStmt().need_sync_) {
+      std::ostringstream oss;
+      for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) {
+        oss << std::to_string(node->AsStmt().sync_streams_[i]);
+        if (i != node->AsStmt().sync_streams_.size() - 1) {
+          oss << ",";
+        }
+      }
+      op_name = string_format("%s%d, stream=%d, sync_streams={%s}",
+                              op_type.c_str(),
+                              op_idx++,
+                              node->AsStmt().stream_id_,
+                              oss.str().c_str());
+    } else {
+      op_name = string_format("%s%d", op_type.c_str(), op_idx++);
+    }
    // Add its input&output variables as the Dot nodes
    dot.AddNode(op_name,
                {Dot::Attr("shape", "box"),
@@ -95,7 +109,13 @@ std::string Visualize(mir::SSAGraph* graph) {
                 Dot::Attr("color", "black"),
                 Dot::Attr("fillcolor", "yellow")});
    for (auto& x : node->inlinks) {
-      auto var_name = x->AsArg().name;
+      std::string var_name;
+      if (x->AsArg().lane != -1) {
+        var_name = string_format(
+            "%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane);
+      } else {
+        var_name = x->AsArg().name;
+      }
      if (!exists_var_names.count(var_name)) {
        dot.AddNode(var_name, {});
        exists_var_names.insert(var_name);
@@ -103,7 +123,13 @@ std::string Visualize(mir::SSAGraph* graph) {
      dot.AddEdge(var_name, op_name, {});
    }
    for (auto& x : node->outlinks) {
-      auto var_name = x->AsArg().name;
+      std::string var_name;
+      if (x->AsArg().lane != -1) {
+        var_name = string_format(
+            "%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane);
+      } else {
+        var_name = x->AsArg().name;
+      }
      if (!exists_var_names.count(var_name)) {
        dot.AddNode(var_name, {});
        exists_var_names.insert(var_name);

--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -313,4 +313,4 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
    .BindTargets({TARGET(kARM), TARGET(kOpenCL)})
-    .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM)});
+    .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM), TARGET(kRKNPU)});
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -15,7 +15,6 @@
 #include "lite/core/mir/mlu_postprocess_pass.h"
 #include <list>
 #include <memory>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -50,10 +49,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
    op_desc.SetAttr<int>("out_dtype", 4);  // FP16
    op_desc.SetInput("X", {cur_node->AsArg().name});
    op_desc.SetOutput("Out", {cast_arg_name});
-  } else if (op_type == "transpose") {
+  } else if (op_type == "layout") {
    // NCHW -> NHWC
-    op_desc.SetAttr<std::vector<int>>("axis", {0, 2, 3, 1});
+    op_desc.SetInput("Input", {cur_node->AsArg().name});
-    op_desc.SetInput("X", {cur_node->AsArg().name});
    op_desc.SetOutput("Out", {cast_arg_name});
  } else if (op_type == "io_copy") {
    op_desc.SetInput("Input", {cur_node->AsArg().name});
@@ -72,8 +70,15 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
      if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
        is_found = true;
      }
-    } else if (op_type == "transpose") {
+    } else if (op_type == "layout") {
-      is_found = true;
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
+          DataLayoutCompatible(*out_arg_ty, *cast_type) &&
+          //  for first conv
+          PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
+        is_found = true;
+      }
    } else if (op_type == "io_copy") {
      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
@@ -89,8 +94,13 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
      // we pick the kernel
      cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
      auto& stmt = cast_inst->AsStmt();
-      stmt.picked_kernel().SetContext(
+      if (op_type == "layout") {
-          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+        stmt.picked_kernel().SetContext(
+            ContextScheduler::Global().NewContext(TARGET(kX86)));
+      } else {
+        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            stmt.picked_kernel().target()));
+      }
      break;
    }
  }
@@ -113,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
  cast_arg->AsArg().type = cast_type;
  auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
  // for CastAfter manully set the tensor's type
-  var->GetMutable<::paddle::lite::Tensor>();
+  var->GetMutable<paddle::lite::Tensor>();
  // create the stmt node
  auto* cast_inst = graph->NewInstructNode();
@@ -127,10 +137,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
    op_desc.SetAttr<int>("out_dtype", 5);  // FP16
    op_desc.SetInput("X", {cast_arg_name});
    op_desc.SetOutput("Out", {cur_node->AsArg().name});
-  } else if (op_type == "transpose") {
+  } else if (op_type == "layout") {
    // NHWC -> NCHW
-    op_desc.SetAttr<std::vector<int>>("axis", {0, 3, 1, 2});
+    op_desc.SetInput("Input", {cast_arg_name});
-    op_desc.SetInput("X", {cast_arg_name});
    op_desc.SetOutput("Out", {cur_node->AsArg().name});
  } else if (op_type == "io_copy") {
    op_desc.SetInput("Input", {cast_arg_name});
@@ -151,8 +160,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
      if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) {
        is_found = true;
      }
-    } else if (op_type == "transpose") {
+    } else if (op_type == "layout") {
-      is_found = true;
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (DataLayoutCompatible(*in_arg_ty, *cast_type) &&
+          DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) {
+        is_found = true;
+      }
    } else if (op_type == "io_copy") {
      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
@@ -168,8 +182,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
      // we pick the kernel
      cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
      auto& stmt = cast_inst->AsStmt();
-      stmt.picked_kernel().SetContext(
+      if (op_type == "layout") {
-          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+        stmt.picked_kernel().SetContext(
+            ContextScheduler::Global().NewContext(TARGET(kX86)));
+      } else {
+        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            stmt.picked_kernel().target()));
+      }
      break;
    }
  }
@@ -193,24 +212,28 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
  auto* cur_node = head_node;
  const auto name_prefix =
      head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
+  bool is_first_conv_head =
+      std::find(first_conv_nodes_.begin(),
+                first_conv_nodes_.end(),
+                head_node->AsArg().name) != first_conv_nodes_.end();
-  // layout cast node
+  // precision cast node
-  if (head_type->layout() != inst_type->layout()) {
+  if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
    cur_node = InsertCastBefore(
-        "transpose",
+        "cast",
-        name_prefix + "transpose",
+        name_prefix + "cast",
        graph,
        cur_node,
        inst_node,
        LiteType::GetTensorTy(
-            head_type->target(), head_type->precision(), inst_type->layout()));
+            head_type->target(), inst_type->precision(), head_type->layout()));
  }
-  // precision cast node
+  // layout cast node
-  if (head_type->precision() != inst_type->precision()) {
+  if (head_type->layout() != inst_type->layout()) {
    cur_node = InsertCastBefore(
-        "cast",
+        "layout",
-        name_prefix + "cast",
+        name_prefix + "layout",
        graph,
        cur_node,
        inst_node,
@@ -260,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
  // get subgraph's valid precision
  const auto& places = graph->valid_places();
-  std::set<::paddle::lite_api::PrecisionType> prec_set;
+  std::set<paddle::lite_api::PrecisionType> prec_set;
  for (const auto& place : places) {
    if (place.target == TARGET(kMLU)) {
      prec_set.insert(place.precision);
@@ -343,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
  const auto name_prefix =
      tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
-  // layout cast node
+  // precision cast node
-  if (tail_type->layout() != inst_type->layout()) {
+  if (tail_type->precision() != inst_type->precision()) {
    cur_node = InsertCastAfter(
-        "transpose",
+        "cast",
-        name_prefix + "transpose",
+        name_prefix + "cast",
        graph,
        cur_node,
        inst_node,
        LiteType::GetTensorTy(
-            tail_type->target(), tail_type->precision(), inst_type->layout()));
+            tail_type->target(), inst_type->precision(), tail_type->layout()));
  }
-  // precision cast node
+  // layout cast node
-  if (tail_type->precision() != inst_type->precision()) {
+  if (tail_type->layout() != inst_type->layout()) {
    cur_node = InsertCastAfter(
-        "cast",
+        "layout",
-        name_prefix + "cast",
+        name_prefix + "layout",
        graph,
        cur_node,
        inst_node,
@@ -392,6 +415,14 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
    UpdateOutputTo(
        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+    /* graph like this
+     *        subgraph_op_0
+     *          /       \
+     *         /         \
+     * subgraph_op_1   host_op
+     */
+    UpdateInputTo(
+        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
  }
  // recreate the op
@@ -415,6 +446,56 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
  }
 }
+bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
+  auto* block_desc =
+      static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
+          ->GetSubBlock();
+  for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
+    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
+    CHECK(op_desc);
+    if (op_desc->Type() == "conv2d") {
+      for (auto& names : op_desc->inputs()) {
+        if (std::find(names.second.begin(),
+                      names.second.end(),
+                      arg_node->AsArg().name) != names.second.end()) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) {
+  CHECK(arg_node->IsArg());
+  for (auto& inst : arg_node->outlinks) {
+    if (inst->AsStmt().op_type() == "subgraph") {
+      return IsFirstConvInSubgraph(arg_node, inst);
+    }
+  }
+  return false;
+}
+void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) {
+  for (auto& node : graph->mutable_nodes()) {
+    if (!node.IsStmt()) continue;
+    if (node.AsStmt().op_type() == "feed") {
+      for (auto& out : node.outlinks) {
+        if (IsFirstConvNode(out)) {
+          first_conv_nodes_.insert(out->AsArg().name);
+          // modify first conv nodes' type
+          const auto* old_type = out->AsArg().type;
+          out->AsArg().type =
+              LiteType::GetTensorTy(old_type->target(),
+                                    paddle::lite_api::PrecisionType::kInt8,
+                                    old_type->layout(),
+                                    old_type->device());
+        }
+      }
+    }
+  }
+}
 void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
  for (auto& node : graph->mutable_nodes()) {
    if (!node.IsStmt()) continue;
@@ -432,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
          out->AsArg().type =
              LiteType::GetTensorTy(old_type->target(),
                                    old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                    old_type->device());
        }
      }
@@ -451,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
          inp->AsArg().type =
              LiteType::GetTensorTy(old_type->target(),
                                    old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                    old_type->device());
        }
      }
@@ -460,14 +541,22 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
 }
 void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  // currently for non-persistent input and output args, mlu subgraph op
+// currently for non-persistent input and output args, mlu subgraph op
-  // only support float16/float32 data type
+// only support float16/float32 data type
-  // in two situations as folllows:
+// in two situations as folllows:
-  // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
+// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
-  // arg_in and arg_out are assumed to be NHWC which user should be aware of.
+// arg_in and arg_out are assumed to be NHWC which user should be aware of.
-  // Thus here we change these args' layout to NHWC
+// Thus here we change these args' layout to NHWC
-  ModifyLayout(graph.get());
+#ifdef LITE_WITH_MLU
+  if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) {
+    ModifyLayout(graph.get());
+  }
+  if (lite::DeviceInfo::Global().UseFirstConv()) {
+    GatherAndModifyFirstConvNodes(graph.get());
+  }
+#endif
  // insert io_copy, layout and precision cast of subgraph's inputs and outputs
  for (auto& node : graph->mutable_nodes()) {

--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 #include "lite/core/mir/pass.h"
@@ -107,6 +108,15 @@ class MLUPostprocessPass : public ProgramPass {
                        const Type* cast_type);
  void RecreateOp(Node* inst_node, SSAGraph* graph);
+  void GatherAndModifyFirstConvNodes(SSAGraph* graph);
+  bool IsFirstConvNode(Node* arg_node);
+  bool IsFirstConvInSubgraph(Node* arg_node, Node* inst);
+ private:
+  std::set<std::string> first_conv_nodes_;
 };
 }  // namespace mir

--- a/lite/core/mir/multi_stream_analysis_pass.cc
+++ b/lite/core/mir/multi_stream_analysis_pass.cc
--- a/lite/core/mir/multi_stream_analysis_pass.h
+++ b/lite/core/mir/multi_stream_analysis_pass.h
--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
--- a/lite/core/mir/pass.h
+++ b/lite/core/mir/pass.h
--- a/lite/core/mir/pattern_matcher.cc
+++ b/lite/core/mir/pattern_matcher.cc
--- a/lite/core/mir/pattern_matcher_high_api.h
+++ b/lite/core/mir/pattern_matcher_high_api.h
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
--- a/lite/core/mir/runtime_context_assign_pass.cc
+++ b/lite/core/mir/runtime_context_assign_pass.cc
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
--- a/lite/core/mir/subgraph_cast_display_pass.cc
+++ b/lite/core/mir/subgraph_cast_display_pass.cc
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
--- a/lite/core/mir/type_precision_cast_pass.h
+++ b/lite/core/mir/type_precision_cast_pass.h
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
--- a/lite/core/mir/weight_quantization_preprocess_pass.cc
+++ b/lite/core/mir/weight_quantization_preprocess_pass.cc
--- a/lite/core/mir/weight_quantization_preprocess_pass.h
+++ b/lite/core/mir/weight_quantization_preprocess_pass.h
--- a/lite/core/mir/xpu_pattern_matcher.cc
+++ b/lite/core/mir/xpu_pattern_matcher.cc
--- a/lite/core/mir/xpu_pattern_matcher.h
+++ b/lite/core/mir/xpu_pattern_matcher.h
--- a/lite/core/mir/xpu_pattern_matcher_high_api.cc
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.cc
--- a/lite/core/mir/xpu_pattern_matcher_high_api.h
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.h
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
--- a/lite/core/profile/profiler.cc
+++ b/lite/core/profile/profiler.cc
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
--- a/lite/core/program.h
+++ b/lite/core/program.h
--- a/lite/core/types.cc
+++ b/lite/core/types.cc
--- a/lite/core/types.h
+++ b/lite/core/types.h
--- a/lite/core/workspace.h
+++ b/lite/core/workspace.h
--- a/lite/fluid/lod.h
+++ b/lite/fluid/lod.h
--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
--- a/lite/kernels/arm/compare_compute.cc
+++ b/lite/kernels/arm/compare_compute.cc
--- a/lite/kernels/arm/concat_compute.cc
+++ b/lite/kernels/arm/concat_compute.cc
--- a/lite/kernels/arm/concat_compute.h
+++ b/lite/kernels/arm/concat_compute.h
--- a/lite/kernels/arm/concat_compute_test.cc
+++ b/lite/kernels/arm/concat_compute_test.cc
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
--- a/lite/kernels/arm/matmul_compute.cc
+++ b/lite/kernels/arm/matmul_compute.cc
--- a/lite/kernels/arm/mean_grad_compute.cc
+++ b/lite/kernels/arm/mean_grad_compute.cc
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
--- a/lite/kernels/bm/bridges/assign_value_op.cc
+++ b/lite/kernels/bm/bridges/assign_value_op.cc
--- a/lite/kernels/bm/bridges/conv_op.cc
+++ b/lite/kernels/bm/bridges/conv_op.cc
--- a/lite/kernels/bm/bridges/conv_transpose_op.cc
+++ b/lite/kernels/bm/bridges/conv_transpose_op.cc
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
--- a/lite/kernels/bm/bridges/interpolate_op.cc
+++ b/lite/kernels/bm/bridges/interpolate_op.cc
--- a/lite/kernels/bm/bridges/matmul_op.cc
+++ b/lite/kernels/bm/bridges/matmul_op.cc
--- a/lite/kernels/bm/bridges/multiclass_nms_op.cc
+++ b/lite/kernels/bm/bridges/multiclass_nms_op.cc
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
--- a/lite/kernels/bm/bridges/reduce_full_op.cc
+++ b/lite/kernels/bm/bridges/reduce_full_op.cc
--- a/lite/kernels/bm/bridges/shape_op.cc
+++ b/lite/kernels/bm/bridges/shape_op.cc
--- a/lite/kernels/bm/bridges/split_op.cc
+++ b/lite/kernels/bm/bridges/split_op.cc
--- a/lite/kernels/bm/bridges/transpose_op.cc
+++ b/lite/kernels/bm/bridges/transpose_op.cc
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/host/compare_compute.cc
+++ b/lite/kernels/host/compare_compute.cc
--- a/lite/kernels/host/compare_compute.h
+++ b/lite/kernels/host/compare_compute.h
--- a/lite/kernels/host/ctc_align_compute.cc
+++ b/lite/kernels/host/ctc_align_compute.cc
--- a/lite/kernels/host/ctc_align_compute.h
+++ b/lite/kernels/host/ctc_align_compute.h
--- a/lite/kernels/host/reshape_compute_test.cc
+++ b/lite/kernels/host/reshape_compute_test.cc
--- a/lite/kernels/arm/shape_compute.cc
+++ b/lite/kernels/arm/shape_compute.cc
--- a/lite/kernels/arm/shape_compute.h
+++ b/lite/kernels/arm/shape_compute.h
--- a/lite/kernels/mlu/CMakeLists.txt
+++ b/lite/kernels/mlu/CMakeLists.txt
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
--- a/lite/kernels/mlu/bridges/batch_norm_op_test.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
--- a/lite/kernels/mlu/bridges/concat_op.cc
+++ b/lite/kernels/mlu/bridges/concat_op.cc
--- a/lite/kernels/mlu/bridges/concat_op_test.cc
+++ b/lite/kernels/mlu/bridges/concat_op_test.cc
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
--- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
--- a/lite/kernels/mlu/bridges/graph.cc
+++ b/lite/kernels/mlu/bridges/graph.cc
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
--- a/lite/kernels/mlu/bridges/interpolate_op_test.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op_test.cc
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
--- a/lite/kernels/mlu/bridges/pool_op.cc
+++ b/lite/kernels/mlu/bridges/pool_op.cc
--- a/lite/kernels/mlu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
--- a/lite/kernels/mlu/bridges/scale_op.cc
+++ b/lite/kernels/mlu/bridges/scale_op.cc
--- a/lite/kernels/mlu/bridges/scale_op_test.cc
+++ b/lite/kernels/mlu/bridges/scale_op_test.cc
--- a/lite/kernels/mlu/bridges/softmax_op.cc
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
--- a/lite/kernels/mlu/bridges/softmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
--- a/lite/kernels/mlu/layout_compute.cc
+++ b/lite/kernels/mlu/layout_compute.cc
--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
--- a/lite/kernels/npu/bridges/gather_op.cc
+++ b/lite/kernels/npu/bridges/gather_op.cc
--- a/lite/kernels/npu/bridges/lookup_table_op.cc
+++ b/lite/kernels/npu/bridges/lookup_table_op.cc
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
--- a/lite/kernels/npu/bridges/shape_op.cc
+++ b/lite/kernels/npu/bridges/shape_op.cc
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
--- a/lite/kernels/opencl/box_coder_image_compute_test.cc
+++ b/lite/kernels/opencl/box_coder_image_compute_test.cc
--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
--- a/lite/kernels/opencl/conv_buffer_compute.cc
+++ b/lite/kernels/opencl/conv_buffer_compute.cc
--- a/lite/kernels/opencl/conv_buffer_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
--- a/lite/kernels/opencl/elementwise_sub_image_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
--- a/lite/kernels/opencl/mul_buffer_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
--- a/lite/kernels/rknpu/CMakeLists.txt
+++ b/lite/kernels/rknpu/CMakeLists.txt
--- a/lite/kernels/rknpu/bridges/CMakeLists.txt
+++ b/lite/kernels/rknpu/bridges/CMakeLists.txt
--- a/lite/kernels/rknpu/bridges/act_op.cc
+++ b/lite/kernels/rknpu/bridges/act_op.cc
--- a/lite/kernels/rknpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/rknpu/bridges/batch_norm_op.cc
--- a/lite/kernels/rknpu/bridges/concat_op.cc
+++ b/lite/kernels/rknpu/bridges/concat_op.cc
--- a/lite/kernels/rknpu/bridges/conv_op.cc
+++ b/lite/kernels/rknpu/bridges/conv_op.cc
--- a/lite/kernels/rknpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/rknpu/bridges/elementwise_ops.cc
--- a/lite/kernels/rknpu/bridges/fc_op.cc
+++ b/lite/kernels/rknpu/bridges/fc_op.cc
--- a/lite/kernels/rknpu/bridges/graph.cc
+++ b/lite/kernels/rknpu/bridges/graph.cc
--- a/lite/kernels/rknpu/bridges/graph.h
+++ b/lite/kernels/rknpu/bridges/graph.h
--- a/lite/kernels/rknpu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/rknpu/bridges/paddle_use_bridges.h
--- a/lite/kernels/rknpu/bridges/pool_op.cc
+++ b/lite/kernels/rknpu/bridges/pool_op.cc
--- a/lite/kernels/rknpu/bridges/softmax_op.cc
+++ b/lite/kernels/rknpu/bridges/softmax_op.cc
--- a/lite/kernels/rknpu/bridges/utility.cc
+++ b/lite/kernels/rknpu/bridges/utility.cc
--- a/lite/kernels/rknpu/bridges/utility.h
+++ b/lite/kernels/rknpu/bridges/utility.h
--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
--- a/lite/kernels/x86/attention_padding_mask_compute.h
+++ b/lite/kernels/x86/attention_padding_mask_compute.h
--- a/lite/kernels/x86/batch_norm_compute.h
+++ b/lite/kernels/x86/batch_norm_compute.h
--- a/lite/kernels/x86/cast_compute.cc
+++ b/lite/kernels/x86/cast_compute.cc
--- a/lite/kernels/x86/concat_compute.h
+++ b/lite/kernels/x86/concat_compute.h
--- a/lite/kernels/x86/conv_compute.h
+++ b/lite/kernels/x86/conv_compute.h
--- a/lite/kernels/x86/dropout_compute.h
+++ b/lite/kernels/x86/dropout_compute.h
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
--- a/lite/kernels/x86/fc_compute.h
+++ b/lite/kernels/x86/fc_compute.h
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute.h
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute.h
--- a/lite/kernels/x86/gather_compute.h
+++ b/lite/kernels/x86/gather_compute.h
--- a/lite/kernels/x86/gru_compute.h
+++ b/lite/kernels/x86/gru_compute.h
--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
--- a/lite/kernels/x86/match_matrix_tensor_compute.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute.cc
--- a/lite/kernels/x86/matmul_compute.h
+++ b/lite/kernels/x86/matmul_compute.h
--- a/lite/kernels/x86/mul_compute.h
+++ b/lite/kernels/x86/mul_compute.h
--- a/lite/kernels/x86/reduce_compute.h
+++ b/lite/kernels/x86/reduce_compute.h
--- a/lite/kernels/x86/scale_compute.h
+++ b/lite/kernels/x86/scale_compute.h
--- a/lite/kernels/x86/search_grnn_compute.cc
+++ b/lite/kernels/x86/search_grnn_compute.cc
--- a/lite/kernels/x86/search_group_padding_compute.h
+++ b/lite/kernels/x86/search_group_padding_compute.h
--- a/lite/kernels/x86/search_seq_fc_compute.h
+++ b/lite/kernels/x86/search_seq_fc_compute.h
--- a/lite/kernels/x86/sequence_arithmetic_compute.h
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
--- a/lite/kernels/x86/sequence_concat_compute.h
+++ b/lite/kernels/x86/sequence_concat_compute.h
--- a/lite/kernels/x86/sequence_concat_compute_test.cc
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
--- a/lite/kernels/x86/sequence_conv_compute.cc
+++ b/lite/kernels/x86/sequence_conv_compute.cc
--- a/lite/kernels/x86/sequence_conv_compute.h
+++ b/lite/kernels/x86/sequence_conv_compute.h
--- a/lite/kernels/x86/sequence_expand_as_compute.h
+++ b/lite/kernels/x86/sequence_expand_as_compute.h
--- a/lite/kernels/x86/sequence_pool_compute.h
+++ b/lite/kernels/x86/sequence_pool_compute.h
--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
--- a/lite/kernels/x86/sequence_unpad_compute.cc
+++ b/lite/kernels/x86/sequence_unpad_compute.cc
--- a/lite/kernels/x86/sequence_unpad_compute.h
+++ b/lite/kernels/x86/sequence_unpad_compute.h
--- a/lite/kernels/x86/shape_compute.h
+++ b/lite/kernels/x86/shape_compute.h
--- a/lite/kernels/x86/softmax_compute.h
+++ b/lite/kernels/x86/softmax_compute.h
--- a/lite/kernels/x86/squeeze_compute.h
+++ b/lite/kernels/x86/squeeze_compute.h
--- a/lite/kernels/x86/stack_compute.h
+++ b/lite/kernels/x86/stack_compute.h
--- a/lite/kernels/x86/transpose_compute.h
+++ b/lite/kernels/x86/transpose_compute.h
--- a/lite/kernels/x86/uniform_random_compute.cc
+++ b/lite/kernels/x86/uniform_random_compute.cc
--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
--- a/lite/kernels/x86/var_conv_2d_compute_test.cc
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
--- a/lite/kernels/xpu/__xpu__resnet50_compute.cc
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.cc
--- a/lite/kernels/xpu/__xpu__resnet50_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
--- a/lite/kernels/xpu/activation_compute.cc
+++ b/lite/kernels/xpu/activation_compute.cc
--- a/lite/kernels/arm/compare_compute.h
+++ b/lite/kernels/arm/compare_compute.h
--- a/lite/kernels/xpu/batch_norm_compute.cc
+++ b/lite/kernels/xpu/batch_norm_compute.cc
--- a/lite/kernels/xpu/batch_norm_compute.h
+++ b/lite/kernels/xpu/batch_norm_compute.h
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
--- a/lite/kernels/xpu/bridges/graph.h
+++ b/lite/kernels/xpu/bridges/graph.h
--- a/lite/kernels/xpu/bridges/utility.h
+++ b/lite/kernels/xpu/bridges/utility.h
--- a/lite/kernels/xpu/cast_compute.cc
+++ b/lite/kernels/xpu/cast_compute.cc
--- a/lite/kernels/xpu/cast_compute.h
+++ b/lite/kernels/xpu/cast_compute.h
--- a/lite/kernels/xpu/conv_compute.cc
+++ b/lite/kernels/xpu/conv_compute.cc
--- a/lite/kernels/xpu/conv_compute.h
+++ b/lite/kernels/xpu/conv_compute.h
--- a/lite/kernels/xpu/dropout_compute.cc
+++ b/lite/kernels/xpu/dropout_compute.cc
--- a/lite/kernels/xpu/dropout_compute.h
+++ b/lite/kernels/xpu/dropout_compute.h
--- a/lite/kernels/xpu/elementwise_compute.cc
+++ b/lite/kernels/xpu/elementwise_compute.cc
--- a/lite/kernels/xpu/elementwise_compute.h
+++ b/lite/kernels/xpu/elementwise_compute.h
--- a/lite/kernels/xpu/io_copy_compute.cc
+++ b/lite/kernels/xpu/io_copy_compute.cc
--- a/lite/kernels/xpu/layer_norm_compute.cc
+++ b/lite/kernels/xpu/layer_norm_compute.cc
--- a/lite/kernels/xpu/layer_norm_compute.h
+++ b/lite/kernels/xpu/layer_norm_compute.h
--- a/lite/kernels/xpu/lookup_table_compute.cc
+++ b/lite/kernels/xpu/lookup_table_compute.cc
--- a/lite/kernels/xpu/lookup_table_compute.h
+++ b/lite/kernels/xpu/lookup_table_compute.h
--- a/lite/kernels/xpu/matmul_compute.cc
+++ b/lite/kernels/xpu/matmul_compute.cc
--- a/lite/kernels/xpu/matmul_compute.h
+++ b/lite/kernels/xpu/matmul_compute.h
--- a/lite/kernels/xpu/mul_compute.cc
+++ b/lite/kernels/xpu/mul_compute.cc
--- a/lite/kernels/xpu/mul_compute.h
+++ b/lite/kernels/xpu/mul_compute.h
--- a/lite/kernels/xpu/pool_compute.cc
+++ b/lite/kernels/xpu/pool_compute.cc
--- a/lite/kernels/xpu/pool_compute.h
+++ b/lite/kernels/xpu/pool_compute.h
--- a/lite/kernels/xpu/scale_compute.cc
+++ b/lite/kernels/xpu/scale_compute.cc
--- a/lite/kernels/xpu/scale_compute.h
+++ b/lite/kernels/xpu/scale_compute.h
--- a/lite/kernels/xpu/slice_compute.cc
+++ b/lite/kernels/xpu/slice_compute.cc
--- a/lite/kernels/xpu/slice_compute.h
+++ b/lite/kernels/xpu/slice_compute.h
--- a/lite/kernels/xpu/softmax_compute.cc
+++ b/lite/kernels/xpu/softmax_compute.cc
--- a/lite/kernels/xpu/softmax_compute.h
+++ b/lite/kernels/xpu/softmax_compute.h
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
--- a/lite/model_parser/naive_buffer/naive_buffer.h
+++ b/lite/model_parser/naive_buffer/naive_buffer.h
--- a/lite/model_parser/naive_buffer/naive_buffer_test.cc
+++ b/lite/model_parser/naive_buffer/naive_buffer_test.cc
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/__xpu__multi_encoder_op.cc
+++ b/lite/operators/__xpu__multi_encoder_op.cc
--- a/lite/operators/__xpu__multi_encoder_op.h
+++ b/lite/operators/__xpu__multi_encoder_op.h
--- a/lite/operators/__xpu__resnet50_op.cc
+++ b/lite/operators/__xpu__resnet50_op.cc
--- a/lite/operators/__xpu__resnet50_op.h
+++ b/lite/operators/__xpu__resnet50_op.h
--- a/lite/operators/activation_extra_ops.cc
+++ b/lite/operators/activation_extra_ops.cc
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
--- a/lite/operators/calib_op.cc
+++ b/lite/operators/calib_op.cc
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
--- a/lite/operators/ctc_align_op.cc
+++ b/lite/operators/ctc_align_op.cc
--- a/lite/operators/ctc_align_op.h
+++ b/lite/operators/ctc_align_op.h
--- a/lite/operators/io_copy_op.cc
+++ b/lite/operators/io_copy_op.cc
--- a/lite/operators/layout_op.cc
+++ b/lite/operators/layout_op.cc
--- a/lite/operators/matmul_op.cc
+++ b/lite/operators/matmul_op.cc
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
--- a/lite/operators/sequence_unpad_op.cc
+++ b/lite/operators/sequence_unpad_op.cc
--- a/lite/operators/sequence_unpad_op.h
+++ b/lite/operators/sequence_unpad_op.h
--- a/lite/operators/shape_op.cc
+++ b/lite/operators/shape_op.cc
--- a/lite/operators/stack_op.cc
+++ b/lite/operators/stack_op.cc
--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
--- a/lite/tests/api/test_bert_lite_xpu.cc
+++ b/lite/tests/api/test_bert_lite_xpu.cc
--- a/lite/tests/api/test_ernie_lite_xpu.cc
+++ b/lite/tests/api/test_ernie_lite_xpu.cc
--- a/lite/tests/api/test_mobilenetv1_int8_rknpu.cc
+++ b/lite/tests/api/test_mobilenetv1_int8_rknpu.cc
--- a/lite/tests/api/test_resnet50_lite_xpu.cc
+++ b/lite/tests/api/test_resnet50_lite_xpu.cc
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
--- a/lite/tests/kernels/compare_compute_test.cc
+++ b/lite/tests/kernels/compare_compute_test.cc
--- a/lite/tests/kernels/ctc_align_compute_test.cc
+++ b/lite/tests/kernels/ctc_align_compute_test.cc
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
--- a/lite/tests/kernels/shape_compute_test.cc
+++ b/lite/tests/kernels/shape_compute_test.cc
--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
--- a/lite/tools/benchmark.sh
+++ b/lite/tools/benchmark.sh
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_bm.sh
+++ b/lite/tools/build_bm.sh
--- a/lite/tools/build_mlu.sh
+++ b/lite/tools/build_mlu.sh
--- a/lite/tools/build_xpu.sh
+++ b/lite/tools/build_xpu.sh
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
--- a/mobile/test/net/test_inference_api_v2.cpp
+++ b/mobile/test/net/test_inference_api_v2.cpp
--- a/mobile/test/net/test_inference_m2fm.cpp
+++ b/mobile/test/net/test_inference_m2fm.cpp