Merge branch 'develop' into pass

d64d4b6f · myq406450149 · GitHub · af3c5d92 · dc481d49 · d64d4b6f
243 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,7 +59,9 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
+lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
+lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
 lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
@@ -177,6 +179,10 @@ if(LITE_WITH_XPU)
    include(device/xpu)
 endif()
+if(LITE_WITH_MLU)
+    include(mlu)
+endif()
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -136,6 +136,9 @@ endif()
 if (LITE_WITH_XPU)
    add_definitions("-DLITE_WITH_XPU")
+    if (LITE_WITH_XTCL)
+      add_definitions("-DLITE_WITH_XTCL")
+    endif()
 endif()
 if (LITE_WITH_OPENCL)
@@ -150,6 +153,10 @@ if (LITE_WITH_BM)
 add_definitions("-DLITE_WITH_BM")
 endif()
+if (LITE_WITH_MLU)
+add_definitions("-DLITE_WITH_MLU")
+endif()
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
 endif()

--- a/cmake/device/xpu.cmake
+++ b/cmake/device/xpu.cmake
@@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT)
    message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
  endif()
 endif()
 message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
-find_path(XPU_SDK_INC NAMES xtcl.h
-  PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl
-  NO_DEFAULT_PATH)
-if(NOT XPU_SDK_INC)
-  message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
-endif()
-include_directories("${XPU_SDK_ROOT}/XTCL/include")
 include_directories("${XPU_SDK_ROOT}/XTDK/include")
-find_library(XPU_SDK_XTCL_FILE NAMES xtcl
-  PATHS ${XPU_SDK_ROOT}/XTCL/so
-  NO_DEFAULT_PATH)
-if(NOT XPU_SDK_XTCL_FILE)
-  message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
-  add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
-endif()
-find_library(XPU_SDK_TVM_FILE NAMES tvm
-  PATHS ${XPU_SDK_ROOT}/XTCL/so
-  NO_DEFAULT_PATH)
-if(NOT XPU_SDK_TVM_FILE)
-  message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
-  add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
-endif()
 find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
  NO_DEFAULT_PATH)
@@ -82,23 +50,55 @@ else()
  set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE})
 endif()
-find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc
+set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs")
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs")
-  NO_DEFAULT_PATH)
+if(LITE_WITH_XTCL)
-find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
+    find_path(XPU_SDK_INC NAMES xtcl.h
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+      PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
-  NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_INC)
+      message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
-if(NOT XPU_SDK_LLVM_FILE)
+    endif()
-  message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
+    include_directories("${XPU_SDK_ROOT}/XTCL/include")
-else()
-  message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
+    find_library(XPU_SDK_XTCL_FILE NAMES xtcl
-  add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
-  set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
+      NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_XTCL_FILE)
+      message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
+      add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
+    endif()
+    find_library(XPU_SDK_TVM_FILE NAMES tvm
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
+      NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_TVM_FILE)
+      message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
+      add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
+    endif()
+    find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
+      PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+      NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_LLVM_FILE)
+      message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
+      add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
+    endif()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1")
+    set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
+    set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
 endif()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0")
-set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
-set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(deps ${lite_deps_DEPS})
@@ -100,6 +100,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()
+  if (LITE_WITH_MLU)
+    foreach(var ${lite_deps_MLU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()
@@ -125,7 +131,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -144,6 +150,7 @@ function(lite_cc_library TARGET)
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )
    if (args_SHARED OR ARGS_shared)
@@ -170,7 +177,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -189,6 +196,7 @@ function(lite_cc_binary TARGET)
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            CV_DEPS ${CV_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -218,7 +226,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -245,6 +253,7 @@ function(lite_cc_test TARGET)
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
              CV_DEPS ${args_CV_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size
@@ -269,6 +278,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
+set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")
@@ -285,12 +295,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
+# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -369,6 +379,12 @@ function(add_kernel TARGET device level)
        endif()
        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "MLU")
+        if (NOT LITE_WITH_MLU)
+            return()
+        endif()
+        set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
            foreach(src ${args_SRCS})
@@ -409,6 +425,7 @@ function(add_kernel TARGET device level)
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
 	      BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -427,7 +444,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -462,6 +479,7 @@ function(add_operator TARGET level)
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
 	      BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -8,7 +8,9 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
+message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
+message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -10,6 +10,7 @@ if (LITE_ON_TINY_PUBLISH)
 endif()
 set(light_lib_DEPS light_api paddle_api paddle_api_light)
 if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
    #full api dynamic library
    lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
@@ -19,7 +20,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
    if(LITE_WITH_X86)
        add_dependencies(paddle_full_api_shared xxhash)
        target_link_libraries(paddle_full_api_shared xxhash)
-        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) 
+        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
            add_dependencies(paddle_full_api_shared dynload_mklml)
        endif()
    endif()
@@ -66,7 +67,8 @@ if (WITH_TESTING)
      CUDA_DEPS ${cuda_kernels}
      X86_DEPS ${x86_kernels}
      XPU_DEPS ${xpu_kernels}
-      BM_DEPS ${bm_kernels})
+      BM_DEPS ${bm_kernels}
+      MLU_DEPS ${mlu_kernels})
 endif()
 if(LITE_WITH_FPGA)
    set(light_api_deps ${light_api_deps} ${fpga_deps})
@@ -88,6 +90,7 @@ message(STATUS "get NPU kernels ${npu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
+message(STATUS "get MLU kernels ${mlu_kernels}")
 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -125,7 +128,8 @@ lite_cc_library(light_api SRCS light_api.cc
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
-        BM_DEPS ${bm_kernels})
+        BM_DEPS ${bm_kernels}
+        MLU_DEPS ${mlu_kernels})
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -144,6 +148,7 @@ if(WITH_TESTING)
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels}
       BM_DEPS ${bm_kernels}
+       MLU_DEPS ${mlu_kernels}
       EXCLUDE_COMPILE_DEPS "ON"
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -264,8 +269,6 @@ if (NOT LITE_ON_TINY_PUBLISH)
        NPU_DEPS ${npu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
-        CV_DEPS paddle_cv_arm
-        NPU_DEPS ${npu_kernels}
        BM_DEPS ${bm_kernels})
    # The final inference library for just MobileConfig.
    bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
@@ -292,6 +295,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
        XPU_DEPS ${xpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
+        MLU_DEPS ${mlu_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
        --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -329,6 +333,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
  X86_DEPS ${x86_kernels}
  FPGA_DEPS ${fpga_kernels}
  BM_DEPS ${bm_kernels}
+  MLU_DEPS ${mlu_kernels}
  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
 if (WITH_TESTING)
    add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -342,6 +347,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -354,6 +360,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -366,6 +373,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -378,6 +386,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
@@ -389,6 +398,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
 	BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -43,6 +43,16 @@ class LITE_API Predictor {
 public:
  // Create an empty predictor.
  Predictor() { scope_ = std::make_shared<Scope>(); }
+  ~Predictor() {
+#ifdef LITE_WITH_OPENCL
+    CLRuntime::Global()->ReleaseResources();
+#endif
+    scope_.reset();
+    exec_scope_ = nullptr;
+    program_.reset();
+    input_names_.clear();
+    output_names_.clear();
+  }
  // Create a predictor with the weight variable scope set.
  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
      : scope_(root_scope) {}

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -42,6 +42,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
    }
  }
 #endif
+#ifdef LITE_WITH_MLU
+  Env<TARGET(kMLU)>::Init();
+  lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
+                                           config.mlu_core_number(),
+                                           config.mlu_use_first_conv(),
+                                           config.mlu_first_conv_mean(),
+                                           config.mlu_first_conv_std(),
+                                           config.mlu_input_layout());
+#endif  // LITE_WITH_MLU
  std::vector<std::string> passes{};
  auto use_layout_preprocess_pass =
      config.model_dir().find("OPENCL_PRE_PRECESS");

--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -107,6 +107,8 @@ class LightPredictorImpl : public lite_api::PaddlePredictor {
 public:
  LightPredictorImpl() = default;
+  ~LightPredictorImpl();
  std::unique_ptr<lite_api::Tensor> GetInput(int i) override;
  std::unique_ptr<const lite_api::Tensor> GetOutput(int i) const override;

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -21,6 +21,13 @@
 namespace paddle {
 namespace lite {
+LightPredictorImpl::~LightPredictorImpl() {
+  raw_predictor_.reset();
+#ifdef LITE_WITH_OPENCL
+  CLRuntime::Global()->ReleaseResources();
+#endif
+}
 void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  // LightPredictor Only support NaiveBuffer backend in publish lib
  if (config.lite_model_file().empty()) {

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -109,6 +109,8 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(TARGET(kNPU));
    } else if (target_repr == "xpu") {
      valid_places.emplace_back(TARGET(kXPU));
+    } else if (target_repr == "mlu") {
+      valid_places.emplace_back(TARGET(kMLU));
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "lite/api/paddle_api.h"
+#include "lite/core/context.h"
 #include "lite/core/device_info.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
@@ -203,6 +204,58 @@ void ConfigBase::set_threads(int threads) {
 #endif
 }
+#ifdef LITE_WITH_MLU
+void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
+  mlu_core_version_ = core_version;
+}
+void CxxConfig::set_mlu_core_number(int core_number) {
+  mlu_core_number_ = core_number;
+}
+void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
+  mlu_input_layout_ = layout;
+}
+void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
+  mlu_use_first_conv_ = use_first_conv;
+}
+void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
+  mlu_first_conv_mean_ = mean;
+}
+void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
+  mlu_first_conv_std_ = std;
+}
+lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
+  return mlu_core_version_;
+}
+int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
+DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
+bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
+const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
+  return mlu_first_conv_mean_;
+}
+const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
+  return mlu_first_conv_std_;
+}
+#endif
+void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_workspace_l3_size_per_thread' is ignored, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::SetDev(dev_no);
+#else
+  LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
+                  "ignored, please rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
 // set model data in combined format, `set_model_from_file` refers to loading
 // model from file, set_model_from_buffer refers to loading model from memory
 // buffer

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -136,6 +136,14 @@ class LITE_API CxxConfig : public ConfigBase {
 #ifdef LITE_WITH_X86
  int x86_math_library_math_threads_ = 1;
 #endif
+#ifdef LITE_WITH_MLU
+  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
+  int mlu_core_number_{1};
+  DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
+  bool mlu_use_first_conv_{false};
+  std::vector<float> mlu_first_conv_mean_;
+  std::vector<float> mlu_first_conv_std_;
+#endif
 public:
  void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -163,6 +171,37 @@ class LITE_API CxxConfig : public ConfigBase {
    return x86_math_library_math_threads_;
  }
 #endif
+#ifdef LITE_WITH_MLU
+  // set MLU core version, which is used when compiling MLU kernels
+  void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
+  // set MLU core number, which is used when compiling MLU kernels
+  void set_mlu_core_number(int core_number);
+  // set MLU input layout. User can specify layout of input data to be NHWC,
+  // default is NCHW
+  void set_mlu_input_layout(DataLayoutType layout);
+  // whether use MLU's first conv kernel. First conv is a special kernel
+  // provided by MLU, its input is uint8, and also needs two 3-dimentional
+  // vectors which save all inputs' mean and std values
+  void set_mlu_use_first_conv(bool use_first_conv);
+  // set the 3-dimentional mean vector used by MLU's first conv
+  void set_mlu_first_conv_mean(const std::vector<float>& mean);
+  // set the 3-dimentional std vector used by MLU's first conv
+  void set_mlu_first_conv_std(const std::vector<float>& std);
+  lite_api::MLUCoreVersion mlu_core_version() const;
+  int mlu_core_number() const;
+  DataLayoutType mlu_input_layout() const;
+  bool mlu_use_first_conv() const;
+  const std::vector<float>& mlu_first_conv_mean() const;
+  const std::vector<float>& mlu_first_conv_std() const;
+#endif
+  // XPU only, set the size of the workspace memory from L3 cache for the
+  // current thread.
+  void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
+  // XPU only, specify the target device ID for the current thread.
+  void set_xpu_dev_per_thread(int dev_no = 0);
 };
 /// MobileConfig is the config for the light weight predictor, it will skip

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -71,7 +71,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "fpga",
                                              "npu",
                                              "xpu",
-                                              "bm"};
+                                              "bm",
+                                              "mlu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -111,6 +112,7 @@ const std::string& TargetRepr(TargetType target) {
                                              "kFPGA",
                                              "kNPU",
                                              "kXPU",
+                                              "kMLU",
                                              "kBM"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
@@ -153,6 +155,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kNPU),
                                               TARGET(kXPU),
                                               TARGET(kBM),
+                                               TARGET(kMLU),
                                               TARGET(kFPGA)});
  if (target == TARGET(kAny)) {
    return valid_set;

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -53,8 +53,8 @@ enum class TargetType : int {
  kNPU = 8,
  kXPU = 9,
  kBM = 10,
-  kAny = 6,  // any target
  kMLU = 11,
+  kAny = 6,  // any target
  NUM = 12,  // number of fields.
 };
 enum class PrecisionType : int {
@@ -89,6 +89,8 @@ typedef enum {
  LITE_POWER_RAND_LOW = 5
 } PowerMode;
+typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion;
 enum class ActivationType : int {
  kIndentity = 0,
  kRelu = 1,
@@ -100,7 +102,9 @@ enum class ActivationType : int {
  kSwish = 7,
  kExp = 8,
  kAbs = 9,
-  NUM = 10,
+  kHardSwish = 10,
+  kReciprocal = 11,
+  NUM = 12,
 };
 static size_t PrecisionTypeLength(PrecisionType type) {

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -45,6 +45,10 @@ USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
+USE_MIR_PASS(mlu_subgraph_pass);
+USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
 USE_MIR_PASS(assign_value_eliminate_pass);
+USE_MIR_PASS(__xpu__resnet_fuse_pass);
+USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -47,6 +47,7 @@ using lite_api::TargetType;
 using lite_api::PrecisionType;
 using lite_api::DataLayoutType;
 using lite_api::Place;
+using lite_api::MLUCoreVersion;
 using lite::LightPredictorImpl;
 using lite_api::OptBase;
@@ -76,6 +77,7 @@ static void BindLiteMobileConfig(py::module *m);
 static void BindLitePowerMode(py::module *m);
 static void BindLitePlace(py::module *m);
 static void BindLiteTensor(py::module *m);
+static void BindLiteMLUCoreVersion(py::module *m);
 void BindLiteApi(py::module *m) {
  BindLiteCxxConfig(m);
@@ -83,6 +85,7 @@ void BindLiteApi(py::module *m) {
  BindLitePowerMode(m);
  BindLitePlace(m);
  BindLiteTensor(m);
+  BindLiteMLUCoreVersion(m);
 #ifndef LITE_ON_TINY_PUBLISH
  BindLiteCxxPredictor(m);
 #endif
@@ -124,6 +127,14 @@ void BindLiteCxxConfig(py::module *m) {
      .def("set_power_mode", &CxxConfig::set_power_mode)
      .def("power_mode", &CxxConfig::power_mode);
 #endif
+#ifdef LITE_WITH_MLU
+  cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version)
+      .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number)
+      .def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout)
+      .def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv)
+      .def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean)
+      .def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std);
+#endif
 }
 // TODO(sangoly): Should MobileConfig be renamed to LightConfig ??
@@ -155,6 +166,12 @@ void BindLitePowerMode(py::module *m) {
      .value("LITE_POWER_RAND_LOW", PowerMode::LITE_POWER_RAND_LOW);
 }
+void BindLiteMLUCoreVersion(py::module *m) {
+  py::enum_<MLUCoreVersion>(*m, "MLUCoreVersion")
+      .value("LITE_MLU_220", MLUCoreVersion::MLU_220)
+      .value("LITE_MLU_270", MLUCoreVersion::MLU_270);
+}
 void BindLitePlace(py::module *m) {
  // TargetType
  py::enum_<TargetType>(*m, "TargetType")
@@ -165,6 +182,7 @@ void BindLitePlace(py::module *m) {
      .value("OpenCL", TargetType::kOpenCL)
      .value("FPGA", TargetType::kFPGA)
      .value("NPU", TargetType::kNPU)
+      .value("MLU", TargetType::kMLU)
      .value("Any", TargetType::kAny);
  // PrecisionType
@@ -245,6 +263,20 @@ void BindLiteTensor(py::module *m) {
  DO_GETTER_ONCE(data_type__, name__##_data)
  DATA_GETTER_SETTER_ONCE(int8_t, int8);
+#ifdef LITE_WITH_MLU
+  tensor.def("set_uint8_data",
+             [](Tensor &self,
+                const std::vector<uint8_t> &data,
+                TargetType type = TargetType::kHost) {
+               if (type == TargetType::kHost) {
+                 self.CopyFromCpu<uint8_t, TargetType::kHost>(data.data());
+               }
+             },
+             py::arg("data"),
+             py::arg("type") = TargetType::kHost);
+  DO_GETTER_ONCE(uint8_t, "uint8_data");
+#endif
  DATA_GETTER_SETTER_ONCE(int32_t, int32);
  DATA_GETTER_SETTER_ONCE(float, float);
 #undef DO_GETTER_ONCE

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -6,4 +6,5 @@ add_subdirectory(fpga)
 add_subdirectory(host)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(mlu)
 add_subdirectory(bm)
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "lite/backends/arm/math/activation.h"
+#include <algorithm>
 #include <string>
 #include "lite/backends/arm/math/funcs.h"
@@ -711,6 +712,38 @@ void act_square<float>(const float* din, float* dout, int size, int threads) {
  }
 }
+template <>
+void act_hard_swish<float>(const float* din,
+                           float* dout,
+                           int size,
+                           float threshold,
+                           float scale,
+                           float offset,
+                           int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = std::min(std::max(0.f, ptr_in[0] + offset), threshold) *
+                 ptr_in[0] / scale;
+    ptr_in++;
+    ptr_out++;
+  }
+}
+template <>
+void act_reciprocal<float>(const float* din,
+                           float* dout,
+                           int size,
+                           int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = 1.0 / ptr_in[0];
+    ptr_in++;
+    ptr_out++;
+  }
+}
 #ifdef LITE_WITH_TRAIN
 template <>
 void act_square_grad(const float* din,

--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -72,6 +72,17 @@ void act_rsqrt(const T* din, T* dout, int size, int threads);
 template <typename T>
 void act_square(const T* din, T* dout, int size, int threads);
+template <typename T>
+void act_hard_swish(const T* din,
+                    T* dout,
+                    int size,
+                    float threshold,
+                    float scale,
+                    float offset,
+                    int threads);
+template <typename T>
+void act_reciprocal(const T* din, T* dout, int size, int threads);
 #ifdef LITE_WITH_TRAIN
 template <typename T>
 void act_square_grad(

--- a/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cl_common.h>
+__kernel void decode_center_size(__read_only image2d_t prior_box_image,
+                                __read_only image2d_t prior_box_var_image,
+                                __read_only image2d_t target_box_image,
+                                __write_only image2d_t output_image,
+                                __private const int out_C,
+                                __private const int out_H){
+                        const int out_c = get_global_id(0);
+                        const int out_nh = get_global_id(1);
+                        const int out_h = out_nh % out_H;
+                        const int out_n =  1;
+                        const int prior_box_n = 1;
+                        const int prior_box_c = 0;
+                        const int prior_box_h = out_h;
+                        const int prior_box_var_n = 1;
+                        const int prior_box_var_c = 0;
+                        const int prior_box_var_h = out_h;
+                        const int target_box_n = 1;
+                        const int target_box_c = out_c;
+                        const int target_box_h = out_h;
+                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                 CLK_ADDRESS_CLAMP      |
+                                                 CLK_FILTER_NEAREST;
+                        int2  prior_box_pos;
+                        int2  prior_box_var_pos;
+                        int2  target_box_pos;
+                        int2  output_pos;
+                        prior_box_pos.x = prior_box_c * 4;
+                        prior_box_pos.y = prior_box_n * prior_box_h;
+                        prior_box_var_pos.x = prior_box_var_c * 4;
+                        prior_box_var_pos.y = prior_box_var_n * prior_box_var_h;
+                        target_box_pos.x = target_box_c * 4;
+                        target_box_pos.y = target_box_n * target_box_h;
+                        output_pos.x = out_c * 4;
+                        output_pos.y = out_n * out_h;
+                        CL_DTYPE4 prior_box_input[4];
+                        CL_DTYPE4 prior_box_var_input[4];
+                        CL_DTYPE4 target_box_input[4];
+                        prior_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 0, prior_box_pos.y));
+                        prior_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 1, prior_box_pos.y));
+                        prior_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 2, prior_box_pos.y));
+                        prior_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 3, prior_box_pos.y));
+                        prior_box_var_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 0, prior_box_var_pos.y));
+                        prior_box_var_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 1, prior_box_var_pos.y));
+                        prior_box_var_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 2, prior_box_var_pos.y));
+                        prior_box_var_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, 
+                                                (int2)(prior_box_var_pos.x + 3, prior_box_var_pos.y));
+                        target_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 0,target_box_pos.y));
+                        target_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 1, target_box_pos.y));
+                        target_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 2, target_box_pos.y));
+                        target_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 3, target_box_pos.y));
+                        CL_DTYPE prior_box_width = prior_box_input[2].x - prior_box_input[0].x;
+                        CL_DTYPE prior_box_height = prior_box_input[3].x - prior_box_input[1].x;
+                        CL_DTYPE prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(CL_DTYPE)2;
+                        CL_DTYPE prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(CL_DTYPE)2;
+                        CL_DTYPE4 target_box_center_x;
+                        CL_DTYPE4 target_box_center_y;
+                        CL_DTYPE4 target_box_width;
+                        CL_DTYPE4 target_box_height;
+                        CL_DTYPE4 output[4];
+                        output[0] = 0.0f;
+                        output[1] = 0.0f;
+                        output[2] = 0.0f;
+                        output[3] = 0.0f;
+                        target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x;
+                        target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y;
+                        target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width;
+                        target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height;
+                        output[0].x = target_box_center_x.x - target_box_width.x/(half)2;
+                        output[1].x = target_box_center_y.x - target_box_height.x/(half)2;
+                        output[2].x = target_box_center_x.x + target_box_width.x/(half)2;
+                        output[3].x = target_box_center_y.x + target_box_height.x/(half)2;
+                        if(out_C - out_c * 4 >= 2){
+                            target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x;
+                            target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y;
+                            target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width;
+                            target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height;
+                            output[0].y = target_box_center_x.y - target_box_width.y/(half)2;
+                            output[1].y = target_box_center_y.y - target_box_height.y/(half)2;
+                            output[2].y = target_box_center_x.y + target_box_width.y/(half)2;
+                            output[3].y = target_box_center_y.y + target_box_height.y/(half)2;
+                        }
+                        if(out_C - out_c * 4 >= 3){
+                            target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x;
+                            target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y;
+                            target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width;
+                            target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height;
+                            output[0].z = target_box_center_x.z - target_box_width.z/(half)2;
+                            output[1].z = target_box_center_y.z - target_box_height.z/(half)2;
+                            output[2].z = target_box_center_x.z + target_box_width.z/(half)2;
+                            output[3].z = target_box_center_y.z + target_box_height.z/(half)2;
+                        }
+                        if(out_C - out_c * 4 >= 4){
+                            target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x;
+                            target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y;
+                            target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width;
+                            target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height;
+                            output[0].w = target_box_center_x.w - target_box_width.w/(half)2;
+                            output[1].w = target_box_center_y.w - target_box_height.w/(half)2;
+                            output[2].w = target_box_center_x.w + target_box_width.w/(half)2;
+                            output[3].w = target_box_center_y.w + target_box_height.w/(half)2;
+                        }
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]);
+}
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -29,30 +29,38 @@ CLRuntime* CLRuntime::Global() {
 }
 CLRuntime::~CLRuntime() {
+  LOG(INFO) << "CLRuntime::~CLRuntime()";
+  // Note: do ReleaseResources() in predictor
+  command_queue_&& clReleaseCommandQueue(command_queue_->get());
+  command_queue_.reset();
+  context_&& clReleaseContext(context_->get());
+  context_.reset();
+  device_.reset();
+  platform_.reset();
+  initialized_ = false;
+}
+void CLRuntime::ReleaseResources() {
+  //  if (is_resources_released_) {
+  //    return;
+  //  }
  if (command_queue_ != nullptr) {
    command_queue_->flush();
    command_queue_->finish();
  }
  for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
    clReleaseKernel(kernels_[kidx]->get());
    kernels_[kidx].reset();
  }
  kernels_.clear();
  kernel_offset_.clear();
  for (auto& p : programs_) {
    clReleaseProgram(p.second->get());
  }
  programs_.clear();
+  LOG(INFO) << "release resources finished.";
-  // For controlling the destruction order
+  is_resources_released_ = true;
-  command_queue_&& clReleaseCommandQueue(command_queue_->get());
-  command_queue_.reset();
-  context_&& clReleaseContext(context_->get());
-  context_.reset();
-  device_.reset();
-  platform_.reset();
 }
 bool CLRuntime::Init() {

--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -33,6 +33,8 @@ class CLRuntime {
 public:
  static CLRuntime* Global();
+  void ReleaseResources();
  bool Init();
  cl::Platform& platform();
@@ -116,6 +118,8 @@ class CLRuntime {
  bool initialized_{false};
  bool is_init_success_{false};
+  bool is_resources_released_{false};
 };
 }  // namespace lite

--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -96,8 +96,8 @@ class BeamSearchFunctor<TARGET(kX86), T> {
    //        : nullptr;
    // fill in data
-    std::vector<size_t> low_level;
+    std::vector<uint64_t> low_level;
-    size_t low_offset = 0;
+    uint64_t low_offset = 0;
    for (auto &items : selected_items) {
      low_level.push_back(low_offset);
      for (auto &item : items) {

--- a/lite/backends/x86/math/beam_search_test.cc
+++ b/lite/backends/x86/math/beam_search_test.cc
@@ -22,8 +22,8 @@ void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
                       paddle::framework::LoDTensor* pre_scores) {
  // lod
  paddle::framework::LoD lod;
-  std::vector<size_t> level0({0, 2, 4});
+  std::vector<uint64_t> level0({0, 2, 4});
-  std::vector<size_t> level1({0, 1, 2, 3, 4});
+  std::vector<uint64_t> level1({0, 1, 2, 3, 4});
  lod.push_back(level0);
  lod.push_back(level1);
  ids->set_lod(lod);

--- a/lite/backends/x86/math/blas_impl.h
+++ b/lite/backends/x86/math/blas_impl.h
@@ -483,7 +483,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
             mat_a.data<T>(),
             mat_b.data<T>(),
             beta,
-             mat_out->mutable_data<T>());
+             mat_out->template mutable_data<T>());
 }
 template <>
@@ -759,7 +759,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
                           mat_a.data<T>(),
                           mat_b.data<T>(),
                           beta,
-                           mat_out->mutable_data<T>());
+                           mat_out->template mutable_data<T>());
  } else {
    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
                   dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
@@ -773,7 +773,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
        mat_a.data<T>(),
        mat_b.data<T>(),
        beta,
-        mat_out->mutable_data<T>(),
+        mat_out->template mutable_data<T>(),
        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
        dim_a.stride_,
        dim_b.stride_);

--- a/lite/backends/x86/math/concat_and_split.cc
+++ b/lite/backends/x86/math/concat_and_split.cc
@@ -51,7 +51,7 @@ class ConcatFunctor<lite::TargetType::kX86, T> {
    // auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
    // computation
-    auto output_data = output->mutable_data<T>();
+    auto output_data = output->template mutable_data<T>();
    int col_idx = 0;
    for (int j = 0; j < num; ++j) {
      int col_len = input_cols[j];
@@ -108,7 +108,7 @@ class SplitFunctor<lite::TargetType::kX86, T> {
        int col_len = output_cols[j];
        auto* out_tensor = outputs->at(j);
        if (out_tensor != nullptr) {
-          T* dst_ptr = out_tensor->mutable_data<T>() + k * col_len;
+          T* dst_ptr = out_tensor->template mutable_data<T>() + k * col_len;
          std::copy_n(src_ptr + col_idx, col_len, dst_ptr);
          // memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
          //             sizeof(T) * col_len);

--- a/lite/backends/x86/math/cross_entropy.cc
+++ b/lite/backends/x86/math/cross_entropy.cc
@@ -50,8 +50,8 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> {
                .reshape(batch_axis_remain)
                .sum(Eigen::DSizes<int, 1>(1)));
    } else {
-      const T* prob_data = prob->data<T>();
+      const T* prob_data = prob->template data<T>();
-      T* loss_data = out->mutable_data<T>();
+      T* loss_data = out->template mutable_data<T>();
      const int64_t* label_data = labels->data<int64_t>();
      for (int i = 0; i < batch_size; ++i) {

--- a/lite/backends/x86/math/im2col.cc
+++ b/lite/backends/x86/math/im2col.cc
@@ -99,7 +99,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
    int channels_col = im_channels * filter_height * filter_width;
-    T* im_data = im->mutable_data<T>();
+    T* im_data = im->template mutable_data<T>();
    const T* col_data = col.data<T>();
    for (int c = 0; c < channels_col; ++c) {
@@ -161,7 +161,7 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
    int col_width = col->dims()[1];
    const T* im_data = im.data<T>();
-    T* col_data = col->mutable_data<T>();
+    T* col_data = col->template mutable_data<T>();
    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
@@ -235,7 +235,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
        "col_width and padding(padding_left, padding_right) are "
        "inconsistent.");
-    T* im_data = im->mutable_data<T>();
+    T* im_data = im->template mutable_data<T>();
    const T* col_data = col.data<T>();
    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {

--- a/lite/backends/x86/math/im2col_cfo_cpu.h
+++ b/lite/backends/x86/math/im2col_cfo_cpu.h
@@ -42,7 +42,7 @@ inline void im2col_common(const lite::Tensor& im,
  int channels_col = im_channels * filter_height * filter_width;
  const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
  for (int c = 0; c < channels_col; ++c) {
    int w_offset = c % filter_width;
    int h_offset = (c / filter_width) % filter_height;
@@ -77,7 +77,7 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im,
  int output_width = col->dims()[4];
  const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
  int col_matrix_width = output_width * output_height;
  int im_size = im_height * im_width;
  size_t copy_size = sizeof(T) * output_width;
@@ -123,7 +123,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im,
  constexpr int prw = 1;
  const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
  int im_size = im_height * im_width;
  int col_matrix_width = output_width * output_height;
  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow

--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -65,7 +65,7 @@ struct TensorSetConstantCPU {
      : tensor_(tensor), value_(value) {}
  template <typename T>
  void apply() const {
-    auto* begin = tensor_->mutable_data<T>(lite::TargetType::kX86);
+    auto* begin = tensor_->template mutable_data<T>(lite::TargetType::kX86);
    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
  }
  lite::Tensor* tensor_;
@@ -126,7 +126,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
    const T* input_data = input.data<T>();
    const T* vector_data = vector.data<T>();
-    T* output_data = output->mutable_data<T>();
+    T* output_data = output->template mutable_data<T>();
    for (int64_t i = 0; i < in_dims[0]; ++i) {
      for (int64_t j = 0; j < size; ++j) {
        output_data[i * in_dims[0] + j] =

--- a/lite/backends/x86/math/math_function_impl.h
+++ b/lite/backends/x86/math/math_function_impl.h
@@ -83,7 +83,7 @@ class ColwiseSum<lite::TargetType::kX86, T> {
    auto size = in_dims[1];
    PADDLE_ENFORCE_EQ(out->numel(), size);
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
    const T* in_buf = input.data<T>();
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -129,7 +129,7 @@ class RowwiseMean<lite::TargetType::kX86, T> {
    auto size = in_dims[1];
    PADDLE_ENFORCE_EQ(out->numel(), height);
    auto inv_size = 1.0 / size;
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
    const T* in_buf = input.data<T>();
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -173,7 +173,7 @@ class RowwiseSum<lite::TargetType::kX86, T> {
    auto size = in_dims[1];
    PADDLE_ENFORCE_EQ(out->numel(), height);
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
    const T* in_buf = input.data<T>();
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {

--- a/lite/backends/x86/math/maxouting.cc
+++ b/lite/backends/x86/math/maxouting.cc
@@ -35,7 +35,7 @@ class MaxOutFunctor<lite::TargetType::kX86, T> {
    // c_size means the output size of each sample
    int c_size = fea_size * output_channels;
    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
    for (int i = 0; i < batch_size; ++i) {
      int new_bindex = c_size * i;
@@ -72,7 +72,8 @@ class MaxOutGradFunctor<lite::TargetType::kX86, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    for (int i = 0; i < batch_size; ++i) {
      int blen = fea_size * output_channels * i;

--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
@@ -54,8 +54,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const int input_stride = input_height * input_width;
    const int output_stride = output_height * output_width;
-    const T* input_data = input->data<T>();
+    const T* input_data = input->template data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
    int hstart, hend;
    int wstart, wend;
@@ -137,7 +137,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    int hstart, hend;
    int wstart, wend;
@@ -220,7 +221,8 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -322,7 +324,7 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const int output_stride = output_depth * output_height * output_width;
    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
    int dstart, dend;
    int hstart, hend;
@@ -425,7 +427,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    int dstart, dend;
    int hstart, hend;
@@ -530,7 +533,8 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {

--- a/lite/backends/x86/math/sample_prob.h
+++ b/lite/backends/x86/math/sample_prob.h
@@ -58,11 +58,11 @@ class SampleWithProb {
    const int64_t* label_data = L->data<int64_t>();
    // int64_t* samples_data =
    //    S->mutable_data<int64_t>(ret_dim, Target);
-    // T* probabilities_data = P->mutable_data<T>(ret_dim, Target);
+    // T* probabilities_data = P->template mutable_data<T>(ret_dim, Target);
    S->Resize({batch_size, num_sampled_classes});
    auto* samples_data = S->mutable_data<int64_t>(Target);
    P->Resize({batch_size, num_sampled_classes});
-    auto* probabilities_data = P->mutable_data<T>(Target);
+    auto* probabilities_data = P->template mutable_data<T>(Target);
    // temp sets for unique sampling
    std::unordered_set<int64_t> tmp_samples;

--- a/lite/backends/x86/math/search_fc.cc
+++ b/lite/backends/x86/math/search_fc.cc
@@ -42,7 +42,7 @@ class SearchFcFunctor<lite::TargetType::kX86, T> {
    lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size}));
    const auto bottom_data = bottom.data<T>();
-    auto top_data = top->mutable_data<T>(lite::TargetType::kX86);
+    auto top_data = top->template mutable_data<T>(lite::TargetType::kX86);
    const auto weights = w.data<T>();
    auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
    call_gemm<lite::X86Context, T>(blas,

--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
@@ -52,7 +52,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
-    auto* out_data = out_value->mutable_data<T>();
+    auto* out_data = out_value->template mutable_data<T>();
    auto* in1_data = in1_value.data<T>();
    std::copy_n(in1_data, in1_value.numel(), out_data);
@@ -87,7 +87,7 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
    functor(context, output, 0.0);
    auto* in1_data = in1_value.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
    for (size_t i = 0; i < in1_rows.size(); i++) {
      for (int64_t j = 0; j < in1_row_numel; j++) {
@@ -127,7 +127,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = in2_value->mutable_data<T>();
+    auto* in2_data = in2_value->template mutable_data<T>();
    std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset);
  }
 };
@@ -161,7 +161,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
    input2->set_rows(in2_rows);
    auto* in2_value = input2->mutable_value();
-    T* in2_data = in2_value->mutable_data<T>();
+    T* in2_data = in2_value->template mutable_data<T>();
    auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
    size_t offset = 0u;
    for (size_t i = 0u; i != input1.size(); ++i) {
@@ -194,7 +194,7 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {
    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->mutable_data<T>();
+    auto* input2_data = input2->template mutable_data<T>();
    for (size_t i = 0; i < in1_rows.size(); i++) {
      for (int64_t j = 0; j < in1_row_numel; j++) {
@@ -305,7 +305,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
    lite::DDim dims(std::vector<int64_t>(
        {static_cast<int64_t>(merged_row_set.size()), input_width}));
    out.mutable_value()->Resize(dims);
-    auto* out_data = out.mutable_value()->mutable_data<T>();
+    auto* out_data = out.mutable_value()->template mutable_data<T>();
    if (merged_row_set.size() == row_num && !sorted_result) {
      // no duplicated ids, just concat the result together
@@ -385,7 +385,7 @@ struct UpdateToTensor<lite::TargetType::kX86, T> {
    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
+    auto* input2_data = input2->template data<T>();
    // FIXME(typhoonzero): use macro fix the below messy code.
    switch (op) {

--- a/lite/backends/x86/math/sequence2batch.cc
+++ b/lite/backends/x86/math/sequence2batch.cc
@@ -24,10 +24,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
 public:
  void operator()(const lite::Context<lite::TargetType::kX86>& context,
                  const lite::Tensor& src,
-                  const std::vector<size_t>& index_lod,
+                  const std::vector<uint64_t>& index_lod,
                  lite::Tensor* dst,
                  bool is_src_index) {
-    const size_t* index = index_lod.data();
+    const uint64_t* index = index_lod.data();
    const auto& src_dims = src.dims();
    const auto& dst_dims = dst->dims();
    PADDLE_ENFORCE_EQ(
@@ -39,7 +39,7 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
    auto height = dst_dims[0];
    auto width = dst_dims[1];
    auto* src_data = src.data<T>();
-    auto* dst_data = dst->mutable_data<T>();
+    auto* dst_data = dst->template mutable_data<T>();
    const int sz = width * sizeof(T);
    if (is_src_index) {
      for (int i = 0; i < height; ++i) {

--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
@@ -36,7 +36,7 @@ class CopyMatrixRowsFunctor {
  // The indexed rows are based on the input index.
  void operator()(const lite::Context<Target>& context,
                  const lite::Tensor& src,
-                  const std::vector<size_t>& index_lod,
+                  const std::vector<uint64_t>& index_lod,
                  lite::Tensor* dst,
                  bool is_src_index);
 };
@@ -130,8 +130,8 @@ class LoDTensor2BatchFunctor {
    // batch_lods[2] is the sort order for the input LoDTensor.
    batch_lods->at(2).resize(seq_info.size());
-    size_t* batch_starts = batch_lods->at(0).data();
+    auto* batch_starts = batch_lods->at(0).data();
-    size_t* seq2batch_idx = batch_lods->at(1).data();
+    auto* seq2batch_idx = batch_lods->at(1).data();
    batch_starts[0] = 0;
    for (int n = 0; n < max_seqlen; n++) {
      auto batch_id = static_cast<int>(batch_starts[n]);
@@ -148,7 +148,7 @@ class LoDTensor2BatchFunctor {
      }
      batch_starts[n + 1] = static_cast<size_t>(batch_id);
    }
-    size_t* seq_order = batch_lods->at(2).data();
+    auto* seq_order = batch_lods->at(2).data();
    for (size_t i = 0; i < seq_info.size(); ++i) {
      seq_order[i] = seq_info[i].seq_idx;
    }

--- a/lite/backends/x86/math/sequence_padding.cc
+++ b/lite/backends/x86/math/sequence_padding.cc
@@ -22,15 +22,15 @@ namespace math {
 template <typename T>
 void CopyValidData(lite::Tensor* dst_tensor,
                   const lite::Tensor* src_tensor,
-                   const std::vector<size_t>& seq_offsets,
+                   const std::vector<uint64_t>& seq_offsets,
                   int pad_seq_len,
                   int step_width,
                   bool norm_by_len,
                   CopyType type,
                   PadLayout layout) {
  int seq_num = seq_offsets.size() - 1;
-  const T* src_data = src_tensor->data<T>();
+  const T* src_data = src_tensor->template data<T>();
-  T* dst_data = dst_tensor->mutable_data<T>();
+  T* dst_data = dst_tensor->template mutable_data<T>();
  int seq_cpy_gap = step_width;
  int pad_cpy_gap =
@@ -113,7 +113,7 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
                   "'step_width'.");
    // fill padding value
-    T* pad_data = pad_tensor->mutable_data<T>();
+    T* pad_data = pad_tensor->template mutable_data<T>();
    const T* pad_value_data = pad_value.data<T>();
    if (pad_value.numel() == 1) {
      fast_mem_init<T>(

--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
@@ -30,10 +30,10 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
 enum CopyType { kSeqToPad, kPadToSeq };
-inline static size_t MaximumSequenceLength(
+inline static uint64_t MaximumSequenceLength(
-    const std::vector<size_t>& seq_offset) {
+    const std::vector<uint64_t>& seq_offset) {
-  size_t seq_num = seq_offset.size() - 1;
+  uint64_t seq_num = seq_offset.size() - 1;
-  size_t max_seq_len = 0;
+  uint64_t max_seq_len = 0;
  for (size_t i = 0; i < seq_num; ++i) {
    max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
  }
@@ -42,7 +42,7 @@ inline static size_t MaximumSequenceLength(
 inline static void CheckDims(const lite::DDim& seq_tensor_dims,
                             const lite::DDim& pad_tensor_dims,
-                             const std::vector<size_t>& seq_offset,
+                             const std::vector<uint64_t>& seq_offset,
                             int64_t padded_seq_len,
                             int64_t step_width,
                             const PadLayout& layout) {

--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
@@ -55,7 +55,7 @@ class MaxSeqPoolFunctor {
    auto starts = input.lod()[0];
    const T* in_data = input.data<T>();
-    T* out_data = output->mutable_data<T>();
+    T* out_data = output->template mutable_data<T>();
    int* max_index = index->mutable_data<int>();
    int64_t num_seq = out_dims[0];
@@ -103,7 +103,7 @@ class MaxSeqPoolFunctor<T, true> {
    auto starts = input.lod()[0];
    const T* in_data = input.data<T>();
-    T* out_data = output->mutable_data<T>();
+    T* out_data = output->template mutable_data<T>();
    int64_t num_seq = out_dims[0];
    int64_t dim = output->numel() / num_seq;
@@ -145,7 +145,7 @@ class MaxSeqPoolGradFunctor {
    const T* og_data = out_grad.data<T>();
    const int* max_index = index.data<int>();
-    T* ig_data = in_grad->mutable_data<T>();
+    T* ig_data = in_grad->template mutable_data<T>();
    SetConstant<TARGET(kX86), T> set_zero;
    set_zero(context, in_grad, static_cast<T>(0.0));
@@ -170,7 +170,7 @@ class LastSeqPoolFunctor {
                  lite::Tensor* output) {
    // Create pointers to input and output data
    auto* in_data = input.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
    // Calculate the size of each item in sequence
    int64_t item_size = input.numel() / input.dims()[0];
@@ -203,7 +203,7 @@ class FirstSeqPoolFunctor {
                  lite::Tensor* output) {
    // Create pointers to input and output data
    auto* in_data = input.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
    // Calculate the size of each item in sequence
    int64_t item_size = input.numel() / input.dims()[0];
@@ -238,7 +238,7 @@ class SumSeqPoolGradFunctor {
    int64_t in_w = in_grad->numel() / in_grad->dims()[0];
    PADDLE_ENFORCE(in_w == out_w);
    const T* out_g_data = out_grad.data<T>();
-    T* in_g_data = in_grad->mutable_data<T>(TARGET(kX86));
+    T* in_g_data = in_grad->template mutable_data<T>(TARGET(kX86));
    auto blas = math::GetBlas<TARGET(kX86), T>(context);
    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
@@ -288,7 +288,7 @@ class SequencePoolFunctor<TARGET(kX86), T> {
    auto lod = input.lod()[0];
    if (pooltype == "SUM") {
      const T* src = input.data<T>();
-      T* dst = output->mutable_data<T>(TARGET(kX86));
+      T* dst = output->template mutable_data<T>(TARGET(kX86));
      jit::seq_pool_attr_t attr(
          static_cast<int>(input.numel() / input.dims()[0]),
          jit::SeqPoolType::kSum);

--- a/lite/backends/x86/math/sequence_pooling_test.cc
+++ b/lite/backends/x86/math/sequence_pooling_test.cc
@@ -101,13 +101,13 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
 TEST(SequencePoolingGrad, CPU_SUM) {
  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
+  lod1.push_back(std::vector<uint64_t>{0, 10});
  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
                         paddle::platform::CPUPlace,
                         float>(lod1);
  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
                         paddle::platform::CPUPlace,
                         float>(lod2);
@@ -116,13 +116,13 @@ TEST(SequencePoolingGrad, CPU_SUM) {
 #ifdef PADDLE_WITH_CUDA
 TEST(SequencePoolingGrad, CUDA_SUM) {
  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
+  lod1.push_back(std::vector<uint64_t>{0, 10});
  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
                         paddle::platform::CUDAPlace,
                         float>(lod1);
  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
                         paddle::platform::CUDAPlace,
                         float>(lod2);

--- a/lite/backends/x86/math/sequence_scale.cc
+++ b/lite/backends/x86/math/sequence_scale.cc
@@ -32,7 +32,7 @@ class ScaleLoDTensorFunctor<lite::TargetType::kX86, T> {
    size_t seq_width = seq->dims()[1];
    lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod);
-    T* seq_data = seq->mutable_data<T>(lite::TargetType::kX86);
+    T* seq_data = seq->template mutable_data<T>(lite::TargetType::kX86);
    for (size_t i = 0; i < num_seq; ++i) {
      for (size_t j = lod[level][i] * seq_width;
           j < lod[level][i + 1] * seq_width;

--- a/lite/backends/x86/math/sequence_topk_avg_pooling.cc
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
@@ -83,7 +83,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
    auto pos_data = pos->mutable_data<int>(lite::TargetType::kX86);
    int offset = 0;
-    std::vector<size_t> vec_out_lod;
+    std::vector<uint64_t> vec_out_lod;
    vec_out_lod.reserve(batch_size + 1);
    for (int i = 0; i <= batch_size; ++i) {
      offset = row_lod[i];
@@ -95,7 +95,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
    out->set_lod(lod_temp);
    auto in_data = in.data<T>();
-    auto out_data = out->mutable_data<T>(lite::TargetType::kX86);
+    auto out_data = out->template mutable_data<T>(lite::TargetType::kX86);
    T* sum_data = new T[max_k];
    for (int i = 0; i < batch_size; ++i) {

--- a/lite/backends/x86/math/softmax_impl.h
+++ b/lite/backends/x86/math/softmax_impl.h
@@ -108,8 +108,8 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
    const int num_remain = num_classes / axis_dim;
    if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
-      const T* in_data = X->data<T>();
+      const T* in_data = X->template data<T>();
-      auto* out_data = Y->mutable_data<T>();
+      auto* out_data = Y->template mutable_data<T>();
      for (int bs = 0; bs < batch_size; ++bs) {
        T max_val = *std::max_element(in_data, in_data + num_classes);
        max_val *= static_cast<T>(-1);
@@ -219,9 +219,9 @@ class SoftmaxGradFunctor<Target, T, enable_if_CPU<Target>> {
    const int num_remain = num_classes / axis_dim;
    if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
-      const T* out_data = y->data<T>();
+      const T* out_data = y->template data<T>();
-      const T* out_grad = y_grad->data<T>();
+      const T* out_grad = y_grad->template data<T>();
-      T* in_grad = x_grad->mutable_data<T>();
+      T* in_grad = x_grad->template mutable_data<T>();
      for (int bs = 0; bs < batch_size; ++bs) {
        T scalar;
        vec_mul_reduce<T, lite::x86::avx>(

--- a/lite/backends/x86/math/tree2col.cc
+++ b/lite/backends/x86/math/tree2col.cc
@@ -104,12 +104,12 @@ class Tree2ColFunctor<lite::TargetType::kX86, T> {
    patch_size = processing_list.size();
    // T *patch_data =
-    //    patch->mutable_data<T>({static_cast<int64_t>(patch_size),
+    //    patch->template mutable_data<T>({static_cast<int64_t>(patch_size),
    //                            static_cast<int64_t>(patch_elem_size)},
    //                           cpu_place);
    patch->Resize({static_cast<int64_t>(patch_size),
                   static_cast<int64_t>(patch_elem_size)});
-    auto *patch_data = patch->mutable_data<T>(lite::TargetType::kX86);
+    auto *patch_data = patch->template mutable_data<T>(lite::TargetType::kX86);
    constant(context, patch, 0);
    const T *features = node_features.data<T>();
@@ -166,12 +166,12 @@ class Col2TreeFunctor<lite::TargetType::kX86, T> {
      }
    }
    // T *grad_data =
-    //    in_grad->mutable_data<T>({static_cast<int64_t>(node_count),
+    //    in_grad->template mutable_data<T>({static_cast<int64_t>(node_count),
    //                              static_cast<int64_t>(grad_elem_size)},
    //                             cpu_place);
    in_grad->Resize({static_cast<int64_t>(node_count),
                     static_cast<int64_t>(grad_elem_size)});
-    auto *grad_data = in_grad->mutable_data<T>(lite::TargetType::kX86);
+    auto *grad_data = in_grad->template mutable_data<T>(lite::TargetType::kX86);
    constant(context, in_grad, 0);
    const T *out_g = out_grad.data<T>();

--- a/lite/backends/x86/math/unpooling.cc
+++ b/lite/backends/x86/math/unpooling.cc
@@ -36,7 +36,7 @@ class Unpool2dMaxFunctor<lite::TargetType::kX86, T> {
    int output_feasize = output_height * output_width;
    const T* input_data = input.data<T>();
    const int* indices_data = indices.data<int>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
    for (int b = 0; b < batch_size; ++b) {
      for (int c = 0; c < output_channels; ++c) {
        for (int i = 0; i < input_feasize; ++i) {
@@ -70,7 +70,8 @@ class Unpool2dMaxGradFunctor<lite::TargetType::kX86, T> {
    int output_feasize = output_height * output_width;
    const int* indices_data = indices.data<int>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    for (int b = 0; b < batch_size; ++b) {
      for (int c = 0; c < output_channels; ++c) {

--- a/lite/backends/x86/math/vol2col.cc
+++ b/lite/backends/x86/math/vol2col.cc
@@ -75,7 +75,7 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
                      "mismatching.");
    const T* vol_data = vol.data<T>();
-    T* col_data = col->mutable_data<T>();
+    T* col_data = col->template mutable_data<T>();
    for (int c = 0; c < channels_col; ++c) {
      int w_offset = c % filter_width;
@@ -159,7 +159,7 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
                      output_width,
                      "input_width and output_width are "
                      "mismatching.");
-    T* vol_data = vol->mutable_data<T>();
+    T* vol_data = vol->template mutable_data<T>();
    const T* col_data = col.data<T>();
    for (int c = 0; c < channels_col; ++c) {

--- a/lite/backends/xpu/CMakeLists.txt
+++ b/lite/backends/xpu/CMakeLists.txt
@@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU)
  return()
 endif()
-lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+if(LITE_WITH_XTCL)
+  lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+endif()
+lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
--- a/lite/backends/xpu/device.h
+++ b/lite/backends/xpu/device.h
@@ -14,12 +14,12 @@
 #pragma once
-#include <xtcl/xtcl.h>
 #include <cstdlib>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 namespace paddle {
 namespace lite {

--- a/lite/backends/xpu/math.h
+++ b/lite/backends/xpu/math.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <stdint.h>
+#include <cmath>
+#include <cstdlib>
+#include <utility>
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace xpu {
+namespace math {
+static inline long round_half_to_even(const float src) {  // NOLINT
+  long ret = llround(src);                                // NOLINT
+  if (fabs(fabs(round(src) - src) - 0.5) > 0) {
+    return ret;
+  } else {
+    if (abs(ret) % 2 == 0) {
+      return ret;
+    } else {
+      return ret + (ret > 0 ? -1 : 1);
+    }
+  }
+}
+static float ieee_compliance_0(float f) {
+  uint32_t *ptr = reinterpret_cast<uint32_t *>(&f);
+  uint32_t sign = (*ptr) & 0x80000000;
+  uint32_t uf = 0;
+  // nan -> inf
+  if (std::isnan(f)) {
+    uf = (sign | 0x7F800000);
+    float *ptr = reinterpret_cast<float *>(&uf);
+    return *ptr;
+  } else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) {
+    return f;
+  } else {
+    // denormal -> +-0
+    uf = 0x0;
+    float *ptr = reinterpret_cast<float *>(&uf);
+    return *ptr;
+  }
+}
+template <typename T, int RMAX>
+static inline T fp32_to_intx(const float f, float max) {
+  max = ieee_compliance_0(max);
+  float input = ieee_compliance_0(f);
+  // +0 and -0 -> +0
+  if (input == 0) {
+    input = 0.0f;
+  }
+  float tmp = RMAX / max;
+  if (std::isinf(tmp)) {
+    uint32_t *ptr = reinterpret_cast<uint32_t *>(&input);
+    if ((*ptr) >> 31 & 1) {
+      return T(-RMAX);
+    } else {
+      return T(RMAX);
+    }
+  }
+  tmp = input * tmp;
+  if (std::isnan(tmp)) {
+    return T(RMAX);
+  }
+  tmp = ieee_compliance_0(tmp);
+  // early check to avoid INF or big value get into convertor func.
+  if (tmp > RMAX) {
+    return T(RMAX);
+  }
+  if (tmp < -RMAX) {
+    return T(-RMAX);
+  }
+  T ret = (T)round_half_to_even(tmp);
+  if (ret > RMAX) {
+    ret = T(RMAX);
+  }
+  if (ret < -RMAX) {
+    ret = T(-RMAX);
+  }
+  return ret;
+}
+static inline int16_t fp32_to_int16(const float f, float max) {
+  int16_t v1 = fp32_to_intx<int16_t, 32767>(f, max);
+  return v1;
+}
+static inline int ConvertFP32ToInt16(const void *input,
+                                     void *output,
+                                     float max_val,
+                                     int len) {
+  for (int i = 0; i < len; i++) {
+    static_cast<int16_t *>(output)[i] =
+        fp32_to_int16(static_cast<const float *>(input)[i], max_val);
+  }
+  return 0;
+}
+static inline float FindMaxAbs(const float *data, int len) {
+  float max_f = 0.0f;
+  for (int i = 0; i < len; ++i) {
+    float max = std::abs(data[i]);
+    if (max > max_f) {
+      max_f = max;
+    }
+  }
+  return max_f;
+}
+template <typename T>
+static inline void Transpose(const T *in, T *out, int h, int w) {
+  for (int h1 = 0; h1 < w; ++h1) {
+    for (int w1 = 0; w1 < h; ++w1) {
+      out[h1 * h + w1] = in[w1 * w + h1];
+    }
+  }
+}
+/**
+ * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
+ * original x_dim is returned.
+ */
+static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return lite::DDim({1, x_dim[0]});
+}
+/**
+ * Get column matrix shape from a vector shape. If the rank of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return lite::DDim({y_dim[0], 1});
+}
+/**
+ * Matrix Descriptor of a memory buffer.
+ *
+ * It is used for Blas::MatMul. MatMul operator can be batched.
+ * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
+ * `batch_size` times of GEMM. The batched GEMM could be faster base on the
+ * implementation of the blas library. The batch size could be zero. If any
+ * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
+ * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
+ * [BatchSize, H1, W2]
+ *
+ * The boolean flag, `trans`, describe the memory is the transpose of matrix or
+ * not. If the trans is true, the last two dims of matrix are transposed. The
+ * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
+ *
+ * The MatDescriptor is not only the dimension or shape of a matrix, it also
+ * contains the layout, stride of matrix. It is clearer to have a structure than
+ * reuse `DDim`.
+ */
+struct MatDescriptor {
+  int64_t height_;
+  int64_t width_;
+  int64_t stride_{0};
+  int64_t batch_size_{0};
+  bool trans_;
+};
+static MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
+                                            int num_flatten_cols,
+                                            bool trans) {
+  MatDescriptor retv;
+  if (num_flatten_cols > 1) {
+    auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
+    retv.height_ = flatten_dim[0];
+    retv.width_ = flatten_dim[1];
+  } else {
+    if (tensor_dim.size() == 2) {
+      retv.height_ = tensor_dim[0];
+      retv.width_ = tensor_dim[1];
+    } else {
+      auto dim_vec = tensor_dim.Vectorize();
+      retv.batch_size_ = 1;
+      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
+        retv.batch_size_ *= dim_vec[i];
+      }
+      retv.height_ = dim_vec[dim_vec.size() - 2];
+      retv.width_ = dim_vec[dim_vec.size() - 1];
+      retv.stride_ = retv.height_ * retv.width_;
+    }
+  }
+  if (trans) {
+    std::swap(retv.width_, retv.height_);
+  }
+  retv.trans_ = trans;
+  return retv;
+}
+}  // namespace math
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/xpu/target_wrapper.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+namespace paddle {
+namespace lite {
+void* TargetWrapperXPU::Malloc(size_t size) {
+  void* ptr{nullptr};
+  xpu_malloc(&ptr, size);
+  return ptr;
+}
+void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
+void TargetWrapperXPU::MemcpySync(void* dst,
+                                  const void* src,
+                                  size_t size,
+                                  IoDirection dir) {
+  switch (dir) {
+    case IoDirection::HtoD:
+      xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
+      break;
+    case IoDirection::DtoH:
+      xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/target_wrapper.h"
+namespace paddle {
+namespace lite {
+using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
+template <>
+class TargetWrapper<TARGET(kXPU)> {
+ public:
+  static size_t num_devices() { return 1; }
+  static size_t maximum_stream() { return 0; }
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+};
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/xpu/xpu_header_sitter.h
+++ b/lite/backends/xpu/xpu_header_sitter.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#pragma GCC system_header
+#include <xpu/api.h>
+#include <xpu/golden.h>
+#include <xpu/runtime.h>
+#if defined(LITE_WITH_XTCL)
+#include <xtcl/xtcl.h>
+#endif
+namespace paddle {
+namespace lite {
+namespace xdnn = baidu::xpu::api;
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -5,9 +5,11 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
  DEPS target_wrapper_host place
  X86_DEPS target_wrapper_x86
  CUDA_DEPS target_wrapper_cuda
+  XPU_DEPS target_wrapper_xpu
  CL_DEPS cl_target_wrapper
  FPGA_DEPS fpga_target_wrapper
-  BM_DEPS target_wrapper_bm)
+  BM_DEPS target_wrapper_bm
+  MLU_DEPS target_wrapper_mlu)
 lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)

--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -15,5 +15,11 @@
 #include "lite/core/context.h"
 namespace paddle {
-namespace lite {}  // namespace lite
+namespace lite {
+#ifdef LITE_WITH_XPU
+thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
+#endif
+}  // namespace lite
 }  // namespace paddle
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -24,6 +24,14 @@
 #include "lite/backends/opencl/cl_context.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
+#ifdef LITE_WITH_MLU
+#include <cnml.h>
+#include <cnrt.h>
+#include "lite/backends/mlu/mlu_utils.h"
+#endif
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#endif
 #include <map>
 #include <memory>
@@ -103,11 +111,38 @@ class Context<TargetType::kXPU> {
 public:
  Context() {}
  explicit Context(const XPUContext& ctx);
  // NOTE: InitOnce should only be used by ContextScheduler
  void InitOnce() {}
  void CopySharedTo(XPUContext* ctx) {}
+  static xdnn::Context* GetRawContext() {
+    if (_tls_raw_ctx == nullptr) {
+      _tls_raw_ctx = xdnn::create_context();
+      CHECK(_tls_raw_ctx);
+    }
+    return _tls_raw_ctx;
+  }
+  static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
+    xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
+  }
+  static void SetDev(int dev_no = 0) {
+    const char* dev_env = getenv("LITE_XPU_DEV");
+    if (dev_env) {
+      xpu_set_device(atoi(dev_env));
+      return;
+    }
+    xpu_set_device(dev_no);
+  }
  std::string name() const { return "XPUContext"; }
+ private:
+  static thread_local xdnn::Context* _tls_raw_ctx;
 };
 #endif
@@ -172,6 +207,85 @@ class Context<TargetType::kFPGA> {
 };
 #endif
+#ifdef LITE_WITH_MLU
+template <>
+class Context<TargetType::kMLU> {
+ public:
+  typename Env<TargetType::kMLU>::Devs& devs = Env<TargetType::kMLU>::Global();
+  void InitOnce() {}
+  MLUContext& operator=(const MLUContext& ctx) {
+    this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
+    return *this;
+  }
+  void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
+    CHECK_GT(devs.size(), 0UL)
+        << "Env is not initialized or current target is not exit!";
+    if (dev_id >= static_cast<int>(devs.size())) {
+      LOG(WARNING) << "device index exceeds the number of devices, set to "
+                      "default device(0)!";
+      device_id_ = 0;
+    } else {
+      device_id_ = dev_id;
+    }
+    SetMluDevice(device_id_);
+    if (io_queue_id >= devs[dev_id].max_queue()) {
+      LOG(WARNING) << "data queue index exceeds the maximum queue number, "
+                      "set to default qeueu(0)!";
+      io_queue_id = 0;
+    }
+    if (exec_queue_id >= devs[dev_id].max_queue()) {
+      LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
+                      "set to default qeueu(0)!";
+      exec_queue_id = 0;
+    }
+    io_queue_ = devs[dev_id].io_queues()[io_queue_id];
+    exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
+    exec_queue_id_ = exec_queue_id;
+    io_queue_id_ = io_queue_id;
+  }
+  void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
+  const cnrtQueue_t& exec_queue() const { return exec_queue_; }
+  void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; }
+  const cnrtQueue_t& io_queue() const { return io_queue_; }
+  void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
+  cnmlCoreVersion_t MLUCoreVersion() {
+    return DeviceInfo::Global().MLUCoreVersion();
+  }
+  int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
+  u32_t affinity() { return affinity_; }
+  cnrtInvokeFuncParam_t forward_param() { return forward_param_; }
+  int device_id() { return device_id_; }
+  std::string name() const { return "MLUContext"; }
+ private:
+  int device_id_;
+  // overall information
+  int exec_queue_id_;
+  int io_queue_id_;
+  cnrtQueue_t io_queue_;
+  cnrtQueue_t exec_queue_;
+  std::vector<cnrtNotifier_t> input_notifiers_;
+  std::vector<cnrtNotifier_t> output_notifiers_;
+  cnrtInvokeFuncParam_t forward_param_;
+  u32_t affinity_ = 0x01;
+};
+#endif  // LITE_WITH_MLU
 #ifdef LITE_WITH_CUDA
 // Only works with CUDA kernels.
 template <>
@@ -398,6 +512,16 @@ class ContextScheduler {
        kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
            &ctx->As<BMContext>());
        break;
+#endif
+#ifdef LITE_WITH_MLU
+      case TARGET(kMLU): {
+        int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
+        auto& context = ctx->As<MLUContext>();
+        context.Init(dev_id);
+        kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
+            &context);
+        LOG(INFO) << "New Context for MLU";
+      } break;
 #endif
      default:
 #if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
@@ -439,6 +563,9 @@ class ContextScheduler {
 #endif
 #ifdef LITE_WITH_BM
    InitContext<TargetType::kBM, BMContext>();
+#endif
+#ifdef LITE_WITH_MLU
+    InitContext<TargetType::kMLU, MLUContext>();
 #endif
  }

--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -58,7 +58,7 @@
 namespace paddle {
 namespace lite {
-#ifdef LITE_WITH_ARM
+#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
 thread_local lite_api::PowerMode DeviceInfo::mode_;
 thread_local ARMArch DeviceInfo::arch_;
 thread_local int DeviceInfo::mem_size_;
@@ -66,6 +66,15 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
 thread_local TensorLite DeviceInfo::workspace_;
 thread_local int64_t DeviceInfo::count_ = 0;
+#ifdef LITE_WITH_MLU
+thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
+thread_local int DeviceInfo::mlu_core_number_{1};
+thread_local bool DeviceInfo::use_first_conv_{false};
+thread_local std::vector<float> DeviceInfo::mean_vec_;
+thread_local std::vector<float> DeviceInfo::std_vec_;
+thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
+#endif
 #ifdef TARGET_IOS
 const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
 const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
@@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() {
  return 0;
 }
+#ifdef LITE_WITH_MLU
+void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                               int core_number,
+                               bool use_first_conv,
+                               const std::vector<float>& mean_vec,
+                               const std::vector<float>& std_vec,
+                               DataLayoutType input_layout) {
+  switch (core_version) {
+    case (lite_api::MLUCoreVersion::MLU_220):
+      mlu_core_version_ = CNML_MLU220;
+      break;
+    case (lite_api::MLUCoreVersion::MLU_270):
+      mlu_core_version_ = CNML_MLU270;
+      break;
+    default:
+      mlu_core_version_ = CNML_MLU270;
+      break;
+  }
+  mlu_core_number_ = core_number;
+  use_first_conv_ = use_first_conv;
+  mean_vec_ = mean_vec;
+  std_vec_ = std_vec;
+  input_layout_ = input_layout;
+}
+cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
+int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
+bool DeviceInfo::UseFirstConv() { return use_first_conv_; }
+const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
+const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
+DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
+#endif  // LITE_WITH_MLU
 void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
 #ifdef ARM_WITH_OMP
  thread_num = std::min(thread_num, core_num_);
@@ -1159,6 +1207,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) {
 #endif  // LITE_WITH_ARM
+#ifdef LITE_WITH_MLU
+void SetMluDevice(int device_id) {
+  LOG(INFO) << "Set mlu device " << device_id;
+  cnrtDev_t dev_handle;
+  CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, device_id));
+  CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
+}
+void Device<TARGET(kMLU)>::Init() {
+  SetMluDevice(idx_);
+  GetInfo();
+  CreateQueue();
+}
+void Device<TARGET(kMLU)>::GetInfo() {}
+void Device<TARGET(kMLU)>::CreateQueue() {
+  exec_queue_.clear();
+  io_queue_.clear();
+  for (size_t i = 0; i < max_queue_; ++i) {
+    cnrtQueue_t exec_queue;
+    cnrtQueue_t io_queue;
+    cnrtCreateQueue(&exec_queue);
+    cnrtCreateQueue(&io_queue);
+    exec_queue_.push_back(exec_queue);
+    io_queue_.push_back(io_queue);
+    cnrtCreateQueue(&exec_queue);
+    exec_queue_.push_back(exec_queue);
+  }
+}
+#endif  // LITE_WITH_MLU
 #ifdef LITE_WITH_CUDA
 void Device<TARGET(kCUDA)>::Init() {

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -19,11 +19,14 @@
 #include <vector>
 #include "lite/core/tensor.h"
 #include "lite/utils/cp_logging.h"
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/mlu_utils.h"
+#endif
 namespace paddle {
 namespace lite {
-#ifdef LITE_WITH_ARM
+#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
 typedef enum {
  kAPPLE = 0,
@@ -52,6 +55,20 @@ class DeviceInfo {
  int Setup();
  void SetRunMode(lite_api::PowerMode mode, int thread_num);
+#ifdef LITE_WITH_MLU
+  void SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                     int core_number,
+                     bool use_first_conv,
+                     const std::vector<float>& mean_vec,
+                     const std::vector<float>& std_vec,
+                     DataLayoutType input_layout);
+  cnmlCoreVersion_t MLUCoreVersion();
+  int MLUCoreNumber();
+  bool UseFirstConv();
+  const std::vector<float>& MeanVec() const;
+  const std::vector<float>& StdVec() const;
+  DataLayoutType InputLayout() const;
+#endif
  void SetCache(int l1size, int l2size, int l3size);
  void SetArch(ARMArch arch) { arch_ = arch; }
@@ -103,6 +120,15 @@ class DeviceInfo {
  static thread_local TensorLite workspace_;
  static thread_local int64_t count_;
+#ifdef LITE_WITH_MLU
+  static thread_local cnmlCoreVersion_t mlu_core_version_;
+  static thread_local int mlu_core_number_;
+  static thread_local bool use_first_conv_;
+  static thread_local std::vector<float> mean_vec_;
+  static thread_local std::vector<float> std_vec_;
+  static thread_local DataLayoutType input_layout_;
+#endif
  void SetDotInfo(int argc, ...);
  void SetFP16Info(int argc, ...);
  void SetFP32Info(int argc, ...);
@@ -134,6 +160,9 @@ class Env {
    return *devs;
  }
  static void Init(int max_stream = 4) {
+#ifdef LITE_WITH_MLU
+    CNRT_CALL(cnrtInit(0));
+#endif
    Devs& devs = Global();
    if (devs.size() > 0) {
      return;
@@ -156,6 +185,41 @@ class Env {
  }
 };
+#ifdef LITE_WITH_MLU
+void SetMluDevice(int device_id);
+template <>
+class Device<TARGET(kMLU)> {
+ public:
+  Device(int dev_id, int max_queue = 1) : idx_(dev_id), max_queue_(max_queue) {}
+  void Init();
+  int id() { return idx_; }
+  int max_queue() { return max_queue_; }
+  void SetId(int idx) { idx_ = idx; }
+  std::string name() { return "MLU"; }
+  int core_num() { return 16; }
+  float max_memory() { return 16 * 1024; }
+  std::vector<cnrtQueue_t> io_queues() { return io_queue_; }
+  std::vector<cnrtQueue_t> exec_queues() { return exec_queue_; }
+ private:
+  void CreateQueue();
+  void GetInfo();
+ private:
+  int idx_{0};
+  int max_queue_;
+  std::string device_name_;
+  float max_memory_;
+  std::vector<cnrtQueue_t> io_queue_;
+  std::vector<cnrtQueue_t> exec_queue_;
+};
+template class Env<TARGET(kMLU)>;
+#endif  // LITE_WITH_MLU
 #ifdef LITE_WITH_CUDA
 template <>
 class Device<TARGET(kCUDA)> {

--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -83,6 +83,9 @@ class KernelBase {
 #if defined(LITE_WITH_CUDA)
    WorkSpace::Global_CUDA().AllocReset();
 #endif
+#if defined(LITE_WITH_MLU)
+    WorkSpace::Global_MLU().AllocReset();
+#endif
 #ifdef LITE_WITH_PROFILE
    profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
    profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());

--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -45,6 +45,16 @@ void* TargetMalloc(TargetType target, size_t size) {
      data = TargetWrapper<TARGET(kBM)>::Malloc(size);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      data = TargetWrapper<TARGET(kMLU)>::Malloc(size);
+      break;
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      data = TargetWrapperXPU::Malloc(size);
+      break;
+#endif  // LITE_WITH_XPU
    default:
      LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
  }
@@ -83,6 +93,16 @@ void TargetFree(TargetType target, void* data, std::string free_flag) {
      TargetWrapper<TARGET(kBM)>::Free(data);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      TargetWrapper<TARGET(kMLU)>::Free(data);
+      break;
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      TargetWrapperXPU::Free(data);
+      break;
+#endif  // LITE_WITH_XPU
    default:
      LOG(FATAL) << "Unknown type";
  }
@@ -114,6 +134,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
      TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      TargetWrapper<TARGET(kMLU)>::MemcpySync(
+          dst, src, size, IoDirection::HtoD);
+      break;
+#endif
 #ifdef LITE_WITH_OPENCL
    case TargetType::kOpenCL:
      TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);

--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -31,6 +31,14 @@
 #include "lite/backends/bm/target_wrapper.h"
 #endif  // LITE_WITH_BM
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/target_wrapper.h"
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif  // LITE_WITH_XPU
 namespace paddle {
 namespace lite {
@@ -75,6 +83,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
      TargetWrapperCL::MemcpySync(dst, src, size, dir);
      break;
 #endif  // LITE_WITH_OPENCL
+#ifdef LITE_WITH_MLU
+    case TARGET(kMLU):
+      TargetWrapperMlu::MemcpySync(dst, src, size, dir);
+      break;
+#endif
 #ifdef LITE_WITH_FPGA
    case TARGET(kFPGA):
      TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir);
@@ -126,7 +139,7 @@ class Buffer {
                        const size_t img_h,
                        void* host_ptr = nullptr) {
    if (target != target_ || cl_image2d_width_ < img_w ||
-        cl_image2d_height_ < img_h) {
+        cl_image2d_height_ < img_h || host_ptr != nullptr) {
      CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
      Free();
      data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -21,6 +21,8 @@ lite_cc_library(mir_passes
      fusion/elementwise_add_activation_fuse_pass.cc
      fusion/quant_dequant_fuse_pass.cc
      fusion/sequence_pool_concat_fuse_pass.cc
+      fusion/__xpu__resnet_fuse_pass.cc
+      fusion/__xpu__multi_encoder_fuse_pass.cc
      elimination/identity_scale_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc
      elimination/assign_value_eliminate_pass.cc
@@ -36,6 +38,7 @@ lite_cc_library(mir_passes
      demo_pass.cc
      runtime_context_assign_pass.cc
      memory_optimize_pass.cc
+      mlu_postprocess_pass.cc
      weight_quantization_preprocess_pass.cc
      quantized_op_attributes_inference_pass.cc
  DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
@@ -70,10 +73,10 @@ set(pattern_deps mir_node mir_ssa_graph op)
 if (WITH_TESTING)
  list(APPEND pattern_deps gtest)
 endif()
-lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps})
+lite_cc_library(pattern_matcher SRCS pattern_matcher.cc xpu_pattern_matcher.cc DEPS ${pattern_deps})
 lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher)
-lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher)
+lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc xpu_pattern_matcher_high_api.cc DEPS pattern_matcher)
 # for mobile, unnecessary to compile the following testings.

--- a/lite/core/mir/dot.h
+++ b/lite/core/mir/dot.h
@@ -27,8 +27,8 @@
 #include "lite/utils/string.h"
 namespace paddle {
-namespace inference {
+namespace lite {
-namespace analysis {
+namespace mir {
 static size_t dot_node_counter{0};
@@ -162,6 +162,6 @@ class Dot {
  std::vector<Attr> attrs_;
 };
-}  // namespace analysis
+}  // namespace mir
-}  // namespace inference
+}  // namespace lite
 }  // namespace paddle
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -27,10 +27,10 @@ lite_cc_library(fuse_transpose_softmax_transpose
        DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_interpolate
        SRCS interpolate_fuser.cc
-        DEPS pattern_matcher_high_api)       
+        DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_sequence_pool_concat
        SRCS sequence_pool_concat_fuser.cc
-        DEPS pattern_matcher_high_api)       
+        DEPS pattern_matcher_high_api)
 set(mir_fusers
    fuse_fc

--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -26,15 +26,13 @@ namespace paddle {
 namespace lite {
 namespace mir {
-using inference::analysis::Dot;
 void GraphVisualizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  VLOG(5) << "\n" << Visualize(graph.get());
 }
 std::string Visualize(mir::SSAGraph* graph) {
  std::ostringstream os;
-  inference::analysis::Dot dot;
+  Dot dot;
  auto string_trunc = [](const std::string& str) -> std::string {
    const int max_disp_size = 100;
    if (str.length() > max_disp_size)

--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -15,7 +15,6 @@
 #include "lite/core/mir/mlu_postprocess_pass.h"
 #include <list>
 #include <memory>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -50,10 +49,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
    op_desc.SetAttr<int>("out_dtype", 4);  // FP16
    op_desc.SetInput("X", {cur_node->AsArg().name});
    op_desc.SetOutput("Out", {cast_arg_name});
-  } else if (op_type == "transpose") {
+  } else if (op_type == "layout") {
    // NCHW -> NHWC
-    op_desc.SetAttr<std::vector<int>>("axis", {0, 2, 3, 1});
+    op_desc.SetInput("Input", {cur_node->AsArg().name});
-    op_desc.SetInput("X", {cur_node->AsArg().name});
    op_desc.SetOutput("Out", {cast_arg_name});
  } else if (op_type == "io_copy") {
    op_desc.SetInput("Input", {cur_node->AsArg().name});
@@ -72,8 +70,15 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
      if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
        is_found = true;
      }
-    } else if (op_type == "transpose") {
+    } else if (op_type == "layout") {
-      is_found = true;
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
+          DataLayoutCompatible(*out_arg_ty, *cast_type) &&
+          //  for first conv
+          PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
+        is_found = true;
+      }
    } else if (op_type == "io_copy") {
      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
@@ -89,8 +94,13 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
      // we pick the kernel
      cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
      auto& stmt = cast_inst->AsStmt();
-      stmt.picked_kernel().SetContext(
+      if (op_type == "layout") {
-          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+        stmt.picked_kernel().SetContext(
+            ContextScheduler::Global().NewContext(TARGET(kX86)));
+      } else {
+        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            stmt.picked_kernel().target()));
+      }
      break;
    }
  }
@@ -113,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
  cast_arg->AsArg().type = cast_type;
  auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
  // for CastAfter manully set the tensor's type
-  var->GetMutable<::paddle::lite::Tensor>();
+  var->GetMutable<paddle::lite::Tensor>();
  // create the stmt node
  auto* cast_inst = graph->NewInstructNode();
@@ -127,10 +137,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
    op_desc.SetAttr<int>("out_dtype", 5);  // FP16
    op_desc.SetInput("X", {cast_arg_name});
    op_desc.SetOutput("Out", {cur_node->AsArg().name});
-  } else if (op_type == "transpose") {
+  } else if (op_type == "layout") {
    // NHWC -> NCHW
-    op_desc.SetAttr<std::vector<int>>("axis", {0, 3, 1, 2});
+    op_desc.SetInput("Input", {cast_arg_name});
-    op_desc.SetInput("X", {cast_arg_name});
    op_desc.SetOutput("Out", {cur_node->AsArg().name});
  } else if (op_type == "io_copy") {
    op_desc.SetInput("Input", {cast_arg_name});
@@ -151,8 +160,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
      if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) {
        is_found = true;
      }
-    } else if (op_type == "transpose") {
+    } else if (op_type == "layout") {
-      is_found = true;
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (DataLayoutCompatible(*in_arg_ty, *cast_type) &&
+          DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) {
+        is_found = true;
+      }
    } else if (op_type == "io_copy") {
      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
@@ -168,8 +182,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
      // we pick the kernel
      cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
      auto& stmt = cast_inst->AsStmt();
-      stmt.picked_kernel().SetContext(
+      if (op_type == "layout") {
-          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+        stmt.picked_kernel().SetContext(
+            ContextScheduler::Global().NewContext(TARGET(kX86)));
+      } else {
+        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            stmt.picked_kernel().target()));
+      }
      break;
    }
  }
@@ -193,24 +212,28 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
  auto* cur_node = head_node;
  const auto name_prefix =
      head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
+  bool is_first_conv_head =
+      std::find(first_conv_nodes_.begin(),
+                first_conv_nodes_.end(),
+                head_node->AsArg().name) != first_conv_nodes_.end();
-  // layout cast node
+  // precision cast node
-  if (head_type->layout() != inst_type->layout()) {
+  if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
    cur_node = InsertCastBefore(
-        "transpose",
+        "cast",
-        name_prefix + "transpose",
+        name_prefix + "cast",
        graph,
        cur_node,
        inst_node,
        LiteType::GetTensorTy(
-            head_type->target(), head_type->precision(), inst_type->layout()));
+            head_type->target(), inst_type->precision(), head_type->layout()));
  }
-  // precision cast node
+  // layout cast node
-  if (head_type->precision() != inst_type->precision()) {
+  if (head_type->layout() != inst_type->layout()) {
    cur_node = InsertCastBefore(
-        "cast",
+        "layout",
-        name_prefix + "cast",
+        name_prefix + "layout",
        graph,
        cur_node,
        inst_node,
@@ -260,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
  // get subgraph's valid precision
  const auto& places = graph->valid_places();
-  std::set<::paddle::lite_api::PrecisionType> prec_set;
+  std::set<paddle::lite_api::PrecisionType> prec_set;
  for (const auto& place : places) {
    if (place.target == TARGET(kMLU)) {
      prec_set.insert(place.precision);
@@ -343,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
  const auto name_prefix =
      tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
-  // layout cast node
+  // precision cast node
-  if (tail_type->layout() != inst_type->layout()) {
+  if (tail_type->precision() != inst_type->precision()) {
    cur_node = InsertCastAfter(
-        "transpose",
+        "cast",
-        name_prefix + "transpose",
+        name_prefix + "cast",
        graph,
        cur_node,
        inst_node,
        LiteType::GetTensorTy(
-            tail_type->target(), tail_type->precision(), inst_type->layout()));
+            tail_type->target(), inst_type->precision(), tail_type->layout()));
  }
-  // precision cast node
+  // layout cast node
-  if (tail_type->precision() != inst_type->precision()) {
+  if (tail_type->layout() != inst_type->layout()) {
    cur_node = InsertCastAfter(
-        "cast",
+        "layout",
-        name_prefix + "cast",
+        name_prefix + "layout",
        graph,
        cur_node,
        inst_node,
@@ -392,6 +415,14 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
    UpdateOutputTo(
        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+    /* graph like this
+     *        subgraph_op_0
+     *          /       \
+     *         /         \
+     * subgraph_op_1   host_op
+     */
+    UpdateInputTo(
+        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
  }
  // recreate the op
@@ -415,6 +446,56 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
  }
 }
+bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
+  auto* block_desc =
+      static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
+          ->GetSubBlock();
+  for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
+    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
+    CHECK(op_desc);
+    if (op_desc->Type() == "conv2d") {
+      for (auto& names : op_desc->inputs()) {
+        if (std::find(names.second.begin(),
+                      names.second.end(),
+                      arg_node->AsArg().name) != names.second.end()) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) {
+  CHECK(arg_node->IsArg());
+  for (auto& inst : arg_node->outlinks) {
+    if (inst->AsStmt().op_type() == "subgraph") {
+      return IsFirstConvInSubgraph(arg_node, inst);
+    }
+  }
+  return false;
+}
+void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) {
+  for (auto& node : graph->mutable_nodes()) {
+    if (!node.IsStmt()) continue;
+    if (node.AsStmt().op_type() == "feed") {
+      for (auto& out : node.outlinks) {
+        if (IsFirstConvNode(out)) {
+          first_conv_nodes_.insert(out->AsArg().name);
+          // modify first conv nodes' type
+          const auto* old_type = out->AsArg().type;
+          out->AsArg().type =
+              LiteType::GetTensorTy(old_type->target(),
+                                    paddle::lite_api::PrecisionType::kInt8,
+                                    old_type->layout(),
+                                    old_type->device());
+        }
+      }
+    }
+  }
+}
 void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
  for (auto& node : graph->mutable_nodes()) {
    if (!node.IsStmt()) continue;
@@ -432,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
          out->AsArg().type =
              LiteType::GetTensorTy(old_type->target(),
                                    old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                    old_type->device());
        }
      }
@@ -451,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
          inp->AsArg().type =
              LiteType::GetTensorTy(old_type->target(),
                                    old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                    old_type->device());
        }
      }
@@ -460,14 +541,22 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
 }
 void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  // currently for non-persistent input and output args, mlu subgraph op
+// currently for non-persistent input and output args, mlu subgraph op
-  // only support float16/float32 data type
+// only support float16/float32 data type
-  // in two situations as folllows:
+// in two situations as folllows:
-  // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
+// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
-  // arg_in and arg_out are assumed to be NHWC which user should be aware of.
+// arg_in and arg_out are assumed to be NHWC which user should be aware of.
-  // Thus here we change these args' layout to NHWC
+// Thus here we change these args' layout to NHWC
-  ModifyLayout(graph.get());
+#ifdef LITE_WITH_MLU
+  if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) {
+    ModifyLayout(graph.get());
+  }
+  if (lite::DeviceInfo::Global().UseFirstConv()) {
+    GatherAndModifyFirstConvNodes(graph.get());
+  }
+#endif
  // insert io_copy, layout and precision cast of subgraph's inputs and outputs
  for (auto& node : graph->mutable_nodes()) {

--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 #include "lite/core/mir/pass.h"
@@ -107,6 +108,15 @@ class MLUPostprocessPass : public ProgramPass {
                        const Type* cast_type);
  void RecreateOp(Node* inst_node, SSAGraph* graph);
+  void GatherAndModifyFirstConvNodes(SSAGraph* graph);
+  bool IsFirstConvNode(Node* arg_node);
+  bool IsFirstConvInSubgraph(Node* arg_node, Node* inst);
+ private:
+  std::set<std::string> first_conv_nodes_;
 };
 }  // namespace mir

--- a/lite/core/mir/pattern_matcher.cc
+++ b/lite/core/mir/pattern_matcher.cc
@@ -322,7 +322,6 @@ void PatternMatcher::RemoveOverlappedMatch(std::vector<subgraph_t> *subgraphs) {
 }
 std::string PMPattern::DotString() const {
-  using inference::analysis::Dot;
  Dot dot;
  int id = 0;
  // Create Nodes

--- a/lite/core/mir/pattern_matcher_high_api.h
+++ b/lite/core/mir/pattern_matcher_high_api.h
@@ -64,7 +64,6 @@ class FuseBase {
 protected:
  virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0;
- private:
  void PerformPatternMatcher(SSAGraph* graph);
  // Delete nodes that are marked as Intermediate

--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -64,6 +64,26 @@ std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildOperationAdjList() {
  return adj_list;
 }
+std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildNodeAdjList() {
+  std::map<mir::Node *, std::set<mir::Node *>> adj_list;
+  for (auto &n : mutable_nodes()) {
+    if (adj_list.find(&n) == adj_list.end()) {
+      adj_list[&n] = std::set<mir::Node *>();
+    }
+    std::vector<mir::Node *> nodes;
+    for (auto &var : n.inlinks) {
+      nodes.push_back(var);
+    }
+    std::sort(nodes.begin(),
+              nodes.end(),
+              [](mir::Node *node1, mir::Node *node2) { return node1 > node2; });
+    adj_list[&n].insert(std::make_move_iterator(nodes.begin()),
+                        std::make_move_iterator(nodes.end()));
+  }
+  return adj_list;
+}
 void SSAGraph::SortHelper(
    const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
    mir::Node *node,
@@ -98,6 +118,24 @@ std::vector<mir::Node *> SSAGraph::StmtTopologicalOrder() {
  return res;
 }
+std::vector<mir::Node *> SSAGraph::NodeTopologicalOrder() {
+  CheckBidirectionalConnection();
+  std::stack<mir::Node *> stack;
+  std::set<mir::Node *> visited;
+  std::vector<mir::Node *> res;
+  auto adj_list = BuildNodeAdjList();
+  for (auto adj : adj_list) {
+    if (visited.find(adj.first) == visited.end()) {
+      SortHelper(adj_list, adj.first, &visited, &res);
+    }
+  }
+  return res;
+}
 Node *SSAGraph::GraphCreateInstructNode(
    const std::shared_ptr<OpLite> &op, const std::vector<Place> &valid_places) {
  node_storage_.emplace_back();
@@ -213,9 +251,10 @@ std::vector<mir::Node *> SSAGraph::outputs() {
 }
 mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) {
-  auto it = arguments_.find(arg);
+  for (auto &node : node_storage_) {
-  if (it != arguments_.end()) {
+    if (node.IsArg() && node.arg()->name == arg) {
-    return it->second;
+      return &node;
+    }
  }
  return nullptr;
 }

--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
@@ -42,6 +42,8 @@ class SSAGraph : GraphBase {
  std::vector<mir::Node *> StmtTopologicalOrder();
+  std::vector<mir::Node *> NodeTopologicalOrder();
  // The inputs of the graph.
  std::vector<mir::Node *> inputs();
@@ -86,6 +88,9 @@ class SSAGraph : GraphBase {
  // Build operator inlink edge table.
  std::map<mir::Node *, std::set<mir::Node *>> BuildOperationAdjList();
+  // Build node inlink edge table.
+  std::map<mir::Node *, std::set<mir::Node *>> BuildNodeAdjList();
  void SortHelper(const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
                  mir::Node *node,
                  std::set<mir::Node *> *visited,

--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -30,10 +30,8 @@ namespace paddle {
 namespace lite {
 namespace mir {
-using inference::analysis::Dot;
 std::string SubgraphVisualizer::operator()() {
-  inference::analysis::Dot dot;
+  Dot dot;
  const std::vector<std::string> subgraph_colors{
      "red",          "green",          "cyan",           "bisque3",
      "coral",        "darkseagreen1",  "goldenrod1",     "darkorchid",
@@ -314,8 +312,14 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) {
 std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
    node_map_t *nodes) {
-  for (auto &it : *nodes) {
+  for (auto &ordered_node : graph_->NodeTopologicalOrder()) {
-    node_dat_t *node = it.second;
+    // different orders when traversing nodes in graph may lead to
+    // different subgraph division, which may generate different result
+    // with device such as MLU. These different results are all "right"
+    // but a little confusing. Thus the topological order is used instead
+    // of the address of the node in graph.
+    CHECK(nodes->find(ordered_node) != nodes->end());
+    node_dat_t *node = (*nodes)[ordered_node];
    if (!node->marked) {
      continue;
    }
@@ -573,13 +577,14 @@ void ExtractInputsOutputs(const std::vector<Node *> &op_nodes,
        unused_var_nodes->insert(var_node);
        continue;
      }
-      // Var can have more than one next op node, So, if any one in the
+      // Var can have more than one next op node, So, if all next nodes are in
-      // op_nodes then continue
+      // op_nodes then it should be put into local_var_nodes
-      bool next_op_in_nodes = false;
+      bool next_op_in_nodes = true;
      for (auto &next_op_node : var_node->outlinks) {
-        if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) !=
+        if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) ==
            op_nodes.end()) {
-          next_op_in_nodes = true;
+          next_op_in_nodes = false;
+          break;
        }
      }
      if (next_op_in_nodes) {

--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -200,7 +200,7 @@ TEST(Subgraph, detect_custom_model) {
 #ifdef LITE_WITH_NPU
      Place{TARGET(kNPU), PRECISION(kFloat)},
 #endif
-#ifdef LITE_WITH_XPU
+#ifdef LITE_WITH_XTCL
      Place{TARGET(kXPU), PRECISION(kFloat)},
 #endif
  });

--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -20,6 +20,7 @@
 #include <vector>
 #include "lite/core/mir/pass_registry.h"
 #include "lite/core/mir/subgraph/subgraph_detector.h"
+#include "lite/utils/env.h"
 namespace paddle {
 namespace lite {
@@ -40,6 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }
 void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
  std::unordered_set<std::string> supported_lists;
 #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
 #include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
@@ -67,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }
+void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/mlu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
@@ -77,3 +93,5 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
    .BindTargets({TARGET(kXPU)});
 REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
    .BindTargets({TARGET(kBM)});
+REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
+    .BindTargets({TARGET(kMLU)});
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };
+class MLUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -180,7 +180,7 @@ TEST(Subgraph, generate_model_and_check_precision) {
 #ifdef LITE_WITH_NPU
  valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
 #endif
-#ifdef LITE_WITH_XPU
+#ifdef LITE_WITH_XTCL
  valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
 #endif
  auto tar_predictor = TestModel(FLAGS_model_dir,

--- a/lite/core/mir/subgraph_cast_display_pass.cc
+++ b/lite/core/mir/subgraph_cast_display_pass.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/pass_registry.h"
-namespace paddle {
-namespace lite {
-namespace mir {
-class SubgraphCastDisplayPass : public DebugPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
-    VLOG(3) << "== Argument types ==";
-    for (auto& node : graph->mutable_nodes()) {
-      if (!node.IsArg()) continue;
-      auto* type = node.AsArg().type;
-      if (type) {
-        VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type;
-      } else {
-        VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK";
-      }
-    }
-    VLOG(3) << "---------------------";
-    //
-    VLOG(0) << "== SubgraphOp Debug Info ==";
-    for (auto& node : graph->mutable_nodes()) {
-      if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
-        VLOG(0) << "FOUND SUBGRAPH OP";
-        display_debug_info(node, "subgraph");
-        break;
-      }
-    }
-    VLOG(0) << "---------------------";
-  }
-  void display_debug_info(const Node& node,
-                          std::string op_type,
-                          bool display_in_nodes = true,
-                          bool display_out_nodes = true) {
-    CHECK(node.IsStmt());
-    VLOG(0) << node.AsStmt();
-    if (display_in_nodes) {
-      for (auto p_in_arg_node : node.inlinks) {
-        CHECK(p_in_arg_node->IsArg());
-        VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name
-                << " type: " << *p_in_arg_node->AsArg().type
-                << " is_weight: " << p_in_arg_node->AsArg().is_weight
-                << " is_persist: " << p_in_arg_node->AsArg().is_persist
-                << " input_count: " << p_in_arg_node->inlinks.size();
-        if (p_in_arg_node->inlinks.size() == 0) {
-          VLOG(0) << "** END with No Op";
-        }
-        for (auto p_in_stmt_node : p_in_arg_node->inlinks) {
-          CHECK(p_in_stmt_node->IsStmt());
-          std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type();
-          if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
-              stmt_op_type == "io_copy") {
-            display_debug_info(*p_in_stmt_node, stmt_op_type, true, false);
-          } else {
-            VLOG(0) << "** END with op type: " << stmt_op_type;
-          }
-        }
-      }
-    }
-    if (display_out_nodes) {
-      for (auto p_out_arg_node : node.outlinks) {
-        CHECK(p_out_arg_node->IsArg());
-        VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
-                << " type: " << *p_out_arg_node->AsArg().type
-                << " is_weight: " << p_out_arg_node->AsArg().is_weight
-                << " is_persist: " << p_out_arg_node->AsArg().is_persist
-                << " output_count: " << p_out_arg_node->outlinks.size();
-        if (p_out_arg_node->outlinks.size() == 0) {
-          VLOG(0) << "** END with No Op";
-        }
-        for (auto p_out_stmt_node : p_out_arg_node->outlinks) {
-          CHECK(p_out_stmt_node->IsStmt());
-          std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type();
-          if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
-              stmt_op_type == "io_copy") {
-            display_debug_info(*p_out_stmt_node, stmt_op_type, false, true);
-          } else {
-            VLOG(0) << "** END with op type: " << stmt_op_type;
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-REGISTER_MIR_PASS(subgraph_cast_display_pass,
-                  paddle::lite::mir::SubgraphCastDisplayPass)
-    .BindTargets({TARGET(kAny)});
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -180,7 +180,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
        VLOG(4) << "picked, opencl found";
        is_found = true;
      } else if (TypeCompatible(*in_arg_ty, from) &&
-                 out_arg_ty->target() == to.target()) {
+                 TargetCompatibleTo(*out_arg_ty, to)) {
        VLOG(4) << "picked";
        is_found = true;
      }

--- a/lite/core/mir/xpu_pattern_matcher.cc
+++ b/lite/core/mir/xpu_pattern_matcher.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <array>
+#include <string>
+#include <vector>
+#include "lite/core/mir/dot.h"
+#include "lite/core/mir/xpu_pattern_matcher.h"
+#include "lite/core/op_lite.h"
+#include "lite/utils/string.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+void XPUPatternMatcher::operator()(SSAGraph *graph,
+                                   XPUPatternMatcher::handle_t handler) {
+  if (!MarkPMNodesInGraph(graph)) {
+    return;
+  }
+  auto subgraphs = DetectPatterns();
+  UniquePatterns(&subgraphs);
+  RemoveOverlappedMatch(&subgraphs);
+  ValidateByNodeRole(&subgraphs);
+  if (subgraphs.empty()) return;
+  LOG(INFO) << "detected " << subgraphs.size() << " subgraph";
+  int id = 0;
+  for (auto &g : subgraphs) {
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
+    handler(g, graph);
+  }
+}
+bool XPUPatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) {
+  VLOG(3) << "mark pmnodes in graph";
+  if (graph->nodes().empty()) return false;
+  for (auto &node : graph->mutable_nodes()) {
+    for (const auto &pmnode : pattern_.nodes()) {
+      if (pmnode->Tell(&node)) {
+        pmnodes2nodes_[pmnode.get()].insert(&node);
+      }
+    }
+  }
+  // Check to early stop if some PMNode can't find matched Node.
+  for (auto &pmnode : pattern_.nodes()) {
+    if (!pmnodes2nodes_.count(pmnode.get())) {
+      VLOG(4) << pmnode->name() << " can't find matched Node, early stop";
+      // return false;
+    }
+  }
+  VLOG(3) << pmnodes2nodes_.size() << " nodes marked";
+  return !pmnodes2nodes_.empty();
+}
+// The intermediate Nodes can only link to the nodes inside the pattern, or this
+// subgraph will be droped.
+void XPUPatternMatcher::ValidateByNodeRole(
+    std::vector<PatternMatcher::subgraph_t> *subgraphs) {
+  subgraphs->erase(
+      std::remove_if(subgraphs->begin(),
+                     subgraphs->end(),
+                     [](const XPUPatternMatcher::subgraph_t &subgraph) -> bool {
+                       // Collect the inlinks and outlinks.
+                       std::unordered_set<Node *> ios;
+                       for (auto &item : subgraph) {
+                         ios.insert(item.second);
+                       }
+                       for (auto &item : subgraph) {
+                         if (item.first->IsIntermediate()) {
+                           for (auto *x : item.second->outlinks) {
+                             if (!ios.count(x)) {
+                               return true;
+                             }
+                           }
+                         }
+                       }
+                       return false;
+                     }),
+      subgraphs->end());
+  for (auto &subgraph : *subgraphs) {
+    std::unordered_set<Node *> ios;
+    for (auto &item : subgraph) {
+      ios.insert(item.second);
+    }
+    extra_input_vars_.emplace_back();
+    for (auto &item : subgraph) {
+      for (auto *x : item.second->inlinks) {
+        if (x->IsArg() && ios.count(x) == 0) {
+          // extra weight var
+          extra_input_vars_.back().push_back(x);
+        }
+      }
+    }
+  }
+}
+struct HitGroup {
+  std::unordered_map<PMNode *, Node *> roles;
+  bool Match(Node *node, PMNode *pat) {
+    if (nodes_.count(node)) {
+      if (roles.count(pat) && roles[pat] == node) return true;
+      return false;
+    } else {
+      if (roles.count(pat) && roles[pat] != node) return false;
+      return true;
+    }
+  }
+  void Register(Node *node, PMNode *pat) {
+    roles[pat] = node;
+    nodes_.insert(node);
+  }
+ private:
+  std::unordered_set<Node *> nodes_;
+};
+// Tell whether Node a links to b.
+bool IsNodesLink(Node *a, Node *b) {
+  for (auto *node : a->outlinks) {
+    if (b == node) {
+      return true;
+    }
+  }
+  return false;
+}
+std::vector<PatternMatcher::subgraph_t> XPUPatternMatcher::DetectPatterns() {
+  // Init empty subgraphs.
+  std::vector<PatternMatcher::subgraph_t> result;
+  std::vector<HitGroup> init_groups;
+  std::array<std::vector<HitGroup>, 2> bi_records;
+  auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
+                                               : pattern_.edges().front().first;
+  if (!pmnodes2nodes_.count(first_pnode)) return result;
+  for (auto *node : pmnodes2nodes_[first_pnode]) {
+    HitGroup group;
+    group.roles[first_pnode] = node;
+    init_groups.emplace_back(group);
+  }
+  int step = 0;
+  bi_records[0] = std::move(init_groups);
+  // Extend a PMNode to subgraphs by deducing the connection relations defined
+  // in edges of PMNodes.
+  for (const auto &edge : pattern_.edges()) {
+    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
+    // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
+    // Each role has two PMNodes, which indicates two roles.
+    // Detect two Nodes that can match these two roles and they are connected.
+    auto &pre_groups = bi_records[step % 2];
+    auto &cur_groups = bi_records[1 - (step++ % 2)];
+    cur_groups.clear();
+    if (pre_groups.empty()) break;
+    // source -> target
+    for (Node *source : pmnodes2nodes_[edge.first]) {
+      for (Node *target : pmnodes2nodes_[edge.second]) {
+        // TODO(Superjomn) add some prune strategies.
+        for (const auto &group : pre_groups) {
+          if (IsNodesLink(source, target)) {
+            HitGroup new_group = group;
+            bool flag = new_group.Match(source, edge.first) &&
+                        new_group.Match(target, edge.second);
+            if (flag) {
+              new_group.Register(source, edge.first);
+              new_group.Register(target, edge.second);
+              cur_groups.push_back(new_group);
+              // TODO(Superjomn) need to unique
+            }
+          }
+        }
+      }
+    }
+    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
+  }
+  for (auto &group : bi_records[step % 2]) {
+    XPUPatternMatcher::subgraph_t subgraph;
+    for (auto &role : group.roles) {
+      subgraph.emplace(role.first, role.second);
+    }
+    result.emplace_back(subgraph);
+  }
+  return result;
+}
+struct GraphItemLessThan {
+  bool operator()(const std::pair<PMNode *, Node *> &a,
+                  const std::pair<PMNode *, Node *> &b) {
+    if (a.first != b.first) {
+      return a.first < b.first;
+    } else {
+      return a.second < b.second;
+    }
+  }
+};
+// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
+// see https://github.com/PaddlePaddle/Paddle/issues/13550
+void XPUPatternMatcher::UniquePatterns(
+    std::vector<PatternMatcher::subgraph_t> *subgraphs) {
+  if (subgraphs->empty()) return;
+  std::vector<PatternMatcher::subgraph_t> result;
+  std::unordered_set<size_t> set;
+  std::hash<std::string> hasher;
+  for (auto &g : *subgraphs) {
+    // Sort the items in the sub-graph, and transform to a string key.
+    std::vector<std::pair<PMNode *, Node *>> sorted_keys(g.begin(), g.end());
+    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
+    STL::stringstream ss;
+    for (auto &item : sorted_keys) {
+      ss << reinterpret_cast<size_t>(item.first) << ":"
+         << reinterpret_cast<size_t>(item.second);
+    }
+    auto key = hasher(ss.str());
+    if (!set.count(key)) {
+      result.emplace_back(g);
+      set.insert(key);
+    }
+  }
+  *subgraphs = result;
+}
+void XPUPatternMatcher::RemoveOverlappedMatch(
+    std::vector<subgraph_t> *subgraphs) {
+  std::vector<subgraph_t> result;
+  std::unordered_set<Node *> node_set;
+  for (const auto &subgraph : *subgraphs) {
+    bool valid = true;
+    for (auto &item : subgraph) {
+      if (item.first->IsIntermediate() && node_set.count(item.second)) {
+        valid = false;
+        break;
+      }
+    }
+    if (valid) {
+      for (auto &item : subgraph) {
+        node_set.insert(item.second);
+      }
+      result.push_back(subgraph);
+    }
+  }
+  *subgraphs = result;
+}
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/xpu_pattern_matcher.h
+++ b/lite/core/mir/xpu_pattern_matcher.h
--- a/lite/core/mir/xpu_pattern_matcher_high_api.cc
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.cc
--- a/lite/core/mir/xpu_pattern_matcher_high_api.h
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.h
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -152,6 +152,8 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kMLU, kInt16, kNCHW);
  INIT_FOR(kHost, kFloat, kNCHW);
+  INIT_FOR(kHost, kInt32, kNCHW);
+  INIT_FOR(kHost, kInt64, kNCHW);
  INIT_FOR(kHost, kAny, kNCHW);
  INIT_FOR(kHost, kFloat, kNHWC);
  INIT_FOR(kHost, kFloat, kAny);

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
--- a/lite/core/workspace.h
+++ b/lite/core/workspace.h
--- a/lite/fluid/lod.h
+++ b/lite/fluid/lod.h
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/host/ctc_align_compute.cc
+++ b/lite/kernels/host/ctc_align_compute.cc
--- a/lite/kernels/host/ctc_align_compute.h
+++ b/lite/kernels/host/ctc_align_compute.h
--- a/lite/kernels/mlu/CMakeLists.txt
+++ b/lite/kernels/mlu/CMakeLists.txt
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
--- a/lite/kernels/mlu/bridges/batch_norm_op_test.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
--- a/lite/kernels/mlu/bridges/concat_op.cc
+++ b/lite/kernels/mlu/bridges/concat_op.cc
--- a/lite/kernels/mlu/bridges/concat_op_test.cc
+++ b/lite/kernels/mlu/bridges/concat_op_test.cc
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
--- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
--- a/lite/kernels/mlu/bridges/graph.cc
+++ b/lite/kernels/mlu/bridges/graph.cc
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
--- a/lite/kernels/mlu/bridges/interpolate_op_test.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op_test.cc
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
--- a/lite/kernels/mlu/bridges/pool_op.cc
+++ b/lite/kernels/mlu/bridges/pool_op.cc
--- a/lite/kernels/mlu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
--- a/lite/kernels/mlu/bridges/scale_op.cc
+++ b/lite/kernels/mlu/bridges/scale_op.cc
--- a/lite/kernels/mlu/bridges/scale_op_test.cc
+++ b/lite/kernels/mlu/bridges/scale_op_test.cc
--- a/lite/kernels/mlu/bridges/softmax_op.cc
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
--- a/lite/kernels/mlu/bridges/softmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
--- a/lite/kernels/mlu/layout_compute.cc
+++ b/lite/kernels/mlu/layout_compute.cc
--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
--- a/lite/kernels/opencl/box_coder_image_compute_test.cc
+++ b/lite/kernels/opencl/box_coder_image_compute_test.cc
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
--- a/lite/kernels/x86/attention_padding_mask_compute.h
+++ b/lite/kernels/x86/attention_padding_mask_compute.h
--- a/lite/kernels/x86/batch_norm_compute.h
+++ b/lite/kernels/x86/batch_norm_compute.h
--- a/lite/kernels/x86/cast_compute.cc
+++ b/lite/kernels/x86/cast_compute.cc
--- a/lite/kernels/x86/concat_compute.h
+++ b/lite/kernels/x86/concat_compute.h
--- a/lite/kernels/x86/conv_compute.h
+++ b/lite/kernels/x86/conv_compute.h
--- a/lite/kernels/x86/dropout_compute.h
+++ b/lite/kernels/x86/dropout_compute.h
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
--- a/lite/kernels/x86/fc_compute.h
+++ b/lite/kernels/x86/fc_compute.h
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute.h
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute.h
--- a/lite/kernels/x86/gather_compute.h
+++ b/lite/kernels/x86/gather_compute.h
--- a/lite/kernels/x86/gru_compute.h
+++ b/lite/kernels/x86/gru_compute.h
--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
--- a/lite/kernels/x86/match_matrix_tensor_compute.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute.cc
--- a/lite/kernels/x86/matmul_compute.h
+++ b/lite/kernels/x86/matmul_compute.h
--- a/lite/kernels/x86/mul_compute.h
+++ b/lite/kernels/x86/mul_compute.h
--- a/lite/kernels/x86/reduce_compute.h
+++ b/lite/kernels/x86/reduce_compute.h
--- a/lite/kernels/x86/scale_compute.h
+++ b/lite/kernels/x86/scale_compute.h
--- a/lite/kernels/x86/search_grnn_compute.cc
+++ b/lite/kernels/x86/search_grnn_compute.cc
--- a/lite/kernels/x86/search_group_padding_compute.h
+++ b/lite/kernels/x86/search_group_padding_compute.h
--- a/lite/kernels/x86/search_seq_fc_compute.h
+++ b/lite/kernels/x86/search_seq_fc_compute.h
--- a/lite/kernels/x86/sequence_arithmetic_compute.h
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
--- a/lite/kernels/x86/sequence_concat_compute.h
+++ b/lite/kernels/x86/sequence_concat_compute.h
--- a/lite/kernels/x86/sequence_concat_compute_test.cc
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
--- a/lite/kernels/x86/sequence_conv_compute.cc
+++ b/lite/kernels/x86/sequence_conv_compute.cc
--- a/lite/kernels/x86/sequence_conv_compute.h
+++ b/lite/kernels/x86/sequence_conv_compute.h
--- a/lite/kernels/x86/sequence_expand_as_compute.h
+++ b/lite/kernels/x86/sequence_expand_as_compute.h
--- a/lite/kernels/x86/sequence_pool_compute.h
+++ b/lite/kernels/x86/sequence_pool_compute.h
--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
--- a/lite/kernels/x86/sequence_unpad_compute.cc
+++ b/lite/kernels/x86/sequence_unpad_compute.cc
--- a/lite/kernels/x86/sequence_unpad_compute.h
+++ b/lite/kernels/x86/sequence_unpad_compute.h
--- a/lite/kernels/x86/shape_compute.h
+++ b/lite/kernels/x86/shape_compute.h
--- a/lite/kernels/x86/softmax_compute.h
+++ b/lite/kernels/x86/softmax_compute.h
--- a/lite/kernels/x86/squeeze_compute.h
+++ b/lite/kernels/x86/squeeze_compute.h
--- a/lite/kernels/x86/stack_compute.h
+++ b/lite/kernels/x86/stack_compute.h
--- a/lite/kernels/x86/transpose_compute.h
+++ b/lite/kernels/x86/transpose_compute.h
--- a/lite/kernels/x86/uniform_random_compute.cc
+++ b/lite/kernels/x86/uniform_random_compute.cc
--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
--- a/lite/kernels/x86/var_conv_2d_compute_test.cc
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
--- a/lite/kernels/xpu/__xpu__resnet50_compute.cc
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.cc
--- a/lite/kernels/xpu/__xpu__resnet50_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
--- a/lite/kernels/xpu/activation_compute.cc
+++ b/lite/kernels/xpu/activation_compute.cc
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
--- a/lite/kernels/xpu/batch_norm_compute.cc
+++ b/lite/kernels/xpu/batch_norm_compute.cc
--- a/lite/kernels/xpu/batch_norm_compute.h
+++ b/lite/kernels/xpu/batch_norm_compute.h
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
--- a/lite/kernels/xpu/bridges/graph.h
+++ b/lite/kernels/xpu/bridges/graph.h
--- a/lite/kernels/xpu/bridges/utility.h
+++ b/lite/kernels/xpu/bridges/utility.h
--- a/lite/kernels/xpu/cast_compute.cc
+++ b/lite/kernels/xpu/cast_compute.cc
--- a/lite/kernels/xpu/cast_compute.h
+++ b/lite/kernels/xpu/cast_compute.h
--- a/lite/kernels/xpu/conv_compute.cc
+++ b/lite/kernels/xpu/conv_compute.cc
--- a/lite/kernels/xpu/conv_compute.h
+++ b/lite/kernels/xpu/conv_compute.h
--- a/lite/kernels/xpu/dropout_compute.cc
+++ b/lite/kernels/xpu/dropout_compute.cc
--- a/lite/kernels/xpu/dropout_compute.h
+++ b/lite/kernels/xpu/dropout_compute.h
--- a/lite/kernels/xpu/elementwise_compute.cc
+++ b/lite/kernels/xpu/elementwise_compute.cc
--- a/lite/kernels/xpu/elementwise_compute.h
+++ b/lite/kernels/xpu/elementwise_compute.h
--- a/lite/kernels/xpu/io_copy_compute.cc
+++ b/lite/kernels/xpu/io_copy_compute.cc
--- a/lite/kernels/xpu/layer_norm_compute.cc
+++ b/lite/kernels/xpu/layer_norm_compute.cc
--- a/lite/kernels/xpu/layer_norm_compute.h
+++ b/lite/kernels/xpu/layer_norm_compute.h
--- a/lite/kernels/xpu/lookup_table_compute.cc
+++ b/lite/kernels/xpu/lookup_table_compute.cc
--- a/lite/kernels/xpu/lookup_table_compute.h
+++ b/lite/kernels/xpu/lookup_table_compute.h
--- a/lite/kernels/xpu/matmul_compute.cc
+++ b/lite/kernels/xpu/matmul_compute.cc
--- a/lite/kernels/xpu/matmul_compute.h
+++ b/lite/kernels/xpu/matmul_compute.h
--- a/lite/kernels/xpu/mul_compute.cc
+++ b/lite/kernels/xpu/mul_compute.cc
--- a/lite/kernels/xpu/mul_compute.h
+++ b/lite/kernels/xpu/mul_compute.h
--- a/lite/kernels/xpu/pool_compute.cc
+++ b/lite/kernels/xpu/pool_compute.cc
--- a/lite/kernels/xpu/pool_compute.h
+++ b/lite/kernels/xpu/pool_compute.h
--- a/lite/kernels/xpu/scale_compute.cc
+++ b/lite/kernels/xpu/scale_compute.cc
--- a/lite/kernels/xpu/scale_compute.h
+++ b/lite/kernels/xpu/scale_compute.h
--- a/lite/kernels/xpu/slice_compute.cc
+++ b/lite/kernels/xpu/slice_compute.cc
--- a/lite/kernels/xpu/slice_compute.h
+++ b/lite/kernels/xpu/slice_compute.h
--- a/lite/kernels/xpu/softmax_compute.cc
+++ b/lite/kernels/xpu/softmax_compute.cc
--- a/lite/kernels/xpu/softmax_compute.h
+++ b/lite/kernels/xpu/softmax_compute.h
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/__xpu__multi_encoder_op.cc
+++ b/lite/operators/__xpu__multi_encoder_op.cc
--- a/lite/operators/__xpu__multi_encoder_op.h
+++ b/lite/operators/__xpu__multi_encoder_op.h
--- a/lite/operators/__xpu__resnet50_op.cc
+++ b/lite/operators/__xpu__resnet50_op.cc
--- a/lite/operators/__xpu__resnet50_op.h
+++ b/lite/operators/__xpu__resnet50_op.h
--- a/lite/operators/activation_extra_ops.cc
+++ b/lite/operators/activation_extra_ops.cc
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
--- a/lite/operators/ctc_align_op.cc
+++ b/lite/operators/ctc_align_op.cc
--- a/lite/operators/ctc_align_op.h
+++ b/lite/operators/ctc_align_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/sequence_unpad_op.cc
+++ b/lite/operators/sequence_unpad_op.cc
--- a/lite/operators/sequence_unpad_op.h
+++ b/lite/operators/sequence_unpad_op.h
--- a/lite/operators/stack_op.cc
+++ b/lite/operators/stack_op.cc
--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
--- a/lite/tests/api/test_bert_lite_xpu.cc
+++ b/lite/tests/api/test_bert_lite_xpu.cc
--- a/lite/tests/api/test_ernie_lite_xpu.cc
+++ b/lite/tests/api/test_ernie_lite_xpu.cc
--- a/lite/tests/api/test_resnet50_lite_xpu.cc
+++ b/lite/tests/api/test_resnet50_lite_xpu.cc
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
--- a/lite/tests/kernels/ctc_align_compute_test.cc
+++ b/lite/tests/kernels/ctc_align_compute_test.cc
--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_mlu.sh
+++ b/lite/tools/build_mlu.sh
--- a/lite/tools/build_xpu.sh
+++ b/lite/tools/build_xpu.sh