Merge pull request #95 from PaddlePaddle/develop

pull code

Merge pull request #95 from PaddlePaddle/develop
pull code
7085da3b · HappyAngel · GitHub · fce2a799 · 40a31442 · 7085da3b
190 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,7 +59,9 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
+lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
+lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
 lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
@@ -177,6 +179,10 @@ if(LITE_WITH_XPU)
    include(device/xpu)
 endif()

+if(LITE_WITH_MLU)
+    include(mlu)
+endif()
+
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -136,6 +136,9 @@ endif()

 if (LITE_WITH_XPU)
    add_definitions("-DLITE_WITH_XPU")
+    if (LITE_WITH_XTCL)
+      add_definitions("-DLITE_WITH_XTCL")
+    endif()
 endif()

 if (LITE_WITH_OPENCL)
@@ -150,6 +153,10 @@ if (LITE_WITH_BM)
 add_definitions("-DLITE_WITH_BM")
 endif()

+if (LITE_WITH_MLU)
+add_definitions("-DLITE_WITH_MLU")
+endif()
+
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
 endif()

--- a/cmake/device/xpu.cmake
+++ b/cmake/device/xpu.cmake
@@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT)
    message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
  endif()
 endif()
-
 message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
-find_path(XPU_SDK_INC NAMES xtcl.h
-  PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl
-  NO_DEFAULT_PATH)
-if(NOT XPU_SDK_INC)
-  message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
-endif()

-include_directories("${XPU_SDK_ROOT}/XTCL/include")
 include_directories("${XPU_SDK_ROOT}/XTDK/include")

-find_library(XPU_SDK_XTCL_FILE NAMES xtcl
-  PATHS ${XPU_SDK_ROOT}/XTCL/so
-  NO_DEFAULT_PATH)
-
-if(NOT XPU_SDK_XTCL_FILE)
-  message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
-  add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
-endif()
-
-find_library(XPU_SDK_TVM_FILE NAMES tvm
-  PATHS ${XPU_SDK_ROOT}/XTCL/so
-  NO_DEFAULT_PATH)
-
-if(NOT XPU_SDK_TVM_FILE)
-  message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
-  add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
-endif()
-
 find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
  NO_DEFAULT_PATH)
@@ -82,23 +50,55 @@ else()
  set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE})
 endif()

-find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
-  NO_DEFAULT_PATH)
-
-find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
-  NO_DEFAULT_PATH)
-
-if(NOT XPU_SDK_LLVM_FILE)
-  message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
-  add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
+set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs")
+set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs")
+
+if(LITE_WITH_XTCL)
+    find_path(XPU_SDK_INC NAMES xtcl.h
+      PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_INC)
+      message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
+    endif()
+    include_directories("${XPU_SDK_ROOT}/XTCL/include")
+
+    find_library(XPU_SDK_XTCL_FILE NAMES xtcl
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
+      NO_DEFAULT_PATH)
+
+    if(NOT XPU_SDK_XTCL_FILE)
+      message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
+      add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
+    endif()
+
+    find_library(XPU_SDK_TVM_FILE NAMES tvm
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
+      NO_DEFAULT_PATH)
+
+    if(NOT XPU_SDK_TVM_FILE)
+      message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
+      add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
+    endif()
+
+    find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
+      PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+      NO_DEFAULT_PATH)
+
+    if(NOT XPU_SDK_LLVM_FILE)
+      message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
+      add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
+    endif()
+
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1")
+
+    set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
+    set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
 endif()
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0")
-
-set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
-set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  set(deps ${lite_deps_DEPS})
@@ -100,6 +100,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+  if (LITE_WITH_MLU)
+    foreach(var ${lite_deps_MLU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()

@@ -125,7 +131,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -144,6 +150,7 @@ function(lite_cc_library TARGET)
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )

    if (args_SHARED OR ARGS_shared)
@@ -170,7 +177,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -189,6 +196,7 @@ function(lite_cc_binary TARGET)
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            CV_DEPS ${CV_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -218,7 +226,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -245,6 +253,7 @@ function(lite_cc_test TARGET)
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
              CV_DEPS ${args_CV_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size
@@ -269,6 +278,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
+set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")
@@ -285,12 +295,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
+# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -369,6 +379,12 @@ function(add_kernel TARGET device level)
        endif()
        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "MLU")
+        if (NOT LITE_WITH_MLU)
+            return()
+        endif()
+        set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
            foreach(src ${args_SRCS})
@@ -409,6 +425,7 @@ function(add_kernel TARGET device level)
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
 	      BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -427,7 +444,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -462,6 +479,7 @@ function(add_operator TARGET level)
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
 	      BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -8,7 +8,9 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
+message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
+message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -20,7 +20,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
    if(LITE_WITH_X86)
        add_dependencies(paddle_full_api_shared xxhash)
        target_link_libraries(paddle_full_api_shared xxhash)
-        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) 
+        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
            add_dependencies(paddle_full_api_shared dynload_mklml)
        endif()
    endif()
@@ -67,7 +67,8 @@ if (WITH_TESTING)
      CUDA_DEPS ${cuda_kernels}
      X86_DEPS ${x86_kernels}
      XPU_DEPS ${xpu_kernels}
-      BM_DEPS ${bm_kernels})
+      BM_DEPS ${bm_kernels}
+      MLU_DEPS ${mlu_kernels})
 endif()
 if(LITE_WITH_FPGA)
    set(light_api_deps ${light_api_deps} ${fpga_deps})
@@ -89,6 +90,7 @@ message(STATUS "get NPU kernels ${npu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
+message(STATUS "get MLU kernels ${mlu_kernels}")

 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -126,7 +128,8 @@ lite_cc_library(light_api SRCS light_api.cc
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
-        BM_DEPS ${bm_kernels})
+        BM_DEPS ${bm_kernels}
+        MLU_DEPS ${mlu_kernels})

 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -145,6 +148,7 @@ if(WITH_TESTING)
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels}
       BM_DEPS ${bm_kernels}
+       MLU_DEPS ${mlu_kernels}
       EXCLUDE_COMPILE_DEPS "ON"
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -291,6 +295,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
        XPU_DEPS ${xpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
+        MLU_DEPS ${mlu_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
        --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)

@@ -328,6 +333,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
  X86_DEPS ${x86_kernels}
  FPGA_DEPS ${fpga_kernels}
  BM_DEPS ${bm_kernels}
+  MLU_DEPS ${mlu_kernels}
  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
 if (WITH_TESTING)
    add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -341,6 +347,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -353,6 +360,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -365,6 +373,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -377,6 +386,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
@@ -388,6 +398,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
 	BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -43,16 +43,7 @@ class LITE_API Predictor {
 public:
  // Create an empty predictor.
  Predictor() { scope_ = std::make_shared<Scope>(); }
-  ~Predictor() {
-#ifdef LITE_WITH_OPENCL
-    CLRuntime::Global()->ReleaseResources();
-#endif
-    scope_.reset();
-    exec_scope_ = nullptr;
-    program_.reset();
-    input_names_.clear();
-    output_names_.clear();
-  }
+
  // Create a predictor with the weight variable scope set.
  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
      : scope_(root_scope) {}

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -42,6 +42,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
    }
  }
 #endif
+#ifdef LITE_WITH_MLU
+  Env<TARGET(kMLU)>::Init();
+  lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
+                                           config.mlu_core_number(),
+                                           config.mlu_use_first_conv(),
+                                           config.mlu_first_conv_mean(),
+                                           config.mlu_first_conv_std(),
+                                           config.mlu_input_layout());
+#endif  // LITE_WITH_MLU
  std::vector<std::string> passes{};
  auto use_layout_preprocess_pass =
      config.model_dir().find("OPENCL_PRE_PRECESS");

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -14,6 +14,7 @@

 #include "lite/api/light_api.h"
 #include <algorithm>
+#include <unordered_map>
 #include "paddle_use_kernels.h"  // NOLINT
 #include "paddle_use_ops.h"      // NOLINT

@@ -135,7 +136,15 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
  // 1. Create op first
  Program program(prog, scope_, {});

-  // 2. Create Instructs
+// 2. Create Instructs
+#ifdef LITE_WITH_OPENCL
+  using WaitListType =
+      std::unordered_map<decltype(static_cast<const void*>(nullptr)),
+                         std::shared_ptr<cl::Event>>;
+  using OpenCLContext = Context<TargetType::kOpenCL>;
+  std::unique_ptr<KernelContext> local_ctx(new KernelContext());
+  local_ctx->As<OpenCLContext>().InitOnce();
+#endif

  // Create the kernels of the target places, and filter out the specific
  // kernel with the target alias.
@@ -151,7 +160,18 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
          return it->alias() == alias;
        });
    CHECK(it != kernels.end());
+
+#ifdef LITE_WITH_OPENCL
+    if ((*it)->target() == TARGET(kOpenCL)) {
+      std::unique_ptr<KernelContext> ctx(new KernelContext());
+      (*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
+      (*it)->SetContext(std::move(ctx));
+    } else {
+      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+    }
+#else
    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+#endif

    insts.emplace_back(op, std::move(*it));
  }

--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -107,8 +107,6 @@ class LightPredictorImpl : public lite_api::PaddlePredictor {
 public:
  LightPredictorImpl() = default;

-  ~LightPredictorImpl();
-
  std::unique_ptr<lite_api::Tensor> GetInput(int i) override;

  std::unique_ptr<const lite_api::Tensor> GetOutput(int i) const override;

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -21,13 +21,6 @@
 namespace paddle {
 namespace lite {

-LightPredictorImpl::~LightPredictorImpl() {
-  raw_predictor_.reset();
-#ifdef LITE_WITH_OPENCL
-  CLRuntime::Global()->ReleaseResources();
-#endif
-}
-
 void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  // LightPredictor Only support NaiveBuffer backend in publish lib
  if (config.lite_model_file().empty()) {

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -109,6 +109,8 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(TARGET(kNPU));
    } else if (target_repr == "xpu") {
      valid_places.emplace_back(TARGET(kXPU));
+    } else if (target_repr == "mlu") {
+      valid_places.emplace_back(TARGET(kMLU));
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "lite/api/paddle_api.h"
+#include "lite/core/context.h"
 #include "lite/core/device_info.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
@@ -203,6 +204,58 @@ void ConfigBase::set_threads(int threads) {
 #endif
 }

+#ifdef LITE_WITH_MLU
+void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
+  mlu_core_version_ = core_version;
+}
+void CxxConfig::set_mlu_core_number(int core_number) {
+  mlu_core_number_ = core_number;
+}
+void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
+  mlu_input_layout_ = layout;
+}
+void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
+  mlu_use_first_conv_ = use_first_conv;
+}
+void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
+  mlu_first_conv_mean_ = mean;
+}
+void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
+  mlu_first_conv_std_ = std;
+}
+lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
+  return mlu_core_version_;
+}
+int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
+DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
+bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
+const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
+  return mlu_first_conv_mean_;
+}
+const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
+  return mlu_first_conv_std_;
+}
+#endif
+
+void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_workspace_l3_size_per_thread' is ignored, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
+void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::SetDev(dev_no);
+#else
+  LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
+                  "ignored, please rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
 // set model data in combined format, `set_model_from_file` refers to loading
 // model from file, set_model_from_buffer refers to loading model from memory
 // buffer

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -136,6 +136,14 @@ class LITE_API CxxConfig : public ConfigBase {
 #ifdef LITE_WITH_X86
  int x86_math_library_math_threads_ = 1;
 #endif
+#ifdef LITE_WITH_MLU
+  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
+  int mlu_core_number_{1};
+  DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
+  bool mlu_use_first_conv_{false};
+  std::vector<float> mlu_first_conv_mean_;
+  std::vector<float> mlu_first_conv_std_;
+#endif

 public:
  void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -163,6 +171,37 @@ class LITE_API CxxConfig : public ConfigBase {
    return x86_math_library_math_threads_;
  }
 #endif
+
+#ifdef LITE_WITH_MLU
+  // set MLU core version, which is used when compiling MLU kernels
+  void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
+  // set MLU core number, which is used when compiling MLU kernels
+  void set_mlu_core_number(int core_number);
+  // set MLU input layout. User can specify layout of input data to be NHWC,
+  // default is NCHW
+  void set_mlu_input_layout(DataLayoutType layout);
+  // whether use MLU's first conv kernel. First conv is a special kernel
+  // provided by MLU, its input is uint8, and also needs two 3-dimentional
+  // vectors which save all inputs' mean and std values
+  void set_mlu_use_first_conv(bool use_first_conv);
+  // set the 3-dimentional mean vector used by MLU's first conv
+  void set_mlu_first_conv_mean(const std::vector<float>& mean);
+  // set the 3-dimentional std vector used by MLU's first conv
+  void set_mlu_first_conv_std(const std::vector<float>& std);
+
+  lite_api::MLUCoreVersion mlu_core_version() const;
+  int mlu_core_number() const;
+  DataLayoutType mlu_input_layout() const;
+  bool mlu_use_first_conv() const;
+  const std::vector<float>& mlu_first_conv_mean() const;
+  const std::vector<float>& mlu_first_conv_std() const;
+#endif
+
+  // XPU only, set the size of the workspace memory from L3 cache for the
+  // current thread.
+  void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
+  // XPU only, specify the target device ID for the current thread.
+  void set_xpu_dev_per_thread(int dev_no = 0);
 };

 /// MobileConfig is the config for the light weight predictor, it will skip

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -71,7 +71,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "fpga",
                                              "npu",
                                              "xpu",
-                                              "bm"};
+                                              "bm",
+                                              "mlu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -111,6 +112,7 @@ const std::string& TargetRepr(TargetType target) {
                                              "kFPGA",
                                              "kNPU",
                                              "kXPU",
+                                              "kMLU",
                                              "kBM"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
@@ -153,6 +155,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kNPU),
                                               TARGET(kXPU),
                                               TARGET(kBM),
+                                               TARGET(kMLU),
                                               TARGET(kFPGA)});
  if (target == TARGET(kAny)) {
    return valid_set;

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -53,8 +53,8 @@ enum class TargetType : int {
  kNPU = 8,
  kXPU = 9,
  kBM = 10,
-  kAny = 6,  // any target
  kMLU = 11,
+  kAny = 6,  // any target
  NUM = 12,  // number of fields.
 };
 enum class PrecisionType : int {
@@ -89,6 +89,8 @@ typedef enum {
  LITE_POWER_RAND_LOW = 5
 } PowerMode;

+typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion;
+
 enum class ActivationType : int {
  kIndentity = 0,
  kRelu = 1,
@@ -100,7 +102,9 @@ enum class ActivationType : int {
  kSwish = 7,
  kExp = 8,
  kAbs = 9,
-  NUM = 10,
+  kHardSwish = 10,
+  kReciprocal = 11,
+  NUM = 12,
 };

 static size_t PrecisionTypeLength(PrecisionType type) {

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -45,5 +45,9 @@ USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
+USE_MIR_PASS(mlu_subgraph_pass);
+USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(__xpu__resnet_fuse_pass);
+USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -47,6 +47,7 @@ using lite_api::TargetType;
 using lite_api::PrecisionType;
 using lite_api::DataLayoutType;
 using lite_api::Place;
+using lite_api::MLUCoreVersion;
 using lite::LightPredictorImpl;
 using lite_api::OptBase;

@@ -76,6 +77,7 @@ static void BindLiteMobileConfig(py::module *m);
 static void BindLitePowerMode(py::module *m);
 static void BindLitePlace(py::module *m);
 static void BindLiteTensor(py::module *m);
+static void BindLiteMLUCoreVersion(py::module *m);

 void BindLiteApi(py::module *m) {
  BindLiteCxxConfig(m);
@@ -83,6 +85,7 @@ void BindLiteApi(py::module *m) {
  BindLitePowerMode(m);
  BindLitePlace(m);
  BindLiteTensor(m);
+  BindLiteMLUCoreVersion(m);
 #ifndef LITE_ON_TINY_PUBLISH
  BindLiteCxxPredictor(m);
 #endif
@@ -124,6 +127,14 @@ void BindLiteCxxConfig(py::module *m) {
      .def("set_power_mode", &CxxConfig::set_power_mode)
      .def("power_mode", &CxxConfig::power_mode);
 #endif
+#ifdef LITE_WITH_MLU
+  cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version)
+      .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number)
+      .def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout)
+      .def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv)
+      .def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean)
+      .def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std);
+#endif
 }

 // TODO(sangoly): Should MobileConfig be renamed to LightConfig ??
@@ -155,6 +166,12 @@ void BindLitePowerMode(py::module *m) {
      .value("LITE_POWER_RAND_LOW", PowerMode::LITE_POWER_RAND_LOW);
 }

+void BindLiteMLUCoreVersion(py::module *m) {
+  py::enum_<MLUCoreVersion>(*m, "MLUCoreVersion")
+      .value("LITE_MLU_220", MLUCoreVersion::MLU_220)
+      .value("LITE_MLU_270", MLUCoreVersion::MLU_270);
+}
+
 void BindLitePlace(py::module *m) {
  // TargetType
  py::enum_<TargetType>(*m, "TargetType")
@@ -165,6 +182,7 @@ void BindLitePlace(py::module *m) {
      .value("OpenCL", TargetType::kOpenCL)
      .value("FPGA", TargetType::kFPGA)
      .value("NPU", TargetType::kNPU)
+      .value("MLU", TargetType::kMLU)
      .value("Any", TargetType::kAny);

  // PrecisionType
@@ -245,6 +263,20 @@ void BindLiteTensor(py::module *m) {
  DO_GETTER_ONCE(data_type__, name__##_data)

  DATA_GETTER_SETTER_ONCE(int8_t, int8);
+#ifdef LITE_WITH_MLU
+  tensor.def("set_uint8_data",
+             [](Tensor &self,
+                const std::vector<uint8_t> &data,
+                TargetType type = TargetType::kHost) {
+               if (type == TargetType::kHost) {
+                 self.CopyFromCpu<uint8_t, TargetType::kHost>(data.data());
+               }
+             },
+             py::arg("data"),
+             py::arg("type") = TargetType::kHost);
+
+  DO_GETTER_ONCE(uint8_t, "uint8_data");
+#endif
  DATA_GETTER_SETTER_ONCE(int32_t, int32);
  DATA_GETTER_SETTER_ONCE(float, float);
 #undef DO_GETTER_ONCE

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -6,4 +6,5 @@ add_subdirectory(fpga)
 add_subdirectory(host)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(mlu)
 add_subdirectory(bm)
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "lite/backends/arm/math/activation.h"
+#include <algorithm>
 #include <string>
 #include "lite/backends/arm/math/funcs.h"

@@ -711,6 +712,38 @@ void act_square<float>(const float* din, float* dout, int size, int threads) {
  }
 }

+template <>
+void act_hard_swish<float>(const float* din,
+                           float* dout,
+                           int size,
+                           float threshold,
+                           float scale,
+                           float offset,
+                           int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = std::min(std::max(0.f, ptr_in[0] + offset), threshold) *
+                 ptr_in[0] / scale;
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
+template <>
+void act_reciprocal<float>(const float* din,
+                           float* dout,
+                           int size,
+                           int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = 1.0 / ptr_in[0];
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
 #ifdef LITE_WITH_TRAIN
 template <>
 void act_square_grad(const float* din,

--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -72,6 +72,17 @@ void act_rsqrt(const T* din, T* dout, int size, int threads);
 template <typename T>
 void act_square(const T* din, T* dout, int size, int threads);

+template <typename T>
+void act_hard_swish(const T* din,
+                    T* dout,
+                    int size,
+                    float threshold,
+                    float scale,
+                    float offset,
+                    int threads);
+template <typename T>
+void act_reciprocal(const T* din, T* dout, int size, int threads);
+
 #ifdef LITE_WITH_TRAIN
 template <typename T>
 void act_square_grad(

--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
    http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,7 +10,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "lite/backends/opencl/cl_context.h"
-#include <algorithm>
 #include <memory>
 #include <string>
 #include <utility>
@@ -36,10 +32,8 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
  STL::stringstream program_key_ss;
  program_key_ss << file_name << options;
  std::string program_key = program_key_ss.str();
-
-  auto &programs = CLRuntime::Global()->programs();
-  auto it = programs.find(program_key);
-  if (it != programs.end()) {
+  auto it = programs_.find(program_key);
+  if (it != programs_.end()) {
    VLOG(3) << " --- program -> " << program_key << " has been built --- ";
    return *(it->second);
  }
@@ -50,9 +44,9 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
  CLRuntime::Global()->BuildProgram(program.get(), options);
  VLOG(3) << " --- end build program -> " << program_key << " --- ";

-  programs[program_key] = std::move(program);
+  programs_[program_key] = std::move(program);

-  return *(programs[program_key]);
+  return *(programs_[program_key]);
 }

 void CLContext::AddKernel(const std::string &kernel_name,
@@ -68,30 +62,25 @@ void CLContext::AddKernel(const std::string &kernel_name,
      new cl::Kernel(program, kernel_name.c_str(), &status));
  CL_CHECK_FATAL(status);
  VLOG(3) << " --- end create kernel --- ";
-
-  auto &kernels = CLRuntime::Global()->kernels();
-  auto &kernel_offset_map = CLRuntime::Global()->kernel_offset();
-  kernels.emplace_back(std::move(kernel));
+  kernels_.emplace_back(std::move(kernel));
  STL::stringstream kernel_key;
  kernel_key << kernel_name << options << time_stamp;
-  kernel_offset_map[kernel_key.str()] = kernels.size() - 1;
+  kernel_offset_[kernel_key.str()] = kernels_.size() - 1;
 }

 cl::Kernel &CLContext::GetKernel(const int index) {
-  auto &kernels = CLRuntime::Global()->kernels();
-  VLOG(3) << " --- kernel count: " << kernels.size() << " --- ";
-  CHECK(static_cast<size_t>(index) < kernels.size())
+  VLOG(3) << " --- kernel count: " << kernels_.size() << " --- ";
+  CHECK(static_cast<size_t>(index) < kernels_.size())
      << "The index must be less than the size of kernels.";
-  CHECK(kernels[index] != nullptr)
+  CHECK(kernels_[index] != nullptr)
      << "The target kernel pointer cannot be null.";
-  return *(kernels[index]);
+  return *(kernels_[index]);
 }

 cl::Kernel &CLContext::GetKernel(const std::string &name) {
-  auto &kernel_offset_map = CLRuntime::Global()->kernel_offset();
-  auto it = kernel_offset_map.find(name);
-  CHECK(it != kernel_offset_map.end()) << "Cannot find the kernel function: "
-                                       << name;
+  auto it = kernel_offset_.find(name);
+  CHECK(it != kernel_offset_.end()) << "Cannot find the kernel function: "
+                                    << name;
  return GetKernel(it->second);
 }


--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -27,6 +27,20 @@ namespace lite {

 class CLContext {
 public:
+  ~CLContext() {
+    for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
+      clReleaseKernel(kernels_[kidx]->get());
+      kernels_[kidx].reset();
+    }
+    kernels_.clear();
+    kernel_offset_.clear();
+    for (auto &p : programs_) {
+      clReleaseProgram(p.second->get());
+    }
+    programs_.clear();
+    LOG(INFO) << "release cl::Program, cl::Kernel finished.";
+  }
+
  cl::CommandQueue &GetCommandQueue();

  cl::Context &GetContext();
@@ -52,6 +66,10 @@ class CLContext {
                                int divitor = 2);
  //  cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
  //                                   size_t max_work_size);
+ private:
+  std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
+  std::vector<std::unique_ptr<cl::Kernel>> kernels_;
+  std::map<std::string, int> kernel_offset_;
 };

 }  // namespace lite

--- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
@@ -55,17 +55,20 @@ __kernel void relu6(__read_only image2d_t input,
 __kernel void sigmoid(__read_only image2d_t input,
                      __write_only image2d_t output,
                      __private const float threshold,
-                   __private const float scale) {
+                      __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height

-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  CL_DTYPE4 out = 1 / (1 + exp(-in));
+  CL_DTYPE4 out;
+  out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
+  out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
+  out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
+  out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
+
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }


--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
    http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,7 +11,6 @@ limitations under the License. */

 #include "lite/backends/opencl/cl_runtime.h"
 #include <string>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "lite/utils/cp_logging.h"
@@ -29,38 +25,16 @@ CLRuntime* CLRuntime::Global() {
 }

 CLRuntime::~CLRuntime() {
-  LOG(INFO) << "CLRuntime::~CLRuntime()";
-  // Note: do ReleaseResources() in predictor
-  command_queue_&& clReleaseCommandQueue(command_queue_->get());
-  command_queue_.reset();
-  context_&& clReleaseContext(context_->get());
-  context_.reset();
-  device_.reset();
-  platform_.reset();
-  initialized_ = false;
-}
-
-void CLRuntime::ReleaseResources() {
-  //  if (is_resources_released_) {
-  //    return;
-  //  }
-
  if (command_queue_ != nullptr) {
    command_queue_->flush();
    command_queue_->finish();
  }
-  for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
-    clReleaseKernel(kernels_[kidx]->get());
-    kernels_[kidx].reset();
-  }
-  kernels_.clear();
-  kernel_offset_.clear();
-  for (auto& p : programs_) {
-    clReleaseProgram(p.second->get());
-  }
-  programs_.clear();
-  LOG(INFO) << "release resources finished.";
-  is_resources_released_ = true;
+  // For controlling the destruction order:
+  command_queue_.reset();
+  context_.reset();
+  device_.reset();
+  platform_.reset();
+  LOG(INFO) << "release ~CLRuntime() ";
 }

 bool CLRuntime::Init() {
@@ -98,14 +72,14 @@ cl::CommandQueue& CLRuntime::command_queue() {
  return *command_queue_;
 }

-std::shared_ptr<cl::Program> CLRuntime::CreateProgram(
+std::unique_ptr<cl::Program> CLRuntime::CreateProgram(
    const cl::Context& context, std::string file_name) {
  auto cl_file = opencl_kernels_files.find(file_name);
  std::string content(cl_file->second.begin(), cl_file->second.end());
  cl::Program::Sources sources;
  sources.push_back(content);
  auto prog =
-      std::shared_ptr<cl::Program>(new cl::Program(context, sources, &status_));
+      std::unique_ptr<cl::Program>(new cl::Program(context, sources, &status_));
  VLOG(4) << "OpenCL kernel file name: " << file_name;
  VLOG(4) << "Program source size: " << content.size();
  CL_CHECK_FATAL(status_);

--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
    http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,7 +15,6 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/backends/opencl/cl_utility.h"
@@ -33,8 +29,6 @@ class CLRuntime {
 public:
  static CLRuntime* Global();

-  void ReleaseResources();
-
  bool Init();

  cl::Platform& platform();
@@ -45,7 +39,7 @@ class CLRuntime {

  cl::CommandQueue& command_queue();

-  std::shared_ptr<cl::Program> CreateProgram(const cl::Context& context,
+  std::unique_ptr<cl::Program> CreateProgram(const cl::Context& context,
                                             std::string file_name);

  std::unique_ptr<cl::UserEvent> CreateEvent(const cl::Context& context);
@@ -60,12 +54,6 @@ class CLRuntime {

  std::map<std::string, size_t>& GetDeviceInfo();

-  std::unordered_map<std::string, std::shared_ptr<cl::Program>>& programs() {
-    return programs_;
-  }
-  std::vector<std::unique_ptr<cl::Kernel>>& kernels() { return kernels_; }
-  std::map<std::string, int>& kernel_offset() { return kernel_offset_; }
-
 private:
  CLRuntime() = default;

@@ -107,19 +95,11 @@ class CLRuntime {

  std::shared_ptr<cl::CommandQueue> command_queue_{nullptr};

-  std::unordered_map<std::string, std::shared_ptr<cl::Program>> programs_{};
-
-  std::vector<std::unique_ptr<cl::Kernel>> kernels_{};
-
-  std::map<std::string, int> kernel_offset_{};
-
  cl_int status_{CL_SUCCESS};

  bool initialized_{false};

  bool is_init_success_{false};
-
-  bool is_resources_released_{false};
 };

 }  // namespace lite

--- a/lite/backends/xpu/CMakeLists.txt
+++ b/lite/backends/xpu/CMakeLists.txt
@@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU)
  return()
 endif()

-lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+if(LITE_WITH_XTCL)
+  lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+endif()
+lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
--- a/lite/backends/xpu/device.h
+++ b/lite/backends/xpu/device.h
@@ -14,12 +14,12 @@

 #pragma once

-#include <xtcl/xtcl.h>
 #include <cstdlib>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"

 namespace paddle {
 namespace lite {

--- a/lite/backends/xpu/math.h
+++ b/lite/backends/xpu/math.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+#include <cmath>
+#include <cstdlib>
+#include <utility>
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+namespace math {
+
+static inline long round_half_to_even(const float src) {  // NOLINT
+  long ret = llround(src);                                // NOLINT
+  if (fabs(fabs(round(src) - src) - 0.5) > 0) {
+    return ret;
+  } else {
+    if (abs(ret) % 2 == 0) {
+      return ret;
+    } else {
+      return ret + (ret > 0 ? -1 : 1);
+    }
+  }
+}
+
+static float ieee_compliance_0(float f) {
+  uint32_t *ptr = reinterpret_cast<uint32_t *>(&f);
+  uint32_t sign = (*ptr) & 0x80000000;
+  uint32_t uf = 0;
+  // nan -> inf
+  if (std::isnan(f)) {
+    uf = (sign | 0x7F800000);
+    float *ptr = reinterpret_cast<float *>(&uf);
+    return *ptr;
+  } else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) {
+    return f;
+  } else {
+    // denormal -> +-0
+    uf = 0x0;
+    float *ptr = reinterpret_cast<float *>(&uf);
+    return *ptr;
+  }
+}
+
+template <typename T, int RMAX>
+static inline T fp32_to_intx(const float f, float max) {
+  max = ieee_compliance_0(max);
+  float input = ieee_compliance_0(f);
+  // +0 and -0 -> +0
+  if (input == 0) {
+    input = 0.0f;
+  }
+
+  float tmp = RMAX / max;
+  if (std::isinf(tmp)) {
+    uint32_t *ptr = reinterpret_cast<uint32_t *>(&input);
+    if ((*ptr) >> 31 & 1) {
+      return T(-RMAX);
+    } else {
+      return T(RMAX);
+    }
+  }
+
+  tmp = input * tmp;
+  if (std::isnan(tmp)) {
+    return T(RMAX);
+  }
+
+  tmp = ieee_compliance_0(tmp);
+  // early check to avoid INF or big value get into convertor func.
+  if (tmp > RMAX) {
+    return T(RMAX);
+  }
+  if (tmp < -RMAX) {
+    return T(-RMAX);
+  }
+  T ret = (T)round_half_to_even(tmp);
+  if (ret > RMAX) {
+    ret = T(RMAX);
+  }
+  if (ret < -RMAX) {
+    ret = T(-RMAX);
+  }
+  return ret;
+}
+
+static inline int16_t fp32_to_int16(const float f, float max) {
+  int16_t v1 = fp32_to_intx<int16_t, 32767>(f, max);
+  return v1;
+}
+
+static inline int ConvertFP32ToInt16(const void *input,
+                                     void *output,
+                                     float max_val,
+                                     int len) {
+  for (int i = 0; i < len; i++) {
+    static_cast<int16_t *>(output)[i] =
+        fp32_to_int16(static_cast<const float *>(input)[i], max_val);
+  }
+  return 0;
+}
+
+static inline float FindMaxAbs(const float *data, int len) {
+  float max_f = 0.0f;
+  for (int i = 0; i < len; ++i) {
+    float max = std::abs(data[i]);
+    if (max > max_f) {
+      max_f = max;
+    }
+  }
+  return max_f;
+}
+
+template <typename T>
+static inline void Transpose(const T *in, T *out, int h, int w) {
+  for (int h1 = 0; h1 < w; ++h1) {
+    for (int w1 = 0; w1 < h; ++w1) {
+      out[h1 * h + w1] = in[w1 * w + h1];
+    }
+  }
+}
+
+/**
+ * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
+ * original x_dim is returned.
+ */
+static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return lite::DDim({1, x_dim[0]});
+}
+
+/**
+ * Get column matrix shape from a vector shape. If the rank of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return lite::DDim({y_dim[0], 1});
+}
+
+/**
+ * Matrix Descriptor of a memory buffer.
+ *
+ * It is used for Blas::MatMul. MatMul operator can be batched.
+ * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
+ * `batch_size` times of GEMM. The batched GEMM could be faster base on the
+ * implementation of the blas library. The batch size could be zero. If any
+ * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
+ * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
+ * [BatchSize, H1, W2]
+ *
+ * The boolean flag, `trans`, describe the memory is the transpose of matrix or
+ * not. If the trans is true, the last two dims of matrix are transposed. The
+ * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
+ *
+ * The MatDescriptor is not only the dimension or shape of a matrix, it also
+ * contains the layout, stride of matrix. It is clearer to have a structure than
+ * reuse `DDim`.
+ */
+struct MatDescriptor {
+  int64_t height_;
+  int64_t width_;
+  int64_t stride_{0};
+  int64_t batch_size_{0};
+  bool trans_;
+};
+
+static MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
+                                            int num_flatten_cols,
+                                            bool trans) {
+  MatDescriptor retv;
+  if (num_flatten_cols > 1) {
+    auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
+    retv.height_ = flatten_dim[0];
+    retv.width_ = flatten_dim[1];
+  } else {
+    if (tensor_dim.size() == 2) {
+      retv.height_ = tensor_dim[0];
+      retv.width_ = tensor_dim[1];
+    } else {
+      auto dim_vec = tensor_dim.Vectorize();
+      retv.batch_size_ = 1;
+      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
+        retv.batch_size_ *= dim_vec[i];
+      }
+      retv.height_ = dim_vec[dim_vec.size() - 2];
+      retv.width_ = dim_vec[dim_vec.size() - 1];
+      retv.stride_ = retv.height_ * retv.width_;
+    }
+  }
+  if (trans) {
+    std::swap(retv.width_, retv.height_);
+  }
+  retv.trans_ = trans;
+  return retv;
+}
+
+}  // namespace math
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/target_wrapper.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+
+namespace paddle {
+namespace lite {
+
+void* TargetWrapperXPU::Malloc(size_t size) {
+  void* ptr{nullptr};
+  xpu_malloc(&ptr, size);
+  return ptr;
+}
+
+void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
+
+void TargetWrapperXPU::MemcpySync(void* dst,
+                                  const void* src,
+                                  size_t size,
+                                  IoDirection dir) {
+  switch (dir) {
+    case IoDirection::HtoD:
+      xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
+      break;
+    case IoDirection::DtoH:
+      xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+
+using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
+
+template <>
+class TargetWrapper<TARGET(kXPU)> {
+ public:
+  static size_t num_devices() { return 1; }
+  static size_t maximum_stream() { return 0; }
+
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+};
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/xpu/xpu_header_sitter.h
+++ b/lite/backends/xpu/xpu_header_sitter.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#pragma GCC system_header
+#include <xpu/api.h>
+#include <xpu/golden.h>
+#include <xpu/runtime.h>
+
+#if defined(LITE_WITH_XTCL)
+#include <xtcl/xtcl.h>
+#endif
+
+namespace paddle {
+namespace lite {
+
+namespace xdnn = baidu::xpu::api;
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -5,9 +5,11 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
  DEPS target_wrapper_host place
  X86_DEPS target_wrapper_x86
  CUDA_DEPS target_wrapper_cuda
+  XPU_DEPS target_wrapper_xpu
  CL_DEPS cl_target_wrapper
  FPGA_DEPS fpga_target_wrapper
-  BM_DEPS target_wrapper_bm)
+  BM_DEPS target_wrapper_bm
+  MLU_DEPS target_wrapper_mlu)

 lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)


--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)

 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -15,5 +15,11 @@
 #include "lite/core/context.h"

 namespace paddle {
-namespace lite {}  // namespace lite
+namespace lite {
+
+#ifdef LITE_WITH_XPU
+thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
+#endif
+
+}  // namespace lite
 }  // namespace paddle
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -24,6 +24,14 @@
 #include "lite/backends/opencl/cl_context.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
+#ifdef LITE_WITH_MLU
+#include <cnml.h>
+#include <cnrt.h>
+#include "lite/backends/mlu/mlu_utils.h"
+#endif
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#endif

 #include <map>
 #include <memory>
@@ -103,11 +111,38 @@ class Context<TargetType::kXPU> {
 public:
  Context() {}
  explicit Context(const XPUContext& ctx);
+
  // NOTE: InitOnce should only be used by ContextScheduler
  void InitOnce() {}
+
  void CopySharedTo(XPUContext* ctx) {}

+  static xdnn::Context* GetRawContext() {
+    if (_tls_raw_ctx == nullptr) {
+      _tls_raw_ctx = xdnn::create_context();
+      CHECK(_tls_raw_ctx);
+    }
+    return _tls_raw_ctx;
+  }
+
+  static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
+    xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
+  }
+
+  static void SetDev(int dev_no = 0) {
+    const char* dev_env = getenv("LITE_XPU_DEV");
+    if (dev_env) {
+      xpu_set_device(atoi(dev_env));
+      return;
+    }
+
+    xpu_set_device(dev_no);
+  }
+
  std::string name() const { return "XPUContext"; }
+
+ private:
+  static thread_local xdnn::Context* _tls_raw_ctx;
 };
 #endif

@@ -172,6 +207,85 @@ class Context<TargetType::kFPGA> {
 };
 #endif

+#ifdef LITE_WITH_MLU
+template <>
+class Context<TargetType::kMLU> {
+ public:
+  typename Env<TargetType::kMLU>::Devs& devs = Env<TargetType::kMLU>::Global();
+
+  void InitOnce() {}
+
+  MLUContext& operator=(const MLUContext& ctx) {
+    this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
+    return *this;
+  }
+
+  void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
+    CHECK_GT(devs.size(), 0UL)
+        << "Env is not initialized or current target is not exit!";
+    if (dev_id >= static_cast<int>(devs.size())) {
+      LOG(WARNING) << "device index exceeds the number of devices, set to "
+                      "default device(0)!";
+      device_id_ = 0;
+    } else {
+      device_id_ = dev_id;
+    }
+    SetMluDevice(device_id_);
+    if (io_queue_id >= devs[dev_id].max_queue()) {
+      LOG(WARNING) << "data queue index exceeds the maximum queue number, "
+                      "set to default qeueu(0)!";
+      io_queue_id = 0;
+    }
+    if (exec_queue_id >= devs[dev_id].max_queue()) {
+      LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
+                      "set to default qeueu(0)!";
+      exec_queue_id = 0;
+    }
+    io_queue_ = devs[dev_id].io_queues()[io_queue_id];
+    exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
+
+    exec_queue_id_ = exec_queue_id;
+    io_queue_id_ = io_queue_id;
+  }
+
+  void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
+
+  const cnrtQueue_t& exec_queue() const { return exec_queue_; }
+  void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; }
+
+  const cnrtQueue_t& io_queue() const { return io_queue_; }
+  void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
+
+  cnmlCoreVersion_t MLUCoreVersion() {
+    return DeviceInfo::Global().MLUCoreVersion();
+  }
+
+  int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
+
+  u32_t affinity() { return affinity_; }
+
+  cnrtInvokeFuncParam_t forward_param() { return forward_param_; }
+
+  int device_id() { return device_id_; }
+
+  std::string name() const { return "MLUContext"; }
+
+ private:
+  int device_id_;
+  // overall information
+  int exec_queue_id_;
+  int io_queue_id_;
+  cnrtQueue_t io_queue_;
+  cnrtQueue_t exec_queue_;
+
+  std::vector<cnrtNotifier_t> input_notifiers_;
+  std::vector<cnrtNotifier_t> output_notifiers_;
+
+  cnrtInvokeFuncParam_t forward_param_;
+  u32_t affinity_ = 0x01;
+};
+#endif  // LITE_WITH_MLU
+
 #ifdef LITE_WITH_CUDA
 // Only works with CUDA kernels.
 template <>
@@ -398,6 +512,16 @@ class ContextScheduler {
        kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
            &ctx->As<BMContext>());
        break;
+#endif
+#ifdef LITE_WITH_MLU
+      case TARGET(kMLU): {
+        int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
+        auto& context = ctx->As<MLUContext>();
+        context.Init(dev_id);
+        kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
+            &context);
+        LOG(INFO) << "New Context for MLU";
+      } break;
 #endif
      default:
 #if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
@@ -439,6 +563,9 @@ class ContextScheduler {
 #endif
 #ifdef LITE_WITH_BM
    InitContext<TargetType::kBM, BMContext>();
+#endif
+#ifdef LITE_WITH_MLU
+    InitContext<TargetType::kMLU, MLUContext>();
 #endif
  }


--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -58,7 +58,7 @@
 namespace paddle {
 namespace lite {

-#ifdef LITE_WITH_ARM
+#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
 thread_local lite_api::PowerMode DeviceInfo::mode_;
 thread_local ARMArch DeviceInfo::arch_;
 thread_local int DeviceInfo::mem_size_;
@@ -66,6 +66,15 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
 thread_local TensorLite DeviceInfo::workspace_;
 thread_local int64_t DeviceInfo::count_ = 0;

+#ifdef LITE_WITH_MLU
+thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
+thread_local int DeviceInfo::mlu_core_number_{1};
+thread_local bool DeviceInfo::use_first_conv_{false};
+thread_local std::vector<float> DeviceInfo::mean_vec_;
+thread_local std::vector<float> DeviceInfo::std_vec_;
+thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
+#endif
+
 #ifdef TARGET_IOS
 const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
 const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
@@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() {
  return 0;
 }

+#ifdef LITE_WITH_MLU
+void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                               int core_number,
+                               bool use_first_conv,
+                               const std::vector<float>& mean_vec,
+                               const std::vector<float>& std_vec,
+                               DataLayoutType input_layout) {
+  switch (core_version) {
+    case (lite_api::MLUCoreVersion::MLU_220):
+      mlu_core_version_ = CNML_MLU220;
+      break;
+    case (lite_api::MLUCoreVersion::MLU_270):
+      mlu_core_version_ = CNML_MLU270;
+      break;
+    default:
+      mlu_core_version_ = CNML_MLU270;
+      break;
+  }
+  mlu_core_number_ = core_number;
+  use_first_conv_ = use_first_conv;
+  mean_vec_ = mean_vec;
+  std_vec_ = std_vec;
+  input_layout_ = input_layout;
+}
+
+cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
+
+int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
+
+bool DeviceInfo::UseFirstConv() { return use_first_conv_; }
+
+const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
+
+const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
+
+DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
+
+#endif  // LITE_WITH_MLU
+
 void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
 #ifdef ARM_WITH_OMP
  thread_num = std::min(thread_num, core_num_);
@@ -1159,6 +1207,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) {

 #endif  // LITE_WITH_ARM

+#ifdef LITE_WITH_MLU
+void SetMluDevice(int device_id) {
+  LOG(INFO) << "Set mlu device " << device_id;
+  cnrtDev_t dev_handle;
+  CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, device_id));
+  CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
+}
+
+void Device<TARGET(kMLU)>::Init() {
+  SetMluDevice(idx_);
+  GetInfo();
+  CreateQueue();
+}
+
+void Device<TARGET(kMLU)>::GetInfo() {}
+
+void Device<TARGET(kMLU)>::CreateQueue() {
+  exec_queue_.clear();
+  io_queue_.clear();
+  for (size_t i = 0; i < max_queue_; ++i) {
+    cnrtQueue_t exec_queue;
+    cnrtQueue_t io_queue;
+    cnrtCreateQueue(&exec_queue);
+    cnrtCreateQueue(&io_queue);
+    exec_queue_.push_back(exec_queue);
+    io_queue_.push_back(io_queue);
+
+    cnrtCreateQueue(&exec_queue);
+    exec_queue_.push_back(exec_queue);
+  }
+}
+#endif  // LITE_WITH_MLU
+
 #ifdef LITE_WITH_CUDA

 void Device<TARGET(kCUDA)>::Init() {

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -19,11 +19,14 @@
 #include <vector>
 #include "lite/core/tensor.h"
 #include "lite/utils/cp_logging.h"
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/mlu_utils.h"
+#endif

 namespace paddle {
 namespace lite {

-#ifdef LITE_WITH_ARM
+#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))

 typedef enum {
  kAPPLE = 0,
@@ -52,6 +55,20 @@ class DeviceInfo {
  int Setup();

  void SetRunMode(lite_api::PowerMode mode, int thread_num);
+#ifdef LITE_WITH_MLU
+  void SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                     int core_number,
+                     bool use_first_conv,
+                     const std::vector<float>& mean_vec,
+                     const std::vector<float>& std_vec,
+                     DataLayoutType input_layout);
+  cnmlCoreVersion_t MLUCoreVersion();
+  int MLUCoreNumber();
+  bool UseFirstConv();
+  const std::vector<float>& MeanVec() const;
+  const std::vector<float>& StdVec() const;
+  DataLayoutType InputLayout() const;
+#endif
  void SetCache(int l1size, int l2size, int l3size);
  void SetArch(ARMArch arch) { arch_ = arch; }

@@ -103,6 +120,15 @@ class DeviceInfo {
  static thread_local TensorLite workspace_;
  static thread_local int64_t count_;

+#ifdef LITE_WITH_MLU
+  static thread_local cnmlCoreVersion_t mlu_core_version_;
+  static thread_local int mlu_core_number_;
+  static thread_local bool use_first_conv_;
+  static thread_local std::vector<float> mean_vec_;
+  static thread_local std::vector<float> std_vec_;
+  static thread_local DataLayoutType input_layout_;
+#endif
+
  void SetDotInfo(int argc, ...);
  void SetFP16Info(int argc, ...);
  void SetFP32Info(int argc, ...);
@@ -134,6 +160,9 @@ class Env {
    return *devs;
  }
  static void Init(int max_stream = 4) {
+#ifdef LITE_WITH_MLU
+    CNRT_CALL(cnrtInit(0));
+#endif
    Devs& devs = Global();
    if (devs.size() > 0) {
      return;
@@ -156,6 +185,41 @@ class Env {
  }
 };

+#ifdef LITE_WITH_MLU
+void SetMluDevice(int device_id);
+
+template <>
+class Device<TARGET(kMLU)> {
+ public:
+  Device(int dev_id, int max_queue = 1) : idx_(dev_id), max_queue_(max_queue) {}
+  void Init();
+
+  int id() { return idx_; }
+  int max_queue() { return max_queue_; }
+  void SetId(int idx) { idx_ = idx; }
+  std::string name() { return "MLU"; }
+  int core_num() { return 16; }
+  float max_memory() { return 16 * 1024; }
+  std::vector<cnrtQueue_t> io_queues() { return io_queue_; }
+  std::vector<cnrtQueue_t> exec_queues() { return exec_queue_; }
+
+ private:
+  void CreateQueue();
+  void GetInfo();
+
+ private:
+  int idx_{0};
+  int max_queue_;
+  std::string device_name_;
+  float max_memory_;
+
+  std::vector<cnrtQueue_t> io_queue_;
+  std::vector<cnrtQueue_t> exec_queue_;
+};
+
+template class Env<TARGET(kMLU)>;
+#endif  // LITE_WITH_MLU
+
 #ifdef LITE_WITH_CUDA
 template <>
 class Device<TARGET(kCUDA)> {

--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -83,6 +83,9 @@ class KernelBase {
 #if defined(LITE_WITH_CUDA)
    WorkSpace::Global_CUDA().AllocReset();
 #endif
+#if defined(LITE_WITH_MLU)
+    WorkSpace::Global_MLU().AllocReset();
+#endif
 #ifdef LITE_WITH_PROFILE
    profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
    profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());

--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -45,6 +45,16 @@ void* TargetMalloc(TargetType target, size_t size) {
      data = TargetWrapper<TARGET(kBM)>::Malloc(size);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      data = TargetWrapper<TARGET(kMLU)>::Malloc(size);
+      break;
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      data = TargetWrapperXPU::Malloc(size);
+      break;
+#endif  // LITE_WITH_XPU
    default:
      LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
  }
@@ -83,6 +93,16 @@ void TargetFree(TargetType target, void* data, std::string free_flag) {
      TargetWrapper<TARGET(kBM)>::Free(data);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      TargetWrapper<TARGET(kMLU)>::Free(data);
+      break;
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      TargetWrapperXPU::Free(data);
+      break;
+#endif  // LITE_WITH_XPU
    default:
      LOG(FATAL) << "Unknown type";
  }
@@ -114,6 +134,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
      TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      TargetWrapper<TARGET(kMLU)>::MemcpySync(
+          dst, src, size, IoDirection::HtoD);
+      break;
+#endif
 #ifdef LITE_WITH_OPENCL
    case TargetType::kOpenCL:
      TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);

--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -31,6 +31,14 @@
 #include "lite/backends/bm/target_wrapper.h"
 #endif  // LITE_WITH_BM

+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/target_wrapper.h"
+#endif  // LITE_WITH_MLU
+
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif  // LITE_WITH_XPU
+
 namespace paddle {
 namespace lite {

@@ -75,6 +83,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
      TargetWrapperCL::MemcpySync(dst, src, size, dir);
      break;
 #endif  // LITE_WITH_OPENCL
+#ifdef LITE_WITH_MLU
+    case TARGET(kMLU):
+      TargetWrapperMlu::MemcpySync(dst, src, size, dir);
+      break;
+#endif
 #ifdef LITE_WITH_FPGA
    case TARGET(kFPGA):
      TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir);

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -21,6 +21,8 @@ lite_cc_library(mir_passes
      fusion/elementwise_add_activation_fuse_pass.cc
      fusion/quant_dequant_fuse_pass.cc
      fusion/sequence_pool_concat_fuse_pass.cc
+      fusion/__xpu__resnet_fuse_pass.cc
+      fusion/__xpu__multi_encoder_fuse_pass.cc
      elimination/identity_scale_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc
      static_kernel_pick_pass.cc
@@ -35,6 +37,7 @@ lite_cc_library(mir_passes
      demo_pass.cc
      runtime_context_assign_pass.cc
      memory_optimize_pass.cc
+      mlu_postprocess_pass.cc
      weight_quantization_preprocess_pass.cc
      quantized_op_attributes_inference_pass.cc
  DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
@@ -69,10 +72,10 @@ set(pattern_deps mir_node mir_ssa_graph op)
 if (WITH_TESTING)
  list(APPEND pattern_deps gtest)
 endif()
-lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps})
+lite_cc_library(pattern_matcher SRCS pattern_matcher.cc xpu_pattern_matcher.cc DEPS ${pattern_deps})
 lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher)

-lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher)
+lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc xpu_pattern_matcher_high_api.cc DEPS pattern_matcher)


 # for mobile, unnecessary to compile the following testings.

--- a/lite/core/mir/dot.h
+++ b/lite/core/mir/dot.h
@@ -27,8 +27,8 @@
 #include "lite/utils/string.h"

 namespace paddle {
-namespace inference {
-namespace analysis {
+namespace lite {
+namespace mir {

 static size_t dot_node_counter{0};

@@ -162,6 +162,6 @@ class Dot {
  std::vector<Attr> attrs_;
 };

-}  // namespace analysis
-}  // namespace inference
+}  // namespace mir
+}  // namespace lite
 }  // namespace paddle
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -27,10 +27,10 @@ lite_cc_library(fuse_transpose_softmax_transpose
        DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_interpolate
        SRCS interpolate_fuser.cc
-        DEPS pattern_matcher_high_api)       
+        DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_sequence_pool_concat
        SRCS sequence_pool_concat_fuser.cc
-        DEPS pattern_matcher_high_api)       
+        DEPS pattern_matcher_high_api)

 set(mir_fusers
    fuse_fc

--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -26,15 +26,13 @@ namespace paddle {
 namespace lite {
 namespace mir {

-using inference::analysis::Dot;
-
 void GraphVisualizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  VLOG(5) << "\n" << Visualize(graph.get());
 }

 std::string Visualize(mir::SSAGraph* graph) {
  std::ostringstream os;
-  inference::analysis::Dot dot;
+  Dot dot;
  auto string_trunc = [](const std::string& str) -> std::string {
    const int max_disp_size = 100;
    if (str.length() > max_disp_size)

--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -15,7 +15,6 @@
 #include "lite/core/mir/mlu_postprocess_pass.h"
 #include <list>
 #include <memory>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -50,10 +49,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
    op_desc.SetAttr<int>("out_dtype", 4);  // FP16
    op_desc.SetInput("X", {cur_node->AsArg().name});
    op_desc.SetOutput("Out", {cast_arg_name});
-  } else if (op_type == "transpose") {
+  } else if (op_type == "layout") {
    // NCHW -> NHWC
-    op_desc.SetAttr<std::vector<int>>("axis", {0, 2, 3, 1});
-    op_desc.SetInput("X", {cur_node->AsArg().name});
+    op_desc.SetInput("Input", {cur_node->AsArg().name});
    op_desc.SetOutput("Out", {cast_arg_name});
  } else if (op_type == "io_copy") {
    op_desc.SetInput("Input", {cur_node->AsArg().name});
@@ -72,8 +70,15 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
      if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
        is_found = true;
      }
-    } else if (op_type == "transpose") {
-      is_found = true;
+    } else if (op_type == "layout") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
+          DataLayoutCompatible(*out_arg_ty, *cast_type) &&
+          //  for first conv
+          PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
+        is_found = true;
+      }
    } else if (op_type == "io_copy") {
      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
@@ -89,8 +94,13 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
      // we pick the kernel
      cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
      auto& stmt = cast_inst->AsStmt();
-      stmt.picked_kernel().SetContext(
-          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+      if (op_type == "layout") {
+        stmt.picked_kernel().SetContext(
+            ContextScheduler::Global().NewContext(TARGET(kX86)));
+      } else {
+        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            stmt.picked_kernel().target()));
+      }
      break;
    }
  }
@@ -113,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
  cast_arg->AsArg().type = cast_type;
  auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
  // for CastAfter manully set the tensor's type
-  var->GetMutable<::paddle::lite::Tensor>();
+  var->GetMutable<paddle::lite::Tensor>();

  // create the stmt node
  auto* cast_inst = graph->NewInstructNode();
@@ -127,10 +137,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
    op_desc.SetAttr<int>("out_dtype", 5);  // FP16
    op_desc.SetInput("X", {cast_arg_name});
    op_desc.SetOutput("Out", {cur_node->AsArg().name});
-  } else if (op_type == "transpose") {
+  } else if (op_type == "layout") {
    // NHWC -> NCHW
-    op_desc.SetAttr<std::vector<int>>("axis", {0, 3, 1, 2});
-    op_desc.SetInput("X", {cast_arg_name});
+    op_desc.SetInput("Input", {cast_arg_name});
    op_desc.SetOutput("Out", {cur_node->AsArg().name});
  } else if (op_type == "io_copy") {
    op_desc.SetInput("Input", {cast_arg_name});
@@ -151,8 +160,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
      if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) {
        is_found = true;
      }
-    } else if (op_type == "transpose") {
-      is_found = true;
+    } else if (op_type == "layout") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (DataLayoutCompatible(*in_arg_ty, *cast_type) &&
+          DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) {
+        is_found = true;
+      }
    } else if (op_type == "io_copy") {
      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
@@ -168,8 +182,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
      // we pick the kernel
      cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
      auto& stmt = cast_inst->AsStmt();
-      stmt.picked_kernel().SetContext(
-          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+      if (op_type == "layout") {
+        stmt.picked_kernel().SetContext(
+            ContextScheduler::Global().NewContext(TARGET(kX86)));
+      } else {
+        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            stmt.picked_kernel().target()));
+      }
      break;
    }
  }
@@ -193,24 +212,28 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
  auto* cur_node = head_node;
  const auto name_prefix =
      head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
+  bool is_first_conv_head =
+      std::find(first_conv_nodes_.begin(),
+                first_conv_nodes_.end(),
+                head_node->AsArg().name) != first_conv_nodes_.end();

-  // layout cast node
-  if (head_type->layout() != inst_type->layout()) {
+  // precision cast node
+  if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
    cur_node = InsertCastBefore(
-        "transpose",
-        name_prefix + "transpose",
+        "cast",
+        name_prefix + "cast",
        graph,
        cur_node,
        inst_node,
        LiteType::GetTensorTy(
-            head_type->target(), head_type->precision(), inst_type->layout()));
+            head_type->target(), inst_type->precision(), head_type->layout()));
  }

-  // precision cast node
-  if (head_type->precision() != inst_type->precision()) {
+  // layout cast node
+  if (head_type->layout() != inst_type->layout()) {
    cur_node = InsertCastBefore(
-        "cast",
-        name_prefix + "cast",
+        "layout",
+        name_prefix + "layout",
        graph,
        cur_node,
        inst_node,
@@ -260,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,

  // get subgraph's valid precision
  const auto& places = graph->valid_places();
-  std::set<::paddle::lite_api::PrecisionType> prec_set;
+  std::set<paddle::lite_api::PrecisionType> prec_set;
  for (const auto& place : places) {
    if (place.target == TARGET(kMLU)) {
      prec_set.insert(place.precision);
@@ -343,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
  const auto name_prefix =
      tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";

-  // layout cast node
-  if (tail_type->layout() != inst_type->layout()) {
+  // precision cast node
+  if (tail_type->precision() != inst_type->precision()) {
    cur_node = InsertCastAfter(
-        "transpose",
-        name_prefix + "transpose",
+        "cast",
+        name_prefix + "cast",
        graph,
        cur_node,
        inst_node,
        LiteType::GetTensorTy(
-            tail_type->target(), tail_type->precision(), inst_type->layout()));
+            tail_type->target(), inst_type->precision(), tail_type->layout()));
  }

-  // precision cast node
-  if (tail_type->precision() != inst_type->precision()) {
+  // layout cast node
+  if (tail_type->layout() != inst_type->layout()) {
    cur_node = InsertCastAfter(
-        "cast",
-        name_prefix + "cast",
+        "layout",
+        name_prefix + "layout",
        graph,
        cur_node,
        inst_node,
@@ -392,6 +415,14 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
    UpdateOutputTo(
        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+    /* graph like this
+     *        subgraph_op_0
+     *          /       \
+     *         /         \
+     * subgraph_op_1   host_op
+     */
+    UpdateInputTo(
+        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
  }

  // recreate the op
@@ -415,6 +446,56 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
  }
 }

+bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
+  auto* block_desc =
+      static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
+          ->GetSubBlock();
+  for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
+    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
+    CHECK(op_desc);
+    if (op_desc->Type() == "conv2d") {
+      for (auto& names : op_desc->inputs()) {
+        if (std::find(names.second.begin(),
+                      names.second.end(),
+                      arg_node->AsArg().name) != names.second.end()) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) {
+  CHECK(arg_node->IsArg());
+  for (auto& inst : arg_node->outlinks) {
+    if (inst->AsStmt().op_type() == "subgraph") {
+      return IsFirstConvInSubgraph(arg_node, inst);
+    }
+  }
+  return false;
+}
+
+void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) {
+  for (auto& node : graph->mutable_nodes()) {
+    if (!node.IsStmt()) continue;
+    if (node.AsStmt().op_type() == "feed") {
+      for (auto& out : node.outlinks) {
+        if (IsFirstConvNode(out)) {
+          first_conv_nodes_.insert(out->AsArg().name);
+          // modify first conv nodes' type
+          const auto* old_type = out->AsArg().type;
+          out->AsArg().type =
+              LiteType::GetTensorTy(old_type->target(),
+                                    paddle::lite_api::PrecisionType::kInt8,
+                                    old_type->layout(),
+                                    old_type->device());
+        }
+      }
+    }
+  }
+}
+
 void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
  for (auto& node : graph->mutable_nodes()) {
    if (!node.IsStmt()) continue;
@@ -432,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
          out->AsArg().type =
              LiteType::GetTensorTy(old_type->target(),
                                    old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                    old_type->device());
        }
      }
@@ -451,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
          inp->AsArg().type =
              LiteType::GetTensorTy(old_type->target(),
                                    old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                    old_type->device());
        }
      }
@@ -460,14 +541,22 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
 }

 void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  // currently for non-persistent input and output args, mlu subgraph op
-  // only support float16/float32 data type
-
-  // in two situations as folllows:
-  // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
-  // arg_in and arg_out are assumed to be NHWC which user should be aware of.
-  // Thus here we change these args' layout to NHWC
-  ModifyLayout(graph.get());
+// currently for non-persistent input and output args, mlu subgraph op
+// only support float16/float32 data type
+
+// in two situations as folllows:
+// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
+// arg_in and arg_out are assumed to be NHWC which user should be aware of.
+// Thus here we change these args' layout to NHWC
+#ifdef LITE_WITH_MLU
+  if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) {
+    ModifyLayout(graph.get());
+  }
+
+  if (lite::DeviceInfo::Global().UseFirstConv()) {
+    GatherAndModifyFirstConvNodes(graph.get());
+  }
+#endif

  // insert io_copy, layout and precision cast of subgraph's inputs and outputs
  for (auto& node : graph->mutable_nodes()) {

--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 #include "lite/core/mir/pass.h"
@@ -107,6 +108,15 @@ class MLUPostprocessPass : public ProgramPass {
                        const Type* cast_type);

  void RecreateOp(Node* inst_node, SSAGraph* graph);
+
+  void GatherAndModifyFirstConvNodes(SSAGraph* graph);
+
+  bool IsFirstConvNode(Node* arg_node);
+
+  bool IsFirstConvInSubgraph(Node* arg_node, Node* inst);
+
+ private:
+  std::set<std::string> first_conv_nodes_;
 };

 }  // namespace mir

--- a/lite/core/mir/pattern_matcher.cc
+++ b/lite/core/mir/pattern_matcher.cc
@@ -322,7 +322,6 @@ void PatternMatcher::RemoveOverlappedMatch(std::vector<subgraph_t> *subgraphs) {
 }

 std::string PMPattern::DotString() const {
-  using inference::analysis::Dot;
  Dot dot;
  int id = 0;
  // Create Nodes

--- a/lite/core/mir/pattern_matcher_high_api.h
+++ b/lite/core/mir/pattern_matcher_high_api.h
@@ -64,7 +64,6 @@ class FuseBase {
 protected:
  virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0;

- private:
  void PerformPatternMatcher(SSAGraph* graph);

  // Delete nodes that are marked as Intermediate

--- a/lite/core/mir/runtime_context_assign_pass.cc
+++ b/lite/core/mir/runtime_context_assign_pass.cc
@@ -24,11 +24,31 @@ class RuntimeContextAssignPass : public StmtPass {
  RuntimeContextAssignPass() {}

  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+#ifdef LITE_WITH_OPENCL
+    using OpenCLContext = Context<TargetType::kOpenCL>;
+    std::unique_ptr<KernelContext> local_ctx(new KernelContext());
+    local_ctx->As<OpenCLContext>().InitOnce();
+#endif
    for (auto& node : graph->mutable_nodes()) {
      if (!node.IsStmt()) continue;
      auto& inst = node.AsStmt();
+
+#ifdef LITE_WITH_OPENCL
+      if (inst.picked_kernel().target() == TARGET(kOpenCL)) {
+        std::unique_ptr<KernelContext> ctx(new KernelContext());
+        (*local_ctx)
+            .As<OpenCLContext>()
+            .CopySharedTo(&ctx->As<OpenCLContext>());
+        inst.picked_kernel().SetContext(std::move(ctx));
+      } else {
+        inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            inst.picked_kernel().target()));
+      }
+#else
      inst.picked_kernel().SetContext(
          ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
+
+#endif
    }
  }
 };

--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -64,6 +64,26 @@ std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildOperationAdjList() {
  return adj_list;
 }

+std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildNodeAdjList() {
+  std::map<mir::Node *, std::set<mir::Node *>> adj_list;
+
+  for (auto &n : mutable_nodes()) {
+    if (adj_list.find(&n) == adj_list.end()) {
+      adj_list[&n] = std::set<mir::Node *>();
+    }
+    std::vector<mir::Node *> nodes;
+    for (auto &var : n.inlinks) {
+      nodes.push_back(var);
+    }
+    std::sort(nodes.begin(),
+              nodes.end(),
+              [](mir::Node *node1, mir::Node *node2) { return node1 > node2; });
+    adj_list[&n].insert(std::make_move_iterator(nodes.begin()),
+                        std::make_move_iterator(nodes.end()));
+  }
+  return adj_list;
+}
+
 void SSAGraph::SortHelper(
    const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
    mir::Node *node,
@@ -98,6 +118,24 @@ std::vector<mir::Node *> SSAGraph::StmtTopologicalOrder() {
  return res;
 }

+std::vector<mir::Node *> SSAGraph::NodeTopologicalOrder() {
+  CheckBidirectionalConnection();
+
+  std::stack<mir::Node *> stack;
+  std::set<mir::Node *> visited;
+  std::vector<mir::Node *> res;
+
+  auto adj_list = BuildNodeAdjList();
+
+  for (auto adj : adj_list) {
+    if (visited.find(adj.first) == visited.end()) {
+      SortHelper(adj_list, adj.first, &visited, &res);
+    }
+  }
+
+  return res;
+}
+
 Node *SSAGraph::GraphCreateInstructNode(
    const std::shared_ptr<OpLite> &op, const std::vector<Place> &valid_places) {
  node_storage_.emplace_back();
@@ -213,9 +251,10 @@ std::vector<mir::Node *> SSAGraph::outputs() {
 }

 mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) {
-  auto it = arguments_.find(arg);
-  if (it != arguments_.end()) {
-    return it->second;
+  for (auto &node : node_storage_) {
+    if (node.IsArg() && node.arg()->name == arg) {
+      return &node;
+    }
  }
  return nullptr;
 }

--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
@@ -42,6 +42,8 @@ class SSAGraph : GraphBase {

  std::vector<mir::Node *> StmtTopologicalOrder();

+  std::vector<mir::Node *> NodeTopologicalOrder();
+
  // The inputs of the graph.
  std::vector<mir::Node *> inputs();

@@ -86,6 +88,9 @@ class SSAGraph : GraphBase {
  // Build operator inlink edge table.
  std::map<mir::Node *, std::set<mir::Node *>> BuildOperationAdjList();

+  // Build node inlink edge table.
+  std::map<mir::Node *, std::set<mir::Node *>> BuildNodeAdjList();
+
  void SortHelper(const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
                  mir::Node *node,
                  std::set<mir::Node *> *visited,

--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -30,10 +30,8 @@ namespace paddle {
 namespace lite {
 namespace mir {

-using inference::analysis::Dot;
-
 std::string SubgraphVisualizer::operator()() {
-  inference::analysis::Dot dot;
+  Dot dot;
  const std::vector<std::string> subgraph_colors{
      "red",          "green",          "cyan",           "bisque3",
      "coral",        "darkseagreen1",  "goldenrod1",     "darkorchid",
@@ -314,8 +312,14 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) {

 std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
    node_map_t *nodes) {
-  for (auto &it : *nodes) {
-    node_dat_t *node = it.second;
+  for (auto &ordered_node : graph_->NodeTopologicalOrder()) {
+    // different orders when traversing nodes in graph may lead to
+    // different subgraph division, which may generate different result
+    // with device such as MLU. These different results are all "right"
+    // but a little confusing. Thus the topological order is used instead
+    // of the address of the node in graph.
+    CHECK(nodes->find(ordered_node) != nodes->end());
+    node_dat_t *node = (*nodes)[ordered_node];
    if (!node->marked) {
      continue;
    }
@@ -573,13 +577,14 @@ void ExtractInputsOutputs(const std::vector<Node *> &op_nodes,
        unused_var_nodes->insert(var_node);
        continue;
      }
-      // Var can have more than one next op node, So, if any one in the
-      // op_nodes then continue
-      bool next_op_in_nodes = false;
+      // Var can have more than one next op node, So, if all next nodes are in
+      // op_nodes then it should be put into local_var_nodes
+      bool next_op_in_nodes = true;
      for (auto &next_op_node : var_node->outlinks) {
-        if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) !=
+        if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) ==
            op_nodes.end()) {
-          next_op_in_nodes = true;
+          next_op_in_nodes = false;
+          break;
        }
      }
      if (next_op_in_nodes) {

--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -200,7 +200,7 @@ TEST(Subgraph, detect_custom_model) {
 #ifdef LITE_WITH_NPU
      Place{TARGET(kNPU), PRECISION(kFloat)},
 #endif
-#ifdef LITE_WITH_XPU
+#ifdef LITE_WITH_XTCL
      Place{TARGET(kXPU), PRECISION(kFloat)},
 #endif
  });

--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -20,6 +20,7 @@
 #include <vector>
 #include "lite/core/mir/pass_registry.h"
 #include "lite/core/mir/subgraph/subgraph_detector.h"
+#include "lite/utils/env.h"

 namespace paddle {
 namespace lite {
@@ -40,6 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }

 void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
  std::unordered_set<std::string> supported_lists;
 #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
 #include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
@@ -67,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }

+void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/mlu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
@@ -77,3 +93,5 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
    .BindTargets({TARGET(kXPU)});
 REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
    .BindTargets({TARGET(kBM)});
+REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
+    .BindTargets({TARGET(kMLU)});
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };

+class MLUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -180,7 +180,7 @@ TEST(Subgraph, generate_model_and_check_precision) {
 #ifdef LITE_WITH_NPU
  valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
 #endif
-#ifdef LITE_WITH_XPU
+#ifdef LITE_WITH_XTCL
  valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
 #endif
  auto tar_predictor = TestModel(FLAGS_model_dir,

--- a/lite/core/mir/subgraph_cast_display_pass.cc
+++ b/lite/core/mir/subgraph_cast_display_pass.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class SubgraphCastDisplayPass : public DebugPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
-    VLOG(3) << "== Argument types ==";
-    for (auto& node : graph->mutable_nodes()) {
-      if (!node.IsArg()) continue;
-
-      auto* type = node.AsArg().type;
-      if (type) {
-        VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type;
-      } else {
-        VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK";
-      }
-    }
-    VLOG(3) << "---------------------";
-
-    //
-    VLOG(0) << "== SubgraphOp Debug Info ==";
-    for (auto& node : graph->mutable_nodes()) {
-      if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
-        VLOG(0) << "FOUND SUBGRAPH OP";
-        display_debug_info(node, "subgraph");
-        break;
-      }
-    }
-    VLOG(0) << "---------------------";
-  }
-
-  void display_debug_info(const Node& node,
-                          std::string op_type,
-                          bool display_in_nodes = true,
-                          bool display_out_nodes = true) {
-    CHECK(node.IsStmt());
-    VLOG(0) << node.AsStmt();
-    if (display_in_nodes) {
-      for (auto p_in_arg_node : node.inlinks) {
-        CHECK(p_in_arg_node->IsArg());
-        VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name
-                << " type: " << *p_in_arg_node->AsArg().type
-                << " is_weight: " << p_in_arg_node->AsArg().is_weight
-                << " is_persist: " << p_in_arg_node->AsArg().is_persist
-                << " input_count: " << p_in_arg_node->inlinks.size();
-        if (p_in_arg_node->inlinks.size() == 0) {
-          VLOG(0) << "** END with No Op";
-        }
-        for (auto p_in_stmt_node : p_in_arg_node->inlinks) {
-          CHECK(p_in_stmt_node->IsStmt());
-          std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type();
-          if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
-              stmt_op_type == "io_copy") {
-            display_debug_info(*p_in_stmt_node, stmt_op_type, true, false);
-          } else {
-            VLOG(0) << "** END with op type: " << stmt_op_type;
-          }
-        }
-      }
-    }
-    if (display_out_nodes) {
-      for (auto p_out_arg_node : node.outlinks) {
-        CHECK(p_out_arg_node->IsArg());
-        VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
-                << " type: " << *p_out_arg_node->AsArg().type
-                << " is_weight: " << p_out_arg_node->AsArg().is_weight
-                << " is_persist: " << p_out_arg_node->AsArg().is_persist
-                << " output_count: " << p_out_arg_node->outlinks.size();
-        if (p_out_arg_node->outlinks.size() == 0) {
-          VLOG(0) << "** END with No Op";
-        }
-        for (auto p_out_stmt_node : p_out_arg_node->outlinks) {
-          CHECK(p_out_stmt_node->IsStmt());
-          std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type();
-          if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
-              stmt_op_type == "io_copy") {
-            display_debug_info(*p_out_stmt_node, stmt_op_type, false, true);
-          } else {
-            VLOG(0) << "** END with op type: " << stmt_op_type;
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(subgraph_cast_display_pass,
-                  paddle::lite::mir::SubgraphCastDisplayPass)
-    .BindTargets({TARGET(kAny)});
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -180,7 +180,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
        VLOG(4) << "picked, opencl found";
        is_found = true;
      } else if (TypeCompatible(*in_arg_ty, from) &&
-                 out_arg_ty->target() == to.target()) {
+                 TargetCompatibleTo(*out_arg_ty, to)) {
        VLOG(4) << "picked";
        is_found = true;
      }

--- a/lite/core/mir/xpu_pattern_matcher.cc
+++ b/lite/core/mir/xpu_pattern_matcher.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <string>
+#include <vector>
+
+#include "lite/core/mir/dot.h"
+#include "lite/core/mir/xpu_pattern_matcher.h"
+#include "lite/core/op_lite.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+
+void XPUPatternMatcher::operator()(SSAGraph *graph,
+                                   XPUPatternMatcher::handle_t handler) {
+  if (!MarkPMNodesInGraph(graph)) {
+    return;
+  }
+
+  auto subgraphs = DetectPatterns();
+  UniquePatterns(&subgraphs);
+  RemoveOverlappedMatch(&subgraphs);
+  ValidateByNodeRole(&subgraphs);
+
+  if (subgraphs.empty()) return;
+  LOG(INFO) << "detected " << subgraphs.size() << " subgraph";
+  int id = 0;
+  for (auto &g : subgraphs) {
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
+    handler(g, graph);
+  }
+}
+
+bool XPUPatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) {
+  VLOG(3) << "mark pmnodes in graph";
+  if (graph->nodes().empty()) return false;
+  for (auto &node : graph->mutable_nodes()) {
+    for (const auto &pmnode : pattern_.nodes()) {
+      if (pmnode->Tell(&node)) {
+        pmnodes2nodes_[pmnode.get()].insert(&node);
+      }
+    }
+  }
+  // Check to early stop if some PMNode can't find matched Node.
+  for (auto &pmnode : pattern_.nodes()) {
+    if (!pmnodes2nodes_.count(pmnode.get())) {
+      VLOG(4) << pmnode->name() << " can't find matched Node, early stop";
+      // return false;
+    }
+  }
+  VLOG(3) << pmnodes2nodes_.size() << " nodes marked";
+
+  return !pmnodes2nodes_.empty();
+}
+
+// The intermediate Nodes can only link to the nodes inside the pattern, or this
+// subgraph will be droped.
+void XPUPatternMatcher::ValidateByNodeRole(
+    std::vector<PatternMatcher::subgraph_t> *subgraphs) {
+  subgraphs->erase(
+      std::remove_if(subgraphs->begin(),
+                     subgraphs->end(),
+                     [](const XPUPatternMatcher::subgraph_t &subgraph) -> bool {
+                       // Collect the inlinks and outlinks.
+                       std::unordered_set<Node *> ios;
+                       for (auto &item : subgraph) {
+                         ios.insert(item.second);
+                       }
+                       for (auto &item : subgraph) {
+                         if (item.first->IsIntermediate()) {
+                           for (auto *x : item.second->outlinks) {
+                             if (!ios.count(x)) {
+                               return true;
+                             }
+                           }
+                         }
+                       }
+                       return false;
+                     }),
+      subgraphs->end());
+
+  for (auto &subgraph : *subgraphs) {
+    std::unordered_set<Node *> ios;
+    for (auto &item : subgraph) {
+      ios.insert(item.second);
+    }
+    extra_input_vars_.emplace_back();
+    for (auto &item : subgraph) {
+      for (auto *x : item.second->inlinks) {
+        if (x->IsArg() && ios.count(x) == 0) {
+          // extra weight var
+          extra_input_vars_.back().push_back(x);
+        }
+      }
+    }
+  }
+}
+
+struct HitGroup {
+  std::unordered_map<PMNode *, Node *> roles;
+
+  bool Match(Node *node, PMNode *pat) {
+    if (nodes_.count(node)) {
+      if (roles.count(pat) && roles[pat] == node) return true;
+      return false;
+    } else {
+      if (roles.count(pat) && roles[pat] != node) return false;
+      return true;
+    }
+  }
+
+  void Register(Node *node, PMNode *pat) {
+    roles[pat] = node;
+    nodes_.insert(node);
+  }
+
+ private:
+  std::unordered_set<Node *> nodes_;
+};
+
+// Tell whether Node a links to b.
+bool IsNodesLink(Node *a, Node *b) {
+  for (auto *node : a->outlinks) {
+    if (b == node) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<PatternMatcher::subgraph_t> XPUPatternMatcher::DetectPatterns() {
+  // Init empty subgraphs.
+  std::vector<PatternMatcher::subgraph_t> result;
+  std::vector<HitGroup> init_groups;
+  std::array<std::vector<HitGroup>, 2> bi_records;
+  auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
+                                               : pattern_.edges().front().first;
+  if (!pmnodes2nodes_.count(first_pnode)) return result;
+  for (auto *node : pmnodes2nodes_[first_pnode]) {
+    HitGroup group;
+    group.roles[first_pnode] = node;
+    init_groups.emplace_back(group);
+  }
+
+  int step = 0;
+  bi_records[0] = std::move(init_groups);
+
+  // Extend a PMNode to subgraphs by deducing the connection relations defined
+  // in edges of PMNodes.
+  for (const auto &edge : pattern_.edges()) {
+    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
+    // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
+    // Each role has two PMNodes, which indicates two roles.
+    // Detect two Nodes that can match these two roles and they are connected.
+    auto &pre_groups = bi_records[step % 2];
+    auto &cur_groups = bi_records[1 - (step++ % 2)];
+    cur_groups.clear();
+    if (pre_groups.empty()) break;
+    // source -> target
+    for (Node *source : pmnodes2nodes_[edge.first]) {
+      for (Node *target : pmnodes2nodes_[edge.second]) {
+        // TODO(Superjomn) add some prune strategies.
+        for (const auto &group : pre_groups) {
+          if (IsNodesLink(source, target)) {
+            HitGroup new_group = group;
+            bool flag = new_group.Match(source, edge.first) &&
+                        new_group.Match(target, edge.second);
+            if (flag) {
+              new_group.Register(source, edge.first);
+              new_group.Register(target, edge.second);
+              cur_groups.push_back(new_group);
+              // TODO(Superjomn) need to unique
+            }
+          }
+        }
+      }
+    }
+    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
+  }
+
+  for (auto &group : bi_records[step % 2]) {
+    XPUPatternMatcher::subgraph_t subgraph;
+    for (auto &role : group.roles) {
+      subgraph.emplace(role.first, role.second);
+    }
+    result.emplace_back(subgraph);
+  }
+  return result;
+}
+
+struct GraphItemLessThan {
+  bool operator()(const std::pair<PMNode *, Node *> &a,
+                  const std::pair<PMNode *, Node *> &b) {
+    if (a.first != b.first) {
+      return a.first < b.first;
+    } else {
+      return a.second < b.second;
+    }
+  }
+};
+
+// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
+// see https://github.com/PaddlePaddle/Paddle/issues/13550
+void XPUPatternMatcher::UniquePatterns(
+    std::vector<PatternMatcher::subgraph_t> *subgraphs) {
+  if (subgraphs->empty()) return;
+  std::vector<PatternMatcher::subgraph_t> result;
+
+  std::unordered_set<size_t> set;
+  std::hash<std::string> hasher;
+  for (auto &g : *subgraphs) {
+    // Sort the items in the sub-graph, and transform to a string key.
+    std::vector<std::pair<PMNode *, Node *>> sorted_keys(g.begin(), g.end());
+    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
+    STL::stringstream ss;
+    for (auto &item : sorted_keys) {
+      ss << reinterpret_cast<size_t>(item.first) << ":"
+         << reinterpret_cast<size_t>(item.second);
+    }
+    auto key = hasher(ss.str());
+    if (!set.count(key)) {
+      result.emplace_back(g);
+      set.insert(key);
+    }
+  }
+  *subgraphs = result;
+}
+
+void XPUPatternMatcher::RemoveOverlappedMatch(
+    std::vector<subgraph_t> *subgraphs) {
+  std::vector<subgraph_t> result;
+  std::unordered_set<Node *> node_set;
+
+  for (const auto &subgraph : *subgraphs) {
+    bool valid = true;
+    for (auto &item : subgraph) {
+      if (item.first->IsIntermediate() && node_set.count(item.second)) {
+        valid = false;
+        break;
+      }
+    }
+    if (valid) {
+      for (auto &item : subgraph) {
+        node_set.insert(item.second);
+      }
+      result.push_back(subgraph);
+    }
+  }
+  *subgraphs = result;
+}
+
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/xpu_pattern_matcher.h
+++ b/lite/core/mir/xpu_pattern_matcher.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/pattern_matcher.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+
+/*
+ * PatternMatcher helps to detect the specific patterns in the graph.
+ * Input a pattern, output a list of the matched subgraphs/nodes.
+ * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
+ *
+ * The algorithm has three phases:
+ *   1. Mark the nodes that match the defined PMNodes in a PMPattern,
+ *   2. Extend a PMNode to subgraphs by deducing the connection relation defined
+ *      in PAPattern(the edges),
+ *   3. Get the filtered subgraphs and treat them with a pre-defined handler.
+ *
+ * Usage:
+ *    // Create a matcher
+ *    PatternMatcher matcher;
+ *    // Define the matcher's pattern, by adding PMNode and define the edges.
+ *    auto* node0 = matcher.mutable_pattern().AddNode(...)
+ *    auto* node1 = matcher.mutable_pattern().AddNode(...)
+ *    node0->teller = some lambda.
+ *    node1->teller = some lambda.
+ *    matcher.mutable_pattern().AddEdge(node0, node1);
+ *    // Create an handler, to define the behavior of treating the filtered
+ *    // subgraphs that comply with the patterns.
+ *    PatternMatcher::handle_t handler = some labmda
+ *    // Execute the matcher.
+ *    matcher(&graph, handler);
+ */
+struct XPUPatternMatcher {
+  using subgraph_t = std::unordered_map<PMNode*, Node*>;
+
+  // Operate on the detected pattern.
+  using handle_t =
+      std::function<void(const subgraph_t& /*hitted pattern*/, SSAGraph*)>;
+
+  void operator()(SSAGraph* graph, handle_t handler);
+
+  const PMPattern& pattern() const { return pattern_; }
+  PMPattern* mutable_pattern() { return &pattern_; }
+
+  // Mark the nodes that fits the pattern.
+  bool MarkPMNodesInGraph(SSAGraph* graph);
+
+  // Detect all the pattern and output the hit records.
+  std::vector<subgraph_t> DetectPatterns();
+
+  // Remove duplicate patterns.
+  void UniquePatterns(std::vector<subgraph_t>* subgraphs);
+
+  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
+  // The intermediate PMNodes will be removed, so can't shared by multiple
+  // patterns.
+  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
+
+  // Validate whether the intermediate nodes are linked by external nodes.
+  void ValidateByNodeRole(std::vector<subgraph_t>* subgraphs);
+
+  using hit_rcd_t =
+      std::pair<Node* /*node in graph*/, PMNode* /*node in pattern*/>;
+  PMPattern pattern_;
+  std::unordered_map<const PMNode*, std::unordered_set<Node*>> pmnodes2nodes_;
+  std::vector<std::vector<Node*>> extra_input_vars_;
+};
+
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/xpu_pattern_matcher_high_api.cc
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include <set>
+#include <unordered_set>
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+
+void XPUFuseBase::PerformPatternMatcher(SSAGraph *graph) {
+  VLOG(4) << "\n" << matcher_.pattern().DotString();
+  // Get subgraphs and record the mir::Node pointers for each PMNode.
+  auto handler = [&](const PatternMatcher::subgraph_t &subgraph, SSAGraph *g) {
+    // get all the reigistered nodes.
+    key2nodes_.emplace_back();
+    for (auto &item : nodes_) {
+      key2nodes_.back()[item.first] = subgraph.at(item.second);
+    }
+  };
+
+  matcher_(graph, handler);
+}
+
+void XPUFuseBase::DeleteInterNodes(SSAGraph *graph) {
+  std::set<std::string> keys;
+  for (auto &node : nodes_) {
+    if (node.second->IsIntermediate()) {
+      keys.insert(node.first);
+    }
+  }
+
+  VLOG(4) << "keys: " << key2nodes_.size();
+  std::unordered_set<const Node *> nodes2rm;
+  for (auto &matched : key2nodes_) {
+    for (const auto &key : keys) {
+      nodes2rm.insert(matched.at(key));
+    }
+  }
+
+  VLOG(3) << "clean nodes " << nodes2rm.size();
+  GraphSafeRemoveNodes(graph, nodes2rm);
+}
+
+PMNode *XPUFuseBase::GetOrCreateNode(const std::string &key) {
+  auto it = nodes_.find(key);
+  if (it != nodes_.end()) {
+    return it->second;
+  }
+  nodes_.emplace(key,
+                 matcher_.mutable_pattern()->NewNode(patterns::UniqueKey(key)));
+  it = nodes_.find(key);
+  return it->second;
+}
+
+PMNode *XPUFuseBase::OpNode(const std::string &key,
+                            const std::string &op_type) {
+  GetOrCreateNode(key)->set_op_type(op_type);
+  GetOrCreateNode(key)->AsOp(op_type);
+  return GetOrCreateNode(key);
+}
+
+PMNode *XPUFuseBase::VarNode(const std::string &key) {
+  GetOrCreateNode(key)->AsVar();
+  return GetOrCreateNode(key);
+}
+
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/xpu_pattern_matcher_high_api.h
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+#include "lite/core/mir/xpu_pattern_matcher.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+
+class XPUFuseBase {
+ public:
+  using key2nodes_t = std::map<std::string, Node*>;
+
+  virtual ~XPUFuseBase() = default;
+
+  void operator()(SSAGraph* graph) {
+    BuildPattern();
+    PerformPatternMatcher(graph);
+
+    for (size_t i = 0; i < key2nodes_.size(); ++i) {
+      InsertNewNode(graph, key2nodes_[i], matcher_.extra_input_vars_[i]);
+    }
+
+    DeleteInterNodes(graph);
+  }
+
+  // Build a PMPattern using PMNode.
+  virtual void BuildPattern() = 0;
+
+  // Generate an operator desc with a matched subgraph.
+  virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
+    return cpp::OpDesc();
+  }
+
+  PMNode* OpNode(const std::string& key) {
+    return GetOrCreateNode(key)->assert_is_op();
+  }
+
+  PMNode* OpNode(const std::string& key, const std::string& op_type);
+
+  PMNode* VarNode(const std::string& key);
+
+ protected:
+  virtual void InsertNewNode(SSAGraph* graph,
+                             const key2nodes_t& matched,
+                             const std::vector<Node*>& extra_input_vars) = 0;
+
+  void PerformPatternMatcher(SSAGraph* graph);
+
+  // Delete nodes that are marked as Intermediate
+  void DeleteInterNodes(SSAGraph* graph);
+
+  PMNode* GetOrCreateNode(const std::string& key);
+
+ protected:
+  XPUPatternMatcher matcher_;
+  std::map<std::string, PMNode*> nodes_;
+  std::vector<key2nodes_t> key2nodes_;
+};
+
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -157,5 +157,33 @@ Tensor *OpLite::GetMutableTensor(lite::Scope *scope,
  return var->GetMutable<lite::Tensor>();
 }

+void OpLite::AttachInput(const cpp::OpDesc &op_desc,
+                         lite::Scope *scope,
+                         const std::string &input_name,
+                         bool is_dispensable,
+                         lite::Tensor **input_var) {
+  bool is_have_input =
+      op_desc.HasInput(input_name) && op_desc.Input(input_name).size() > 0;
+  CHECK(is_dispensable || is_have_input);
+  if (is_have_input) {
+    std::string input_var_name = op_desc.Input(input_name).front();
+    *input_var = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  }
+}
+
+void OpLite::AttachOutput(const cpp::OpDesc &op_desc,
+                          lite::Scope *scope,
+                          const std::string &output_name,
+                          bool is_dispensable,
+                          lite::Tensor **output_var) {
+  bool is_have_output =
+      op_desc.HasOutput(output_name) && op_desc.Output(output_name).size() > 0;
+  CHECK(is_dispensable || is_have_output);
+  if (is_have_output) {
+    std::string output_var_name = op_desc.Output(output_name).front();
+    *output_var = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -105,6 +105,20 @@ class OpLite : public Registry {
    return kernel_.get();
  }

+  // Attach input variable from scope by op_desc and input name
+  void AttachInput(const cpp::OpDesc &op_desc,
+                   lite::Scope *scope,
+                   const std::string &input_name,
+                   bool is_dispensable,
+                   lite::Tensor **input_var);
+
+  // Attach output variable from scope by op_desc and output name
+  void AttachOutput(const cpp::OpDesc &op_desc,
+                    lite::Scope *scope,
+                    const std::string &output_name,
+                    bool is_dispensable,
+                    lite::Tensor **output_var);
+
  virtual ~OpLite() = default;

 protected:

--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -152,6 +152,8 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kMLU, kInt16, kNCHW);

  INIT_FOR(kHost, kFloat, kNCHW);
+  INIT_FOR(kHost, kInt32, kNCHW);
+  INIT_FOR(kHost, kInt64, kNCHW);
  INIT_FOR(kHost, kAny, kNCHW);
  INIT_FOR(kHost, kFloat, kNHWC);
  INIT_FOR(kHost, kFloat, kAny);

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -135,6 +135,12 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kHost),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kHost),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kHost),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -75,6 +75,8 @@ class Optimizer {
    (defined LITE_WITH_ARM)
           "lite_elementwise_add_activation_fuse_pass",  //
 #endif
+           "__xpu__resnet_fuse_pass",
+           "__xpu__multi_encoder_fuse_pass",
           "quantized_op_attributes_inference_pass",  // Only for fully
                                                      // quantized model, infer
                                                      // the output scale and
@@ -115,9 +117,15 @@ class Optimizer {
           "variable_place_inference_pass",  //
           "argument_type_display_pass",

+           "mlu_subgraph_pass",
+
           "runtime_context_assign_pass",
           "argument_type_display_pass",
+
+           "mlu_postprocess_pass",
+
           "memory_optimize_pass"}};
+
      if (passes.size() == 1) {
        passes_local.push_back(passes[0]);
      }

--- a/lite/core/workspace.h
+++ b/lite/core/workspace.h
@@ -69,6 +69,13 @@ class WorkSpace {
  }
 #endif

+#if defined(LITE_WITH_MLU)
+  static WorkSpace& Global_MLU() {
+    thread_local std::unique_ptr<WorkSpace> x(new WorkSpace(TARGET(kMLU)));
+    return *x;
+  }
+#endif
+
 private:
  explicit WorkSpace(TargetType x) : target_(x) {}


--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -10,4 +10,5 @@ add_subdirectory(opencl)
 add_subdirectory(fpga)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(mlu)
 add_subdirectory(bm)
--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
@@ -179,6 +179,34 @@ void SquareCompute::Run() {
      x_data, output_data, x_dims.production(), ctx.threads());
 }

+void HardSwishCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  float threshold = param.hard_swish_threshold;
+  float scale = param.hard_swish_scale;
+  float offset = param.hard_swish_offset;
+  lite::arm::math::act_hard_swish<float>(x_data,
+                                         output_data,
+                                         x_dims.production(),
+                                         threshold,
+                                         scale,
+                                         offset,
+                                         ctx.threads());
+}
+
+void ReciprocalCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_reciprocal<float>(
+      x_data, output_data, x_dims.production(), ctx.threads());
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -275,3 +303,21 @@ REGISTER_LITE_KERNEL(
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
+REGISTER_LITE_KERNEL(hard_swish,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::HardSwishCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+REGISTER_LITE_KERNEL(reciprocal,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ReciprocalCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
@@ -148,6 +148,24 @@ class SquareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  virtual ~SquareCompute() = default;
 };

+class HardSwishCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~HardSwishCompute() = default;
+};
+
+class ReciprocalCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~ReciprocalCompute() = default;
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -5,3 +5,4 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${lite_kernel_deps})
--- a/lite/kernels/host/ctc_align_compute.cc
+++ b/lite/kernels/host/ctc_align_compute.cc
--- a/lite/kernels/host/ctc_align_compute.h
+++ b/lite/kernels/host/ctc_align_compute.h
--- a/lite/kernels/mlu/CMakeLists.txt
+++ b/lite/kernels/mlu/CMakeLists.txt
@@ -6,3 +6,4 @@ add_subdirectory(bridges)
 add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges})
 add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
 add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
+add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
@@ -25,8 +25,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {

-int ActConverter(void* ctx, OpLite* op);
-
 template void FillTensor<float, int>(Tensor* x,
                                     float lower = -2,
                                     float upper = -2);
@@ -136,7 +134,7 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {

 TEST(MLUBridges, activation) {
  std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
-  std::vector<std::string> types{"sigmoid", "relu", "tanh"};
+  std::vector<std::string> types{"sigmoid", "relu", "tanh", "leaky_relu"};
  for (auto x_shape : shapes) {
    for (auto op_type : types) {
      test_act(x_shape, op_type);
@@ -149,8 +147,7 @@ TEST(MLUBridges, activation) {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         sigmoid,
-                         paddle::lite::subgraph::mlu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter);
+USE_SUBGRAPH_BRIDGE(sigmoid, kMLU)
+USE_SUBGRAPH_BRIDGE(relu, kMLU)
+USE_SUBGRAPH_BRIDGE(tanh, kMLU)
+USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU)
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
--- a/lite/kernels/mlu/bridges/batch_norm_op_test.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
--- a/lite/kernels/mlu/bridges/concat_op.cc
+++ b/lite/kernels/mlu/bridges/concat_op.cc
--- a/lite/kernels/mlu/bridges/concat_op_test.cc
+++ b/lite/kernels/mlu/bridges/concat_op_test.cc
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
--- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
--- a/lite/kernels/mlu/bridges/graph.cc
+++ b/lite/kernels/mlu/bridges/graph.cc
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
--- a/lite/kernels/mlu/bridges/interpolate_op_test.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op_test.cc
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
--- a/lite/kernels/mlu/bridges/pool_op.cc
+++ b/lite/kernels/mlu/bridges/pool_op.cc
--- a/lite/kernels/mlu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
--- a/lite/kernels/mlu/bridges/scale_op.cc
+++ b/lite/kernels/mlu/bridges/scale_op.cc
--- a/lite/kernels/mlu/bridges/scale_op_test.cc
+++ b/lite/kernels/mlu/bridges/scale_op_test.cc
--- a/lite/kernels/mlu/bridges/softmax_op.cc
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
--- a/lite/kernels/mlu/bridges/softmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
--- a/lite/kernels/mlu/layout_compute.cc
+++ b/lite/kernels/mlu/layout_compute.cc
--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
--- a/lite/kernels/x86/cast_compute.cc
+++ b/lite/kernels/x86/cast_compute.cc
--- a/lite/kernels/x86/sequence_conv_compute.cc
+++ b/lite/kernels/x86/sequence_conv_compute.cc
--- a/lite/kernels/x86/sequence_conv_compute.h
+++ b/lite/kernels/x86/sequence_conv_compute.h
--- a/lite/kernels/x86/sequence_unpad_compute.cc
+++ b/lite/kernels/x86/sequence_unpad_compute.cc
--- a/lite/kernels/x86/sequence_unpad_compute.h
+++ b/lite/kernels/x86/sequence_unpad_compute.h
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
--- a/lite/kernels/xpu/__xpu__resnet50_compute.cc
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.cc
--- a/lite/kernels/xpu/__xpu__resnet50_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
--- a/lite/kernels/xpu/activation_compute.cc
+++ b/lite/kernels/xpu/activation_compute.cc
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
--- a/lite/kernels/xpu/batch_norm_compute.cc
+++ b/lite/kernels/xpu/batch_norm_compute.cc
--- a/lite/kernels/xpu/batch_norm_compute.h
+++ b/lite/kernels/xpu/batch_norm_compute.h
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
--- a/lite/kernels/xpu/bridges/graph.h
+++ b/lite/kernels/xpu/bridges/graph.h
--- a/lite/kernels/xpu/bridges/utility.h
+++ b/lite/kernels/xpu/bridges/utility.h
--- a/lite/kernels/xpu/cast_compute.cc
+++ b/lite/kernels/xpu/cast_compute.cc
--- a/lite/kernels/xpu/cast_compute.h
+++ b/lite/kernels/xpu/cast_compute.h
--- a/lite/kernels/xpu/conv_compute.cc
+++ b/lite/kernels/xpu/conv_compute.cc
--- a/lite/kernels/xpu/conv_compute.h
+++ b/lite/kernels/xpu/conv_compute.h
--- a/lite/kernels/xpu/dropout_compute.cc
+++ b/lite/kernels/xpu/dropout_compute.cc
--- a/lite/kernels/xpu/dropout_compute.h
+++ b/lite/kernels/xpu/dropout_compute.h
--- a/lite/kernels/xpu/elementwise_compute.cc
+++ b/lite/kernels/xpu/elementwise_compute.cc
--- a/lite/kernels/xpu/elementwise_compute.h
+++ b/lite/kernels/xpu/elementwise_compute.h
--- a/lite/kernels/xpu/io_copy_compute.cc
+++ b/lite/kernels/xpu/io_copy_compute.cc
--- a/lite/kernels/xpu/layer_norm_compute.cc
+++ b/lite/kernels/xpu/layer_norm_compute.cc
--- a/lite/kernels/xpu/layer_norm_compute.h
+++ b/lite/kernels/xpu/layer_norm_compute.h
--- a/lite/kernels/xpu/lookup_table_compute.cc
+++ b/lite/kernels/xpu/lookup_table_compute.cc
--- a/lite/kernels/xpu/lookup_table_compute.h
+++ b/lite/kernels/xpu/lookup_table_compute.h
--- a/lite/kernels/xpu/matmul_compute.cc
+++ b/lite/kernels/xpu/matmul_compute.cc
--- a/lite/kernels/xpu/matmul_compute.h
+++ b/lite/kernels/xpu/matmul_compute.h
--- a/lite/kernels/xpu/mul_compute.cc
+++ b/lite/kernels/xpu/mul_compute.cc
--- a/lite/kernels/xpu/mul_compute.h
+++ b/lite/kernels/xpu/mul_compute.h
--- a/lite/kernels/xpu/pool_compute.cc
+++ b/lite/kernels/xpu/pool_compute.cc
--- a/lite/kernels/xpu/pool_compute.h
+++ b/lite/kernels/xpu/pool_compute.h
--- a/lite/kernels/xpu/scale_compute.cc
+++ b/lite/kernels/xpu/scale_compute.cc
--- a/lite/kernels/xpu/scale_compute.h
+++ b/lite/kernels/xpu/scale_compute.h
--- a/lite/kernels/xpu/slice_compute.cc
+++ b/lite/kernels/xpu/slice_compute.cc
--- a/lite/kernels/xpu/slice_compute.h
+++ b/lite/kernels/xpu/slice_compute.h
--- a/lite/kernels/xpu/softmax_compute.cc
+++ b/lite/kernels/xpu/softmax_compute.cc
--- a/lite/kernels/xpu/softmax_compute.h
+++ b/lite/kernels/xpu/softmax_compute.h
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/__xpu__multi_encoder_op.cc
+++ b/lite/operators/__xpu__multi_encoder_op.cc
--- a/lite/operators/__xpu__multi_encoder_op.h
+++ b/lite/operators/__xpu__multi_encoder_op.h
--- a/lite/operators/__xpu__resnet50_op.cc
+++ b/lite/operators/__xpu__resnet50_op.cc
--- a/lite/operators/__xpu__resnet50_op.h
+++ b/lite/operators/__xpu__resnet50_op.h
--- a/lite/operators/activation_extra_ops.cc
+++ b/lite/operators/activation_extra_ops.cc
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
--- a/lite/operators/calib_op.cc
+++ b/lite/operators/calib_op.cc
--- a/lite/operators/ctc_align_op.cc
+++ b/lite/operators/ctc_align_op.cc
--- a/lite/operators/ctc_align_op.h
+++ b/lite/operators/ctc_align_op.h
--- a/lite/operators/io_copy_op.cc
+++ b/lite/operators/io_copy_op.cc
--- a/lite/operators/layout_op.cc
+++ b/lite/operators/layout_op.cc
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/sequence_unpad_op.cc
+++ b/lite/operators/sequence_unpad_op.cc
--- a/lite/operators/sequence_unpad_op.h
+++ b/lite/operators/sequence_unpad_op.h
--- a/lite/operators/stack_op.cc
+++ b/lite/operators/stack_op.cc
--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
--- a/lite/tests/api/test_bert_lite_xpu.cc
+++ b/lite/tests/api/test_bert_lite_xpu.cc
--- a/lite/tests/api/test_ernie_lite_xpu.cc
+++ b/lite/tests/api/test_ernie_lite_xpu.cc
--- a/lite/tests/api/test_resnet50_lite_xpu.cc
+++ b/lite/tests/api/test_resnet50_lite_xpu.cc
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
--- a/lite/tests/kernels/ctc_align_compute_test.cc
+++ b/lite/tests/kernels/ctc_align_compute_test.cc
--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_mlu.sh
+++ b/lite/tools/build_mlu.sh
--- a/lite/tools/build_xpu.sh
+++ b/lite/tools/build_xpu.sh
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
--- a/mobile/test/net/test_inference_api_v2.cpp
+++ b/mobile/test/net/test_inference_api_v2.cpp
--- a/mobile/test/net/test_inference_m2fm.cpp
+++ b/mobile/test/net/test_inference_m2fm.cpp