Merge branch 'develop' into pass

d64d4b6f · myq406450149 · GitHub · af3c5d92 · dc481d49 · d64d4b6f
243 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,7 +59,9 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
+lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
+lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
 lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
@@ -177,6 +179,10 @@ if(LITE_WITH_XPU)
    include(device/xpu)
 endif()
+if(LITE_WITH_MLU)
+    include(mlu)
+endif()
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -136,6 +136,9 @@ endif()
 if (LITE_WITH_XPU)
    add_definitions("-DLITE_WITH_XPU")
+    if (LITE_WITH_XTCL)
+      add_definitions("-DLITE_WITH_XTCL")
+    endif()
 endif()
 if (LITE_WITH_OPENCL)
@@ -150,6 +153,10 @@ if (LITE_WITH_BM)
 add_definitions("-DLITE_WITH_BM")
 endif()
+if (LITE_WITH_MLU)
+add_definitions("-DLITE_WITH_MLU")
+endif()
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
 endif()

--- a/cmake/device/xpu.cmake
+++ b/cmake/device/xpu.cmake
@@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT)
    message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
  endif()
 endif()
 message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
-find_path(XPU_SDK_INC NAMES xtcl.h
-  PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl
-  NO_DEFAULT_PATH)
-if(NOT XPU_SDK_INC)
-  message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
-endif()
-include_directories("${XPU_SDK_ROOT}/XTCL/include")
 include_directories("${XPU_SDK_ROOT}/XTDK/include")
-find_library(XPU_SDK_XTCL_FILE NAMES xtcl
-  PATHS ${XPU_SDK_ROOT}/XTCL/so
-  NO_DEFAULT_PATH)
-if(NOT XPU_SDK_XTCL_FILE)
-  message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
-  add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
-endif()
-find_library(XPU_SDK_TVM_FILE NAMES tvm
-  PATHS ${XPU_SDK_ROOT}/XTCL/so
-  NO_DEFAULT_PATH)
-if(NOT XPU_SDK_TVM_FILE)
-  message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
-  add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
-endif()
 find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
  NO_DEFAULT_PATH)
@@ -82,23 +50,55 @@ else()
  set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE})
 endif()
-find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc
+set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs")
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs")
+if(LITE_WITH_XTCL)
+    find_path(XPU_SDK_INC NAMES xtcl.h
+      PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_INC)
+      message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
+    endif()
+    include_directories("${XPU_SDK_ROOT}/XTCL/include")
+    find_library(XPU_SDK_XTCL_FILE NAMES xtcl
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
+      NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_XTCL_FILE)
+      message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
+      add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
+    endif()
+    find_library(XPU_SDK_TVM_FILE NAMES tvm
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
      NO_DEFAULT_PATH)
-find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
+    if(NOT XPU_SDK_TVM_FILE)
+      message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
+      add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
+    endif()
+    find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
      PATHS ${XPU_SDK_ROOT}/XTDK/shlib
      NO_DEFAULT_PATH)
-if(NOT XPU_SDK_LLVM_FILE)
+    if(NOT XPU_SDK_LLVM_FILE)
      message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
-else()
+    else()
      message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
      add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
      set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
-endif()
+    endif()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1")
-set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
+    set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
-set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
+    set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
+endif()
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(deps ${lite_deps_DEPS})
@@ -100,6 +100,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()
+  if (LITE_WITH_MLU)
+    foreach(var ${lite_deps_MLU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()
@@ -125,7 +131,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -144,6 +150,7 @@ function(lite_cc_library TARGET)
            PROFILE_DEPS ${args_PROFILE_DEPS}
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )
    if (args_SHARED OR ARGS_shared)
@@ -170,7 +177,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -189,6 +196,7 @@ function(lite_cc_binary TARGET)
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            CV_DEPS ${CV_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -218,7 +226,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -245,6 +253,7 @@ function(lite_cc_test TARGET)
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
              CV_DEPS ${args_CV_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size
@@ -269,6 +278,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
+set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
 set(host_kernels CACHE INTERNAL "host kernels")
@@ -285,12 +295,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
+# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -369,6 +379,12 @@ function(add_kernel TARGET device level)
        endif()
        set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "MLU")
+        if (NOT LITE_WITH_MLU)
+            return()
+        endif()
+        set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
            foreach(src ${args_SRCS})
@@ -409,6 +425,7 @@ function(add_kernel TARGET device level)
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
 	      BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -427,7 +444,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -462,6 +479,7 @@ function(add_operator TARGET level)
              NPU_DEPS ${args_NPU_DEPS}
              XPU_DEPS ${args_XPU_DEPS}
 	      BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -8,7 +8,9 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
+message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
+message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -10,6 +10,7 @@ if (LITE_ON_TINY_PUBLISH)
 endif()
 set(light_lib_DEPS light_api paddle_api paddle_api_light)
 if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
    #full api dynamic library
    lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
@@ -66,7 +67,8 @@ if (WITH_TESTING)
      CUDA_DEPS ${cuda_kernels}
      X86_DEPS ${x86_kernels}
      XPU_DEPS ${xpu_kernels}
-      BM_DEPS ${bm_kernels})
+      BM_DEPS ${bm_kernels}
+      MLU_DEPS ${mlu_kernels})
 endif()
 if(LITE_WITH_FPGA)
    set(light_api_deps ${light_api_deps} ${fpga_deps})
@@ -88,6 +90,7 @@ message(STATUS "get NPU kernels ${npu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
+message(STATUS "get MLU kernels ${mlu_kernels}")
 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -125,7 +128,8 @@ lite_cc_library(light_api SRCS light_api.cc
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
-        BM_DEPS ${bm_kernels})
+        BM_DEPS ${bm_kernels}
+        MLU_DEPS ${mlu_kernels})
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -144,6 +148,7 @@ if(WITH_TESTING)
       CL_DEPS ${opencl_kernels}
       FPGA_DEPS ${fpga_kernels}
       BM_DEPS ${bm_kernels}
+       MLU_DEPS ${mlu_kernels}
       EXCLUDE_COMPILE_DEPS "ON"
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -264,8 +269,6 @@ if (NOT LITE_ON_TINY_PUBLISH)
        NPU_DEPS ${npu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
-        CV_DEPS paddle_cv_arm
-        NPU_DEPS ${npu_kernels}
        BM_DEPS ${bm_kernels})
    # The final inference library for just MobileConfig.
    bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
@@ -292,6 +295,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
        XPU_DEPS ${xpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
+        MLU_DEPS ${mlu_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
        --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -329,6 +333,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
  X86_DEPS ${x86_kernels}
  FPGA_DEPS ${fpga_kernels}
  BM_DEPS ${bm_kernels}
+  MLU_DEPS ${mlu_kernels}
  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
 if (WITH_TESTING)
    add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -342,6 +347,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -354,6 +360,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -366,6 +373,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -378,6 +386,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
@@ -389,6 +398,7 @@ if(NOT IOS)
        CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
        CL_DEPS ${opencl_kernels}
 	BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -43,6 +43,16 @@ class LITE_API Predictor {
 public:
  // Create an empty predictor.
  Predictor() { scope_ = std::make_shared<Scope>(); }
+  ~Predictor() {
+#ifdef LITE_WITH_OPENCL
+    CLRuntime::Global()->ReleaseResources();
+#endif
+    scope_.reset();
+    exec_scope_ = nullptr;
+    program_.reset();
+    input_names_.clear();
+    output_names_.clear();
+  }
  // Create a predictor with the weight variable scope set.
  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
      : scope_(root_scope) {}

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -42,6 +42,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
    }
  }
 #endif
+#ifdef LITE_WITH_MLU
+  Env<TARGET(kMLU)>::Init();
+  lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
+                                           config.mlu_core_number(),
+                                           config.mlu_use_first_conv(),
+                                           config.mlu_first_conv_mean(),
+                                           config.mlu_first_conv_std(),
+                                           config.mlu_input_layout());
+#endif  // LITE_WITH_MLU
  std::vector<std::string> passes{};
  auto use_layout_preprocess_pass =
      config.model_dir().find("OPENCL_PRE_PRECESS");

--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -107,6 +107,8 @@ class LightPredictorImpl : public lite_api::PaddlePredictor {
 public:
  LightPredictorImpl() = default;
+  ~LightPredictorImpl();
  std::unique_ptr<lite_api::Tensor> GetInput(int i) override;
  std::unique_ptr<const lite_api::Tensor> GetOutput(int i) const override;

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -21,6 +21,13 @@
 namespace paddle {
 namespace lite {
+LightPredictorImpl::~LightPredictorImpl() {
+  raw_predictor_.reset();
+#ifdef LITE_WITH_OPENCL
+  CLRuntime::Global()->ReleaseResources();
+#endif
+}
 void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  // LightPredictor Only support NaiveBuffer backend in publish lib
  if (config.lite_model_file().empty()) {

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -109,6 +109,8 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(TARGET(kNPU));
    } else if (target_repr == "xpu") {
      valid_places.emplace_back(TARGET(kXPU));
+    } else if (target_repr == "mlu") {
+      valid_places.emplace_back(TARGET(kMLU));
    } else {
      LOG(FATAL) << lite::string_format(
          "Wrong target '%s' found, please check the command flag "

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "lite/api/paddle_api.h"
+#include "lite/core/context.h"
 #include "lite/core/device_info.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
@@ -203,6 +204,58 @@ void ConfigBase::set_threads(int threads) {
 #endif
 }
+#ifdef LITE_WITH_MLU
+void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
+  mlu_core_version_ = core_version;
+}
+void CxxConfig::set_mlu_core_number(int core_number) {
+  mlu_core_number_ = core_number;
+}
+void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
+  mlu_input_layout_ = layout;
+}
+void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
+  mlu_use_first_conv_ = use_first_conv;
+}
+void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
+  mlu_first_conv_mean_ = mean;
+}
+void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
+  mlu_first_conv_std_ = std;
+}
+lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
+  return mlu_core_version_;
+}
+int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
+DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
+bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
+const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
+  return mlu_first_conv_mean_;
+}
+const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
+  return mlu_first_conv_std_;
+}
+#endif
+void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_workspace_l3_size_per_thread' is ignored, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::SetDev(dev_no);
+#else
+  LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
+                  "ignored, please rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
 // set model data in combined format, `set_model_from_file` refers to loading
 // model from file, set_model_from_buffer refers to loading model from memory
 // buffer

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -136,6 +136,14 @@ class LITE_API CxxConfig : public ConfigBase {
 #ifdef LITE_WITH_X86
  int x86_math_library_math_threads_ = 1;
 #endif
+#ifdef LITE_WITH_MLU
+  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
+  int mlu_core_number_{1};
+  DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
+  bool mlu_use_first_conv_{false};
+  std::vector<float> mlu_first_conv_mean_;
+  std::vector<float> mlu_first_conv_std_;
+#endif
 public:
  void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -163,6 +171,37 @@ class LITE_API CxxConfig : public ConfigBase {
    return x86_math_library_math_threads_;
  }
 #endif
+#ifdef LITE_WITH_MLU
+  // set MLU core version, which is used when compiling MLU kernels
+  void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
+  // set MLU core number, which is used when compiling MLU kernels
+  void set_mlu_core_number(int core_number);
+  // set MLU input layout. User can specify layout of input data to be NHWC,
+  // default is NCHW
+  void set_mlu_input_layout(DataLayoutType layout);
+  // whether use MLU's first conv kernel. First conv is a special kernel
+  // provided by MLU, its input is uint8, and also needs two 3-dimentional
+  // vectors which save all inputs' mean and std values
+  void set_mlu_use_first_conv(bool use_first_conv);
+  // set the 3-dimentional mean vector used by MLU's first conv
+  void set_mlu_first_conv_mean(const std::vector<float>& mean);
+  // set the 3-dimentional std vector used by MLU's first conv
+  void set_mlu_first_conv_std(const std::vector<float>& std);
+  lite_api::MLUCoreVersion mlu_core_version() const;
+  int mlu_core_number() const;
+  DataLayoutType mlu_input_layout() const;
+  bool mlu_use_first_conv() const;
+  const std::vector<float>& mlu_first_conv_mean() const;
+  const std::vector<float>& mlu_first_conv_std() const;
+#endif
+  // XPU only, set the size of the workspace memory from L3 cache for the
+  // current thread.
+  void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
+  // XPU only, specify the target device ID for the current thread.
+  void set_xpu_dev_per_thread(int dev_no = 0);
 };
 /// MobileConfig is the config for the light weight predictor, it will skip

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -71,7 +71,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "fpga",
                                              "npu",
                                              "xpu",
-                                              "bm"};
+                                              "bm",
+                                              "mlu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -111,6 +112,7 @@ const std::string& TargetRepr(TargetType target) {
                                              "kFPGA",
                                              "kNPU",
                                              "kXPU",
+                                              "kMLU",
                                              "kBM"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
@@ -153,6 +155,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kNPU),
                                               TARGET(kXPU),
                                               TARGET(kBM),
+                                               TARGET(kMLU),
                                               TARGET(kFPGA)});
  if (target == TARGET(kAny)) {
    return valid_set;

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -53,8 +53,8 @@ enum class TargetType : int {
  kNPU = 8,
  kXPU = 9,
  kBM = 10,
-  kAny = 6,  // any target
  kMLU = 11,
+  kAny = 6,  // any target
  NUM = 12,  // number of fields.
 };
 enum class PrecisionType : int {
@@ -89,6 +89,8 @@ typedef enum {
  LITE_POWER_RAND_LOW = 5
 } PowerMode;
+typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion;
 enum class ActivationType : int {
  kIndentity = 0,
  kRelu = 1,
@@ -100,7 +102,9 @@ enum class ActivationType : int {
  kSwish = 7,
  kExp = 8,
  kAbs = 9,
-  NUM = 10,
+  kHardSwish = 10,
+  kReciprocal = 11,
+  NUM = 12,
 };
 static size_t PrecisionTypeLength(PrecisionType type) {

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -45,6 +45,10 @@ USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
+USE_MIR_PASS(mlu_subgraph_pass);
+USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
 USE_MIR_PASS(assign_value_eliminate_pass);
+USE_MIR_PASS(__xpu__resnet_fuse_pass);
+USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -47,6 +47,7 @@ using lite_api::TargetType;
 using lite_api::PrecisionType;
 using lite_api::DataLayoutType;
 using lite_api::Place;
+using lite_api::MLUCoreVersion;
 using lite::LightPredictorImpl;
 using lite_api::OptBase;
@@ -76,6 +77,7 @@ static void BindLiteMobileConfig(py::module *m);
 static void BindLitePowerMode(py::module *m);
 static void BindLitePlace(py::module *m);
 static void BindLiteTensor(py::module *m);
+static void BindLiteMLUCoreVersion(py::module *m);
 void BindLiteApi(py::module *m) {
  BindLiteCxxConfig(m);
@@ -83,6 +85,7 @@ void BindLiteApi(py::module *m) {
  BindLitePowerMode(m);
  BindLitePlace(m);
  BindLiteTensor(m);
+  BindLiteMLUCoreVersion(m);
 #ifndef LITE_ON_TINY_PUBLISH
  BindLiteCxxPredictor(m);
 #endif
@@ -124,6 +127,14 @@ void BindLiteCxxConfig(py::module *m) {
      .def("set_power_mode", &CxxConfig::set_power_mode)
      .def("power_mode", &CxxConfig::power_mode);
 #endif
+#ifdef LITE_WITH_MLU
+  cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version)
+      .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number)
+      .def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout)
+      .def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv)
+      .def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean)
+      .def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std);
+#endif
 }
 // TODO(sangoly): Should MobileConfig be renamed to LightConfig ??
@@ -155,6 +166,12 @@ void BindLitePowerMode(py::module *m) {
      .value("LITE_POWER_RAND_LOW", PowerMode::LITE_POWER_RAND_LOW);
 }
+void BindLiteMLUCoreVersion(py::module *m) {
+  py::enum_<MLUCoreVersion>(*m, "MLUCoreVersion")
+      .value("LITE_MLU_220", MLUCoreVersion::MLU_220)
+      .value("LITE_MLU_270", MLUCoreVersion::MLU_270);
+}
 void BindLitePlace(py::module *m) {
  // TargetType
  py::enum_<TargetType>(*m, "TargetType")
@@ -165,6 +182,7 @@ void BindLitePlace(py::module *m) {
      .value("OpenCL", TargetType::kOpenCL)
      .value("FPGA", TargetType::kFPGA)
      .value("NPU", TargetType::kNPU)
+      .value("MLU", TargetType::kMLU)
      .value("Any", TargetType::kAny);
  // PrecisionType
@@ -245,6 +263,20 @@ void BindLiteTensor(py::module *m) {
  DO_GETTER_ONCE(data_type__, name__##_data)
  DATA_GETTER_SETTER_ONCE(int8_t, int8);
+#ifdef LITE_WITH_MLU
+  tensor.def("set_uint8_data",
+             [](Tensor &self,
+                const std::vector<uint8_t> &data,
+                TargetType type = TargetType::kHost) {
+               if (type == TargetType::kHost) {
+                 self.CopyFromCpu<uint8_t, TargetType::kHost>(data.data());
+               }
+             },
+             py::arg("data"),
+             py::arg("type") = TargetType::kHost);
+  DO_GETTER_ONCE(uint8_t, "uint8_data");
+#endif
  DATA_GETTER_SETTER_ONCE(int32_t, int32);
  DATA_GETTER_SETTER_ONCE(float, float);
 #undef DO_GETTER_ONCE

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -6,4 +6,5 @@ add_subdirectory(fpga)
 add_subdirectory(host)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(mlu)
 add_subdirectory(bm)
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "lite/backends/arm/math/activation.h"
+#include <algorithm>
 #include <string>
 #include "lite/backends/arm/math/funcs.h"
@@ -711,6 +712,38 @@ void act_square<float>(const float* din, float* dout, int size, int threads) {
  }
 }
+template <>
+void act_hard_swish<float>(const float* din,
+                           float* dout,
+                           int size,
+                           float threshold,
+                           float scale,
+                           float offset,
+                           int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = std::min(std::max(0.f, ptr_in[0] + offset), threshold) *
+                 ptr_in[0] / scale;
+    ptr_in++;
+    ptr_out++;
+  }
+}
+template <>
+void act_reciprocal<float>(const float* din,
+                           float* dout,
+                           int size,
+                           int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = 1.0 / ptr_in[0];
+    ptr_in++;
+    ptr_out++;
+  }
+}
 #ifdef LITE_WITH_TRAIN
 template <>
 void act_square_grad(const float* din,

--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -72,6 +72,17 @@ void act_rsqrt(const T* din, T* dout, int size, int threads);
 template <typename T>
 void act_square(const T* din, T* dout, int size, int threads);
+template <typename T>
+void act_hard_swish(const T* din,
+                    T* dout,
+                    int size,
+                    float threshold,
+                    float scale,
+                    float offset,
+                    int threads);
+template <typename T>
+void act_reciprocal(const T* din, T* dout, int size, int threads);
 #ifdef LITE_WITH_TRAIN
 template <typename T>
 void act_square_grad(

--- a/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cl_common.h>
+__kernel void decode_center_size(__read_only image2d_t prior_box_image,
+                                __read_only image2d_t prior_box_var_image,
+                                __read_only image2d_t target_box_image,
+                                __write_only image2d_t output_image,
+                                __private const int out_C,
+                                __private const int out_H){
+                        const int out_c = get_global_id(0);
+                        const int out_nh = get_global_id(1);
+                        const int out_h = out_nh % out_H;
+                        const int out_n =  1;
+                        const int prior_box_n = 1;
+                        const int prior_box_c = 0;
+                        const int prior_box_h = out_h;
+                        const int prior_box_var_n = 1;
+                        const int prior_box_var_c = 0;
+                        const int prior_box_var_h = out_h;
+                        const int target_box_n = 1;
+                        const int target_box_c = out_c;
+                        const int target_box_h = out_h;
+                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                 CLK_ADDRESS_CLAMP      |
+                                                 CLK_FILTER_NEAREST;
+                        int2  prior_box_pos;
+                        int2  prior_box_var_pos;
+                        int2  target_box_pos;
+                        int2  output_pos;
+                        prior_box_pos.x = prior_box_c * 4;
+                        prior_box_pos.y = prior_box_n * prior_box_h;
+                        prior_box_var_pos.x = prior_box_var_c * 4;
+                        prior_box_var_pos.y = prior_box_var_n * prior_box_var_h;
+                        target_box_pos.x = target_box_c * 4;
+                        target_box_pos.y = target_box_n * target_box_h;
+                        output_pos.x = out_c * 4;
+                        output_pos.y = out_n * out_h;
+                        CL_DTYPE4 prior_box_input[4];
+                        CL_DTYPE4 prior_box_var_input[4];
+                        CL_DTYPE4 target_box_input[4];
+                        prior_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 0, prior_box_pos.y));
+                        prior_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 1, prior_box_pos.y));
+                        prior_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 2, prior_box_pos.y));
+                        prior_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 3, prior_box_pos.y));
+                        prior_box_var_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 0, prior_box_var_pos.y));
+                        prior_box_var_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 1, prior_box_var_pos.y));
+                        prior_box_var_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 2, prior_box_var_pos.y));
+                        prior_box_var_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, 
+                                                (int2)(prior_box_var_pos.x + 3, prior_box_var_pos.y));
+                        target_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 0,target_box_pos.y));
+                        target_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 1, target_box_pos.y));
+                        target_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 2, target_box_pos.y));
+                        target_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 3, target_box_pos.y));
+                        CL_DTYPE prior_box_width = prior_box_input[2].x - prior_box_input[0].x;
+                        CL_DTYPE prior_box_height = prior_box_input[3].x - prior_box_input[1].x;
+                        CL_DTYPE prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(CL_DTYPE)2;
+                        CL_DTYPE prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(CL_DTYPE)2;
+                        CL_DTYPE4 target_box_center_x;
+                        CL_DTYPE4 target_box_center_y;
+                        CL_DTYPE4 target_box_width;
+                        CL_DTYPE4 target_box_height;
+                        CL_DTYPE4 output[4];
+                        output[0] = 0.0f;
+                        output[1] = 0.0f;
+                        output[2] = 0.0f;
+                        output[3] = 0.0f;
+                        target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x;
+                        target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y;
+                        target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width;
+                        target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height;
+                        output[0].x = target_box_center_x.x - target_box_width.x/(half)2;
+                        output[1].x = target_box_center_y.x - target_box_height.x/(half)2;
+                        output[2].x = target_box_center_x.x + target_box_width.x/(half)2;
+                        output[3].x = target_box_center_y.x + target_box_height.x/(half)2;
+                        if(out_C - out_c * 4 >= 2){
+                            target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x;
+                            target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y;
+                            target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width;
+                            target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height;
+                            output[0].y = target_box_center_x.y - target_box_width.y/(half)2;
+                            output[1].y = target_box_center_y.y - target_box_height.y/(half)2;
+                            output[2].y = target_box_center_x.y + target_box_width.y/(half)2;
+                            output[3].y = target_box_center_y.y + target_box_height.y/(half)2;
+                        }
+                        if(out_C - out_c * 4 >= 3){
+                            target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x;
+                            target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y;
+                            target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width;
+                            target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height;
+                            output[0].z = target_box_center_x.z - target_box_width.z/(half)2;
+                            output[1].z = target_box_center_y.z - target_box_height.z/(half)2;
+                            output[2].z = target_box_center_x.z + target_box_width.z/(half)2;
+                            output[3].z = target_box_center_y.z + target_box_height.z/(half)2;
+                        }
+                        if(out_C - out_c * 4 >= 4){
+                            target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x;
+                            target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y;
+                            target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width;
+                            target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height;
+                            output[0].w = target_box_center_x.w - target_box_width.w/(half)2;
+                            output[1].w = target_box_center_y.w - target_box_height.w/(half)2;
+                            output[2].w = target_box_center_x.w + target_box_width.w/(half)2;
+                            output[3].w = target_box_center_y.w + target_box_height.w/(half)2;
+                        }
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]);
+}
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -29,30 +29,38 @@ CLRuntime* CLRuntime::Global() {
 }
 CLRuntime::~CLRuntime() {
+  LOG(INFO) << "CLRuntime::~CLRuntime()";
+  // Note: do ReleaseResources() in predictor
+  command_queue_&& clReleaseCommandQueue(command_queue_->get());
+  command_queue_.reset();
+  context_&& clReleaseContext(context_->get());
+  context_.reset();
+  device_.reset();
+  platform_.reset();
+  initialized_ = false;
+}
+void CLRuntime::ReleaseResources() {
+  //  if (is_resources_released_) {
+  //    return;
+  //  }
  if (command_queue_ != nullptr) {
    command_queue_->flush();
    command_queue_->finish();
  }
  for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
    clReleaseKernel(kernels_[kidx]->get());
    kernels_[kidx].reset();
  }
  kernels_.clear();
  kernel_offset_.clear();
  for (auto& p : programs_) {
    clReleaseProgram(p.second->get());
  }
  programs_.clear();
+  LOG(INFO) << "release resources finished.";
-  // For controlling the destruction order
+  is_resources_released_ = true;
-  command_queue_&& clReleaseCommandQueue(command_queue_->get());
-  command_queue_.reset();
-  context_&& clReleaseContext(context_->get());
-  context_.reset();
-  device_.reset();
-  platform_.reset();
 }
 bool CLRuntime::Init() {

--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -33,6 +33,8 @@ class CLRuntime {
 public:
  static CLRuntime* Global();
+  void ReleaseResources();
  bool Init();
  cl::Platform& platform();
@@ -116,6 +118,8 @@ class CLRuntime {
  bool initialized_{false};
  bool is_init_success_{false};
+  bool is_resources_released_{false};
 };
 }  // namespace lite

--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -96,8 +96,8 @@ class BeamSearchFunctor<TARGET(kX86), T> {
    //        : nullptr;
    // fill in data
-    std::vector<size_t> low_level;
+    std::vector<uint64_t> low_level;
-    size_t low_offset = 0;
+    uint64_t low_offset = 0;
    for (auto &items : selected_items) {
      low_level.push_back(low_offset);
      for (auto &item : items) {

--- a/lite/backends/x86/math/beam_search_test.cc
+++ b/lite/backends/x86/math/beam_search_test.cc
@@ -22,8 +22,8 @@ void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
                       paddle::framework::LoDTensor* pre_scores) {
  // lod
  paddle::framework::LoD lod;
-  std::vector<size_t> level0({0, 2, 4});
+  std::vector<uint64_t> level0({0, 2, 4});
-  std::vector<size_t> level1({0, 1, 2, 3, 4});
+  std::vector<uint64_t> level1({0, 1, 2, 3, 4});
  lod.push_back(level0);
  lod.push_back(level1);
  ids->set_lod(lod);

--- a/lite/backends/x86/math/blas_impl.h
+++ b/lite/backends/x86/math/blas_impl.h
@@ -483,7 +483,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
             mat_a.data<T>(),
             mat_b.data<T>(),
             beta,
-             mat_out->mutable_data<T>());
+             mat_out->template mutable_data<T>());
 }
 template <>
@@ -759,7 +759,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
                           mat_a.data<T>(),
                           mat_b.data<T>(),
                           beta,
-                           mat_out->mutable_data<T>());
+                           mat_out->template mutable_data<T>());
  } else {
    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
                   dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
@@ -773,7 +773,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
        mat_a.data<T>(),
        mat_b.data<T>(),
        beta,
-        mat_out->mutable_data<T>(),
+        mat_out->template mutable_data<T>(),
        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
        dim_a.stride_,
        dim_b.stride_);

--- a/lite/backends/x86/math/concat_and_split.cc
+++ b/lite/backends/x86/math/concat_and_split.cc
@@ -51,7 +51,7 @@ class ConcatFunctor<lite::TargetType::kX86, T> {
    // auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
    // computation
-    auto output_data = output->mutable_data<T>();
+    auto output_data = output->template mutable_data<T>();
    int col_idx = 0;
    for (int j = 0; j < num; ++j) {
      int col_len = input_cols[j];
@@ -108,7 +108,7 @@ class SplitFunctor<lite::TargetType::kX86, T> {
        int col_len = output_cols[j];
        auto* out_tensor = outputs->at(j);
        if (out_tensor != nullptr) {
-          T* dst_ptr = out_tensor->mutable_data<T>() + k * col_len;
+          T* dst_ptr = out_tensor->template mutable_data<T>() + k * col_len;
          std::copy_n(src_ptr + col_idx, col_len, dst_ptr);
          // memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
          //             sizeof(T) * col_len);

--- a/lite/backends/x86/math/cross_entropy.cc
+++ b/lite/backends/x86/math/cross_entropy.cc
@@ -50,8 +50,8 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> {
                .reshape(batch_axis_remain)
                .sum(Eigen::DSizes<int, 1>(1)));
    } else {
-      const T* prob_data = prob->data<T>();
+      const T* prob_data = prob->template data<T>();
-      T* loss_data = out->mutable_data<T>();
+      T* loss_data = out->template mutable_data<T>();
      const int64_t* label_data = labels->data<int64_t>();
      for (int i = 0; i < batch_size; ++i) {

--- a/lite/backends/x86/math/im2col.cc
+++ b/lite/backends/x86/math/im2col.cc
@@ -99,7 +99,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
    int channels_col = im_channels * filter_height * filter_width;
-    T* im_data = im->mutable_data<T>();
+    T* im_data = im->template mutable_data<T>();
    const T* col_data = col.data<T>();
    for (int c = 0; c < channels_col; ++c) {
@@ -161,7 +161,7 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
    int col_width = col->dims()[1];
    const T* im_data = im.data<T>();
-    T* col_data = col->mutable_data<T>();
+    T* col_data = col->template mutable_data<T>();
    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
@@ -235,7 +235,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
        "col_width and padding(padding_left, padding_right) are "
        "inconsistent.");
-    T* im_data = im->mutable_data<T>();
+    T* im_data = im->template mutable_data<T>();
    const T* col_data = col.data<T>();
    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {

--- a/lite/backends/x86/math/im2col_cfo_cpu.h
+++ b/lite/backends/x86/math/im2col_cfo_cpu.h
@@ -42,7 +42,7 @@ inline void im2col_common(const lite::Tensor& im,
  int channels_col = im_channels * filter_height * filter_width;
  const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
  for (int c = 0; c < channels_col; ++c) {
    int w_offset = c % filter_width;
    int h_offset = (c / filter_width) % filter_height;
@@ -77,7 +77,7 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im,
  int output_width = col->dims()[4];
  const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
  int col_matrix_width = output_width * output_height;
  int im_size = im_height * im_width;
  size_t copy_size = sizeof(T) * output_width;
@@ -123,7 +123,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im,
  constexpr int prw = 1;
  const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
  int im_size = im_height * im_width;
  int col_matrix_width = output_width * output_height;
  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow

--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -65,7 +65,7 @@ struct TensorSetConstantCPU {
      : tensor_(tensor), value_(value) {}
  template <typename T>
  void apply() const {
-    auto* begin = tensor_->mutable_data<T>(lite::TargetType::kX86);
+    auto* begin = tensor_->template mutable_data<T>(lite::TargetType::kX86);
    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
  }
  lite::Tensor* tensor_;
@@ -126,7 +126,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
    const T* input_data = input.data<T>();
    const T* vector_data = vector.data<T>();
-    T* output_data = output->mutable_data<T>();
+    T* output_data = output->template mutable_data<T>();
    for (int64_t i = 0; i < in_dims[0]; ++i) {
      for (int64_t j = 0; j < size; ++j) {
        output_data[i * in_dims[0] + j] =

--- a/lite/backends/x86/math/math_function_impl.h
+++ b/lite/backends/x86/math/math_function_impl.h
@@ -83,7 +83,7 @@ class ColwiseSum<lite::TargetType::kX86, T> {
    auto size = in_dims[1];
    PADDLE_ENFORCE_EQ(out->numel(), size);
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
    const T* in_buf = input.data<T>();
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -129,7 +129,7 @@ class RowwiseMean<lite::TargetType::kX86, T> {
    auto size = in_dims[1];
    PADDLE_ENFORCE_EQ(out->numel(), height);
    auto inv_size = 1.0 / size;
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
    const T* in_buf = input.data<T>();
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -173,7 +173,7 @@ class RowwiseSum<lite::TargetType::kX86, T> {
    auto size = in_dims[1];
    PADDLE_ENFORCE_EQ(out->numel(), height);
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
    const T* in_buf = input.data<T>();
    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {

--- a/lite/backends/x86/math/maxouting.cc
+++ b/lite/backends/x86/math/maxouting.cc
@@ -35,7 +35,7 @@ class MaxOutFunctor<lite::TargetType::kX86, T> {
    // c_size means the output size of each sample
    int c_size = fea_size * output_channels;
    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
    for (int i = 0; i < batch_size; ++i) {
      int new_bindex = c_size * i;
@@ -72,7 +72,8 @@ class MaxOutGradFunctor<lite::TargetType::kX86, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    for (int i = 0; i < batch_size; ++i) {
      int blen = fea_size * output_channels * i;

--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
@@ -54,8 +54,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const int input_stride = input_height * input_width;
    const int output_stride = output_height * output_width;
-    const T* input_data = input->data<T>();
+    const T* input_data = input->template data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
    int hstart, hend;
    int wstart, wend;
@@ -137,7 +137,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    int hstart, hend;
    int wstart, wend;
@@ -220,7 +221,8 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -322,7 +324,7 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const int output_stride = output_depth * output_height * output_width;
    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
    int dstart, dend;
    int hstart, hend;
@@ -425,7 +427,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    int dstart, dend;
    int hstart, hend;
@@ -530,7 +533,8 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {

--- a/lite/backends/x86/math/sample_prob.h
+++ b/lite/backends/x86/math/sample_prob.h
@@ -58,11 +58,11 @@ class SampleWithProb {
    const int64_t* label_data = L->data<int64_t>();
    // int64_t* samples_data =
    //    S->mutable_data<int64_t>(ret_dim, Target);
-    // T* probabilities_data = P->mutable_data<T>(ret_dim, Target);
+    // T* probabilities_data = P->template mutable_data<T>(ret_dim, Target);
    S->Resize({batch_size, num_sampled_classes});
    auto* samples_data = S->mutable_data<int64_t>(Target);
    P->Resize({batch_size, num_sampled_classes});
-    auto* probabilities_data = P->mutable_data<T>(Target);
+    auto* probabilities_data = P->template mutable_data<T>(Target);
    // temp sets for unique sampling
    std::unordered_set<int64_t> tmp_samples;

--- a/lite/backends/x86/math/search_fc.cc
+++ b/lite/backends/x86/math/search_fc.cc
@@ -42,7 +42,7 @@ class SearchFcFunctor<lite::TargetType::kX86, T> {
    lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size}));
    const auto bottom_data = bottom.data<T>();
-    auto top_data = top->mutable_data<T>(lite::TargetType::kX86);
+    auto top_data = top->template mutable_data<T>(lite::TargetType::kX86);
    const auto weights = w.data<T>();
    auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
    call_gemm<lite::X86Context, T>(blas,

--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
@@ -52,7 +52,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
-    auto* out_data = out_value->mutable_data<T>();
+    auto* out_data = out_value->template mutable_data<T>();
    auto* in1_data = in1_value.data<T>();
    std::copy_n(in1_data, in1_value.numel(), out_data);
@@ -87,7 +87,7 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
    functor(context, output, 0.0);
    auto* in1_data = in1_value.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
    for (size_t i = 0; i < in1_rows.size(); i++) {
      for (int64_t j = 0; j < in1_row_numel; j++) {
@@ -127,7 +127,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = in2_value->mutable_data<T>();
+    auto* in2_data = in2_value->template mutable_data<T>();
    std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset);
  }
 };
@@ -161,7 +161,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
    input2->set_rows(in2_rows);
    auto* in2_value = input2->mutable_value();
-    T* in2_data = in2_value->mutable_data<T>();
+    T* in2_data = in2_value->template mutable_data<T>();
    auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
    size_t offset = 0u;
    for (size_t i = 0u; i != input1.size(); ++i) {
@@ -194,7 +194,7 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {
    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->mutable_data<T>();
+    auto* input2_data = input2->template mutable_data<T>();
    for (size_t i = 0; i < in1_rows.size(); i++) {
      for (int64_t j = 0; j < in1_row_numel; j++) {
@@ -305,7 +305,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
    lite::DDim dims(std::vector<int64_t>(
        {static_cast<int64_t>(merged_row_set.size()), input_width}));
    out.mutable_value()->Resize(dims);
-    auto* out_data = out.mutable_value()->mutable_data<T>();
+    auto* out_data = out.mutable_value()->template mutable_data<T>();
    if (merged_row_set.size() == row_num && !sorted_result) {
      // no duplicated ids, just concat the result together
@@ -385,7 +385,7 @@ struct UpdateToTensor<lite::TargetType::kX86, T> {
    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
+    auto* input2_data = input2->template data<T>();
    // FIXME(typhoonzero): use macro fix the below messy code.
    switch (op) {

--- a/lite/backends/x86/math/sequence2batch.cc
+++ b/lite/backends/x86/math/sequence2batch.cc
@@ -24,10 +24,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
 public:
  void operator()(const lite::Context<lite::TargetType::kX86>& context,
                  const lite::Tensor& src,
-                  const std::vector<size_t>& index_lod,
+                  const std::vector<uint64_t>& index_lod,
                  lite::Tensor* dst,
                  bool is_src_index) {
-    const size_t* index = index_lod.data();
+    const uint64_t* index = index_lod.data();
    const auto& src_dims = src.dims();
    const auto& dst_dims = dst->dims();
    PADDLE_ENFORCE_EQ(
@@ -39,7 +39,7 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
    auto height = dst_dims[0];
    auto width = dst_dims[1];
    auto* src_data = src.data<T>();
-    auto* dst_data = dst->mutable_data<T>();
+    auto* dst_data = dst->template mutable_data<T>();
    const int sz = width * sizeof(T);
    if (is_src_index) {
      for (int i = 0; i < height; ++i) {

--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
@@ -36,7 +36,7 @@ class CopyMatrixRowsFunctor {
  // The indexed rows are based on the input index.
  void operator()(const lite::Context<Target>& context,
                  const lite::Tensor& src,
-                  const std::vector<size_t>& index_lod,
+                  const std::vector<uint64_t>& index_lod,
                  lite::Tensor* dst,
                  bool is_src_index);
 };
@@ -130,8 +130,8 @@ class LoDTensor2BatchFunctor {
    // batch_lods[2] is the sort order for the input LoDTensor.
    batch_lods->at(2).resize(seq_info.size());
-    size_t* batch_starts = batch_lods->at(0).data();
+    auto* batch_starts = batch_lods->at(0).data();
-    size_t* seq2batch_idx = batch_lods->at(1).data();
+    auto* seq2batch_idx = batch_lods->at(1).data();
    batch_starts[0] = 0;
    for (int n = 0; n < max_seqlen; n++) {
      auto batch_id = static_cast<int>(batch_starts[n]);
@@ -148,7 +148,7 @@ class LoDTensor2BatchFunctor {
      }
      batch_starts[n + 1] = static_cast<size_t>(batch_id);
    }
-    size_t* seq_order = batch_lods->at(2).data();
+    auto* seq_order = batch_lods->at(2).data();
    for (size_t i = 0; i < seq_info.size(); ++i) {
      seq_order[i] = seq_info[i].seq_idx;
    }

--- a/lite/backends/x86/math/sequence_padding.cc
+++ b/lite/backends/x86/math/sequence_padding.cc
@@ -22,15 +22,15 @@ namespace math {
 template <typename T>
 void CopyValidData(lite::Tensor* dst_tensor,
                   const lite::Tensor* src_tensor,
-                   const std::vector<size_t>& seq_offsets,
+                   const std::vector<uint64_t>& seq_offsets,
                   int pad_seq_len,
                   int step_width,
                   bool norm_by_len,
                   CopyType type,
                   PadLayout layout) {
  int seq_num = seq_offsets.size() - 1;
-  const T* src_data = src_tensor->data<T>();
+  const T* src_data = src_tensor->template data<T>();
-  T* dst_data = dst_tensor->mutable_data<T>();
+  T* dst_data = dst_tensor->template mutable_data<T>();
  int seq_cpy_gap = step_width;
  int pad_cpy_gap =
@@ -113,7 +113,7 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
                   "'step_width'.");
    // fill padding value
-    T* pad_data = pad_tensor->mutable_data<T>();
+    T* pad_data = pad_tensor->template mutable_data<T>();
    const T* pad_value_data = pad_value.data<T>();
    if (pad_value.numel() == 1) {
      fast_mem_init<T>(

--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
@@ -30,10 +30,10 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
 enum CopyType { kSeqToPad, kPadToSeq };
-inline static size_t MaximumSequenceLength(
+inline static uint64_t MaximumSequenceLength(
-    const std::vector<size_t>& seq_offset) {
+    const std::vector<uint64_t>& seq_offset) {
-  size_t seq_num = seq_offset.size() - 1;
+  uint64_t seq_num = seq_offset.size() - 1;
-  size_t max_seq_len = 0;
+  uint64_t max_seq_len = 0;
  for (size_t i = 0; i < seq_num; ++i) {
    max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
  }
@@ -42,7 +42,7 @@ inline static size_t MaximumSequenceLength(
 inline static void CheckDims(const lite::DDim& seq_tensor_dims,
                             const lite::DDim& pad_tensor_dims,
-                             const std::vector<size_t>& seq_offset,
+                             const std::vector<uint64_t>& seq_offset,
                             int64_t padded_seq_len,
                             int64_t step_width,
                             const PadLayout& layout) {

--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
@@ -55,7 +55,7 @@ class MaxSeqPoolFunctor {
    auto starts = input.lod()[0];
    const T* in_data = input.data<T>();
-    T* out_data = output->mutable_data<T>();
+    T* out_data = output->template mutable_data<T>();
    int* max_index = index->mutable_data<int>();
    int64_t num_seq = out_dims[0];
@@ -103,7 +103,7 @@ class MaxSeqPoolFunctor<T, true> {
    auto starts = input.lod()[0];
    const T* in_data = input.data<T>();
-    T* out_data = output->mutable_data<T>();
+    T* out_data = output->template mutable_data<T>();
    int64_t num_seq = out_dims[0];
    int64_t dim = output->numel() / num_seq;
@@ -145,7 +145,7 @@ class MaxSeqPoolGradFunctor {
    const T* og_data = out_grad.data<T>();
    const int* max_index = index.data<int>();
-    T* ig_data = in_grad->mutable_data<T>();
+    T* ig_data = in_grad->template mutable_data<T>();
    SetConstant<TARGET(kX86), T> set_zero;
    set_zero(context, in_grad, static_cast<T>(0.0));
@@ -170,7 +170,7 @@ class LastSeqPoolFunctor {
                  lite::Tensor* output) {
    // Create pointers to input and output data
    auto* in_data = input.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
    // Calculate the size of each item in sequence
    int64_t item_size = input.numel() / input.dims()[0];
@@ -203,7 +203,7 @@ class FirstSeqPoolFunctor {
                  lite::Tensor* output) {
    // Create pointers to input and output data
    auto* in_data = input.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
    // Calculate the size of each item in sequence
    int64_t item_size = input.numel() / input.dims()[0];
@@ -238,7 +238,7 @@ class SumSeqPoolGradFunctor {
    int64_t in_w = in_grad->numel() / in_grad->dims()[0];
    PADDLE_ENFORCE(in_w == out_w);
    const T* out_g_data = out_grad.data<T>();
-    T* in_g_data = in_grad->mutable_data<T>(TARGET(kX86));
+    T* in_g_data = in_grad->template mutable_data<T>(TARGET(kX86));
    auto blas = math::GetBlas<TARGET(kX86), T>(context);
    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
@@ -288,7 +288,7 @@ class SequencePoolFunctor<TARGET(kX86), T> {
    auto lod = input.lod()[0];
    if (pooltype == "SUM") {
      const T* src = input.data<T>();
-      T* dst = output->mutable_data<T>(TARGET(kX86));
+      T* dst = output->template mutable_data<T>(TARGET(kX86));
      jit::seq_pool_attr_t attr(
          static_cast<int>(input.numel() / input.dims()[0]),
          jit::SeqPoolType::kSum);

--- a/lite/backends/x86/math/sequence_pooling_test.cc
+++ b/lite/backends/x86/math/sequence_pooling_test.cc
@@ -101,13 +101,13 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
 TEST(SequencePoolingGrad, CPU_SUM) {
  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
+  lod1.push_back(std::vector<uint64_t>{0, 10});
  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
                         paddle::platform::CPUPlace,
                         float>(lod1);
  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
                         paddle::platform::CPUPlace,
                         float>(lod2);
@@ -116,13 +116,13 @@ TEST(SequencePoolingGrad, CPU_SUM) {
 #ifdef PADDLE_WITH_CUDA
 TEST(SequencePoolingGrad, CUDA_SUM) {
  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
+  lod1.push_back(std::vector<uint64_t>{0, 10});
  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
                         paddle::platform::CUDAPlace,
                         float>(lod1);
  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
                         paddle::platform::CUDAPlace,
                         float>(lod2);

--- a/lite/backends/x86/math/sequence_scale.cc
+++ b/lite/backends/x86/math/sequence_scale.cc
@@ -32,7 +32,7 @@ class ScaleLoDTensorFunctor<lite::TargetType::kX86, T> {
    size_t seq_width = seq->dims()[1];
    lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod);
-    T* seq_data = seq->mutable_data<T>(lite::TargetType::kX86);
+    T* seq_data = seq->template mutable_data<T>(lite::TargetType::kX86);
    for (size_t i = 0; i < num_seq; ++i) {
      for (size_t j = lod[level][i] * seq_width;
           j < lod[level][i + 1] * seq_width;

--- a/lite/backends/x86/math/sequence_topk_avg_pooling.cc
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
@@ -83,7 +83,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
    auto pos_data = pos->mutable_data<int>(lite::TargetType::kX86);
    int offset = 0;
-    std::vector<size_t> vec_out_lod;
+    std::vector<uint64_t> vec_out_lod;
    vec_out_lod.reserve(batch_size + 1);
    for (int i = 0; i <= batch_size; ++i) {
      offset = row_lod[i];
@@ -95,7 +95,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
    out->set_lod(lod_temp);
    auto in_data = in.data<T>();
-    auto out_data = out->mutable_data<T>(lite::TargetType::kX86);
+    auto out_data = out->template mutable_data<T>(lite::TargetType::kX86);
    T* sum_data = new T[max_k];
    for (int i = 0; i < batch_size; ++i) {

--- a/lite/backends/x86/math/softmax_impl.h
+++ b/lite/backends/x86/math/softmax_impl.h
@@ -108,8 +108,8 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
    const int num_remain = num_classes / axis_dim;
    if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
-      const T* in_data = X->data<T>();
+      const T* in_data = X->template data<T>();
-      auto* out_data = Y->mutable_data<T>();
+      auto* out_data = Y->template mutable_data<T>();
      for (int bs = 0; bs < batch_size; ++bs) {
        T max_val = *std::max_element(in_data, in_data + num_classes);
        max_val *= static_cast<T>(-1);
@@ -219,9 +219,9 @@ class SoftmaxGradFunctor<Target, T, enable_if_CPU<Target>> {
    const int num_remain = num_classes / axis_dim;
    if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
-      const T* out_data = y->data<T>();
+      const T* out_data = y->template data<T>();
-      const T* out_grad = y_grad->data<T>();
+      const T* out_grad = y_grad->template data<T>();
-      T* in_grad = x_grad->mutable_data<T>();
+      T* in_grad = x_grad->template mutable_data<T>();
      for (int bs = 0; bs < batch_size; ++bs) {
        T scalar;
        vec_mul_reduce<T, lite::x86::avx>(

--- a/lite/backends/x86/math/tree2col.cc
+++ b/lite/backends/x86/math/tree2col.cc
@@ -104,12 +104,12 @@ class Tree2ColFunctor<lite::TargetType::kX86, T> {
    patch_size = processing_list.size();
    // T *patch_data =
-    //    patch->mutable_data<T>({static_cast<int64_t>(patch_size),
+    //    patch->template mutable_data<T>({static_cast<int64_t>(patch_size),
    //                            static_cast<int64_t>(patch_elem_size)},
    //                           cpu_place);
    patch->Resize({static_cast<int64_t>(patch_size),
                   static_cast<int64_t>(patch_elem_size)});
-    auto *patch_data = patch->mutable_data<T>(lite::TargetType::kX86);
+    auto *patch_data = patch->template mutable_data<T>(lite::TargetType::kX86);
    constant(context, patch, 0);
    const T *features = node_features.data<T>();
@@ -166,12 +166,12 @@ class Col2TreeFunctor<lite::TargetType::kX86, T> {
      }
    }
    // T *grad_data =
-    //    in_grad->mutable_data<T>({static_cast<int64_t>(node_count),
+    //    in_grad->template mutable_data<T>({static_cast<int64_t>(node_count),
    //                              static_cast<int64_t>(grad_elem_size)},
    //                             cpu_place);
    in_grad->Resize({static_cast<int64_t>(node_count),
                     static_cast<int64_t>(grad_elem_size)});
-    auto *grad_data = in_grad->mutable_data<T>(lite::TargetType::kX86);
+    auto *grad_data = in_grad->template mutable_data<T>(lite::TargetType::kX86);
    constant(context, in_grad, 0);
    const T *out_g = out_grad.data<T>();

--- a/lite/backends/x86/math/unpooling.cc
+++ b/lite/backends/x86/math/unpooling.cc
@@ -36,7 +36,7 @@ class Unpool2dMaxFunctor<lite::TargetType::kX86, T> {
    int output_feasize = output_height * output_width;
    const T* input_data = input.data<T>();
    const int* indices_data = indices.data<int>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
    for (int b = 0; b < batch_size; ++b) {
      for (int c = 0; c < output_channels; ++c) {
        for (int i = 0; i < input_feasize; ++i) {
@@ -70,7 +70,8 @@ class Unpool2dMaxGradFunctor<lite::TargetType::kX86, T> {
    int output_feasize = output_height * output_width;
    const int* indices_data = indices.data<int>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
    for (int b = 0; b < batch_size; ++b) {
      for (int c = 0; c < output_channels; ++c) {

--- a/lite/backends/x86/math/vol2col.cc
+++ b/lite/backends/x86/math/vol2col.cc
@@ -75,7 +75,7 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
                      "mismatching.");
    const T* vol_data = vol.data<T>();
-    T* col_data = col->mutable_data<T>();
+    T* col_data = col->template mutable_data<T>();
    for (int c = 0; c < channels_col; ++c) {
      int w_offset = c % filter_width;
@@ -159,7 +159,7 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
                      output_width,
                      "input_width and output_width are "
                      "mismatching.");
-    T* vol_data = vol->mutable_data<T>();
+    T* vol_data = vol->template mutable_data<T>();
    const T* col_data = col.data<T>();
    for (int c = 0; c < channels_col; ++c) {

--- a/lite/backends/xpu/CMakeLists.txt
+++ b/lite/backends/xpu/CMakeLists.txt
@@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU)
  return()
 endif()
-lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+if(LITE_WITH_XTCL)
+  lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+endif()
+lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
--- a/lite/backends/xpu/device.h
+++ b/lite/backends/xpu/device.h
@@ -14,12 +14,12 @@
 #pragma once
-#include <xtcl/xtcl.h>
 #include <cstdlib>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 namespace paddle {
 namespace lite {

--- a/lite/backends/xpu/math.h
+++ b/lite/backends/xpu/math.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <stdint.h>
+#include <cmath>
+#include <cstdlib>
+#include <utility>
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace xpu {
+namespace math {
+static inline long round_half_to_even(const float src) {  // NOLINT
+  long ret = llround(src);                                // NOLINT
+  if (fabs(fabs(round(src) - src) - 0.5) > 0) {
+    return ret;
+  } else {
+    if (abs(ret) % 2 == 0) {
+      return ret;
+    } else {
+      return ret + (ret > 0 ? -1 : 1);
+    }
+  }
+}
+static float ieee_compliance_0(float f) {
+  uint32_t *ptr = reinterpret_cast<uint32_t *>(&f);
+  uint32_t sign = (*ptr) & 0x80000000;
+  uint32_t uf = 0;
+  // nan -> inf
+  if (std::isnan(f)) {
+    uf = (sign | 0x7F800000);
+    float *ptr = reinterpret_cast<float *>(&uf);
+    return *ptr;
+  } else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) {
+    return f;
+  } else {
+    // denormal -> +-0
+    uf = 0x0;
+    float *ptr = reinterpret_cast<float *>(&uf);
+    return *ptr;
+  }
+}
+template <typename T, int RMAX>
+static inline T fp32_to_intx(const float f, float max) {
+  max = ieee_compliance_0(max);
+  float input = ieee_compliance_0(f);
+  // +0 and -0 -> +0
+  if (input == 0) {
+    input = 0.0f;
+  }
+  float tmp = RMAX / max;
+  if (std::isinf(tmp)) {
+    uint32_t *ptr = reinterpret_cast<uint32_t *>(&input);
+    if ((*ptr) >> 31 & 1) {
+      return T(-RMAX);
+    } else {
+      return T(RMAX);
+    }
+  }
+  tmp = input * tmp;
+  if (std::isnan(tmp)) {
+    return T(RMAX);
+  }
+  tmp = ieee_compliance_0(tmp);
+  // early check to avoid INF or big value get into convertor func.
+  if (tmp > RMAX) {
+    return T(RMAX);
+  }
+  if (tmp < -RMAX) {
+    return T(-RMAX);
+  }
+  T ret = (T)round_half_to_even(tmp);
+  if (ret > RMAX) {
+    ret = T(RMAX);
+  }
+  if (ret < -RMAX) {
+    ret = T(-RMAX);
+  }
+  return ret;
+}
+static inline int16_t fp32_to_int16(const float f, float max) {
+  int16_t v1 = fp32_to_intx<int16_t, 32767>(f, max);
+  return v1;
+}
+static inline int ConvertFP32ToInt16(const void *input,
+                                     void *output,
+                                     float max_val,
+                                     int len) {
+  for (int i = 0; i < len; i++) {
+    static_cast<int16_t *>(output)[i] =
+        fp32_to_int16(static_cast<const float *>(input)[i], max_val);
+  }
+  return 0;
+}
+static inline float FindMaxAbs(const float *data, int len) {
+  float max_f = 0.0f;
+  for (int i = 0; i < len; ++i) {
+    float max = std::abs(data[i]);
+    if (max > max_f) {
+      max_f = max;
+    }
+  }
+  return max_f;
+}
+template <typename T>
+static inline void Transpose(const T *in, T *out, int h, int w) {
+  for (int h1 = 0; h1 < w; ++h1) {
+    for (int w1 = 0; w1 < h; ++w1) {
+      out[h1 * h + w1] = in[w1 * w + h1];
+    }
+  }
+}
+/**
+ * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
+ * original x_dim is returned.
+ */
+static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return lite::DDim({1, x_dim[0]});
+}
+/**
+ * Get column matrix shape from a vector shape. If the rank of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return lite::DDim({y_dim[0], 1});
+}
+/**
+ * Matrix Descriptor of a memory buffer.
+ *
+ * It is used for Blas::MatMul. MatMul operator can be batched.
+ * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
+ * `batch_size` times of GEMM. The batched GEMM could be faster base on the
+ * implementation of the blas library. The batch size could be zero. If any
+ * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
+ * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
+ * [BatchSize, H1, W2]
+ *
+ * The boolean flag, `trans`, describe the memory is the transpose of matrix or
+ * not. If the trans is true, the last two dims of matrix are transposed. The
+ * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
+ *
+ * The MatDescriptor is not only the dimension or shape of a matrix, it also
+ * contains the layout, stride of matrix. It is clearer to have a structure than
+ * reuse `DDim`.
+ */
+struct MatDescriptor {
+  int64_t height_;
+  int64_t width_;
+  int64_t stride_{0};
+  int64_t batch_size_{0};
+  bool trans_;
+};
+static MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
+                                            int num_flatten_cols,
+                                            bool trans) {
+  MatDescriptor retv;
+  if (num_flatten_cols > 1) {
+    auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
+    retv.height_ = flatten_dim[0];
+    retv.width_ = flatten_dim[1];
+  } else {
+    if (tensor_dim.size() == 2) {
+      retv.height_ = tensor_dim[0];
+      retv.width_ = tensor_dim[1];
+    } else {
+      auto dim_vec = tensor_dim.Vectorize();
+      retv.batch_size_ = 1;
+      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
+        retv.batch_size_ *= dim_vec[i];
+      }
+      retv.height_ = dim_vec[dim_vec.size() - 2];
+      retv.width_ = dim_vec[dim_vec.size() - 1];
+      retv.stride_ = retv.height_ * retv.width_;
+    }
+  }
+  if (trans) {
+    std::swap(retv.width_, retv.height_);
+  }
+  retv.trans_ = trans;
+  return retv;
+}
+}  // namespace math
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/xpu/target_wrapper.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+namespace paddle {
+namespace lite {
+void* TargetWrapperXPU::Malloc(size_t size) {
+  void* ptr{nullptr};
+  xpu_malloc(&ptr, size);
+  return ptr;
+}
+void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
+void TargetWrapperXPU::MemcpySync(void* dst,
+                                  const void* src,
+                                  size_t size,
+                                  IoDirection dir) {
+  switch (dir) {
+    case IoDirection::HtoD:
+      xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
+      break;
+    case IoDirection::DtoH:
+      xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/target_wrapper.h"
+namespace paddle {
+namespace lite {
+using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
+template <>
+class TargetWrapper<TARGET(kXPU)> {
+ public:
+  static size_t num_devices() { return 1; }
+  static size_t maximum_stream() { return 0; }
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+};
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/xpu/xpu_header_sitter.h
+++ b/lite/backends/xpu/xpu_header_sitter.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#pragma GCC system_header
+#include <xpu/api.h>
+#include <xpu/golden.h>
+#include <xpu/runtime.h>
+#if defined(LITE_WITH_XTCL)
+#include <xtcl/xtcl.h>
+#endif
+namespace paddle {
+namespace lite {
+namespace xdnn = baidu::xpu::api;
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -5,9 +5,11 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
  DEPS target_wrapper_host place
  X86_DEPS target_wrapper_x86
  CUDA_DEPS target_wrapper_cuda
+  XPU_DEPS target_wrapper_xpu
  CL_DEPS cl_target_wrapper
  FPGA_DEPS fpga_target_wrapper
-  BM_DEPS target_wrapper_bm)
+  BM_DEPS target_wrapper_bm
+  MLU_DEPS target_wrapper_mlu)
 lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)

--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -15,5 +15,11 @@
 #include "lite/core/context.h"
 namespace paddle {
-namespace lite {}  // namespace lite
+namespace lite {
+#ifdef LITE_WITH_XPU
+thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
+#endif
+}  // namespace lite
 }  // namespace paddle
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -24,6 +24,14 @@
 #include "lite/backends/opencl/cl_context.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
+#ifdef LITE_WITH_MLU
+#include <cnml.h>
+#include <cnrt.h>
+#include "lite/backends/mlu/mlu_utils.h"
+#endif
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#endif
 #include <map>
 #include <memory>
@@ -103,11 +111,38 @@ class Context<TargetType::kXPU> {
 public:
  Context() {}
  explicit Context(const XPUContext& ctx);
  // NOTE: InitOnce should only be used by ContextScheduler
  void InitOnce() {}
  void CopySharedTo(XPUContext* ctx) {}
+  static xdnn::Context* GetRawContext() {
+    if (_tls_raw_ctx == nullptr) {
+      _tls_raw_ctx = xdnn::create_context();
+      CHECK(_tls_raw_ctx);
+    }
+    return _tls_raw_ctx;
+  }
+  static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
+    xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
+  }
+  static void SetDev(int dev_no = 0) {
+    const char* dev_env = getenv("LITE_XPU_DEV");
+    if (dev_env) {
+      xpu_set_device(atoi(dev_env));
+      return;
+    }
+    xpu_set_device(dev_no);
+  }
  std::string name() const { return "XPUContext"; }
+ private:
+  static thread_local xdnn::Context* _tls_raw_ctx;
 };
 #endif
@@ -172,6 +207,85 @@ class Context<TargetType::kFPGA> {
 };
 #endif
+#ifdef LITE_WITH_MLU
+template <>
+class Context<TargetType::kMLU> {
+ public:
+  typename Env<TargetType::kMLU>::Devs& devs = Env<TargetType::kMLU>::Global();
+  void InitOnce() {}
+  MLUContext& operator=(const MLUContext& ctx) {
+    this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
+    return *this;
+  }
+  void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
+    CHECK_GT(devs.size(), 0UL)
+        << "Env is not initialized or current target is not exit!";
+    if (dev_id >= static_cast<int>(devs.size())) {
+      LOG(WARNING) << "device index exceeds the number of devices, set to "
+                      "default device(0)!";
+      device_id_ = 0;
+    } else {
+      device_id_ = dev_id;
+    }
+    SetMluDevice(device_id_);
+    if (io_queue_id >= devs[dev_id].max_queue()) {
+      LOG(WARNING) << "data queue index exceeds the maximum queue number, "
+                      "set to default qeueu(0)!";
+      io_queue_id = 0;
+    }
+    if (exec_queue_id >= devs[dev_id].max_queue()) {
+      LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
+                      "set to default qeueu(0)!";
+      exec_queue_id = 0;
+    }
+    io_queue_ = devs[dev_id].io_queues()[io_queue_id];
+    exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
+    exec_queue_id_ = exec_queue_id;
+    io_queue_id_ = io_queue_id;
+  }
+  void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
+  const cnrtQueue_t& exec_queue() const { return exec_queue_; }
+  void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; }
+  const cnrtQueue_t& io_queue() const { return io_queue_; }
+  void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
+  cnmlCoreVersion_t MLUCoreVersion() {
+    return DeviceInfo::Global().MLUCoreVersion();
+  }
+  int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
+  u32_t affinity() { return affinity_; }
+  cnrtInvokeFuncParam_t forward_param() { return forward_param_; }
+  int device_id() { return device_id_; }
+  std::string name() const { return "MLUContext"; }
+ private:
+  int device_id_;
+  // overall information
+  int exec_queue_id_;
+  int io_queue_id_;
+  cnrtQueue_t io_queue_;
+  cnrtQueue_t exec_queue_;
+  std::vector<cnrtNotifier_t> input_notifiers_;
+  std::vector<cnrtNotifier_t> output_notifiers_;
+  cnrtInvokeFuncParam_t forward_param_;
+  u32_t affinity_ = 0x01;
+};
+#endif  // LITE_WITH_MLU
 #ifdef LITE_WITH_CUDA
 // Only works with CUDA kernels.
 template <>
@@ -398,6 +512,16 @@ class ContextScheduler {
        kernel_contexts_[TargetType::kBM].As<BMContext>().CopySharedTo(
            &ctx->As<BMContext>());
        break;
+#endif
+#ifdef LITE_WITH_MLU
+      case TARGET(kMLU): {
+        int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
+        auto& context = ctx->As<MLUContext>();
+        context.Init(dev_id);
+        kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
+            &context);
+        LOG(INFO) << "New Context for MLU";
+      } break;
 #endif
      default:
 #if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
@@ -439,6 +563,9 @@ class ContextScheduler {
 #endif
 #ifdef LITE_WITH_BM
    InitContext<TargetType::kBM, BMContext>();
+#endif
+#ifdef LITE_WITH_MLU
+    InitContext<TargetType::kMLU, MLUContext>();
 #endif
  }

--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -58,7 +58,7 @@
 namespace paddle {
 namespace lite {
-#ifdef LITE_WITH_ARM
+#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
 thread_local lite_api::PowerMode DeviceInfo::mode_;
 thread_local ARMArch DeviceInfo::arch_;
 thread_local int DeviceInfo::mem_size_;
@@ -66,6 +66,15 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
 thread_local TensorLite DeviceInfo::workspace_;
 thread_local int64_t DeviceInfo::count_ = 0;
+#ifdef LITE_WITH_MLU
+thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
+thread_local int DeviceInfo::mlu_core_number_{1};
+thread_local bool DeviceInfo::use_first_conv_{false};
+thread_local std::vector<float> DeviceInfo::mean_vec_;
+thread_local std::vector<float> DeviceInfo::std_vec_;
+thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
+#endif
 #ifdef TARGET_IOS
 const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
 const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
@@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() {
  return 0;
 }
+#ifdef LITE_WITH_MLU
+void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                               int core_number,
+                               bool use_first_conv,
+                               const std::vector<float>& mean_vec,
+                               const std::vector<float>& std_vec,
+                               DataLayoutType input_layout) {
+  switch (core_version) {
+    case (lite_api::MLUCoreVersion::MLU_220):
+      mlu_core_version_ = CNML_MLU220;
+      break;
+    case (lite_api::MLUCoreVersion::MLU_270):
+      mlu_core_version_ = CNML_MLU270;
+      break;
+    default:
+      mlu_core_version_ = CNML_MLU270;
+      break;
+  }
+  mlu_core_number_ = core_number;
+  use_first_conv_ = use_first_conv;
+  mean_vec_ = mean_vec;
+  std_vec_ = std_vec;
+  input_layout_ = input_layout;
+}
+cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
+int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
+bool DeviceInfo::UseFirstConv() { return use_first_conv_; }
+const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
+const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
+DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
+#endif  // LITE_WITH_MLU
 void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
 #ifdef ARM_WITH_OMP
  thread_num = std::min(thread_num, core_num_);
@@ -1159,6 +1207,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) {
 #endif  // LITE_WITH_ARM
+#ifdef LITE_WITH_MLU
+void SetMluDevice(int device_id) {
+  LOG(INFO) << "Set mlu device " << device_id;
+  cnrtDev_t dev_handle;
+  CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, device_id));
+  CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
+}
+void Device<TARGET(kMLU)>::Init() {
+  SetMluDevice(idx_);
+  GetInfo();
+  CreateQueue();
+}
+void Device<TARGET(kMLU)>::GetInfo() {}
+void Device<TARGET(kMLU)>::CreateQueue() {
+  exec_queue_.clear();
+  io_queue_.clear();
+  for (size_t i = 0; i < max_queue_; ++i) {
+    cnrtQueue_t exec_queue;
+    cnrtQueue_t io_queue;
+    cnrtCreateQueue(&exec_queue);
+    cnrtCreateQueue(&io_queue);
+    exec_queue_.push_back(exec_queue);
+    io_queue_.push_back(io_queue);
+    cnrtCreateQueue(&exec_queue);
+    exec_queue_.push_back(exec_queue);
+  }
+}
+#endif  // LITE_WITH_MLU
 #ifdef LITE_WITH_CUDA
 void Device<TARGET(kCUDA)>::Init() {

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -19,11 +19,14 @@
 #include <vector>
 #include "lite/core/tensor.h"
 #include "lite/utils/cp_logging.h"
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/mlu_utils.h"
+#endif
 namespace paddle {
 namespace lite {
-#ifdef LITE_WITH_ARM
+#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
 typedef enum {
  kAPPLE = 0,
@@ -52,6 +55,20 @@ class DeviceInfo {
  int Setup();
  void SetRunMode(lite_api::PowerMode mode, int thread_num);
+#ifdef LITE_WITH_MLU
+  void SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                     int core_number,
+                     bool use_first_conv,
+                     const std::vector<float>& mean_vec,
+                     const std::vector<float>& std_vec,
+                     DataLayoutType input_layout);
+  cnmlCoreVersion_t MLUCoreVersion();
+  int MLUCoreNumber();
+  bool UseFirstConv();
+  const std::vector<float>& MeanVec() const;
+  const std::vector<float>& StdVec() const;
+  DataLayoutType InputLayout() const;
+#endif
  void SetCache(int l1size, int l2size, int l3size);
  void SetArch(ARMArch arch) { arch_ = arch; }
@@ -103,6 +120,15 @@ class DeviceInfo {
  static thread_local TensorLite workspace_;
  static thread_local int64_t count_;
+#ifdef LITE_WITH_MLU
+  static thread_local cnmlCoreVersion_t mlu_core_version_;
+  static thread_local int mlu_core_number_;
+  static thread_local bool use_first_conv_;
+  static thread_local std::vector<float> mean_vec_;
+  static thread_local std::vector<float> std_vec_;
+  static thread_local DataLayoutType input_layout_;
+#endif
  void SetDotInfo(int argc, ...);
  void SetFP16Info(int argc, ...);
  void SetFP32Info(int argc, ...);
@@ -134,6 +160,9 @@ class Env {
    return *devs;
  }
  static void Init(int max_stream = 4) {
+#ifdef LITE_WITH_MLU
+    CNRT_CALL(cnrtInit(0));
+#endif
    Devs& devs = Global();
    if (devs.size() > 0) {
      return;
@@ -156,6 +185,41 @@ class Env {
  }
 };
+#ifdef LITE_WITH_MLU
+void SetMluDevice(int device_id);
+template <>
+class Device<TARGET(kMLU)> {
+ public:
+  Device(int dev_id, int max_queue = 1) : idx_(dev_id), max_queue_(max_queue) {}
+  void Init();
+  int id() { return idx_; }
+  int max_queue() { return max_queue_; }
+  void SetId(int idx) { idx_ = idx; }
+  std::string name() { return "MLU"; }
+  int core_num() { return 16; }
+  float max_memory() { return 16 * 1024; }
+  std::vector<cnrtQueue_t> io_queues() { return io_queue_; }
+  std::vector<cnrtQueue_t> exec_queues() { return exec_queue_; }
+ private:
+  void CreateQueue();
+  void GetInfo();
+ private:
+  int idx_{0};
+  int max_queue_;
+  std::string device_name_;
+  float max_memory_;
+  std::vector<cnrtQueue_t> io_queue_;
+  std::vector<cnrtQueue_t> exec_queue_;
+};
+template class Env<TARGET(kMLU)>;
+#endif  // LITE_WITH_MLU
 #ifdef LITE_WITH_CUDA
 template <>
 class Device<TARGET(kCUDA)> {

--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -83,6 +83,9 @@ class KernelBase {
 #if defined(LITE_WITH_CUDA)
    WorkSpace::Global_CUDA().AllocReset();
 #endif
+#if defined(LITE_WITH_MLU)
+    WorkSpace::Global_MLU().AllocReset();
+#endif
 #ifdef LITE_WITH_PROFILE
    profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
    profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());

--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -45,6 +45,16 @@ void* TargetMalloc(TargetType target, size_t size) {
      data = TargetWrapper<TARGET(kBM)>::Malloc(size);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      data = TargetWrapper<TARGET(kMLU)>::Malloc(size);
+      break;
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      data = TargetWrapperXPU::Malloc(size);
+      break;
+#endif  // LITE_WITH_XPU
    default:
      LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
  }
@@ -83,6 +93,16 @@ void TargetFree(TargetType target, void* data, std::string free_flag) {
      TargetWrapper<TARGET(kBM)>::Free(data);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      TargetWrapper<TARGET(kMLU)>::Free(data);
+      break;
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      TargetWrapperXPU::Free(data);
+      break;
+#endif  // LITE_WITH_XPU
    default:
      LOG(FATAL) << "Unknown type";
  }
@@ -114,6 +134,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
      TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
      break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      TargetWrapper<TARGET(kMLU)>::MemcpySync(
+          dst, src, size, IoDirection::HtoD);
+      break;
+#endif
 #ifdef LITE_WITH_OPENCL
    case TargetType::kOpenCL:
      TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);

--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -31,6 +31,14 @@
 #include "lite/backends/bm/target_wrapper.h"
 #endif  // LITE_WITH_BM
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/target_wrapper.h"
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif  // LITE_WITH_XPU
 namespace paddle {
 namespace lite {
@@ -75,6 +83,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
      TargetWrapperCL::MemcpySync(dst, src, size, dir);
      break;
 #endif  // LITE_WITH_OPENCL
+#ifdef LITE_WITH_MLU
+    case TARGET(kMLU):
+      TargetWrapperMlu::MemcpySync(dst, src, size, dir);
+      break;
+#endif
 #ifdef LITE_WITH_FPGA
    case TARGET(kFPGA):
      TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir);
@@ -126,7 +139,7 @@ class Buffer {
                        const size_t img_h,
                        void* host_ptr = nullptr) {
    if (target != target_ || cl_image2d_width_ < img_w ||
-        cl_image2d_height_ < img_h) {
+        cl_image2d_height_ < img_h || host_ptr != nullptr) {
      CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
      Free();
      data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -21,6 +21,8 @@ lite_cc_library(mir_passes
      fusion/elementwise_add_activation_fuse_pass.cc
      fusion/quant_dequant_fuse_pass.cc
      fusion/sequence_pool_concat_fuse_pass.cc
+      fusion/__xpu__resnet_fuse_pass.cc
+      fusion/__xpu__multi_encoder_fuse_pass.cc
      elimination/identity_scale_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc
      elimination/assign_value_eliminate_pass.cc
@@ -36,6 +38,7 @@ lite_cc_library(mir_passes
      demo_pass.cc
      runtime_context_assign_pass.cc
      memory_optimize_pass.cc
+      mlu_postprocess_pass.cc
      weight_quantization_preprocess_pass.cc
      quantized_op_attributes_inference_pass.cc
  DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
@@ -70,10 +73,10 @@ set(pattern_deps mir_node mir_ssa_graph op)
 if (WITH_TESTING)
  list(APPEND pattern_deps gtest)
 endif()
-lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps})
+lite_cc_library(pattern_matcher SRCS pattern_matcher.cc xpu_pattern_matcher.cc DEPS ${pattern_deps})
 lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher)
-lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher)
+lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc xpu_pattern_matcher_high_api.cc DEPS pattern_matcher)
 # for mobile, unnecessary to compile the following testings.

--- a/lite/core/mir/dot.h
+++ b/lite/core/mir/dot.h
@@ -27,8 +27,8 @@
 #include "lite/utils/string.h"
 namespace paddle {
-namespace inference {
+namespace lite {
-namespace analysis {
+namespace mir {
 static size_t dot_node_counter{0};
@@ -162,6 +162,6 @@ class Dot {
  std::vector<Attr> attrs_;
 };
-}  // namespace analysis
+}  // namespace mir
-}  // namespace inference
+}  // namespace lite
 }  // namespace paddle
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include "lite/operators/subgraph_op.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+class XPUSingleEncoderFuser : public FuseBase {
+ public:
+  explicit XPUSingleEncoderFuser(const std::string& act_type = "gelu")
+      : act_type_(act_type) {}
+  void BuildPattern() override {
+    auto* input = VarNode("input")
+                      ->assert_is_op_input("mul", "X")
+                      ->assert_is_op_input("elementwise_add", "Y")
+                      ->AsInput();
+    auto* q_mul_y =
+        VarNode("q_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* q_mul = OpNode("q_mul", "mul");
+    auto* q_mul_out = VarNode("q_mul_out")
+                          ->assert_is_op_output("mul", "Out")
+                          ->assert_is_op_input("elementwise_add", "X")
+                          ->AsIntermediate();
+    auto* q_add_y = VarNode("q_add_y")
+                        ->assert_is_op_input("elementwise_add", "Y")
+                        ->AsInput();
+    auto* q_add = OpNode("q_add", "elementwise_add")->AsIntermediate();
+    auto* q_add_out = VarNode("q_add_out")
+                          ->assert_is_op_output("elementwise_add", "Out")
+                          ->assert_is_op_input("reshape2", "X")
+                          ->AsIntermediate();
+    auto* q_reshape2 = OpNode("q_reshape2", "reshape2")->AsIntermediate();
+    auto* q_reshape2_out = VarNode("q_reshape2_out")
+                               ->assert_is_op_output("reshape2", "Out")
+                               ->assert_is_op_input("transpose2", "X")
+                               ->AsIntermediate();
+    auto* q_reshape2_xshape = VarNode("q_reshape2_xshape")
+                                  ->assert_is_op_output("reshape2", "XShape")
+                                  ->AsIntermediate();
+    auto* q_transpose2 = OpNode("q_transpose2", "transpose2")->AsIntermediate();
+    auto* q_transpose2_out = VarNode("q_transpose2_out")
+                                 ->assert_is_op_output("transpose2", "Out")
+                                 ->assert_is_op_input("scale", "X")
+                                 ->AsIntermediate();
+    auto* q_transpose2_xshape =
+        VarNode("q_transpose2_xshape")
+            ->assert_is_op_output("transpose2", "XShape")
+            ->AsIntermediate();
+    auto* q_scale = OpNode("q_scale", "scale")->AsIntermediate();
+    auto* q_scale_out = VarNode("q_scale_out")
+                            ->assert_is_op_output("scale", "Out")
+                            ->assert_is_op_input("matmul", "X")
+                            ->AsIntermediate();
+    auto* k_mul_y =
+        VarNode("k_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* k_mul = OpNode("k_mul", "mul")->AsIntermediate();
+    auto* k_mul_out = VarNode("k_mul_out")
+                          ->assert_is_op_output("mul", "Out")
+                          ->assert_is_op_input("elementwise_add", "X")
+                          ->AsIntermediate();
+    auto* k_add_y = VarNode("k_add_y")
+                        ->assert_is_op_input("elementwise_add", "Y")
+                        ->AsInput();
+    auto* k_add = OpNode("k_add", "elementwise_add")->AsIntermediate();
+    auto* k_add_out = VarNode("k_add_out")
+                          ->assert_is_op_output("elementwise_add", "Out")
+                          ->assert_is_op_input("reshape2", "X")
+                          ->AsIntermediate();
+    auto* k_reshape2 = OpNode("k_reshape2", "reshape2")->AsIntermediate();
+    auto* k_reshape2_out = VarNode("k_reshape2_out")
+                               ->assert_is_op_output("reshape2", "Out")
+                               ->assert_is_op_input("transpose2", "X")
+                               ->AsIntermediate();
+    auto* k_reshape2_xshape = VarNode("k_reshape2_xshape")
+                                  ->assert_is_op_output("reshape2", "XShape")
+                                  ->AsIntermediate();
+    auto* k_transpose2 = OpNode("k_transpose2", "transpose2")->AsIntermediate();
+    auto* k_transpose2_out = VarNode("k_transpose2_out")
+                                 ->assert_is_op_output("transpose2", "Out")
+                                 ->assert_is_op_input("matmul", "Y")
+                                 ->AsIntermediate();
+    auto* k_transpose2_xshape =
+        VarNode("k_transpose2_xshape")
+            ->assert_is_op_output("transpose2", "XShape")
+            ->AsIntermediate();
+    auto* qk_matmul = OpNode("qk_matmul", "matmul")->AsIntermediate();
+    auto* qk_matmul_out = VarNode("qk_matmul_out")
+                              ->assert_is_op_output("matmul", "Out")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* qk_mask = VarNode("qk_mask")
+                        ->assert_is_op_input("elementwise_add", "Y")
+                        ->AsInput();
+    auto* qk_add = OpNode("qk_add", "elementwise_add")->AsIntermediate();
+    auto* qk_add_out = VarNode("qk_add_out")
+                           ->assert_is_op_output("elementwise_add", "Out")
+                           ->assert_is_op_input("softmax", "X")
+                           ->AsIntermediate();
+    auto* qk_softmax = OpNode("qk_softmax", "softmax")->AsIntermediate();
+    auto* qk_softmax_out = VarNode("qk_softmax_out")
+                               ->assert_is_op_output("softmax", "Out")
+                               ->AsIntermediate();
+    auto* qk_dropout = OpNode("qk_dropout", "dropout")->AsIntermediate();
+    auto* qk_dropout_out = VarNode("qk_dropout_out")
+                               ->assert_is_op_output("dropout", "Out")
+                               ->assert_is_op_input("matmul", "X")
+                               ->AsIntermediate();
+    auto* qk_dropout_mask = VarNode("qk_dropout_mask")
+                                ->assert_is_op_output("dropout", "Mask")
+                                ->AsIntermediate();
+    auto* v_mul_y =
+        VarNode("v_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* v_mul = OpNode("v_mul", "mul")->AsIntermediate();
+    auto* v_mul_out = VarNode("v_mul_out")
+                          ->assert_is_op_output("mul", "Out")
+                          ->assert_is_op_input("elementwise_add", "X")
+                          ->AsIntermediate();
+    auto* v_add_y = VarNode("v_add_y")
+                        ->assert_is_op_input("elementwise_add", "Y")
+                        ->AsInput();
+    auto* v_add = OpNode("v_add", "elementwise_add")->AsIntermediate();
+    auto* v_add_out = VarNode("v_add_out")
+                          ->assert_is_op_output("elementwise_add", "Out")
+                          ->assert_is_op_input("reshape2", "X")
+                          ->AsIntermediate();
+    auto* v_reshape2 = OpNode("v_reshape2", "reshape2")->AsIntermediate();
+    auto* v_reshape2_out = VarNode("v_reshape2_out")
+                               ->assert_is_op_output("reshape2", "Out")
+                               ->assert_is_op_input("transpose2", "X")
+                               ->AsIntermediate();
+    auto* v_reshape2_xshape = VarNode("v_reshape2_xshape")
+                                  ->assert_is_op_output("reshape2", "XShape")
+                                  ->AsIntermediate();
+    auto* v_transpose2 = OpNode("v_transpose2", "transpose2")->AsIntermediate();
+    auto* v_transpose2_out = VarNode("v_transpose2_out")
+                                 ->assert_is_op_output("transpose2", "Out")
+                                 ->assert_is_op_input("matmul", "Y")
+                                 ->AsIntermediate();
+    auto* v_transpose2_xshape =
+        VarNode("v_transpose2_xshape")
+            ->assert_is_op_output("transpose2", "XShape")
+            ->AsIntermediate();
+    auto* qkv_matmul = OpNode("qkv_matmul", "matmul")->AsIntermediate();
+    auto* qkv_matmul_out = VarNode("qkv_matmul_out")
+                               ->assert_is_op_output("matmul", "Out")
+                               ->assert_is_op_input("transpose2", "X")
+                               ->AsIntermediate();
+    auto* qkv_transpose2 =
+        OpNode("qkv_transpose2", "transpose2")->AsIntermediate();
+    auto* qkv_transpose2_out = VarNode("qkv_transpose2_out")
+                                   ->assert_is_op_output("transpose2", "Out")
+                                   ->assert_is_op_input("reshape2", "X")
+                                   ->AsIntermediate();
+    auto* qkv_transpose2_xshape =
+        VarNode("qkv_transpose2_xshape")
+            ->assert_is_op_output("transpose2", "XShape")
+            ->AsIntermediate();
+    auto* qkv_reshape2 = OpNode("qkv_reshape2", "reshape2")->AsIntermediate();
+    auto* qkv_reshape2_out = VarNode("qkv_reshape2_out")
+                                 ->assert_is_op_output("reshape2", "Out")
+                                 ->assert_is_op_input("mul", "X")
+                                 ->AsIntermediate();
+    auto* qkv_reshape2_xshape = VarNode("qkv_reshape2_xshape")
+                                    ->assert_is_op_output("reshape2", "XShape")
+                                    ->AsIntermediate();
+    auto* qkv_mul_y =
+        VarNode("qkv_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* qkv_mul = OpNode("qkv_mul", "mul")->AsIntermediate();
+    auto* qkv_mul_out = VarNode("qkv_mul_out")
+                            ->assert_is_op_output("mul", "Out")
+                            ->assert_is_op_input("elementwise_add", "X")
+                            ->AsIntermediate();
+    auto* qkv_add_y = VarNode("qkv_add_y")
+                          ->assert_is_op_input("elementwise_add", "Y")
+                          ->AsInput();
+    auto* qkv_add = OpNode("qkv_add", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_out = VarNode("qkv_add_out")
+                            ->assert_is_op_output("elementwise_add", "Out")
+                            ->assert_is_op_input("dropout", "X")
+                            ->AsIntermediate();
+    auto* qkv_dropout = OpNode("qkv_dropout", "dropout")->AsIntermediate();
+    auto* qkv_dropout_out = VarNode("qkv_dropout_out")
+                                ->assert_is_op_output("dropout", "Out")
+                                ->assert_is_op_input("elementwise_add", "X")
+                                ->AsIntermediate();
+    auto* qkv_dropout_mask = VarNode("qkv_dropout_mask")
+                                 ->assert_is_op_output("dropout", "Mask")
+                                 ->AsIntermediate();
+    auto* qkv_add_2 = OpNode("qkv_add_2", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_2_out = VarNode("qkv_add_2_out")
+                              ->assert_is_op_output("elementwise_add", "Out")
+                              ->assert_is_op_input("layer_norm", "X")
+                              ->AsIntermediate();
+    auto* qkv_ln_2_scale = VarNode("qkv_ln_2_scale")
+                               ->assert_is_op_input("layer_norm", "Scale")
+                               ->AsInput();
+    auto* qkv_ln_2_bias = VarNode("qkv_ln_2_bias")
+                              ->assert_is_op_input("layer_norm", "Bias")
+                              ->AsInput();
+    auto* qkv_ln_2 = OpNode("qkv_ln_2", "layer_norm")->AsIntermediate();
+    auto* qkv_ln_2_out = VarNode("qkv_ln_2_out")
+                             ->assert_is_op_output("layer_norm", "Y")
+                             ->assert_is_op_input("mul", "X")
+                             ->assert_is_op_input("elementwise_add", "Y")
+                             ->AsIntermediate();
+    auto* qkv_ln_2_mean = VarNode("qkv_ln_2_mean")
+                              ->assert_is_op_output("layer_norm", "Mean")
+                              ->AsIntermediate();
+    auto* qkv_ln_2_var = VarNode("qkv_ln_2_var")
+                             ->assert_is_op_output("layer_norm", "Variance")
+                             ->AsIntermediate();
+    auto* qkv_mul_3_y =
+        VarNode("qkv_mul_3_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* qkv_mul_3 = OpNode("qkv_mul_3", "mul")->AsIntermediate();
+    auto* qkv_mul_3_out = VarNode("qkv_mul_3_out")
+                              ->assert_is_op_output("mul", "Out")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* qkv_add_3_y = VarNode("qkv_add_3_y")
+                            ->assert_is_op_input("elementwise_add", "Y")
+                            ->AsInput();
+    auto* qkv_add_3 = OpNode("qkv_add_3", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_3_out = VarNode("qkv_add_3_out")
+                              ->assert_is_op_output("elementwise_add", "Out")
+                              ->assert_is_op_input(act_type_, "X")
+                              ->AsIntermediate();
+    auto* qkv_act = OpNode("qkv_act", act_type_)->AsIntermediate();
+    auto* qkv_act_out = VarNode("qkv_act_out")
+                            ->assert_is_op_output(act_type_, "Out")
+                            ->assert_is_op_input("mul", "X")
+                            ->AsIntermediate();
+    auto* qkv_mul_4_y =
+        VarNode("qkv_mul_4_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* qkv_mul_4 = OpNode("qkv_mul_4", "mul")->AsIntermediate();
+    auto* qkv_mul_4_out = VarNode("qkv_mul_4_out")
+                              ->assert_is_op_output("mul", "Out")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* qkv_add_4_y = VarNode("qkv_add_4_y")
+                            ->assert_is_op_input("elementwise_add", "Y")
+                            ->AsInput();
+    auto* qkv_add_4 = OpNode("qkv_add_4", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_4_out = VarNode("qkv_add_4_out")
+                              ->assert_is_op_output("elementwise_add", "Out")
+                              ->assert_is_op_input("dropout", "X")
+                              ->AsIntermediate();
+    auto* qkv_dropout_4 = OpNode("qkv_dropout_4", "dropout")->AsIntermediate();
+    auto* qkv_dropout_4_out = VarNode("qkv_dropout_4_out")
+                                  ->assert_is_op_output("dropout", "Out")
+                                  ->assert_is_op_input("elementwise_add", "X")
+                                  ->AsIntermediate();
+    auto* qkv_dropout_4_mask = VarNode("qkv_dropout_4_mask")
+                                   ->assert_is_op_output("dropout", "Mask")
+                                   ->AsIntermediate();
+    auto* qkv_add_5 = OpNode("qkv_add_5", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_5_out = VarNode("qkv_add_5_out")
+                              ->assert_is_op_output("elementwise_add", "Out")
+                              ->assert_is_op_input("layer_norm", "X")
+                              ->AsIntermediate();
+    auto* qkv_ln_5_scale = VarNode("qkv_ln_5_scale")
+                               ->assert_is_op_input("layer_norm", "Scale")
+                               ->AsInput();
+    auto* qkv_ln_5_bias = VarNode("qkv_ln_5_bias")
+                              ->assert_is_op_input("layer_norm", "Bias")
+                              ->AsInput();
+    auto* qkv_ln_5 = OpNode("qkv_ln_5", "layer_norm")->AsIntermediate();
+    auto* qkv_ln_5_out = VarNode("qkv_ln_5_out")
+                             ->assert_is_op_output("layer_norm", "Y")
+                             ->AsOutput();
+    auto* qkv_ln_5_mean = VarNode("qkv_ln_5_mean")
+                              ->assert_is_op_output("layer_norm", "Mean")
+                              ->AsIntermediate();
+    auto* qkv_ln_5_var = VarNode("qkv_ln_5_var")
+                             ->assert_is_op_output("layer_norm", "Variance")
+                             ->AsIntermediate();
+    // TODO(miaotianxiang): use LinksFrom/LinksTo() instead
+    *input >> *q_mul >> *q_mul_out >> *q_add >> *q_add_out >> *q_reshape2 >>
+        *q_reshape2_out >> *q_transpose2 >> *q_transpose2_out >> *q_scale >>
+        *q_scale_out >> *qk_matmul;
+    *q_mul_y >> *q_mul;
+    *q_add_y >> *q_add;
+    *q_reshape2 >> *q_reshape2_xshape;
+    *q_transpose2 >> *q_transpose2_xshape;
+    *input >> *k_mul >> *k_mul_out >> *k_add >> *k_add_out >> *k_reshape2 >>
+        *k_reshape2_out >> *k_transpose2 >> *k_transpose2_out >> *qk_matmul;
+    *k_mul_y >> *k_mul;
+    *k_add_y >> *k_add;
+    *k_reshape2 >> *k_reshape2_xshape;
+    *k_transpose2 >> *k_transpose2_xshape;
+    *qk_matmul >> *qk_matmul_out >> *qk_add >> *qk_add_out >> *qk_softmax >>
+        *qk_softmax_out >> *qk_dropout >> *qk_dropout_out >> *qkv_matmul;
+    *qk_mask >> *qk_add;
+    *qk_dropout >> *qk_dropout_mask;
+    *input >> *v_mul >> *v_mul_out >> *v_add >> *v_add_out >> *v_reshape2 >>
+        *v_reshape2_out >> *v_transpose2 >> *v_transpose2_out >> *qkv_matmul;
+    *v_mul_y >> *v_mul;
+    *v_add_y >> *v_add;
+    *v_reshape2 >> *v_reshape2_xshape;
+    *v_transpose2 >> *v_transpose2_xshape;
+    *qkv_matmul >> *qkv_matmul_out >> *qkv_transpose2 >> *qkv_transpose2_out >>
+        *qkv_reshape2 >> *qkv_reshape2_out >> *qkv_mul >> *qkv_mul_out >>
+        *qkv_add >> *qkv_add_out >> *qkv_dropout >> *qkv_dropout_out >>
+        *qkv_add_2;
+    *qkv_transpose2 >> *qkv_transpose2_xshape;
+    *qkv_reshape2 >> *qkv_reshape2_xshape;
+    *qkv_mul_y >> *qkv_mul;
+    *qkv_add_y >> *qkv_add;
+    *qkv_dropout >> *qkv_dropout_mask;
+    *input >> *qkv_add_2 >> *qkv_add_2_out >> *qkv_ln_2 >> *qkv_ln_2_out;
+    *qkv_ln_2_scale >> *qkv_ln_2;
+    *qkv_ln_2_bias >> *qkv_ln_2;
+    *qkv_ln_2 >> *qkv_ln_2_mean;
+    *qkv_ln_2 >> *qkv_ln_2_var;
+    *qkv_ln_2_out >> *qkv_mul_3 >> *qkv_mul_3_out >> *qkv_add_3 >>
+        *qkv_add_3_out >> *qkv_act >> *qkv_act_out >> *qkv_mul_4 >>
+        *qkv_mul_4_out >> *qkv_add_4 >> *qkv_add_4_out >> *qkv_dropout_4 >>
+        *qkv_dropout_4_out >> *qkv_add_5;
+    *qkv_mul_3_y >> *qkv_mul_3;
+    *qkv_add_3_y >> *qkv_add_3;
+    *qkv_mul_4_y >> *qkv_mul_4;
+    *qkv_add_4_y >> *qkv_add_4;
+    *qkv_dropout_4 >> *qkv_dropout_4_mask;
+    *qkv_ln_2_out >> *qkv_add_5 >> *qkv_add_5_out >> *qkv_ln_5 >> *qkv_ln_5_out;
+    *qkv_ln_5_scale >> *qkv_ln_5;
+    *qkv_ln_5_bias >> *qkv_ln_5;
+    *qkv_ln_5 >> *qkv_ln_5_mean;
+    *qkv_ln_5 >> *qkv_ln_5_var;
+  }
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("single_encoder");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Mask", {matched.at("qk_mask")->arg()->name});
+    op_desc.SetInput("FCWeight",
+                     {
+                         matched.at("q_mul_y")->arg()->name,
+                         matched.at("k_mul_y")->arg()->name,
+                         matched.at("v_mul_y")->arg()->name,
+                         matched.at("qkv_mul_y")->arg()->name,
+                         matched.at("qkv_mul_3_y")->arg()->name,
+                         matched.at("qkv_mul_4_y")->arg()->name,
+                     });
+    op_desc.SetInput("FCBias",
+                     {
+                         matched.at("q_add_y")->arg()->name,
+                         matched.at("k_add_y")->arg()->name,
+                         matched.at("v_add_y")->arg()->name,
+                         matched.at("qkv_add_y")->arg()->name,
+                         matched.at("qkv_add_3_y")->arg()->name,
+                         matched.at("qkv_add_4_y")->arg()->name,
+                     });
+    op_desc.SetInput("LNScale",
+                     {
+                         matched.at("qkv_ln_2_scale")->arg()->name,
+                         matched.at("qkv_ln_5_scale")->arg()->name,
+                     });
+    op_desc.SetInput("LNBias",
+                     {
+                         matched.at("qkv_ln_2_bias")->arg()->name,
+                         matched.at("qkv_ln_5_bias")->arg()->name,
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("qkv_ln_5_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+    // extra traits to distill
+    auto* reshape_op_info = matched.at("q_reshape2")->stmt()->op_info();
+    auto reshape_dim = reshape_op_info->GetAttr<std::vector<int>>("shape");
+    op_desc.SetAttr<int>("head_num", reshape_dim[2]);
+    op_desc.SetAttr<int>("size_per_head", reshape_dim[3]);
+    op_desc.SetAttr<std::string>("act_type", act_type_);
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    // XXX: memleak?
+    auto sub_block_desc = new cpp::BlockDesc();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetSubBlock(sub_block_desc);
+    auto* single_encoder_stmt = matched.at("q_mul")->stmt();
+    fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());
+    single_encoder_stmt->SetOp(fake_subgraph_op);
+    std::vector<std::string> froms = {
+        "qk_mask",
+        "k_mul_y",
+        "v_mul_y",
+        "qkv_mul_y",
+        "qkv_mul_3_y",
+        "qkv_mul_4_y",
+        "q_add_y",
+        "k_add_y",
+        "v_add_y",
+        "qkv_add_y",
+        "qkv_add_3_y",
+        "qkv_add_4_y",
+        "qkv_ln_2_scale",
+        "qkv_ln_2_bias",
+        "qkv_ln_5_scale",
+        "qkv_ln_5_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("q_mul"));
+    }
+    IR_OP_VAR_LINK(matched.at("q_mul"), matched.at("qkv_ln_5_out"));
+  }
+ private:
+  std::string act_type_;
+};
+class XPUMultiEncoderFuser {
+ public:
+  bool IsDirectPredecessorOf(Node* op1, Node* op2) {
+    for (auto* out : op1->outlinks) {
+      for (auto* in : op2->inlinks) {
+        if (out == in) return true;
+      }
+    }
+    return false;
+  }
+  void operator()(SSAGraph* graph) {
+    std::vector<Node*> all_encoders;
+    for (auto* node : graph->StmtTopologicalOrder()) {
+      CHECK(node->IsStmt());
+      if (node->stmt()->op_info()->Type() == "single_encoder") {
+        all_encoders.push_back(node);
+      }
+    }
+    VLOG(3) << "Found " << all_encoders.size() << " single_encoder";
+    if (all_encoders.size() == 0) {
+      return;
+    }
+    // TODO(miaotianxiang): more verification
+    for (size_t i = 0; i < all_encoders.size() - 1; ++i) {
+      CHECK(IsDirectPredecessorOf(all_encoders[i], all_encoders[i + 1]));
+    }
+    std::string mask_name;
+    for (auto* encoder : all_encoders) {
+      auto* op_info = encoder->stmt()->op_info();
+      if (mask_name.empty()) {
+        mask_name = op_info->Input("Mask").front();
+      } else {
+        // CHECK(mask_name == op_info->Input("Mask").front());
+      }
+    }
+    std::unordered_set<const Node*> to_remove;
+    Node* first_encoder = all_encoders[0];
+    std::string in_name, out_name;
+    std::vector<std::string> arg_names{
+        "FCWeight", "FCBias", "LNScale", "LNBias"};
+    std::unordered_map<std::string, std::vector<std::string>> arg_map;
+    for (size_t i = 0; i < all_encoders.size(); ++i) {
+      Node* cur_encoder = all_encoders[i];
+      auto* op_info = cur_encoder->stmt()->op_info();
+      for (auto arg_name : arg_names) {
+        auto real_names = op_info->Input(arg_name);
+        for (auto name : real_names) {
+          auto* arg_node = graph->RetrieveArgument(name);
+          DirectedLink(arg_node, first_encoder);
+          arg_map[arg_name].push_back(name);
+        }
+      }
+      auto* cur_out =
+          graph->RetrieveArgument(op_info->Output("Outputs").front());
+      if (i == 0) {
+        // first encoder
+        to_remove.insert(cur_out);
+        in_name = op_info->Input("Inputs").front();
+        mask_name = op_info->Input("Mask").front();
+      } else if (i == all_encoders.size() - 1) {
+        // last encoder
+        to_remove.insert(cur_encoder);
+        DirectedLink(first_encoder, cur_out);
+        out_name = op_info->Output("Outputs").front();
+      } else {
+        to_remove.insert(cur_encoder);
+        to_remove.insert(cur_out);
+      }
+    }
+    GraphSafeRemoveNodes(graph, to_remove);
+    auto* multi_encoder_stmt = first_encoder->stmt();
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__multi_encoder");
+    op_desc.SetInput("Input", {in_name});
+    for (auto kv : arg_map) {
+      op_desc.SetInput(kv.first, kv.second);
+    }
+    op_desc.SetInput("Mask", {mask_name});
+    op_desc.SetOutput("Output", {out_name});
+    op_desc.SetAttr<int>("xpu", 1);
+    auto* first_encoder_op_info = multi_encoder_stmt->op_info();
+    op_desc.SetAttr<int>("head_num",
+                         first_encoder_op_info->GetAttr<int>("head_num"));
+    op_desc.SetAttr<int>("size_per_head",
+                         first_encoder_op_info->GetAttr<int>("size_per_head"));
+    op_desc.SetAttr<int>("n_layers", all_encoders.size());
+    op_desc.SetAttr<std::string>(
+        "act_type", first_encoder_op_info->GetAttr<std::string>("act_type"));
+    auto* scope = multi_encoder_stmt->op()->scope();
+    std::vector<float> fc_weight_max(arg_map["FCWeight"].size());
+    auto& fc_weight_names = arg_map["FCWeight"];
+    for (size_t i = 0; i < fc_weight_names.size(); ++i) {
+      auto* weight_t = scope->FindMutableTensor(fc_weight_names[i]);
+      auto weight_dims = weight_t->dims();
+      int weight_len = weight_t->numel();
+      float* weight_on_host = weight_t->mutable_data<float>();
+      float max_f =
+          paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
+      std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
+      std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
+      paddle::lite::xpu::math::ConvertFP32ToInt16(
+          weight_on_host, weight_int16.get(), max_f, weight_len);
+      paddle::lite::xpu::math::Transpose(weight_int16.get(),
+                                         weight_trans_int16.get(),
+                                         weight_dims[0],
+                                         weight_dims[1]);
+      memcpy(weight_on_host,
+             weight_trans_int16.get(),
+             weight_len * sizeof(int16_t));
+      fc_weight_max[i] = max_f;
+    }
+    std::string max_name = "encoder_max";
+    auto* max_filter_node = graph->NewArgumentNode(max_name);
+    max_filter_node->arg()->is_weight = true;
+    max_filter_node->arg()->type = LiteType::GetTensorTy(
+        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    DirectedLink(max_filter_node, first_encoder);
+    auto* max_filter_tensor = scope->NewTensor(max_name);
+    max_filter_tensor->Resize({static_cast<int>(fc_weight_max.size())});
+    memcpy(max_filter_tensor->mutable_data<float>(),
+           &fc_weight_max[0],
+           sizeof(float) * fc_weight_max.size());
+    op_desc.SetInput("FCWeightMax", {max_name});
+    auto multi_encoder_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    multi_encoder_op->Attach(op_desc, scope);
+    multi_encoder_op->SetValidPlaces(multi_encoder_stmt->op()->valid_places());
+    auto kernels =
+        multi_encoder_op->CreateKernels(multi_encoder_op->valid_places());
+    multi_encoder_stmt->SetOp(multi_encoder_op);
+    multi_encoder_stmt->SetKernels(std::move(kernels));
+    // temp remove useless cast
+    std::unordered_set<const Node*> to_remove2;
+    Node* stack = nullptr;
+    for (auto* node : graph->StmtTopologicalOrder()) {
+      CHECK(node->IsStmt());
+      if (node->stmt()->op_info()->Type() == "stack") {
+        stack = node;
+      }
+    }
+    Node* stack_out = stack->outlinks.front();
+    for (Node* cast : stack_out->outlinks) {
+      Node* cast_out = cast->outlinks.front();
+      if (cast_out->outlinks.size() == 0) {
+        // remove
+        to_remove2.insert(cast_out);
+        to_remove2.insert(cast);
+      }
+    }
+    GraphSafeRemoveNodes(graph, to_remove2);
+  }
+};
+}  // namespace fusion
+class XPUMultiEncoderFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    // TODO(miaotianxiang): backup graph, recover from failed match
+    std::vector<std::string> act_types{"gelu", "relu"};
+    for (auto& act_type : act_types) {
+      fusion::XPUSingleEncoderFuser single_encoder_fuser(act_type);
+      single_encoder_fuser(graph.get());
+      fusion::XPUMultiEncoderFuser multi_encoder_fuser;
+      multi_encoder_fuser(graph.get());
+    }
+  }
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(__xpu__multi_encoder_fuse_pass,
+                  paddle::lite::mir::XPUMultiEncoderFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("matmul");
--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include "lite/operators/subgraph_op.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+class XPUResNetBlock0Fuser : public FuseBase {
+ public:
+  XPUResNetBlock0Fuser() {}
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+    auto* left_conv1_weight = VarNode("left_conv1_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv1 = OpNode("left_conv1", "conv2d");
+    auto* left_conv1_out = VarNode("left_conv1_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn1_scale = VarNode("left_bn1_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn1_bias = VarNode("left_bn1_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn1_mean = VarNode("left_bn1_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn1_var = VarNode("left_bn1_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate();
+    auto* left_bn1_out = VarNode("left_bn1_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
+                             ->AsIntermediate();
+    auto* left_bn1_mean_out = VarNode("left_bn1_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn1_var_out =
+        VarNode("left_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn1_saved_mean =
+        VarNode("left_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn1_saved_var =
+        VarNode("left_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate();
+    auto* left_relu1_out = VarNode("left_relu1_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+    auto* left_conv2_weight = VarNode("left_conv2_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate();
+    auto* left_conv2_out = VarNode("left_conv2_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn2_scale = VarNode("left_bn2_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn2_bias = VarNode("left_bn2_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn2_mean = VarNode("left_bn2_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn2_var = VarNode("left_bn2_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate();
+    auto* left_bn2_out = VarNode("left_bn2_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
+                             ->AsIntermediate();
+    auto* left_bn2_mean_out = VarNode("left_bn2_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn2_var_out =
+        VarNode("left_bn2_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn2_saved_mean =
+        VarNode("left_bn2_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn2_saved_var =
+        VarNode("left_bn2_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate();
+    auto* left_relu2_out = VarNode("left_relu2_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+    auto* left_conv3_weight = VarNode("left_conv3_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate();
+    auto* left_conv3_out = VarNode("left_conv3_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn3_scale = VarNode("left_bn3_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn3_bias = VarNode("left_bn3_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn3_mean = VarNode("left_bn3_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn3_var = VarNode("left_bn3_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate();
+    auto* left_bn3_out = VarNode("left_bn3_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("elementwise_add", "Y")
+                             ->AsIntermediate();
+    auto* left_bn3_mean_out = VarNode("left_bn3_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn3_var_out =
+        VarNode("left_bn3_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn3_saved_mean =
+        VarNode("left_bn3_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn3_saved_var =
+        VarNode("left_bn3_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* right_conv1_weight = VarNode("right_conv1_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate();
+    auto* right_conv1_out = VarNode("right_conv1_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn1_scale = VarNode("right_bn1_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn1_bias = VarNode("right_bn1_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn1_mean = VarNode("right_bn1_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn1_var = VarNode("right_bn1_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
+    auto* right_bn1_out = VarNode("right_bn1_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* right_bn1_mean_out =
+        VarNode("right_bn1_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn1_var_out =
+        VarNode("right_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn1_saved_mean =
+        VarNode("right_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn1_saved_var =
+        VarNode("right_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
+    auto* add_out = VarNode("add_out")
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("relu", "X")
+                        ->AsIntermediate();
+    auto* relu = OpNode("relu", "relu")->AsIntermediate();
+    auto* relu_out =
+        VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+    *input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >>
+        *left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >>
+        *left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >>
+        *left_conv3 >> *left_conv3_out >> *left_bn3 >> *left_bn3_out >> *add;
+    *left_conv1_weight >> *left_conv1;
+    *left_bn1_scale >> *left_bn1;
+    *left_bn1_bias >> *left_bn1;
+    *left_bn1_mean >> *left_bn1;
+    *left_bn1_var >> *left_bn1;
+    *left_bn1 >> *left_bn1_mean_out;
+    *left_bn1 >> *left_bn1_var_out;
+    *left_bn1 >> *left_bn1_saved_mean;
+    *left_bn1 >> *left_bn1_saved_var;
+    *left_conv2_weight >> *left_conv2;
+    *left_bn2_scale >> *left_bn2;
+    *left_bn2_bias >> *left_bn2;
+    *left_bn2_mean >> *left_bn2;
+    *left_bn2_var >> *left_bn2;
+    *left_bn2 >> *left_bn2_mean_out;
+    *left_bn2 >> *left_bn2_var_out;
+    *left_bn2 >> *left_bn2_saved_mean;
+    *left_bn2 >> *left_bn2_saved_var;
+    *left_conv3_weight >> *left_conv3;
+    *left_bn3_scale >> *left_bn3;
+    *left_bn3_bias >> *left_bn3;
+    *left_bn3_mean >> *left_bn3;
+    *left_bn3_var >> *left_bn3;
+    *left_bn3 >> *left_bn3_mean_out;
+    *left_bn3 >> *left_bn3_var_out;
+    *left_bn3 >> *left_bn3_saved_mean;
+    *left_bn3 >> *left_bn3_saved_var;
+    *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
+        *right_bn1_out >> *add;
+    *right_conv1_weight >> *right_conv1;
+    *right_bn1_scale >> *right_bn1;
+    *right_bn1_bias >> *right_bn1;
+    *right_bn1_mean >> *right_bn1;
+    *right_bn1_var >> *right_bn1;
+    *right_bn1 >> *right_bn1_mean_out;
+    *right_bn1 >> *right_bn1_var_out;
+    *right_bn1 >> *right_bn1_saved_mean;
+    *right_bn1 >> *right_bn1_saved_var;
+    *add >> *add_out >> *relu >> *relu_out;
+  }
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_block0");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter",
+                     {
+                         matched.at("left_conv1_weight")->arg()->name,
+                         matched.at("left_conv2_weight")->arg()->name,
+                         matched.at("left_conv3_weight")->arg()->name,
+                         matched.at("right_conv1_weight")->arg()->name,
+                     });
+    op_desc.SetInput("Scale",
+                     {
+                         matched.at("left_bn1_scale")->arg()->name,
+                         matched.at("left_bn2_scale")->arg()->name,
+                         matched.at("left_bn3_scale")->arg()->name,
+                         matched.at("right_bn1_scale")->arg()->name,
+                     });
+    op_desc.SetInput("Bias",
+                     {
+                         matched.at("left_bn1_bias")->arg()->name,
+                         matched.at("left_bn2_bias")->arg()->name,
+                         matched.at("left_bn3_bias")->arg()->name,
+                         matched.at("right_bn1_bias")->arg()->name,
+                     });
+    op_desc.SetInput("Mean",
+                     {
+                         matched.at("left_bn1_mean")->arg()->name,
+                         matched.at("left_bn2_mean")->arg()->name,
+                         matched.at("left_bn3_mean")->arg()->name,
+                         matched.at("right_bn1_mean")->arg()->name,
+                     });
+    op_desc.SetInput("Var",
+                     {
+                         matched.at("left_bn1_variance")->arg()->name,
+                         matched.at("left_bn2_variance")->arg()->name,
+                         matched.at("left_bn3_variance")->arg()->name,
+                         matched.at("right_bn1_variance")->arg()->name,
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+    auto block0_stmt = matched.at("left_conv1")->stmt();
+    // block0_stmt->ResetOp(op_desc, graph->valid_places());
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    // XXX: memleak?
+    auto sub_block_desc = new cpp::BlockDesc();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetSubBlock(sub_block_desc);
+    fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
+    block0_stmt->SetOp(fake_subgraph_op);
+    std::vector<std::string> froms = {
+        "left_conv2_weight",
+        "left_conv3_weight",
+        "right_conv1_weight",
+        "left_bn1_bias",
+        "left_bn2_bias",
+        "left_bn3_bias",
+        "right_bn1_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1"));
+    }
+    IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out"));
+  }
+};
+class XPUResNetBlock1Fuser : public FuseBase {
+ public:
+  XPUResNetBlock1Fuser() {}
+  void BuildPattern() override {
+    auto* input = VarNode("input")
+                      ->assert_is_op_input("conv2d", "Input")
+                      ->assert_is_op_input("elementwise_add", "X")
+                      ->AsInput();
+    auto* right_conv1_weight = VarNode("right_conv1_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv1 = OpNode("right_conv1", "conv2d");
+    auto* right_conv1_out = VarNode("right_conv1_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn1_scale = VarNode("right_bn1_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn1_bias = VarNode("right_bn1_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn1_mean = VarNode("right_bn1_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn1_var = VarNode("right_bn1_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
+    auto* right_bn1_out = VarNode("right_bn1_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("relu", "X")
+                              ->AsIntermediate();
+    auto* right_bn1_mean_out =
+        VarNode("right_bn1_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn1_var_out =
+        VarNode("right_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn1_saved_mean =
+        VarNode("right_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn1_saved_var =
+        VarNode("right_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* right_relu1 = OpNode("right_relu1", "relu")->AsIntermediate();
+    auto* right_relu1_out = VarNode("right_relu1_out")
+                                ->assert_is_op_output("relu", "Out")
+                                ->assert_is_op_input("conv2d", "Input")
+                                ->AsIntermediate();
+    auto* right_conv2_weight = VarNode("right_conv2_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv2 = OpNode("right_conv2", "conv2d")->AsIntermediate();
+    auto* right_conv2_out = VarNode("right_conv2_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn2_scale = VarNode("right_bn2_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn2_bias = VarNode("right_bn2_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn2_mean = VarNode("right_bn2_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn2_var = VarNode("right_bn2_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn2 = OpNode("right_bn2", "batch_norm")->AsIntermediate();
+    auto* right_bn2_out = VarNode("right_bn2_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("relu", "X")
+                              ->AsIntermediate();
+    auto* right_bn2_mean_out =
+        VarNode("right_bn2_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn2_var_out =
+        VarNode("right_bn2_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn2_saved_mean =
+        VarNode("right_bn2_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn2_saved_var =
+        VarNode("right_bn2_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* right_relu2 = OpNode("right_relu2", "relu")->AsIntermediate();
+    auto* right_relu2_out = VarNode("right_relu2_out")
+                                ->assert_is_op_output("relu", "Out")
+                                ->assert_is_op_input("conv2d", "Input")
+                                ->AsIntermediate();
+    auto* right_conv3_weight = VarNode("right_conv3_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv3 = OpNode("right_conv3", "conv2d")->AsIntermediate();
+    auto* right_conv3_out = VarNode("right_conv3_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn3_scale = VarNode("right_bn3_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn3_bias = VarNode("right_bn3_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn3_mean = VarNode("right_bn3_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn3_var = VarNode("right_bn3_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn3 = OpNode("right_bn3", "batch_norm")->AsIntermediate();
+    auto* right_bn3_out = VarNode("right_bn3_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("elementwise_add", "Y")
+                              ->AsIntermediate();
+    auto* right_bn3_mean_out =
+        VarNode("right_bn3_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn3_var_out =
+        VarNode("right_bn3_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn3_saved_mean =
+        VarNode("right_bn3_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn3_saved_var =
+        VarNode("right_bn3_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
+    auto* add_out = VarNode("add_out")
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("relu", "X")
+                        ->AsIntermediate();
+    auto* relu = OpNode("relu", "relu")->AsIntermediate();
+    auto* relu_out =
+        VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+    *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
+        *right_bn1_out >> *right_relu1 >> *right_relu1_out >> *right_conv2 >>
+        *right_conv2_out >> *right_bn2 >> *right_bn2_out >> *right_relu2 >>
+        *right_relu2_out >> *right_conv3 >> *right_conv3_out >> *right_bn3 >>
+        *right_bn3_out >> *add;
+    *right_conv1_weight >> *right_conv1;
+    *right_bn1_scale >> *right_bn1;
+    *right_bn1_bias >> *right_bn1;
+    *right_bn1_mean >> *right_bn1;
+    *right_bn1_var >> *right_bn1;
+    *right_bn1 >> *right_bn1_mean_out;
+    *right_bn1 >> *right_bn1_var_out;
+    *right_bn1 >> *right_bn1_saved_mean;
+    *right_bn1 >> *right_bn1_saved_var;
+    *right_conv2_weight >> *right_conv2;
+    *right_bn2_scale >> *right_bn2;
+    *right_bn2_bias >> *right_bn2;
+    *right_bn2_mean >> *right_bn2;
+    *right_bn2_var >> *right_bn2;
+    *right_bn2 >> *right_bn2_mean_out;
+    *right_bn2 >> *right_bn2_var_out;
+    *right_bn2 >> *right_bn2_saved_mean;
+    *right_bn2 >> *right_bn2_saved_var;
+    *right_conv3_weight >> *right_conv3;
+    *right_bn3_scale >> *right_bn3;
+    *right_bn3_bias >> *right_bn3;
+    *right_bn3_mean >> *right_bn3;
+    *right_bn3_var >> *right_bn3;
+    *right_bn3 >> *right_bn3_mean_out;
+    *right_bn3 >> *right_bn3_var_out;
+    *right_bn3 >> *right_bn3_saved_mean;
+    *right_bn3 >> *right_bn3_saved_var;
+    *input >> *add;
+    *add >> *add_out >> *relu >> *relu_out;
+  }
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_block1");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter",
+                     {
+                         matched.at("right_conv1_weight")->arg()->name,
+                         matched.at("right_conv2_weight")->arg()->name,
+                         matched.at("right_conv3_weight")->arg()->name,
+                     });
+    op_desc.SetInput("Scale",
+                     {
+                         matched.at("right_bn1_scale")->arg()->name,
+                         matched.at("right_bn2_scale")->arg()->name,
+                         matched.at("right_bn3_scale")->arg()->name,
+                     });
+    op_desc.SetInput("Bias",
+                     {
+                         matched.at("right_bn1_bias")->arg()->name,
+                         matched.at("right_bn2_bias")->arg()->name,
+                         matched.at("right_bn3_bias")->arg()->name,
+                     });
+    op_desc.SetInput("Mean",
+                     {
+                         matched.at("right_bn1_mean")->arg()->name,
+                         matched.at("right_bn2_mean")->arg()->name,
+                         matched.at("right_bn3_mean")->arg()->name,
+                     });
+    op_desc.SetInput("Var",
+                     {
+                         matched.at("right_bn1_variance")->arg()->name,
+                         matched.at("right_bn2_variance")->arg()->name,
+                         matched.at("right_bn3_variance")->arg()->name,
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+    auto block1_stmt = matched.at("right_conv1")->stmt();
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    // XXX: memleak?
+    auto sub_block_desc = new cpp::BlockDesc();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetSubBlock(sub_block_desc);
+    fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
+    block1_stmt->SetOp(fake_subgraph_op);
+    std::vector<std::string> froms = {
+        "right_conv2_weight",
+        "right_conv3_weight",
+        "right_bn1_bias",
+        "right_bn2_bias",
+        "right_bn3_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("right_conv1"));
+    }
+    IR_OP_VAR_LINK(matched.at("right_conv1"), matched.at("relu_out"));
+  }
+};
+class XPUResNet50Fuser : public xpu::XPUFuseBase {
+ public:
+  XPUResNet50Fuser() {}
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+    auto* top_conv_weight = VarNode("top_conv_weight")
+                                ->assert_is_op_input("conv2d", "Filter")
+                                ->AsInput();
+    auto* top_conv = OpNode("top_conv", "conv2d");
+    auto* top_conv_out = VarNode("top_conv_out")
+                             ->assert_is_op_output("conv2d", "Output")
+                             ->assert_is_op_input("batch_norm", "X")
+                             ->AsIntermediate();
+    auto* top_bn_scale = VarNode("top_bn_scale")
+                             ->assert_is_op_input("batch_norm", "Scale")
+                             ->AsIntermediate();
+    auto* top_bn_bias = VarNode("top_bn_bias")
+                            ->assert_is_op_input("batch_norm", "Bias")
+                            ->AsInput();
+    auto* top_bn_mean = VarNode("top_bn_mean")
+                            ->assert_is_op_input("batch_norm", "Mean")
+                            ->AsIntermediate();
+    auto* top_bn_var = VarNode("top_bn_variance")
+                           ->assert_is_op_input("batch_norm", "Variance")
+                           ->AsIntermediate();
+    auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate();
+    auto* top_bn_out = VarNode("top_bn_out")
+                           ->assert_is_op_output("batch_norm", "Y")
+                           ->assert_is_op_input("relu", "X")
+                           ->AsIntermediate();
+    auto* top_bn_mean_out = VarNode("top_bn_mean_out")
+                                ->assert_is_op_output("batch_norm", "MeanOut")
+                                ->AsIntermediate();
+    auto* top_bn_var_out =
+        VarNode("top_bn_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* top_bn_saved_mean =
+        VarNode("top_bn_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* top_bn_saved_var =
+        VarNode("top_bn_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate();
+    auto* top_relu_out = VarNode("top_relu_out")
+                             ->assert_is_op_output("relu", "Out")
+                             ->assert_is_op_input("pool2d", "X")
+                             ->AsIntermediate();
+    auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate();
+    auto* top_pool_out = VarNode("top_pool_out")
+                             ->assert_is_op_output("pool2d", "Out")
+                             ->assert_is_op_input("resnet_block0", "Inputs")
+                             ->AsIntermediate();
+    // args are left out
+    auto* resnet_block0_1 =
+        OpNode("resnet_block0_1", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_1_out =
+        VarNode("resnet_block0_1_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_1_1 =
+        OpNode("resnet_block1_1_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_1_1_out =
+        VarNode("resnet_block1_1_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_1_2 =
+        OpNode("resnet_block1_1_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_1_2_out =
+        VarNode("resnet_block1_1_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block0_2 =
+        OpNode("resnet_block0_2", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_2_out =
+        VarNode("resnet_block0_2_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_1 =
+        OpNode("resnet_block1_2_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_2_1_out =
+        VarNode("resnet_block1_2_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_2 =
+        OpNode("resnet_block1_2_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_2_2_out =
+        VarNode("resnet_block1_2_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_3 =
+        OpNode("resnet_block1_2_3", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_2_3_out =
+        VarNode("resnet_block1_2_3_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block0_3 =
+        OpNode("resnet_block0_3", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_3_out =
+        VarNode("resnet_block0_3_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_1 =
+        OpNode("resnet_block1_3_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_1_out =
+        VarNode("resnet_block1_3_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_2 =
+        OpNode("resnet_block1_3_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_2_out =
+        VarNode("resnet_block1_3_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_3 =
+        OpNode("resnet_block1_3_3", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_3_out =
+        VarNode("resnet_block1_3_3_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_4 =
+        OpNode("resnet_block1_3_4", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_4_out =
+        VarNode("resnet_block1_3_4_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_5 =
+        OpNode("resnet_block1_3_5", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_5_out =
+        VarNode("resnet_block1_3_5_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block0_4 =
+        OpNode("resnet_block0_4", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_4_out =
+        VarNode("resnet_block0_4_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_4_1 =
+        OpNode("resnet_block1_4_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_4_1_out =
+        VarNode("resnet_block1_4_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_4_2 =
+        OpNode("resnet_block1_4_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_4_2_out =
+        VarNode("resnet_block1_4_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* bottom_pool = OpNode("bottom_pool", "pool2d")->AsIntermediate();
+    auto* bottom_pool_out = VarNode("bottom_pool_out")
+                                ->assert_is_op_output("pool2d", "Out")
+                                ->AsOutput();
+    *input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >>
+        *top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >>
+        *resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >>
+        *resnet_block1_1_1_out >> *resnet_block1_1_2 >>
+        *resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >>
+        *resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >>
+        *resnet_block1_2_2_out >> *resnet_block1_2_3 >>
+        *resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >>
+        *resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >>
+        *resnet_block1_3_2_out >> *resnet_block1_3_3 >>
+        *resnet_block1_3_3_out >> *resnet_block1_3_4 >>
+        *resnet_block1_3_4_out >> *resnet_block1_3_5 >>
+        *resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >>
+        *resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >>
+        *resnet_block1_4_2_out >> *bottom_pool >> *bottom_pool_out;
+    *top_conv_weight >> *top_conv;
+    *top_bn_scale >> *top_bn;
+    *top_bn_bias >> *top_bn;
+    *top_bn_mean >> *top_bn;
+    *top_bn_var >> *top_bn;
+    *top_bn >> *top_bn_mean_out;
+    *top_bn >> *top_bn_var_out;
+    *top_bn >> *top_bn_saved_mean;
+    *top_bn >> *top_bn_saved_var;
+  }
+  void InsertNewNode(SSAGraph* graph,
+                     const key2nodes_t& matched,
+                     const std::vector<Node*>& extra_input_vars) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__resnet50");
+    op_desc.SetInput("Input", {matched.at("input")->arg()->name});
+    std::vector<std::string> filter_name = {
+        matched.at("top_conv_weight")->arg()->name};
+    std::vector<std::string> scale_name = {
+        matched.at("top_bn_scale")->arg()->name};
+    std::vector<std::string> bias_name = {
+        matched.at("top_bn_bias")->arg()->name};
+    std::vector<std::string> mean_name = {
+        matched.at("top_bn_mean")->arg()->name};
+    std::vector<std::string> var_name = {
+        matched.at("top_bn_variance")->arg()->name};
+    std::vector<std::string> max_filter_name;
+    std::vector<std::string> resnet_block_vec = {
+        "resnet_block0_1",
+        "resnet_block1_1_1",
+        "resnet_block1_1_2",
+        "resnet_block0_2",
+        "resnet_block1_2_1",
+        "resnet_block1_2_2",
+        "resnet_block1_2_3",
+        "resnet_block0_3",
+        "resnet_block1_3_1",
+        "resnet_block1_3_2",
+        "resnet_block1_3_3",
+        "resnet_block1_3_4",
+        "resnet_block1_3_5",
+        "resnet_block0_4",
+        "resnet_block1_4_1",
+        "resnet_block1_4_2",
+    };
+    for (auto& block : resnet_block_vec) {
+      auto* block_op_info = matched.at(block)->stmt()->op_info();
+      auto block_filter_name = block_op_info->Input("Filter");
+      std::copy(block_filter_name.begin(),
+                block_filter_name.end(),
+                std::back_inserter(filter_name));
+      auto block_scale_name = block_op_info->Input("Scale");
+      std::copy(block_scale_name.begin(),
+                block_scale_name.end(),
+                std::back_inserter(scale_name));
+      auto block_bias_name = block_op_info->Input("Bias");
+      std::copy(block_bias_name.begin(),
+                block_bias_name.end(),
+                std::back_inserter(bias_name));
+      auto block_mean_name = block_op_info->Input("Mean");
+      std::copy(block_mean_name.begin(),
+                block_mean_name.end(),
+                std::back_inserter(mean_name));
+      auto block_var_name = block_op_info->Input("Var");
+      std::copy(block_var_name.begin(),
+                block_var_name.end(),
+                std::back_inserter(var_name));
+    }
+    op_desc.SetInput("Filter", filter_name);
+    op_desc.SetInput("Bias", bias_name);
+    op_desc.SetOutput("Output", {matched.at("bottom_pool_out")->arg()->name});
+    op_desc.SetAttr<int>("xpu", 1);
+    auto* resnet50_stmt = matched.at("top_conv")->stmt();
+    auto* scope = resnet50_stmt->op()->scope();
+    for (size_t i = 0; i < filter_name.size(); ++i) {
+      auto* filter_t = scope->FindMutableTensor(filter_name[i]);
+      auto* scale_t = scope->FindMutableTensor(scale_name[i]);
+      auto* bias_t = scope->FindMutableTensor(bias_name[i]);
+      auto* mean_t = scope->FindMutableTensor(mean_name[i]);
+      auto* var_t = scope->FindMutableTensor(var_name[i]);
+      int mean_len = mean_t->numel();
+      int filter_len = filter_t->numel();
+      int filter_stride = filter_len / mean_len;
+      float* filter_on_host = filter_t->mutable_data<float>();
+      float* scale_on_host = scale_t->mutable_data<float>();
+      float* bias_on_host = bias_t->mutable_data<float>();
+      float* mean_on_host = mean_t->mutable_data<float>();
+      float* var_on_host = var_t->mutable_data<float>();
+      // Perform preprocess
+      for (int i = 0; i < mean_len; ++i) {
+        scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f);
+      }
+      for (int i = 0; i < mean_len; ++i) {
+        for (int j = 0; j < filter_stride; ++j) {
+          filter_on_host[i * filter_stride + j] *= scale_on_host[i];
+        }
+      }
+      for (int i = 0; i < mean_len; ++i) {
+        bias_on_host[i] += -mean_on_host[i] * scale_on_host[i];
+      }
+      float max_f =
+          paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
+      std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
+      paddle::lite::xpu::math::ConvertFP32ToInt16(
+          filter_on_host, filter_int16.get(), max_f, filter_len);
+      memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
+      // create new arg in graph and scope
+      std::string max_name = filter_name[i] + "_max";
+      max_filter_name.push_back(max_name);
+      auto* max_filter_node = graph->NewArgumentNode(max_name);
+      max_filter_node->arg()->is_weight = true;
+      max_filter_node->arg()->type = LiteType::GetTensorTy(
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+      DirectedLink(max_filter_node, matched.at("top_conv"));
+      auto* max_filter_t = scope->NewTensor(max_name);
+      max_filter_t->Resize({4});
+      float* max_ptr = max_filter_t->mutable_data<float>();
+      max_ptr[0] = max_f;
+      max_ptr[1] = max_f;
+      max_ptr[2] = max_f;
+      max_ptr[3] = max_f;
+    }
+    op_desc.SetInput("MaxFilter", max_filter_name);
+    auto resnet50_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    resnet50_op->Attach(op_desc, scope);
+    resnet50_op->SetValidPlaces(resnet50_stmt->op()->valid_places());
+    auto kernels = resnet50_op->CreateKernels(resnet50_op->valid_places());
+    resnet50_stmt->SetOp(resnet50_op);
+    resnet50_stmt->SetKernels(std::move(kernels));
+    IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv"));
+    for (auto* node : extra_input_vars) {
+      IR_NODE_LINK_TO(node, matched.at("top_conv"));
+    }
+    IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("bottom_pool_out"));
+  }
+};
+}  // namespace fusion
+class XPUResNet50FusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    fusion::XPUResNetBlock0Fuser block0_fuser;
+    block0_fuser(graph.get());
+    fusion::XPUResNetBlock1Fuser block1_fuser;
+    block1_fuser(graph.get());
+    fusion::XPUResNet50Fuser resnet50_fuser;
+    resnet50_fuser(graph.get());
+  }
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(__xpu__resnet_fuse_pass,
+                  paddle::lite::mir::XPUResNet50FusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("conv2d");
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -26,15 +26,13 @@ namespace paddle {
 namespace lite {
 namespace mir {
-using inference::analysis::Dot;
 void GraphVisualizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  VLOG(5) << "\n" << Visualize(graph.get());
 }
 std::string Visualize(mir::SSAGraph* graph) {
  std::ostringstream os;
-  inference::analysis::Dot dot;
+  Dot dot;
  auto string_trunc = [](const std::string& str) -> std::string {
    const int max_disp_size = 100;
    if (str.length() > max_disp_size)

--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -15,7 +15,6 @@
 #include "lite/core/mir/mlu_postprocess_pass.h"
 #include <list>
 #include <memory>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -50,10 +49,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
    op_desc.SetAttr<int>("out_dtype", 4);  // FP16
    op_desc.SetInput("X", {cur_node->AsArg().name});
    op_desc.SetOutput("Out", {cast_arg_name});
-  } else if (op_type == "transpose") {
+  } else if (op_type == "layout") {
    // NCHW -> NHWC
-    op_desc.SetAttr<std::vector<int>>("axis", {0, 2, 3, 1});
+    op_desc.SetInput("Input", {cur_node->AsArg().name});
-    op_desc.SetInput("X", {cur_node->AsArg().name});
    op_desc.SetOutput("Out", {cast_arg_name});
  } else if (op_type == "io_copy") {
    op_desc.SetInput("Input", {cur_node->AsArg().name});
@@ -72,8 +70,15 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
      if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
        is_found = true;
      }
-    } else if (op_type == "transpose") {
+    } else if (op_type == "layout") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
+          DataLayoutCompatible(*out_arg_ty, *cast_type) &&
+          //  for first conv
+          PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
        is_found = true;
+      }
    } else if (op_type == "io_copy") {
      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
@@ -89,8 +94,13 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
      // we pick the kernel
      cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
      auto& stmt = cast_inst->AsStmt();
+      if (op_type == "layout") {
        stmt.picked_kernel().SetContext(
-          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+            ContextScheduler::Global().NewContext(TARGET(kX86)));
+      } else {
+        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            stmt.picked_kernel().target()));
+      }
      break;
    }
  }
@@ -113,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
  cast_arg->AsArg().type = cast_type;
  auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
  // for CastAfter manully set the tensor's type
-  var->GetMutable<::paddle::lite::Tensor>();
+  var->GetMutable<paddle::lite::Tensor>();
  // create the stmt node
  auto* cast_inst = graph->NewInstructNode();
@@ -127,10 +137,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
    op_desc.SetAttr<int>("out_dtype", 5);  // FP16
    op_desc.SetInput("X", {cast_arg_name});
    op_desc.SetOutput("Out", {cur_node->AsArg().name});
-  } else if (op_type == "transpose") {
+  } else if (op_type == "layout") {
    // NHWC -> NCHW
-    op_desc.SetAttr<std::vector<int>>("axis", {0, 3, 1, 2});
+    op_desc.SetInput("Input", {cast_arg_name});
-    op_desc.SetInput("X", {cast_arg_name});
    op_desc.SetOutput("Out", {cur_node->AsArg().name});
  } else if (op_type == "io_copy") {
    op_desc.SetInput("Input", {cast_arg_name});
@@ -151,8 +160,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
      if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) {
        is_found = true;
      }
-    } else if (op_type == "transpose") {
+    } else if (op_type == "layout") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (DataLayoutCompatible(*in_arg_ty, *cast_type) &&
+          DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) {
        is_found = true;
+      }
    } else if (op_type == "io_copy") {
      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
@@ -168,8 +182,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
      // we pick the kernel
      cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
      auto& stmt = cast_inst->AsStmt();
+      if (op_type == "layout") {
        stmt.picked_kernel().SetContext(
-          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+            ContextScheduler::Global().NewContext(TARGET(kX86)));
+      } else {
+        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            stmt.picked_kernel().target()));
+      }
      break;
    }
  }
@@ -193,24 +212,28 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
  auto* cur_node = head_node;
  const auto name_prefix =
      head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
+  bool is_first_conv_head =
+      std::find(first_conv_nodes_.begin(),
+                first_conv_nodes_.end(),
+                head_node->AsArg().name) != first_conv_nodes_.end();
-  // layout cast node
+  // precision cast node
-  if (head_type->layout() != inst_type->layout()) {
+  if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
    cur_node = InsertCastBefore(
-        "transpose",
+        "cast",
-        name_prefix + "transpose",
+        name_prefix + "cast",
        graph,
        cur_node,
        inst_node,
        LiteType::GetTensorTy(
-            head_type->target(), head_type->precision(), inst_type->layout()));
+            head_type->target(), inst_type->precision(), head_type->layout()));
  }
-  // precision cast node
+  // layout cast node
-  if (head_type->precision() != inst_type->precision()) {
+  if (head_type->layout() != inst_type->layout()) {
    cur_node = InsertCastBefore(
-        "cast",
+        "layout",
-        name_prefix + "cast",
+        name_prefix + "layout",
        graph,
        cur_node,
        inst_node,
@@ -260,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
  // get subgraph's valid precision
  const auto& places = graph->valid_places();
-  std::set<::paddle::lite_api::PrecisionType> prec_set;
+  std::set<paddle::lite_api::PrecisionType> prec_set;
  for (const auto& place : places) {
    if (place.target == TARGET(kMLU)) {
      prec_set.insert(place.precision);
@@ -343,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
  const auto name_prefix =
      tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
-  // layout cast node
+  // precision cast node
-  if (tail_type->layout() != inst_type->layout()) {
+  if (tail_type->precision() != inst_type->precision()) {
    cur_node = InsertCastAfter(
-        "transpose",
+        "cast",
-        name_prefix + "transpose",
+        name_prefix + "cast",
        graph,
        cur_node,
        inst_node,
        LiteType::GetTensorTy(
-            tail_type->target(), tail_type->precision(), inst_type->layout()));
+            tail_type->target(), inst_type->precision(), tail_type->layout()));
  }
-  // precision cast node
+  // layout cast node
-  if (tail_type->precision() != inst_type->precision()) {
+  if (tail_type->layout() != inst_type->layout()) {
    cur_node = InsertCastAfter(
-        "cast",
+        "layout",
-        name_prefix + "cast",
+        name_prefix + "layout",
        graph,
        cur_node,
        inst_node,
@@ -392,6 +415,14 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
    UpdateOutputTo(
        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+    /* graph like this
+     *        subgraph_op_0
+     *          /       \
+     *         /         \
+     * subgraph_op_1   host_op
+     */
+    UpdateInputTo(
+        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
  }
  // recreate the op
@@ -415,6 +446,56 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
  }
 }
+bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
+  auto* block_desc =
+      static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
+          ->GetSubBlock();
+  for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
+    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
+    CHECK(op_desc);
+    if (op_desc->Type() == "conv2d") {
+      for (auto& names : op_desc->inputs()) {
+        if (std::find(names.second.begin(),
+                      names.second.end(),
+                      arg_node->AsArg().name) != names.second.end()) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) {
+  CHECK(arg_node->IsArg());
+  for (auto& inst : arg_node->outlinks) {
+    if (inst->AsStmt().op_type() == "subgraph") {
+      return IsFirstConvInSubgraph(arg_node, inst);
+    }
+  }
+  return false;
+}
+void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) {
+  for (auto& node : graph->mutable_nodes()) {
+    if (!node.IsStmt()) continue;
+    if (node.AsStmt().op_type() == "feed") {
+      for (auto& out : node.outlinks) {
+        if (IsFirstConvNode(out)) {
+          first_conv_nodes_.insert(out->AsArg().name);
+          // modify first conv nodes' type
+          const auto* old_type = out->AsArg().type;
+          out->AsArg().type =
+              LiteType::GetTensorTy(old_type->target(),
+                                    paddle::lite_api::PrecisionType::kInt8,
+                                    old_type->layout(),
+                                    old_type->device());
+        }
+      }
+    }
+  }
+}
 void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
  for (auto& node : graph->mutable_nodes()) {
    if (!node.IsStmt()) continue;
@@ -432,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
          out->AsArg().type =
              LiteType::GetTensorTy(old_type->target(),
                                    old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                    old_type->device());
        }
      }
@@ -451,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
          inp->AsArg().type =
              LiteType::GetTensorTy(old_type->target(),
                                    old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                    old_type->device());
        }
      }
@@ -460,14 +541,22 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
 }
 void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  // currently for non-persistent input and output args, mlu subgraph op
+// currently for non-persistent input and output args, mlu subgraph op
-  // only support float16/float32 data type
+// only support float16/float32 data type
-  // in two situations as folllows:
+// in two situations as folllows:
-  // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
+// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
-  // arg_in and arg_out are assumed to be NHWC which user should be aware of.
+// arg_in and arg_out are assumed to be NHWC which user should be aware of.
-  // Thus here we change these args' layout to NHWC
+// Thus here we change these args' layout to NHWC
+#ifdef LITE_WITH_MLU
+  if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) {
    ModifyLayout(graph.get());
+  }
+  if (lite::DeviceInfo::Global().UseFirstConv()) {
+    GatherAndModifyFirstConvNodes(graph.get());
+  }
+#endif
  // insert io_copy, layout and precision cast of subgraph's inputs and outputs
  for (auto& node : graph->mutable_nodes()) {

--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 #include "lite/core/mir/pass.h"
@@ -107,6 +108,15 @@ class MLUPostprocessPass : public ProgramPass {
                        const Type* cast_type);
  void RecreateOp(Node* inst_node, SSAGraph* graph);
+  void GatherAndModifyFirstConvNodes(SSAGraph* graph);
+  bool IsFirstConvNode(Node* arg_node);
+  bool IsFirstConvInSubgraph(Node* arg_node, Node* inst);
+ private:
+  std::set<std::string> first_conv_nodes_;
 };
 }  // namespace mir

--- a/lite/core/mir/pattern_matcher.cc
+++ b/lite/core/mir/pattern_matcher.cc
@@ -322,7 +322,6 @@ void PatternMatcher::RemoveOverlappedMatch(std::vector<subgraph_t> *subgraphs) {
 }
 std::string PMPattern::DotString() const {
-  using inference::analysis::Dot;
  Dot dot;
  int id = 0;
  // Create Nodes

--- a/lite/core/mir/pattern_matcher_high_api.h
+++ b/lite/core/mir/pattern_matcher_high_api.h
@@ -64,7 +64,6 @@ class FuseBase {
 protected:
  virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0;
- private:
  void PerformPatternMatcher(SSAGraph* graph);
  // Delete nodes that are marked as Intermediate

--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -64,6 +64,26 @@ std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildOperationAdjList() {
  return adj_list;
 }
+std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildNodeAdjList() {
+  std::map<mir::Node *, std::set<mir::Node *>> adj_list;
+  for (auto &n : mutable_nodes()) {
+    if (adj_list.find(&n) == adj_list.end()) {
+      adj_list[&n] = std::set<mir::Node *>();
+    }
+    std::vector<mir::Node *> nodes;
+    for (auto &var : n.inlinks) {
+      nodes.push_back(var);
+    }
+    std::sort(nodes.begin(),
+              nodes.end(),
+              [](mir::Node *node1, mir::Node *node2) { return node1 > node2; });
+    adj_list[&n].insert(std::make_move_iterator(nodes.begin()),
+                        std::make_move_iterator(nodes.end()));
+  }
+  return adj_list;
+}
 void SSAGraph::SortHelper(
    const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
    mir::Node *node,
@@ -98,6 +118,24 @@ std::vector<mir::Node *> SSAGraph::StmtTopologicalOrder() {
  return res;
 }
+std::vector<mir::Node *> SSAGraph::NodeTopologicalOrder() {
+  CheckBidirectionalConnection();
+  std::stack<mir::Node *> stack;
+  std::set<mir::Node *> visited;
+  std::vector<mir::Node *> res;
+  auto adj_list = BuildNodeAdjList();
+  for (auto adj : adj_list) {
+    if (visited.find(adj.first) == visited.end()) {
+      SortHelper(adj_list, adj.first, &visited, &res);
+    }
+  }
+  return res;
+}
 Node *SSAGraph::GraphCreateInstructNode(
    const std::shared_ptr<OpLite> &op, const std::vector<Place> &valid_places) {
  node_storage_.emplace_back();
@@ -213,9 +251,10 @@ std::vector<mir::Node *> SSAGraph::outputs() {
 }
 mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) {
-  auto it = arguments_.find(arg);
+  for (auto &node : node_storage_) {
-  if (it != arguments_.end()) {
+    if (node.IsArg() && node.arg()->name == arg) {
-    return it->second;
+      return &node;
+    }
  }
  return nullptr;
 }

--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
@@ -42,6 +42,8 @@ class SSAGraph : GraphBase {
  std::vector<mir::Node *> StmtTopologicalOrder();
+  std::vector<mir::Node *> NodeTopologicalOrder();
  // The inputs of the graph.
  std::vector<mir::Node *> inputs();
@@ -86,6 +88,9 @@ class SSAGraph : GraphBase {
  // Build operator inlink edge table.
  std::map<mir::Node *, std::set<mir::Node *>> BuildOperationAdjList();
+  // Build node inlink edge table.
+  std::map<mir::Node *, std::set<mir::Node *>> BuildNodeAdjList();
  void SortHelper(const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
                  mir::Node *node,
                  std::set<mir::Node *> *visited,

--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -30,10 +30,8 @@ namespace paddle {
 namespace lite {
 namespace mir {
-using inference::analysis::Dot;
 std::string SubgraphVisualizer::operator()() {
-  inference::analysis::Dot dot;
+  Dot dot;
  const std::vector<std::string> subgraph_colors{
      "red",          "green",          "cyan",           "bisque3",
      "coral",        "darkseagreen1",  "goldenrod1",     "darkorchid",
@@ -314,8 +312,14 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) {
 std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
    node_map_t *nodes) {
-  for (auto &it : *nodes) {
+  for (auto &ordered_node : graph_->NodeTopologicalOrder()) {
-    node_dat_t *node = it.second;
+    // different orders when traversing nodes in graph may lead to
+    // different subgraph division, which may generate different result
+    // with device such as MLU. These different results are all "right"
+    // but a little confusing. Thus the topological order is used instead
+    // of the address of the node in graph.
+    CHECK(nodes->find(ordered_node) != nodes->end());
+    node_dat_t *node = (*nodes)[ordered_node];
    if (!node->marked) {
      continue;
    }
@@ -573,13 +577,14 @@ void ExtractInputsOutputs(const std::vector<Node *> &op_nodes,
        unused_var_nodes->insert(var_node);
        continue;
      }
-      // Var can have more than one next op node, So, if any one in the
+      // Var can have more than one next op node, So, if all next nodes are in
-      // op_nodes then continue
+      // op_nodes then it should be put into local_var_nodes
-      bool next_op_in_nodes = false;
+      bool next_op_in_nodes = true;
      for (auto &next_op_node : var_node->outlinks) {
-        if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) !=
+        if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) ==
            op_nodes.end()) {
-          next_op_in_nodes = true;
+          next_op_in_nodes = false;
+          break;
        }
      }
      if (next_op_in_nodes) {

--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -200,7 +200,7 @@ TEST(Subgraph, detect_custom_model) {
 #ifdef LITE_WITH_NPU
      Place{TARGET(kNPU), PRECISION(kFloat)},
 #endif
-#ifdef LITE_WITH_XPU
+#ifdef LITE_WITH_XTCL
      Place{TARGET(kXPU), PRECISION(kFloat)},
 #endif
  });

--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -20,6 +20,7 @@
 #include <vector>
 #include "lite/core/mir/pass_registry.h"
 #include "lite/core/mir/subgraph/subgraph_detector.h"
+#include "lite/utils/env.h"
 namespace paddle {
 namespace lite {
@@ -40,6 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }
 void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
  std::unordered_set<std::string> supported_lists;
 #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
 #include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
@@ -67,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }
+void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/mlu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
@@ -77,3 +93,5 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
    .BindTargets({TARGET(kXPU)});
 REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
    .BindTargets({TARGET(kBM)});
+REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
+    .BindTargets({TARGET(kMLU)});
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };
+class MLUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -180,7 +180,7 @@ TEST(Subgraph, generate_model_and_check_precision) {
 #ifdef LITE_WITH_NPU
  valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
 #endif
-#ifdef LITE_WITH_XPU
+#ifdef LITE_WITH_XTCL
  valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
 #endif
  auto tar_predictor = TestModel(FLAGS_model_dir,

--- a/lite/core/mir/subgraph_cast_display_pass.cc
+++ b/lite/core/mir/subgraph_cast_display_pass.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/pass_registry.h"
-namespace paddle {
-namespace lite {
-namespace mir {
-class SubgraphCastDisplayPass : public DebugPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
-    VLOG(3) << "== Argument types ==";
-    for (auto& node : graph->mutable_nodes()) {
-      if (!node.IsArg()) continue;
-      auto* type = node.AsArg().type;
-      if (type) {
-        VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type;
-      } else {
-        VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK";
-      }
-    }
-    VLOG(3) << "---------------------";
-    //
-    VLOG(0) << "== SubgraphOp Debug Info ==";
-    for (auto& node : graph->mutable_nodes()) {
-      if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
-        VLOG(0) << "FOUND SUBGRAPH OP";
-        display_debug_info(node, "subgraph");
-        break;
-      }
-    }
-    VLOG(0) << "---------------------";
-  }
-  void display_debug_info(const Node& node,
-                          std::string op_type,
-                          bool display_in_nodes = true,
-                          bool display_out_nodes = true) {
-    CHECK(node.IsStmt());
-    VLOG(0) << node.AsStmt();
-    if (display_in_nodes) {
-      for (auto p_in_arg_node : node.inlinks) {
-        CHECK(p_in_arg_node->IsArg());
-        VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name
-                << " type: " << *p_in_arg_node->AsArg().type
-                << " is_weight: " << p_in_arg_node->AsArg().is_weight
-                << " is_persist: " << p_in_arg_node->AsArg().is_persist
-                << " input_count: " << p_in_arg_node->inlinks.size();
-        if (p_in_arg_node->inlinks.size() == 0) {
-          VLOG(0) << "** END with No Op";
-        }
-        for (auto p_in_stmt_node : p_in_arg_node->inlinks) {
-          CHECK(p_in_stmt_node->IsStmt());
-          std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type();
-          if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
-              stmt_op_type == "io_copy") {
-            display_debug_info(*p_in_stmt_node, stmt_op_type, true, false);
-          } else {
-            VLOG(0) << "** END with op type: " << stmt_op_type;
-          }
-        }
-      }
-    }
-    if (display_out_nodes) {
-      for (auto p_out_arg_node : node.outlinks) {
-        CHECK(p_out_arg_node->IsArg());
-        VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
-                << " type: " << *p_out_arg_node->AsArg().type
-                << " is_weight: " << p_out_arg_node->AsArg().is_weight
-                << " is_persist: " << p_out_arg_node->AsArg().is_persist
-                << " output_count: " << p_out_arg_node->outlinks.size();
-        if (p_out_arg_node->outlinks.size() == 0) {
-          VLOG(0) << "** END with No Op";
-        }
-        for (auto p_out_stmt_node : p_out_arg_node->outlinks) {
-          CHECK(p_out_stmt_node->IsStmt());
-          std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type();
-          if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
-              stmt_op_type == "io_copy") {
-            display_debug_info(*p_out_stmt_node, stmt_op_type, false, true);
-          } else {
-            VLOG(0) << "** END with op type: " << stmt_op_type;
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-REGISTER_MIR_PASS(subgraph_cast_display_pass,
-                  paddle::lite::mir::SubgraphCastDisplayPass)
-    .BindTargets({TARGET(kAny)});
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -180,7 +180,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
        VLOG(4) << "picked, opencl found";
        is_found = true;
      } else if (TypeCompatible(*in_arg_ty, from) &&
-                 out_arg_ty->target() == to.target()) {
+                 TargetCompatibleTo(*out_arg_ty, to)) {
        VLOG(4) << "picked";
        is_found = true;
      }

--- a/lite/core/mir/xpu_pattern_matcher.cc
+++ b/lite/core/mir/xpu_pattern_matcher.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <array>
+#include <string>
+#include <vector>
+#include "lite/core/mir/dot.h"
+#include "lite/core/mir/xpu_pattern_matcher.h"
+#include "lite/core/op_lite.h"
+#include "lite/utils/string.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+void XPUPatternMatcher::operator()(SSAGraph *graph,
+                                   XPUPatternMatcher::handle_t handler) {
+  if (!MarkPMNodesInGraph(graph)) {
+    return;
+  }
+  auto subgraphs = DetectPatterns();
+  UniquePatterns(&subgraphs);
+  RemoveOverlappedMatch(&subgraphs);
+  ValidateByNodeRole(&subgraphs);
+  if (subgraphs.empty()) return;
+  LOG(INFO) << "detected " << subgraphs.size() << " subgraph";
+  int id = 0;
+  for (auto &g : subgraphs) {
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
+    handler(g, graph);
+  }
+}
+bool XPUPatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) {
+  VLOG(3) << "mark pmnodes in graph";
+  if (graph->nodes().empty()) return false;
+  for (auto &node : graph->mutable_nodes()) {
+    for (const auto &pmnode : pattern_.nodes()) {
+      if (pmnode->Tell(&node)) {
+        pmnodes2nodes_[pmnode.get()].insert(&node);
+      }
+    }
+  }
+  // Check to early stop if some PMNode can't find matched Node.
+  for (auto &pmnode : pattern_.nodes()) {
+    if (!pmnodes2nodes_.count(pmnode.get())) {
+      VLOG(4) << pmnode->name() << " can't find matched Node, early stop";
+      // return false;
+    }
+  }
+  VLOG(3) << pmnodes2nodes_.size() << " nodes marked";
+  return !pmnodes2nodes_.empty();
+}
+// The intermediate Nodes can only link to the nodes inside the pattern, or this
+// subgraph will be droped.
+void XPUPatternMatcher::ValidateByNodeRole(
+    std::vector<PatternMatcher::subgraph_t> *subgraphs) {
+  subgraphs->erase(
+      std::remove_if(subgraphs->begin(),
+                     subgraphs->end(),
+                     [](const XPUPatternMatcher::subgraph_t &subgraph) -> bool {
+                       // Collect the inlinks and outlinks.
+                       std::unordered_set<Node *> ios;
+                       for (auto &item : subgraph) {
+                         ios.insert(item.second);
+                       }
+                       for (auto &item : subgraph) {
+                         if (item.first->IsIntermediate()) {
+                           for (auto *x : item.second->outlinks) {
+                             if (!ios.count(x)) {
+                               return true;
+                             }
+                           }
+                         }
+                       }
+                       return false;
+                     }),
+      subgraphs->end());
+  for (auto &subgraph : *subgraphs) {
+    std::unordered_set<Node *> ios;
+    for (auto &item : subgraph) {
+      ios.insert(item.second);
+    }
+    extra_input_vars_.emplace_back();
+    for (auto &item : subgraph) {
+      for (auto *x : item.second->inlinks) {
+        if (x->IsArg() && ios.count(x) == 0) {
+          // extra weight var
+          extra_input_vars_.back().push_back(x);
+        }
+      }
+    }
+  }
+}
+struct HitGroup {
+  std::unordered_map<PMNode *, Node *> roles;
+  bool Match(Node *node, PMNode *pat) {
+    if (nodes_.count(node)) {
+      if (roles.count(pat) && roles[pat] == node) return true;
+      return false;
+    } else {
+      if (roles.count(pat) && roles[pat] != node) return false;
+      return true;
+    }
+  }
+  void Register(Node *node, PMNode *pat) {
+    roles[pat] = node;
+    nodes_.insert(node);
+  }
+ private:
+  std::unordered_set<Node *> nodes_;
+};
+// Tell whether Node a links to b.
+bool IsNodesLink(Node *a, Node *b) {
+  for (auto *node : a->outlinks) {
+    if (b == node) {
+      return true;
+    }
+  }
+  return false;
+}
+std::vector<PatternMatcher::subgraph_t> XPUPatternMatcher::DetectPatterns() {
+  // Init empty subgraphs.
+  std::vector<PatternMatcher::subgraph_t> result;
+  std::vector<HitGroup> init_groups;
+  std::array<std::vector<HitGroup>, 2> bi_records;
+  auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
+                                               : pattern_.edges().front().first;
+  if (!pmnodes2nodes_.count(first_pnode)) return result;
+  for (auto *node : pmnodes2nodes_[first_pnode]) {
+    HitGroup group;
+    group.roles[first_pnode] = node;
+    init_groups.emplace_back(group);
+  }
+  int step = 0;
+  bi_records[0] = std::move(init_groups);
+  // Extend a PMNode to subgraphs by deducing the connection relations defined
+  // in edges of PMNodes.
+  for (const auto &edge : pattern_.edges()) {
+    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
+    // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
+    // Each role has two PMNodes, which indicates two roles.
+    // Detect two Nodes that can match these two roles and they are connected.
+    auto &pre_groups = bi_records[step % 2];
+    auto &cur_groups = bi_records[1 - (step++ % 2)];
+    cur_groups.clear();
+    if (pre_groups.empty()) break;
+    // source -> target
+    for (Node *source : pmnodes2nodes_[edge.first]) {
+      for (Node *target : pmnodes2nodes_[edge.second]) {
+        // TODO(Superjomn) add some prune strategies.
+        for (const auto &group : pre_groups) {
+          if (IsNodesLink(source, target)) {
+            HitGroup new_group = group;
+            bool flag = new_group.Match(source, edge.first) &&
+                        new_group.Match(target, edge.second);
+            if (flag) {
+              new_group.Register(source, edge.first);
+              new_group.Register(target, edge.second);
+              cur_groups.push_back(new_group);
+              // TODO(Superjomn) need to unique
+            }
+          }
+        }
+      }
+    }
+    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
+  }
+  for (auto &group : bi_records[step % 2]) {
+    XPUPatternMatcher::subgraph_t subgraph;
+    for (auto &role : group.roles) {
+      subgraph.emplace(role.first, role.second);
+    }
+    result.emplace_back(subgraph);
+  }
+  return result;
+}
+struct GraphItemLessThan {
+  bool operator()(const std::pair<PMNode *, Node *> &a,
+                  const std::pair<PMNode *, Node *> &b) {
+    if (a.first != b.first) {
+      return a.first < b.first;
+    } else {
+      return a.second < b.second;
+    }
+  }
+};
+// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
+// see https://github.com/PaddlePaddle/Paddle/issues/13550
+void XPUPatternMatcher::UniquePatterns(
+    std::vector<PatternMatcher::subgraph_t> *subgraphs) {
+  if (subgraphs->empty()) return;
+  std::vector<PatternMatcher::subgraph_t> result;
+  std::unordered_set<size_t> set;
+  std::hash<std::string> hasher;
+  for (auto &g : *subgraphs) {
+    // Sort the items in the sub-graph, and transform to a string key.
+    std::vector<std::pair<PMNode *, Node *>> sorted_keys(g.begin(), g.end());
+    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
+    STL::stringstream ss;
+    for (auto &item : sorted_keys) {
+      ss << reinterpret_cast<size_t>(item.first) << ":"
+         << reinterpret_cast<size_t>(item.second);
+    }
+    auto key = hasher(ss.str());
+    if (!set.count(key)) {
+      result.emplace_back(g);
+      set.insert(key);
+    }
+  }
+  *subgraphs = result;
+}
+void XPUPatternMatcher::RemoveOverlappedMatch(
+    std::vector<subgraph_t> *subgraphs) {
+  std::vector<subgraph_t> result;
+  std::unordered_set<Node *> node_set;
+  for (const auto &subgraph : *subgraphs) {
+    bool valid = true;
+    for (auto &item : subgraph) {
+      if (item.first->IsIntermediate() && node_set.count(item.second)) {
+        valid = false;
+        break;
+      }
+    }
+    if (valid) {
+      for (auto &item : subgraph) {
+        node_set.insert(item.second);
+      }
+      result.push_back(subgraph);
+    }
+  }
+  *subgraphs = result;
+}
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/xpu_pattern_matcher.h
+++ b/lite/core/mir/xpu_pattern_matcher.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/pattern_matcher.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+/*
+ * PatternMatcher helps to detect the specific patterns in the graph.
+ * Input a pattern, output a list of the matched subgraphs/nodes.
+ * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
+ *
+ * The algorithm has three phases:
+ *   1. Mark the nodes that match the defined PMNodes in a PMPattern,
+ *   2. Extend a PMNode to subgraphs by deducing the connection relation defined
+ *      in PAPattern(the edges),
+ *   3. Get the filtered subgraphs and treat them with a pre-defined handler.
+ *
+ * Usage:
+ *    // Create a matcher
+ *    PatternMatcher matcher;
+ *    // Define the matcher's pattern, by adding PMNode and define the edges.
+ *    auto* node0 = matcher.mutable_pattern().AddNode(...)
+ *    auto* node1 = matcher.mutable_pattern().AddNode(...)
+ *    node0->teller = some lambda.
+ *    node1->teller = some lambda.
+ *    matcher.mutable_pattern().AddEdge(node0, node1);
+ *    // Create an handler, to define the behavior of treating the filtered
+ *    // subgraphs that comply with the patterns.
+ *    PatternMatcher::handle_t handler = some labmda
+ *    // Execute the matcher.
+ *    matcher(&graph, handler);
+ */
+struct XPUPatternMatcher {
+  using subgraph_t = std::unordered_map<PMNode*, Node*>;
+  // Operate on the detected pattern.
+  using handle_t =
+      std::function<void(const subgraph_t& /*hitted pattern*/, SSAGraph*)>;
+  void operator()(SSAGraph* graph, handle_t handler);
+  const PMPattern& pattern() const { return pattern_; }
+  PMPattern* mutable_pattern() { return &pattern_; }
+  // Mark the nodes that fits the pattern.
+  bool MarkPMNodesInGraph(SSAGraph* graph);
+  // Detect all the pattern and output the hit records.
+  std::vector<subgraph_t> DetectPatterns();
+  // Remove duplicate patterns.
+  void UniquePatterns(std::vector<subgraph_t>* subgraphs);
+  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
+  // The intermediate PMNodes will be removed, so can't shared by multiple
+  // patterns.
+  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
+  // Validate whether the intermediate nodes are linked by external nodes.
+  void ValidateByNodeRole(std::vector<subgraph_t>* subgraphs);
+  using hit_rcd_t =
+      std::pair<Node* /*node in graph*/, PMNode* /*node in pattern*/>;
+  PMPattern pattern_;
+  std::unordered_map<const PMNode*, std::unordered_set<Node*>> pmnodes2nodes_;
+  std::vector<std::vector<Node*>> extra_input_vars_;
+};
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/xpu_pattern_matcher_high_api.cc
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include <set>
+#include <unordered_set>
+#include "lite/utils/cp_logging.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+void XPUFuseBase::PerformPatternMatcher(SSAGraph *graph) {
+  VLOG(4) << "\n" << matcher_.pattern().DotString();
+  // Get subgraphs and record the mir::Node pointers for each PMNode.
+  auto handler = [&](const PatternMatcher::subgraph_t &subgraph, SSAGraph *g) {
+    // get all the reigistered nodes.
+    key2nodes_.emplace_back();
+    for (auto &item : nodes_) {
+      key2nodes_.back()[item.first] = subgraph.at(item.second);
+    }
+  };
+  matcher_(graph, handler);
+}
+void XPUFuseBase::DeleteInterNodes(SSAGraph *graph) {
+  std::set<std::string> keys;
+  for (auto &node : nodes_) {
+    if (node.second->IsIntermediate()) {
+      keys.insert(node.first);
+    }
+  }
+  VLOG(4) << "keys: " << key2nodes_.size();
+  std::unordered_set<const Node *> nodes2rm;
+  for (auto &matched : key2nodes_) {
+    for (const auto &key : keys) {
+      nodes2rm.insert(matched.at(key));
+    }
+  }
+  VLOG(3) << "clean nodes " << nodes2rm.size();
+  GraphSafeRemoveNodes(graph, nodes2rm);
+}
+PMNode *XPUFuseBase::GetOrCreateNode(const std::string &key) {
+  auto it = nodes_.find(key);
+  if (it != nodes_.end()) {
+    return it->second;
+  }
+  nodes_.emplace(key,
+                 matcher_.mutable_pattern()->NewNode(patterns::UniqueKey(key)));
+  it = nodes_.find(key);
+  return it->second;
+}
+PMNode *XPUFuseBase::OpNode(const std::string &key,
+                            const std::string &op_type) {
+  GetOrCreateNode(key)->set_op_type(op_type);
+  GetOrCreateNode(key)->AsOp(op_type);
+  return GetOrCreateNode(key);
+}
+PMNode *XPUFuseBase::VarNode(const std::string &key) {
+  GetOrCreateNode(key)->AsVar();
+  return GetOrCreateNode(key);
+}
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/xpu_pattern_matcher_high_api.h
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+#include "lite/core/mir/xpu_pattern_matcher.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+class XPUFuseBase {
+ public:
+  using key2nodes_t = std::map<std::string, Node*>;
+  virtual ~XPUFuseBase() = default;
+  void operator()(SSAGraph* graph) {
+    BuildPattern();
+    PerformPatternMatcher(graph);
+    for (size_t i = 0; i < key2nodes_.size(); ++i) {
+      InsertNewNode(graph, key2nodes_[i], matcher_.extra_input_vars_[i]);
+    }
+    DeleteInterNodes(graph);
+  }
+  // Build a PMPattern using PMNode.
+  virtual void BuildPattern() = 0;
+  // Generate an operator desc with a matched subgraph.
+  virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
+    return cpp::OpDesc();
+  }
+  PMNode* OpNode(const std::string& key) {
+    return GetOrCreateNode(key)->assert_is_op();
+  }
+  PMNode* OpNode(const std::string& key, const std::string& op_type);
+  PMNode* VarNode(const std::string& key);
+ protected:
+  virtual void InsertNewNode(SSAGraph* graph,
+                             const key2nodes_t& matched,
+                             const std::vector<Node*>& extra_input_vars) = 0;
+  void PerformPatternMatcher(SSAGraph* graph);
+  // Delete nodes that are marked as Intermediate
+  void DeleteInterNodes(SSAGraph* graph);
+  PMNode* GetOrCreateNode(const std::string& key);
+ protected:
+  XPUPatternMatcher matcher_;
+  std::map<std::string, PMNode*> nodes_;
+  std::vector<key2nodes_t> key2nodes_;
+};
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -157,5 +157,33 @@ Tensor *OpLite::GetMutableTensor(lite::Scope *scope,
  return var->GetMutable<lite::Tensor>();
 }
+void OpLite::AttachInput(const cpp::OpDesc &op_desc,
+                         lite::Scope *scope,
+                         const std::string &input_name,
+                         bool is_dispensable,
+                         lite::Tensor **input_var) {
+  bool is_have_input =
+      op_desc.HasInput(input_name) && op_desc.Input(input_name).size() > 0;
+  CHECK(is_dispensable || is_have_input);
+  if (is_have_input) {
+    std::string input_var_name = op_desc.Input(input_name).front();
+    *input_var = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  }
+}
+void OpLite::AttachOutput(const cpp::OpDesc &op_desc,
+                          lite::Scope *scope,
+                          const std::string &output_name,
+                          bool is_dispensable,
+                          lite::Tensor **output_var) {
+  bool is_have_output =
+      op_desc.HasOutput(output_name) && op_desc.Output(output_name).size() > 0;
+  CHECK(is_dispensable || is_have_output);
+  if (is_have_output) {
+    std::string output_var_name = op_desc.Output(output_name).front();
+    *output_var = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  }
+}
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -105,6 +105,20 @@ class OpLite : public Registry {
    return kernel_.get();
  }
+  // Attach input variable from scope by op_desc and input name
+  void AttachInput(const cpp::OpDesc &op_desc,
+                   lite::Scope *scope,
+                   const std::string &input_name,
+                   bool is_dispensable,
+                   lite::Tensor **input_var);
+  // Attach output variable from scope by op_desc and output name
+  void AttachOutput(const cpp::OpDesc &op_desc,
+                    lite::Scope *scope,
+                    const std::string &output_name,
+                    bool is_dispensable,
+                    lite::Tensor **output_var);
  virtual ~OpLite() = default;
 protected:

--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -152,6 +152,8 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kMLU, kInt16, kNCHW);
  INIT_FOR(kHost, kFloat, kNCHW);
+  INIT_FOR(kHost, kInt32, kNCHW);
+  INIT_FOR(kHost, kInt64, kNCHW);
  INIT_FOR(kHost, kAny, kNCHW);
  INIT_FOR(kHost, kFloat, kNHWC);
  INIT_FOR(kHost, kFloat, kAny);

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -135,6 +135,12 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kHost),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kHost),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kHost),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kCUDA),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -76,6 +76,8 @@ class Optimizer {
    (defined LITE_WITH_ARM)
           "lite_elementwise_add_activation_fuse_pass",  //
 #endif
+           "__xpu__resnet_fuse_pass",
+           "__xpu__multi_encoder_fuse_pass",
           "quantized_op_attributes_inference_pass",  // Only for fully
                                                      // quantized model, infer
                                                      // the output scale and
@@ -116,9 +118,15 @@ class Optimizer {
           "variable_place_inference_pass",  //
           "argument_type_display_pass",
+           "mlu_subgraph_pass",
           "runtime_context_assign_pass",
           "argument_type_display_pass",
+           "mlu_postprocess_pass",
           "memory_optimize_pass"}};
      if (passes.size() == 1) {
        passes_local.push_back(passes[0]);
      }

--- a/lite/core/workspace.h
+++ b/lite/core/workspace.h
@@ -69,6 +69,13 @@ class WorkSpace {
  }
 #endif
+#if defined(LITE_WITH_MLU)
+  static WorkSpace& Global_MLU() {
+    thread_local std::unique_ptr<WorkSpace> x(new WorkSpace(TARGET(kMLU)));
+    return *x;
+  }
+#endif
 private:
  explicit WorkSpace(TargetType x) : target_(x) {}

--- a/lite/fluid/lod.h
+++ b/lite/fluid/lod.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace lite {
 namespace fluid {
-using LoD = std::vector<std::vector<size_t>>;
+using LoD = std::vector<std::vector<uint64_t>>;
 static LoD ToAbsOffset(const LoD &in) {
  // the lowest level stores relative offsets

--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -10,4 +10,5 @@ add_subdirectory(opencl)
 add_subdirectory(fpga)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(mlu)
 add_subdirectory(bm)
--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
@@ -179,6 +179,34 @@ void SquareCompute::Run() {
      x_data, output_data, x_dims.production(), ctx.threads());
 }
+void HardSwishCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  float threshold = param.hard_swish_threshold;
+  float scale = param.hard_swish_scale;
+  float offset = param.hard_swish_offset;
+  lite::arm::math::act_hard_swish<float>(x_data,
+                                         output_data,
+                                         x_dims.production(),
+                                         threshold,
+                                         scale,
+                                         offset,
+                                         ctx.threads());
+}
+void ReciprocalCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_reciprocal<float>(
+      x_data, output_data, x_dims.production(), ctx.threads());
+}
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -275,3 +303,21 @@ REGISTER_LITE_KERNEL(
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
+REGISTER_LITE_KERNEL(hard_swish,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::HardSwishCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+REGISTER_LITE_KERNEL(reciprocal,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ReciprocalCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
@@ -148,6 +148,24 @@ class SquareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  virtual ~SquareCompute() = default;
 };
+class HardSwishCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+  void Run() override;
+  virtual ~HardSwishCompute() = default;
+};
+class ReciprocalCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+  void Run() override;
+  virtual ~ReciprocalCompute() = default;
+};
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -5,3 +5,4 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${lite_kernel_deps})
--- a/lite/kernels/host/ctc_align_compute.cc
+++ b/lite/kernels/host/ctc_align_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/host/ctc_align_compute.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <utility>
+#include <vector>
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+LoD ToAbs(const LoD& in) {
+  if (in.empty()) return in;
+  LoD result;
+  for (auto& src : in) {
+    std::vector<uint64_t> dest(src.size() + 1, 0);
+    for (int i = 0; i < src.size(); i++) {
+      dest[i + 1] = dest[i] + src[i];
+    }
+    result.emplace_back(dest);
+  }
+  return result;
+}
+LoD ToNorm(const LoD& in) {
+  if (in.empty()) return in;
+  LoD result;
+  for (auto& src : in) {
+    std::vector<uint64_t> dest(src.size() - 1, 0);
+    for (int i = 0; i < dest.size(); i++) {
+      dest[i] = src[i + 1] - src[i];
+    }
+    result.emplace_back(dest);
+  }
+  return result;
+}
+LoD ToAbsOffset(const LoD& in) {
+  // the lowest level stores relative offsets
+  if (in.empty() || in.size() == 1) return in;
+  LoD result = in;
+  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
+    for (size_t i = 0; i < in[level].size(); ++i) {
+      size_t index = in[level][i];
+      result[level][i] = result[level + 1][index];
+    }
+  }
+  return result;
+}
+template <typename T, PrecisionType PT>
+void CtcAlignCompute<T, PT>::Run() {
+  auto& param = this->template Param<operators::CtcAlignParam>();
+  auto* input = param.input;
+  auto* output = param.output;
+  size_t blank = static_cast<size_t>(param.blank);
+  bool merge_repeated = param.merge_repeated;
+  size_t padding_value = static_cast<size_t>(param.padding_value);
+  const auto* input_data = input->template data<T>();
+  auto input_dims = input->dims();
+  auto* output_data = output->template mutable_data<T>();
+  if (input->lod().empty()) {
+    auto* input_length = param.input_length;
+    auto* output_length = param.output_length;
+    CHECK(input_length != nullptr);
+    CHECK(output_length != nullptr);
+    const auto* input_length_data = input_length->template data<T>();
+    auto* output_length_data = output_length->template mutable_data<T>();
+    for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0]; batch_id++) {
+      T prev_token = -1;
+      size_t output_idx = 0;
+      for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) {
+        size_t input_ind = batch_id * input_dims[1] + i;
+        if ((unsigned)input_data[input_ind] != blank &&
+            !(merge_repeated && input_data[input_ind] == prev_token)) {
+          output_data[batch_id * input_dims[1] + output_idx] =
+              input_data[input_ind];
+          ++output_idx;
+        }
+        prev_token = input_data[input_ind];
+      }
+      output_length_data[batch_id] = output_idx;
+      for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++)
+        output_data[batch_id * input_dims[1] + j] = padding_value;
+    }
+  } else {
+    const size_t level = 0;
+    auto input_lod = input->lod();
+    input_lod = ToAbs(input->lod());
+    input_lod = ToAbsOffset(input_lod);
+    CHECK_EQ(input_dims[0], static_cast<int64_t>(input_lod[level].back()));
+    const size_t num_sequences = input_lod[level].size() - 1;
+    // merge repeated tokens and delete blank
+    size_t output_idx = 0;
+    std::vector<uint64_t> output_lod0(1, 0);
+    for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
+      T prev_token = -1;
+      for (size_t i = input_lod[level][seq_idx];
+           i < input_lod[level][seq_idx + 1];
+           ++i) {
+        if ((unsigned)input_data[i] != blank &&
+            !(merge_repeated && input_data[i] == prev_token)) {
+          output_data[output_idx] = input_data[i];
+          ++output_idx;
+        }
+        prev_token = input_data[i];
+      }
+      output_lod0.push_back(static_cast<uint64_t>(output_idx));
+    }
+    LoD output_lod;
+    output_lod.push_back(output_lod0);
+    output_lod = ToNorm(output_lod);
+    output->set_lod(output_lod);
+    output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+    if (output_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output_data = output->template mutable_data<T>();
+      output_data[0] = -1;
+    }
+  }
+}
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+using ctc_align_int64 =
+    paddle::lite::kernels::host::CtcAlignCompute<int64_t, PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(ctc_align, kHost, kInt64, kNCHW, ctc_align_int64, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .BindInput("InputLength",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .BindOutput("OutputLength",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .Finalize();
+using ctc_align_int32 =
+    paddle::lite::kernels::host::CtcAlignCompute<int32_t, PRECISION(kInt32)>;
+REGISTER_LITE_KERNEL(ctc_align, kHost, kInt32, kNCHW, ctc_align_int32, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("InputLength",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("OutputLength",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .Finalize();
--- a/lite/kernels/host/ctc_align_compute.h
+++ b/lite/kernels/host/ctc_align_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+template <typename T, PrecisionType PT>
+class CtcAlignCompute : public KernelLite<TARGET(kHost), PT> {
+ public:
+  void Run() override;
+  virtual ~CtcAlignCompute() = default;
+};
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/mlu/CMakeLists.txt
+++ b/lite/kernels/mlu/CMakeLists.txt
@@ -6,3 +6,4 @@ add_subdirectory(bridges)
 add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges})
 add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
 add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
+add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
@@ -15,6 +15,9 @@ lite_cc_library(subgraph_bridge_elementwise_ops_mlu SRCS elementwise_ops.cc DEPS
 lite_cc_library(subgraph_bridge_pool_op_mlu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_mlu})
 lite_cc_library(subgraph_bridge_softmax_op_mlu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_mlu})
 lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS ${subgraph_bridge_deps_mlu})
 set(mlu_subgraph_bridges
        subgraph_bridge_registry
        subgraph_bridge_utility_mlu
@@ -26,16 +29,20 @@ set(mlu_subgraph_bridges
        subgraph_bridge_softmax_op_mlu
        subgraph_bridge_fc_op_mlu
        subgraph_bridge_batch_norm_op_mlu
+        subgraph_bridge_scale_op_mlu
+        subgraph_bridge_interp_op_mlu
+        subgraph_bridge_concat_op_mlu
        CACHE INTERNAL "mlu_subgraph_bridges")
+lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
-# lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
+lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
@@ -31,20 +31,34 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  VLOG(3) << "[MLU] Converting " + op_type + "...";
  // Create act node and set params from op
+  auto fp_type = graph->FPType();
  auto x_var_name = op_info->Input("X").front();
  auto out_var_name = op_info->Output("Out").front();
  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
  auto output_dims = output->dims().Vectorize();
  auto output_tensor = graph->AddNode(
-      out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type);
  CHECK(graph->HasNode(x_var_name));
  auto input_tensor = graph->GetNode(x_var_name);
-  cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type);
  cnmlBaseOp_t activation_op;
+  if (op_type == "leaky_relu") {
+    auto alpha = op_info->GetAttr<float>("alpha");
+    std::vector<int64_t> shape = {1, 1, 1, 1};
+    std::string alpha_var_name = string_format("leaky_relu_alpha_%p", op);
+    auto alpha_tensor =
+        graph->AddNode(alpha_var_name, shape, CNML_CONST, CNML_NHWC, fp_type);
+    graph->BindConstRawData(alpha_var_name, &alpha, 1, true);
+    CNML_CALL(cnmlCreatePreluOp(&activation_op,
+                                input_tensor->mlu_tensor(),
+                                output_tensor->mlu_tensor(),
+                                alpha_tensor->mlu_tensor()));
+  } else {
+    cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type);
    CNML_CALL(cnmlCreateActiveOp(&activation_op,
                                 act_type,
                                 input_tensor->mlu_tensor(),
                                 output_tensor->mlu_tensor()));
+  }
  graph->FuseOp(activation_op);
  return SUCCESS;
 }
@@ -54,4 +68,11 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(sigmoid,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ActConverter);
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
@@ -25,8 +25,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
-int ActConverter(void* ctx, OpLite* op);
 template void FillTensor<float, int>(Tensor* x,
                                     float lower = -2,
                                     float upper = -2);
@@ -136,7 +134,7 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
 TEST(MLUBridges, activation) {
  std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
-  std::vector<std::string> types{"sigmoid", "relu", "tanh"};
+  std::vector<std::string> types{"sigmoid", "relu", "tanh", "leaky_relu"};
  for (auto x_shape : shapes) {
    for (auto op_type : types) {
      test_act(x_shape, op_type);
@@ -149,8 +147,7 @@ TEST(MLUBridges, activation) {
 }  // namespace lite
 }  // namespace paddle
-REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter);
+USE_SUBGRAPH_BRIDGE(sigmoid, kMLU)
-REGISTER_SUBGRAPH_BRIDGE(MLU,
+USE_SUBGRAPH_BRIDGE(relu, kMLU)
-                         sigmoid,
+USE_SUBGRAPH_BRIDGE(tanh, kMLU)
-                         paddle::lite::subgraph::mlu::ActConverter);
+USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU)
-REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter);
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
@@ -42,7 +42,7 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto output = scope->FindVar(y_var_name)->GetMutable<Tensor>();
  auto output_dims = output->dims().Vectorize();
  auto output_tensor = graph->AddNode(
-      y_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+      y_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
  CHECK(graph->HasNode(x_var_name));

--- a/lite/kernels/mlu/bridges/batch_norm_op_test.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
@@ -23,8 +23,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
-int BatchNormConverter(void* ctx, OpLite* op);
 template <typename dtype>
 void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
  Scope* scope = op->scope();
@@ -139,9 +137,7 @@ void test_batch_norm(
            {bs, ic, ih, iw},
            {0, 2, 3, 1});
-  out->Resize({bs, ih, iw, ic});
  x->CopyDataFrom(input_trans);
-  x->Resize({bs, ih, iw, ic});
  LaunchOp(op, {x_var_name}, {out_var_name});
@@ -181,6 +177,4 @@ TEST(MLUBridges, batch_norm) {
 }  // namespace lite
 }  // namespace paddle
-REGISTER_SUBGRAPH_BRIDGE(MLU,
+USE_SUBGRAPH_BRIDGE(batch_norm, kMLU)
-                         batch_norm,
-                         paddle::lite::subgraph::mlu::BatchNormConverter);
--- a/lite/kernels/mlu/bridges/concat_op.cc
+++ b/lite/kernels/mlu/bridges/concat_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+  auto x_var_name = op_info->Input("X");
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto param_axis = op_info->GetAttr<int>("axis");
+  std::vector<cnmlTensor_t> input_tensor;
+  for (auto x_name : x_var_name) {
+    CHECK(graph->HasNode(x_name));
+    input_tensor.push_back(graph->GetNode(x_name)->mlu_tensor());
+  }
+  auto dims = output_dims.size();
+  int axis = (param_axis < 0) ? (param_axis + dims) : param_axis;
+  CHECK_LE(axis, 4) << "Unsupport dims in mlu concat";
+  int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2};
+  int nhwc_axis = nchw_to_nhwc_axis_map[axis];
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  cnmlBaseOp_t concat_op;
+  cnmlTensor_t outputs = output_tensor->mlu_tensor();
+  CNML_CALL(cnmlCreateNdConcatOp(&concat_op,
+                                 nhwc_axis,
+                                 input_tensor.data(),
+                                 x_var_name.size(),
+                                 &outputs,
+                                 1));
+  graph->FuseOp(concat_op);
+  return SUCCESS;
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ConcatConverter);
--- a/lite/kernels/mlu/bridges/concat_op_test.cc
+++ b/lite/kernels/mlu/bridges/concat_op_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/concat_op.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+void concat_ref(const std::shared_ptr<operators::ConcatOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = op_info->Input("X");
+  std::vector<lite::Tensor*> inputs;
+  for (auto var : x) {
+    inputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  }
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  int axis = op_info->GetAttr<int>("axis");
+  std::vector<lite::Tensor*> inputs_concat(inputs.size());
+  for (int j = 0; j < inputs.size(); ++j) {
+    inputs_concat[j] = inputs[j];
+  }
+  size_t num = inputs.size();
+  int rows = 1;
+  auto dim_0 = inputs[0]->dims();
+  for (int i = 0; i < axis; ++i) {
+    rows *= dim_0[i];
+  }
+  int out_rows = rows, out_cols = 0;
+  std::vector<int64_t> inputs_cols(inputs.size());
+  for (int i = 0; i < num; ++i) {
+    int t_cols = inputs[i]->numel() / rows;
+    out_cols += t_cols;
+    inputs_cols[i] = t_cols;
+  }
+  for (int k = 0; k < out_rows; ++k) {
+    float* dst_ptr = out->mutable_data<float>() + k * out_cols;
+    int col_idx = 0;
+    for (int j = 0; j < num; ++j) {
+      int col_len = inputs_cols[j];
+      const float* src_prt = inputs[j]->data<float>() + k * col_len;
+      std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
+      col_idx += col_len;
+    }
+  }
+}
+void test_concat(std::vector<std::vector<int64_t>> input, int axis) {
+  std::string x_var_name = "x";
+  std::string y_var_name = "y";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  // prepare input&output variables
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(input[0]));
+  y->Resize(DDim(input[1]));
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  CHECK_EQ(out->dims(), out_ref->dims());
+  // initialize input&output data
+  FillTensor<float>(x);
+  FillTensor<float>(y);
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("concat");
+  opdesc.SetInput("X", {x_var_name, y_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", axis);
+  auto op = CreateOp<operators::ConcatOpLite>(opdesc, &scope);
+  concat_ref(op);
+  out_ref->CopyDataFrom(*out);
+  Tensor input_x, input_y;
+  input_x.Resize(DDim(input[0]));
+  input_y.Resize(DDim(input[1]));
+  transpose(x->mutable_data<float>(),
+            input_x.mutable_data<float>(),
+            {static_cast<int>(input[0][0]),
+             static_cast<int>(input[0][1]),
+             static_cast<int>(input[0][2]),
+             static_cast<int>(input[0][3])},
+            {0, 2, 3, 1});
+  transpose(y->mutable_data<float>(),
+            input_y.mutable_data<float>(),
+            {static_cast<int>(input[1][0]),
+             static_cast<int>(input[1][1]),
+             static_cast<int>(input[1][2]),
+             static_cast<int>(input[1][3])},
+            {0, 2, 3, 1});
+  x->CopyDataFrom(input_x);
+  y->CopyDataFrom(input_y);
+  LaunchOp(op, {x_var_name, y_var_name}, {out_var_name});
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize(out->dims());
+  auto os = out->dims();
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(os[0]),
+             static_cast<int>(os[2]),
+             static_cast<int>(os[3]),
+             static_cast<int>(os[1])},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4);
+  }
+}
+TEST(MLUBridges, concat) {
+  test_concat({{3, 3, 5, 2}, {2, 3, 5, 2}}, 0);
+  test_concat({{3, 5, 5, 2}, {3, 1, 5, 2}}, 1);
+  test_concat({{3, 3, 2, 2}, {3, 3, 4, 2}}, 2);
+  test_concat({{3, 3, 5, 2}, {3, 3, 5, 6}}, 3);
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+USE_SUBGRAPH_BRIDGE(concat, kMLU);
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
@@ -31,15 +31,16 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  const auto* scope = op->scope();
  VLOG(3) << "[MLU] Converting " << op_info->Type() << "... ";
-  // Get input, filter and op attributes
+  // get input, filter and op attributes
  const auto input_var_name = op_info->Input("Input").front();
-  const auto& input_dims_nhwc =
+  const auto& input_dims =
      scope->FindVar(input_var_name)->GetMutable<Tensor>()->dims();
-  const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
  const auto filter_var_name = op_info->Input("Filter").front();
  auto* filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
  const auto& filter_dims = filter->dims();
  const auto output_var_name = op_info->Output("Output").front();
+  auto* output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
+  const auto output_shape = output->dims().Vectorize();
  const auto bs = input_dims[0];
  const auto oc = filter_dims[0];
  CHECK_EQ(input_dims.size(), 4);
@@ -70,24 +71,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                      input_dims,
                                      filter_dims);
-  std::vector<int64_t> output_shape({bs, oc});
+  const auto output_tensor = graph->AddNode(
-  for (size_t i = 0; i < 2; i++) {
+      output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
-    const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
-    output_shape.push_back(
-        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
-            strides[i] +
-        1);
-  }
-  const auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
-  const auto output_tensor = graph->AddNode(output_var_name,
-                                            output_shape_nhwc,
-                                            CNML_TENSOR,
-                                            CNML_NHWC,
-                                            graph->FPType());
-  scope->FindVar(output_var_name)
-      ->GetMutable<::paddle::lite::Tensor>()
-      ->Resize(output_shape_nhwc);
  // Create filter node
  const auto filter_tensor = graph->AddNode(filter_var_name,
@@ -119,14 +104,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    LOG(FATAL) << "UnSupported weight precision!";
  }
-  cnmlConvOpParam_t conv_param;
-  CNML_CALL(cnmlCreateConvOpParam(&conv_param,
-                                  strides[0],
-                                  strides[1],
-                                  dilations[0],
-                                  dilations[1],
-                                  paddings[0] * 2,
-                                  paddings[2] * 2));
  std::string bias_var_name;
  std::shared_ptr<MLUTensor> bias_tensor;
  if (HasInputArg(op_info, scope, "Bias")) {
@@ -160,8 +137,66 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                 graph->FPType());
    graph->BindConstData(bias_var_name, bias);
  }
-  cnmlBaseOp_t conv_op;
  const auto input_scale = op_info->GetAttr<float>("input_scale");
+  bool use_first_conv = false;
+  if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) {
+    use_first_conv = true;
+  }
+  cnmlBaseOp_t conv_op;
+  if (use_first_conv) {
+    cnmlConvFirstOpParam_t conv_param;
+    CNML_CALL(cnmlCreateConvFirstOpParam_V2(&conv_param,
+                                            strides[0],
+                                            strides[1],
+                                            dilations[0],
+                                            dilations[1],
+                                            paddings[2],
+                                            paddings[2],
+                                            paddings[0],
+                                            paddings[0]));
+    const auto mean_tensor = graph->AddNode("first_conv_mean_tensor",
+                                            std::vector<int64_t>{3},
+                                            CNML_CONST,
+                                            CNML_CNHW,
+                                            graph->FPType());
+    const auto std_tensor = graph->AddNode("first_conv_std_tensor",
+                                           std::vector<int64_t>{3},
+                                           CNML_CONST,
+                                           CNML_CNHW,
+                                           graph->FPType());
+    graph->BindConstRawData("first_conv_mean_tensor",
+                            lite::DeviceInfo::Global().MeanVec().data(),
+                            3,
+                            false);
+    graph->BindConstRawData("first_conv_std_tensor",
+                            lite::DeviceInfo::Global().StdVec().data(),
+                            3,
+                            false);
+    graph->GetNode(input_var_name)->set_mlu_dtype(CNML_DATA_UINT8);
+    CNML_CALL(cnmlCreateConvFirstOpForward(
+        &conv_op,
+        conv_param,
+        graph->GetNode(input_var_name)->mlu_tensor(),
+        mean_tensor->mlu_tensor(),
+        output_tensor->mlu_tensor(),
+        filter_tensor->mlu_tensor(),
+        bias_tensor ? bias_tensor->mlu_tensor() : nullptr,
+        std_tensor->mlu_tensor()));
+    CNML_CALL(cnmlDestroyConvFirstOpParam(&conv_param));
+  } else {
+    cnmlConvOpParam_t conv_param;
+    CNML_CALL(cnmlCreateConvOpParam(&conv_param,
+                                    strides[0],
+                                    strides[1],
+                                    dilations[0],
+                                    dilations[1],
+                                    paddings[0] * 2,
+                                    paddings[2] * 2));
    CNML_CALL(cnmlCreateConvOpForward(
        &conv_op,
        conv_param,
@@ -169,6 +204,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
        output_tensor->mlu_tensor(),
        filter_tensor->mlu_tensor(),
        bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
+    CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
+  }
  graph->SetComputingDataType(
      conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
@@ -183,7 +220,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  }
  graph->BindConstData(filter_var_name, filter);
  graph->FuseOp(conv_op);
-  CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
  return REBUILD_WHEN_SHAPE_CHANGED;
 }

--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
@@ -25,8 +25,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
-int ConvConverter(void* ctx, OpLite* op);
 void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
  Scope* scope = op->scope();
  const OpInfo* op_info = op->op_info();
@@ -246,10 +244,6 @@ void test_conv(int bs,
    }
  }
-  input->Resize({bs, ih, iw, ic});
-  output->Resize(
-      {output_shape[0], output_shape[2], output_shape[3], output_shape[1]});
  // create and convert op to MLU model, then run it on MLU
  auto op = CreateOp<operators::ConvOpLite>(opdesc_mlu, &scope);
  LaunchOp(op, {input_var_name}, {output_var_name});
@@ -342,9 +336,5 @@ TEST(MLUBridges, conv) {
 }  // namespace lite
 }  // namespace paddle
-REGISTER_SUBGRAPH_BRIDGE(MLU,
+USE_SUBGRAPH_BRIDGE(conv2d, kMLU)
-                         conv2d,
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU)
-                         paddle::lite::subgraph::mlu::ConvConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         depthwise_conv2d,
-                         paddle::lite::subgraph::mlu::ConvConverter);
--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
@@ -77,7 +77,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto output_tensor = graph->AddNode(out_var_name,
                                      x->dims().Vectorize(),
                                      CNML_TENSOR,
-                                      CNML_NHWC,
+                                      CNML_NCHW,
                                      graph->FPType());
  cnmlBaseOp_t elementwise_op;
@@ -90,7 +90,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    auto mid_tensor = graph->AddNode(out_var_name + "_mid",
                                     x->dims().Vectorize(),
                                     CNML_TENSOR,
-                                     CNML_NHWC,
+                                     CNML_NCHW,
                                     graph->FPType());
    CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op,
                                       x_tensor->mlu_tensor(),

--- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
@@ -24,8 +24,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
-int ElementwiseConverter(void* ctx, OpLite* op);
 template <typename dtype>
 void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
  Scope* scope = op->scope();
@@ -184,15 +182,7 @@ TEST(MLUBridges, elementwise_add) {
 }  // namespace lite
 }  // namespace paddle
-REGISTER_SUBGRAPH_BRIDGE(MLU,
+USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU)
-                         elementwise_add,
+USE_SUBGRAPH_BRIDGE(elementwise_sub, kMLU)
-                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU)
-REGISTER_SUBGRAPH_BRIDGE(MLU,
+USE_SUBGRAPH_BRIDGE(elementwise_div, kMLU)
-                         elementwise_sub,
-                         paddle::lite::subgraph::mlu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         elementwise_mul,
-                         paddle::lite::subgraph::mlu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         elementwise_div,
-                         paddle::lite::subgraph::mlu::ElementwiseConverter);
--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
@@ -37,6 +37,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
  auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
+  auto output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
  auto x_dims = x->dims();
  auto w_dims = w->dims();
@@ -50,15 +51,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto input_scale = op_info->GetAttr<float>("input_scale");
-  std::vector<int64_t> output_shape_nhwc({1, 1, 1, w_dims[1]});
  auto output_tensor = graph->AddNode(output_var_name,
-                                      output_shape_nhwc,
+                                      output->dims().Vectorize(),
                                      CNML_TENSOR,
-                                      CNML_NHWC,
+                                      CNML_NCHW,
                                      graph->FPType());
-  scope->FindVar(output_var_name)
-      ->GetMutable<::paddle::lite::Tensor>()
-      ->Resize(output_shape_nhwc);
  std::string bias_var_name;
  std::shared_ptr<MLUTensor> bias_tensor;

--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
@@ -24,8 +24,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
-int FCConverter(void* ctx, OpLite* op);
 void fc_ref(const std::shared_ptr<operators::FcOpLite> op) {
  Scope* scope = op->scope();
  const OpInfo* op_info = op->op_info();
@@ -141,15 +139,34 @@ void test_fc(const std::vector<int64_t>& input_shape,
  }
  auto fc_op_mlu = CreateOp<operators::FcOpLite>(fc_op_desc_mlu, &scope);
-  input->Resize({static_cast<int>(input_shape[0]),
+  Tensor input_tmp, out_tmp;
+  input_tmp.Resize(input_shape);
+  transpose(input->mutable_data<float>(),
+            input_tmp.mutable_data<float>(),
+            {static_cast<int>(input_shape[0]),
+             static_cast<int>(input_shape[1]),
             static_cast<int>(input_shape[2]),
-                 static_cast<int>(input_shape[3]),
+             static_cast<int>(input_shape[3])},
-                 static_cast<int>(input_shape[1])});
+            {0, 2, 3, 1});
-  out->Resize({static_cast<int>(input_shape[0]), static_cast<int>(w_shape[1])});
+  input->CopyDataFrom(input_tmp);
  LaunchOp(fc_op_mlu, {input_var_name}, {out_var_name});
-  // compare results
+  auto os = out->dims();
+  out_tmp.Resize(os);
  auto* out_data = out->mutable_data<float>();
+  //  transpose(out_data,
+  //            out_tmp.mutable_data<float>(),
+  //            {static_cast<int>(os[0]),
+  //             static_cast<int>(os[2]),
+  //             static_cast<int>(os[3]),
+  //             static_cast<int>(os[1])},
+  //            {0, 3, 1, 2});
+  //
+  //  out_data = out_tmp.mutable_data<float>();
+  // compare results
  auto* out_ref_data = out_ref->mutable_data<float>();
  for (int i = 0; i < out->dims().production(); i++) {
    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
@@ -170,4 +187,4 @@ TEST(MLUBridges, fc) {
 }  // namespace lite
 }  // namespace paddle
-REGISTER_SUBGRAPH_BRIDGE(MLU, fc, paddle::lite::subgraph::mlu::FCConverter);
+USE_SUBGRAPH_BRIDGE(fc, kMLU);
--- a/lite/kernels/mlu/bridges/graph.cc
+++ b/lite/kernels/mlu/bridges/graph.cc
@@ -25,12 +25,12 @@ namespace mlu {
 std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
                                          std::vector<int64_t> shape,
                                          cnmlTensorType_t tensor_type,
-                                          cnmlDataOrder_t data_order,
+                                          cnmlDataOrder_t shape_order,
                                          cnmlDataType_t mlu_dtype,
                                          void* raw_ptr) {
  CHECK(!HasNode(name));
  auto node = std::shared_ptr<MLUTensor>(
-      new MLUTensor(shape, tensor_type, data_order, mlu_dtype));
+      new MLUTensor(shape, tensor_type, shape_order, mlu_dtype));
  node->set_mlu_ptr(raw_ptr);
  nodes_.insert(std::make_pair(name, node));
  return node;

--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -23,6 +23,12 @@
 #include "lite/core/tensor.h"
 #include "lite/kernels/mlu/bridges/tensor.h"
+#define PRINT_HW_TIME false
+#if PRINT_HW_TIME
+#include <mutex>  //NOLINT
+#endif
 namespace paddle {
 namespace lite {
 namespace subgraph {
@@ -32,13 +38,30 @@ namespace mlu {
 // to the MLU IR graph
 class Graph {
 public:
-  Graph() { CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); }
+  Graph() {
+    CNML_CALL(cnmlCreateFusionOp(&fusion_op_));
+#if PRINT_HW_TIME
+    CNRT_CALL(cnrtCreateNotifier(&notifier_start_));
+    CNRT_CALL(cnrtCreateNotifier(&notifier_end_));
+#endif
+  }
  ~Graph() {
+    FreeConstData();
    CNML_CALL(cnmlDestroyFusionOp(&fusion_op_));
    for (auto op : ops_) {
      CNML_CALL(cnmlDestroyBaseOp(&op));
    }
+#if PRINT_HW_TIME
+    CNRT_CALL(cnrtDestroyNotifier(&notifier_start_));
+    CNRT_CALL(cnrtDestroyNotifier(&notifier_end_));
+    double total_time = 0;
+    for (auto& f : time_log_) {
+      total_time += f;
+    }
+    std::cout << "cnml hardware time for " << time_log_.size()
+              << " process:" << total_time / time_log_.size() << std::endl;
+#endif
  }
  // Data node
@@ -89,6 +112,10 @@ class Graph {
  }
  void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
+#if PRINT_HW_TIME
+    thread_local float hw_time;
+    CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
+#endif
    CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
                                            input_addrs_.data(),
                                            input_addrs_.size(),
@@ -96,7 +123,61 @@ class Graph {
                                            output_addrs_.size(),
                                            &forward_param,
                                            que));
+#if PRINT_HW_TIME
+    CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
+#endif
    CNRT_CALL(cnrtSyncQueue(que));
+#if PRINT_HW_TIME
+    CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
+    hw_time /= 1000.0f;
+    DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;
+    std::lock_guard<std::mutex> lk(time_mut_);
+    time_log_.push_back(hw_time);
+#endif
+  }
+  template <typename T>
+  void* RegisterConstData(size_t len) {
+    void* addr = malloc(len * sizeof(T));
+    const_data_storage_.push_back(addr);
+    return addr;
+  }
+  void FreeConstData() {
+    for (auto& addr : const_data_storage_) {
+      free(addr);
+    }
+  }
+  void BindConstRawData(std::string tensor_name,
+                        const float* data,
+                        size_t len,
+                        bool alloc = true) {
+    void* alloc_data;
+    if (fp_type_ == CNML_DATA_FLOAT32) {
+      if (alloc) {
+        alloc_data = RegisterConstData<float>(len);
+        memcpy(alloc_data, data, len * sizeof(float));
+      } else {
+        alloc_data = const_cast<void*>(static_cast<const void*>(data));
+      }
+      CNML_CALL(cnmlBindConstData_V2(
+          nodes_[tensor_name]->mlu_tensor(), alloc_data, false));
+    } else if (fp_type_ == CNML_DATA_FLOAT16) {
+      void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len);
+      CNRT_CALL(
+          cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
+                           CNRT_FLOAT32,
+                           data_fp16,
+                           CNRT_FLOAT16,
+                           len,
+                           nullptr));
+      CNML_CALL(cnmlBindConstData_V2(
+          nodes_[tensor_name]->mlu_tensor(), data_fp16, false));
+    } else {
+      CHECK(0);
+    }
  }
  void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
@@ -158,6 +239,12 @@ class Graph {
  std::vector<std::shared_ptr<MLUTensor>> output_tensors_;
  std::vector<cnmlBaseOp_t> ops_;
  cnmlFusionOp_t fusion_op_;
+  std::vector<void*> const_data_storage_;
+#if PRINT_HW_TIME
+  cnrtNotifier_t notifier_start_{}, notifier_end_{};
+  std::mutex time_mut_;
+  std::vector<float> time_log_;
+#endif
 };
 }  // namespace mlu

--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+  // Get input and output vars and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto out = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims();
+  CHECK_EQ(x_dims.size(), 4);
+  auto scale = op_info->GetAttr<float>("scale");
+  auto out_w = op_info->GetAttr<int>("out_w");
+  auto out_h = op_info->GetAttr<int>("out_h");
+  auto align_corners = op_info->GetAttr<bool>("align_corners");
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+  auto in_h = x_dims[2];
+  auto in_w = x_dims[3];
+  // Priority: SizeTensor > OutSize > Scale > scale > out_h/out_w
+  if (HasInputArg(op_info, scope, "SizeTensor")) {
+    LOG(ERROR) << "Not support SizeTensor input now";
+    CHECK(0);
+  } else {
+    if (HasInputArg(op_info, scope, "Scale")) {
+      LOG(ERROR) << "Not support Scale input now";
+      CHECK(0);
+    }
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+      out_h = out_h > 0 ? out_h : -1;
+      out_w = out_w > 0 ? out_w : -1;
+    }
+    if (HasInputArg(op_info, scope, "OutSize")) {
+      LOG(ERROR) << "Not support OutSize input now";
+      CHECK(0);
+    }
+  }
+  auto output_tensor = graph->AddNode(out_var_name,
+                                      out->dims().Vectorize(),
+                                      CNML_TENSOR,
+                                      CNML_NCHW,
+                                      graph->FPType());
+  cnmlBaseOp_t interp_op;
+  cnmlNearestNeighborOpParam_t nn_param;
+  CNML_CALL(cnmlCreateNearestNeighborOpParam(&nn_param, out_w, out_h));
+  CNML_CALL(cnmlSetNearestNeighborAlignCorner(&nn_param, align_corners));
+  CNML_CALL(cnmlCreateNearestNeighborOp(&interp_op,
+                                        input_tensor->mlu_tensor(),
+                                        output_tensor->mlu_tensor(),
+                                        nn_param));
+  CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param));
+  graph->FuseOp(interp_op);
+  return SUCCESS;
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(nearest_interp,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::InterpolateConverter);
--- a/lite/kernels/mlu/bridges/interpolate_op_test.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/interpolate_op.h"
+#include <gtest/gtest.h>
+#include <string>
+#include "lite/core/device_info.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+template <typename dtype>
+void ResizeNearestAlign(const lite::Tensor* x,
+                        lite::Tensor* out,
+                        bool with_align) {
+  auto x_dims = x->dims();
+  int num = x_dims[0];
+  int channels = x_dims[1];
+  int hin = x_dims[2];
+  int win = x_dims[3];
+  int hout = out->dims()[2];
+  int wout = out->dims()[3];
+  dtype scale_w = (with_align) ? (static_cast<float>(win - 1) / (wout - 1))
+                               : (static_cast<float>(win) / (wout));
+  dtype scale_h = (with_align) ? (static_cast<float>(hin - 1) / (hout - 1))
+                               : (static_cast<float>(hin) / (hout));
+  const dtype* src = x->data<dtype>();
+  dtype* dst = out->mutable_data<dtype>();
+  int dst_stride_w = 1;
+  int dst_stride_h = wout;
+  int dst_stride_c = wout * hout;
+  int dst_stride_batch = wout * hout * channels;
+  int src_stride_w = 1;
+  int src_stride_h = win;
+  int src_stride_c = win * hin;
+  int src_stride_batch = win * hin * channels;
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      int src_index = n * src_stride_batch + c * src_stride_c;
+      for (int h = 0; h < hout; ++h) {
+        for (int w = 0; w < wout; ++w) {
+          int fw = (with_align) ? static_cast<int>(scale_w * w + 0.5)
+                                : static_cast<int>(scale_w * w);
+          fw = (fw < 0) ? 0 : fw;
+          int fh = (with_align) ? static_cast<int>(scale_h * h + 0.5)
+                                : static_cast<int>(scale_h * h);
+          fh = (fh < 0) ? 0 : fh;
+          int w_start = static_cast<int>(fw);
+          int h_start = static_cast<int>(fh);
+          int dst_index = n * dst_stride_batch + c * dst_stride_c +
+                          h * dst_stride_h + w * dst_stride_w;
+          dst[dst_index] =
+              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
+        }
+      }
+    }
+  }
+}
+template <typename DType>
+void BilinearInterpRef(const lite::Tensor* x,
+                       lite::Tensor* out,
+                       bool align_corners,
+                       int align_mode) {
+  auto x_dims = x->dims();
+  int batch_size = x_dims[0];
+  int channel_size = x_dims[1];
+  auto x_h = x_dims[2];
+  auto x_w = x_dims[3];
+  CHECK_EQ(x_dims.size(), 4);
+  auto out_dims = out->dims();
+  int out_h = out_dims[2];
+  int out_w = out_dims[3];
+  // copy from x if no change
+  if (x_h == out_h && x_w == out_w) {
+    out->CopyDataFrom(*x);
+    return;
+  }
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(x_h - 1) / (out_h - 1)
+                              : static_cast<float>(x_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(x_w - 1) / (out_w - 1)
+                              : static_cast<float>(x_w) / out_w;
+  }
+  // naive bilinear interpolation
+  auto x_data = x->data<DType>();
+  auto out_data = out->mutable_data<DType>();
+  bool align_flag = (align_mode == 0 && !align_corners);
+  std::vector<int> vy_n, vy_s;
+  std::vector<float> vd_n, vd_s;
+  vy_n.reserve(out_h);
+  vy_s.reserve(out_h);
+  vd_n.reserve(out_h);
+  vd_s.reserve(out_h);
+  for (int k = 0; k < out_h; k++) {
+    int yn = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                        : static_cast<int>(ratio_h * k);
+    yn = (yn > 0) ? yn : 0;
+    int ys = (yn + 1) < (x_h - 1) ? (yn + 1) : (x_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float dn = align_flag ? idx_src_y - yn : ratio_h * k - yn;
+    float ds = 1.f - dn;
+    {
+      vy_n[k] = yn;
+      vy_s[k] = ys;
+      vd_n[k] = dn;
+      vd_s[k] = ds;
+    }
+  }
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+  for (int l = 0; l < out_w; l++) {
+    int xw = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                        : static_cast<int>(ratio_w * l);
+    xw = (xw > 0) ? xw : 0;
+    int xe = (xw + 1) < (x_w - 1) ? (xw + 1) : (x_w - 1);
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float dw = align_flag ? idx_src_x - xw : ratio_w * l - xw;
+    float de = 1.f - dw;
+    {
+      vx_w[l] = xw;
+      vx_e[l] = xe;
+      vd_w[l] = dw;
+      vd_e[l] = de;
+    }
+  }
+  std::vector<int64_t> x_strides(x_dims.size(), 1);
+  for (int idx = x_strides.size() - 2; idx >= 0; idx--) {
+    x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1];
+  }
+  for (int i = 0; i < batch_size; i++) {
+    for (int j = 0; j < channel_size; j++) {
+      for (int k = 0; k < out_h; k++) {
+        for (int l = 0; l < out_w; l++) {
+          DType x0 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_n[k] * x_strides[2] + vx_w[l] * x_strides[3]];
+          DType x1 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_s[k] * x_strides[2] + vx_w[l] * x_strides[3]];
+          DType x2 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_n[k] * x_strides[2] + vx_e[l] * x_strides[3]];
+          DType x3 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_s[k] * x_strides[2] + vx_e[l] * x_strides[3]];
+          *out_data = x0 * vd_s[k] * vd_e[l] + x1 * vd_n[k] * vd_e[l] +
+                      x2 * vd_s[k] * vd_w[l] + x3 * vd_n[k] * vd_w[l];
+          out_data++;
+        }
+      }
+    }
+  }
+}
+class InterpComputeTester {
+ protected:
+  // common attributes for this op.
+  std::string x_var_name = "X";
+  std::string outsize_var_name = "OutSize";
+  std::string out_var_name = "Out";
+  std::string out_ref_var_name = "out_ref";
+  DDim dims_{{1, 2, 3, 4}};
+  Scope scope;
+  std::string interp_method_ = "nearest";
+  float scale_ = -1.f;
+  int out_h_ = -1;
+  int out_w_ = -1;
+  bool align_corners_ = true;
+  int align_mode_ = 1;
+  bool use_outsize_ = false;
+ public:
+  InterpComputeTester(const std::string& alias,
+                      DDim dims,
+                      std::string interp_method = "nearest",
+                      float scale = -1.f,
+                      int out_h = -1,
+                      int out_w = -1,
+                      bool align_corners = true,
+                      int align_mode = 1,
+                      bool use_outsize = false)
+      : dims_(dims),
+        interp_method_(interp_method),
+        scale_(scale),
+        out_h_(out_h),
+        out_w_(out_w),
+        align_corners_(align_corners),
+        align_mode_(align_mode),
+        use_outsize_(use_outsize) {}
+  void Execute(float abs_error) {
+    cpp::OpDesc op_desc;
+    auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+    auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+    auto* outsize = scope.Var(outsize_var_name)->GetMutable<Tensor>();
+    auto* outref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+    int out_h = out_h_;
+    int out_w = out_w_;
+    if (scale_ > 0) {
+      out_h = static_cast<int>(dims_[2] * scale_);
+      out_w = static_cast<int>(dims_[3] * scale_);
+    }
+    x->Resize(dims_);
+    /* printf("----output tensor dims: %ld, %d, %d, %ld\n", dims_[0], out_h,
+     * out_w, dims_[1]); */
+    std::vector<int64_t> out_shape_nchw = {dims_[0], dims_[1], out_h, out_w};
+    outref->Resize(out_shape_nchw);
+    outsize->Resize({2});
+    FillTensor<float, float>(x, -1.f, 1.f);
+    if (use_outsize_) {
+      outsize->mutable_data<int>()[0] = out_h;
+      outsize->mutable_data<int>()[1] = out_w;
+      outsize->set_persistable(true);
+    }
+    if (interp_method_ == "nearest") {
+      op_desc.SetType("nearest_interp");
+    } else if (interp_method_ == "bilinear") {
+      op_desc.SetType("bilinear_interp");
+    } else {
+      LOG(FATAL) << "unsupport";
+    }
+    op_desc.SetInput("X", {x_var_name});
+    if (use_outsize_) {
+      op_desc.SetInput("OutSize", {outsize_var_name});
+    }
+    op_desc.SetOutput("Out", {out_var_name});
+    op_desc.SetAttr("scale", scale_);
+    op_desc.SetAttr("out_h", out_h_);
+    op_desc.SetAttr("out_w", out_w_);
+    op_desc.SetAttr("align_corners", align_corners_);
+    op_desc.SetAttr("align_mode", align_mode_);
+    op_desc.SetAttr("interp_method", interp_method_);
+    auto op = CreateOp<operators::InterpolateOp>(op_desc, &scope);
+    if (interp_method_ == "nearest") {
+      ResizeNearestAlign<float>(x, outref, align_corners_);
+    } else if (interp_method_ == "bilinear") {
+      BilinearInterpRef<float>(x, outref, align_corners_, align_mode_);
+    }
+    int in = dims_[0], ic = dims_[1], ih = dims_[2], iw = dims_[3];
+    Tensor input_trans;
+    input_trans.Resize(dims_);
+    transpose(x->mutable_data<float>(),
+              input_trans.mutable_data<float>(),
+              {in, ic, ih, iw},
+              {0, 2, 3, 1});
+    x->CopyDataFrom(input_trans);
+    if (use_outsize_) {
+      LaunchOp(op, {x_var_name, outsize_var_name}, {out_var_name});
+    } else {
+      LaunchOp(op, {x_var_name}, {out_var_name});
+    }
+    auto* out_ref_data = outref->mutable_data<float>();
+    Tensor output_trans;
+    output_trans.Resize(out_shape_nchw);
+    transpose(
+        out->mutable_data<float>(),
+        output_trans.mutable_data<float>(),
+        {static_cast<int>(dims_[0]), out_h, out_w, static_cast<int>(dims_[1])},
+        {0, 3, 1, 2});
+    auto* out_data = output_trans.mutable_data<float>();
+    for (int i = 0; i < out->dims().production(); ++i) {
+      EXPECT_NEAR(out_data[i], out_ref_data[i], abs_error);
+    }
+  }
+};
+void TestInterpOuthw(float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    /* for (auto interp_method : std::vector<std::string>{"nearest",
+     * "bilinear"}) { */
+    for (auto interp_method : std::vector<std::string>{"nearest"}) {
+      for (int out_h : {6, 8, 12}) {
+        for (int out_w : {6, 9}) {
+          printf("testcase %s: out_w %d, out_h %d\n",
+                 interp_method.c_str(),
+                 out_w,
+                 out_h);
+          InterpComputeTester tester(
+              "def", DDim(x_dims), interp_method, -1.f, out_h, out_w);
+          tester.Execute(abs_error);
+        }
+      }
+    }
+  }
+}
+void TestInterpScale(float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    /* for (auto interp_method : std::vector<std::string>{"nearest",
+     * "bilinear"}) { */
+    for (auto interp_method : std::vector<std::string>{"nearest"}) {
+      for (float scale : {0.3f, 1.f, 1.7f}) {
+        printf("testcase %s: scale: %f\n", interp_method.c_str(), scale);
+        InterpComputeTester tester("def", DDim(x_dims), interp_method, scale);
+        tester.Execute(abs_error);
+      }
+    }
+  }
+}
+void TestInterpOutsize(float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    /* for (auto interp_method : std::vector<std::string>{"nearest",
+     * "bilinear"}) { */
+    for (auto interp_method : std::vector<std::string>{"nearest"}) {
+      printf("testcase %s: outsize: %d %d\n", interp_method.c_str(), 4, 4);
+      InterpComputeTester tester(
+          "def", DDim(x_dims), interp_method, -1, 4, 4, true, 1, true);
+      tester.Execute(abs_error);
+    }
+  }
+}
+void TestInterpAlignCorners(float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (bool align_corners : {true, false}) {
+      printf(
+          "testcase nearest: scale: 0.4, out_w -1 out_h -1, align_corners %d\n",
+          align_corners);
+      InterpComputeTester tester(
+          "def", DDim(x_dims), "nearest", 0.4, -1, -1, align_corners);
+      tester.Execute(abs_error);
+    }
+  }
+}
+void TestInterpAlignMode(float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (bool align_corners : {true, false}) {
+      for (int align_mode : {0, 1}) {
+        printf(
+            "testcase bilinear: scale: 0.7, out_w -1 out_h -1, align_corners "
+            "%d, mode %d\n",
+            align_corners,
+            align_mode);
+        InterpComputeTester tester("def",
+                                   DDim(x_dims),
+                                   "bilinear",
+                                   0.7,
+                                   -1,
+                                   -1,
+                                   align_corners,
+                                   align_mode);
+        tester.Execute(abs_error);
+      }
+    }
+  }
+}
+TEST(MLUBridges, interpolate) {
+  float abs_error = 2e-5;
+  TestInterpOuthw(abs_error);
+  TestInterpScale(abs_error);
+  // bug, not usable
+  // TestInterpOutsize(abs_error);
+  TestInterpAlignCorners(abs_error);
+  // only for bilinear interp
+  // TestInterpAlignMode(abs_error);
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
@@ -22,3 +22,7 @@ USE_SUBGRAPH_BRIDGE(pool2d, kMLU);
 USE_SUBGRAPH_BRIDGE(softmax, kMLU);
 USE_SUBGRAPH_BRIDGE(batch_norm, kMLU);
 USE_SUBGRAPH_BRIDGE(fc, kMLU);
+USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
+USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU);
+USE_SUBGRAPH_BRIDGE(concat, kMLU);
+USE_SUBGRAPH_BRIDGE(scale, kMLU);
--- a/lite/kernels/mlu/bridges/pool_op.cc
+++ b/lite/kernels/mlu/bridges/pool_op.cc
@@ -47,9 +47,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Get input, and attributes
  auto x_var_name = op_info->Input("X").front();
  auto x = scope->FindTensor(x_var_name);
-  auto input_dims_nhwc = x->dims();
-  const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
  auto output_var_name = op_info->Output("Out").front();
+  auto output_shape = scope->FindTensor(output_var_name)->dims().Vectorize();
  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
  auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
@@ -81,23 +80,17 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                 strides,
                                 ksize);
-  std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
+  //  std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
-  for (size_t i = 0; i < 2; i++) {
+  //  for (size_t i = 0; i < 2; i++) {
-    output_shape.push_back(
+  //    output_shape.push_back(
-        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - ksize[0]) /
+  //        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] -
-            strides[i] +
+  //        ksize[0]) /
-        1);
+  //            strides[i] +
-  }
+  //        1);
+  //  }
-  auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
+  auto output_tensor = graph->AddNode(
-  auto output_tensor = graph->AddNode(output_var_name,
+      output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
-                                      output_shape_nhwc,
-                                      CNML_TENSOR,
-                                      CNML_NHWC,
-                                      graph->FPType());
-  scope->FindVar(output_var_name)
-      ->GetMutable<::paddle::lite::Tensor>()
-      ->Resize(output_shape_nhwc);
  cnmlPoolOpParam_t pool_param;
  CNML_CALL(

--- a/lite/kernels/mlu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
@@ -24,8 +24,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
-int PoolConverter(void* ctx, OpLite* op);
 void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
  Scope* scope = op->scope();
  const OpInfo* op_info = op->op_info();
@@ -182,12 +180,7 @@ void test_pool(int bs,
            {0, 2, 3, 1});
  auto os = out->dims();
-  out->Resize({static_cast<int>(os[0]),
-               static_cast<int>(os[2]),
-               static_cast<int>(os[3]),
-               static_cast<int>(os[1])});
  x->CopyDataFrom(input_trans);
-  x->Resize({bs, ih, iw, ic});
  LaunchOp(op, {x_var_name}, {out_var_name});
@@ -275,6 +268,4 @@ TEST(MLUBridges, pool) {
 }  // namespace lite
 }  // namespace paddle
-REGISTER_SUBGRAPH_BRIDGE(MLU,
+USE_SUBGRAPH_BRIDGE(pool2d, kMLU)
-                         pool2d,
-                         paddle::lite::subgraph::mlu::PoolConverter);
--- a/lite/kernels/mlu/bridges/scale_op.cc
+++ b/lite/kernels/mlu/bridges/scale_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+  // Create act node and set params from op
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  auto bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
+  auto scale = op_info->GetAttr<float>("scale");
+  auto bias = op_info->GetAttr<float>("bias");
+  auto beta = bias_after_scale ? bias : bias * scale;
+  std::vector<int64_t> shape = {1, 1, 1, 1};
+  std::string prefix = string_format("_%p", op);
+  auto alpha_tensor = graph->AddNode(
+      "Alpha" + prefix, shape, CNML_CONST, CNML_NHWC, graph->FPType());
+  auto beta_tensor = graph->AddNode(
+      "Beta" + prefix, shape, CNML_CONST, CNML_NHWC, graph->FPType());
+  graph->BindConstRawData("Alpha" + prefix, &scale, 1);
+  graph->BindConstRawData("Beta" + prefix, &beta, 1);
+  auto input_tensor = graph->GetNode(x_var_name);
+  cnmlBaseOp_t scale_op;
+  CNML_CALL(cnmlCreateScaleOp(&scale_op,
+                              input_tensor->mlu_tensor(),
+                              output_tensor->mlu_tensor(),
+                              alpha_tensor->mlu_tensor(),
+                              beta_tensor->mlu_tensor()));
+  graph->FuseOp(scale_op);
+  return SUCCESS;
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(scale,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ScaleConverter);
--- a/lite/kernels/mlu/bridges/scale_op_test.cc
+++ b/lite/kernels/mlu/bridges/scale_op_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/scale_op.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+void scale_ref(const std::shared_ptr<operators::ScaleOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  float scale = op_info->GetAttr<float>("scale");
+  float bias = op_info->GetAttr<float>("bias");
+  bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
+  if (!bias_after_scale) {
+    bias *= scale;
+  }
+  auto x_data = x->data<float>();
+  auto out_data = out->mutable_data<float>();
+  DDim x_dims = x->dims();
+  DDim out_dims = out->dims();
+  CHECK_EQ(x_dims.production(), out_dims.production());
+  for (int i = 0; i < out_dims.production(); i++) {
+    out_data[i] = x_data[i] * scale + bias;
+  }
+}
+void test_scale(int bs,
+                int ic,
+                int ih,
+                int iw,
+                bool bias_after_scale,
+                float scale,
+                float bias) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string out_ref_var_name("out_ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize({bs, ic, ih, iw});
+  // initialize input&output data
+  FillTensor<float, int>(x);
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("scale");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("bias_after_scale", bias_after_scale);
+  opdesc.SetAttr("scale", scale);
+  opdesc.SetAttr("bias", bias);
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::ScaleOp>(opdesc, &scope);
+  scale_ref(op);
+  out_ref->CopyDataFrom(*out);
+  Tensor input_trans;
+  input_trans.Resize({bs, ic, ih, iw});
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {bs, ic, ih, iw},
+            {0, 2, 3, 1});
+  auto os = out->dims();
+  out->Resize({static_cast<int>(os[0]),
+               static_cast<int>(os[2]),
+               static_cast<int>(os[3]),
+               static_cast<int>(os[1])});
+  x->CopyDataFrom(input_trans);
+  x->Resize({bs, ih, iw, ic});
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  // execute reference implementation and save to output tensor('out')
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize(os);
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(os[0]),
+             static_cast<int>(os[2]),
+             static_cast<int>(os[3]),
+             static_cast<int>(os[1])},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+TEST(MLUBridges, scale) {
+  for (auto bs : {1, 3}) {
+    for (auto ic : {1, 3}) {
+      for (auto ih : {3, 4}) {
+        for (auto iw : {4, 3}) {
+          for (auto bias_after_scale : {false, true}) {
+            for (auto scale : {-1.0f, 5.0f}) {
+              for (auto bias : {-2.0f, 30.0f}) {
+                VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
+                        << " iw: " << iw
+                        // << " bias_after_scale: " << bias_after_scale
+                        << " scale: " << scale << " bias: " << bias;
+                test_scale(bs, ic, ih, iw, bias_after_scale, scale, bias);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+USE_SUBGRAPH_BRIDGE(scale, kMLU);
--- a/lite/kernels/mlu/bridges/softmax_op.cc
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
@@ -45,11 +45,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      axis = output_dims.size() + axis;
    }
  }
  int nhwc_axis = nchw_to_nhwc_aixs_map[axis];
  auto output_tensor = graph->AddNode(
-      out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
  cnmlBaseOp_t softmax_op;
  CNML_CALL(cnmlCreateNdSoftmaxOp(&softmax_op,
                                  nhwc_axis,

--- a/lite/kernels/mlu/bridges/softmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
@@ -23,8 +23,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
-int SoftmaxConverter(void* ctx, OpLite* op);
 template <typename dtype>
 void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
  Scope* scope = op->scope();
@@ -112,9 +110,7 @@ void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
            {bs, ic, ih, iw},
            {0, 2, 3, 1});
-  out->Resize({bs, ih, iw, ic});
  x->CopyDataFrom(input_trans);
-  x->Resize({bs, ih, iw, ic});
  LaunchOp(op, {x_var_name}, {out_var_name});
@@ -171,6 +167,4 @@ TEST(MLUBridges, softmax) {
 }  // namespace lite
 }  // namespace paddle
-REGISTER_SUBGRAPH_BRIDGE(MLU,
+USE_SUBGRAPH_BRIDGE(softmax, kMLU)
-                         softmax,
-                         paddle::lite::subgraph::mlu::SoftmaxConverter);
--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
@@ -47,6 +47,8 @@ class MLUTensor {
    return mlu_ptr_;
  }
+  void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; }
  ~MLUTensor();
 private:

--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
@@ -28,7 +28,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
              const std::vector<std::string>& input_var_names,
              const std::vector<std::string>& output_var_names) {
  CNRT_CALL(cnrtInit(0));
-  SetMluDevice(0);
+  ::paddle::lite::SetMluDevice(0);
  cnrtQueue_t queue_;
  cnrtInvokeFuncParam_t forward_param;
  u32_t affinity = 1;
@@ -47,7 +47,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
  const auto& bridges = subgraph::Registry::Instance();
  CHECK(bridges.Exists(op_type, TARGET(kMLU)));
-  // Convert all of input data vars and added into the MLU IR graph
+  // Convert input data var and add it into the MLU IR graph
  for (auto& input_name : input_var_names) {
    auto input_tensor = scope->FindMutableTensor(input_name);
    CHECK(input_tensor);
@@ -58,7 +58,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
        graph.AddNode(input_name,
                      input_tensor->dims().Vectorize(),
                      CNML_TENSOR,
-                      CNML_NHWC,
+                      CNML_NCHW,
                      graph.FPType(),
                      reinterpret_cast<void*>(
                          input_tensor->mutable_data<float>(TARGET(kMLU))));
@@ -68,6 +68,8 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
                          sizeof(float) * input_tensor->dims().production(),
                          CNRT_MEM_TRANS_DIR_HOST2DEV));
  }
+  op->CheckShape();
+  op->InferShape();
  bridges.Select(op_type, TARGET(kMLU))(
      reinterpret_cast<void*>(&graph), const_cast<OpLite*>(op.get()), nullptr);

--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
@@ -84,7 +84,7 @@ struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
 template <>
 struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
-  typedef ::paddle::lite::fluid::float16 T;
+  typedef paddle::lite::fluid::float16 T;
 };
 }  // namespace mlu

--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
@@ -133,22 +133,3 @@ REGISTER_LITE_KERNEL(
    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
    .Finalize();
-//                     kMLU,
-//                     kFloat,
-//                     kNHWC,
-//                     paddle::lite::kernels::mlu::IoCopyHostToMluCompute,
-//                     host_to_device)
-//    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
-//    .Finalize();
-//
-//
-//                     kMLU,
-//                     kFloat,
-//                     kNHWC,
-//                     paddle::lite::kernels::mlu::IoCopyMluToHostCompute,
-//                     device_to_host)
-//    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-//    .Finalize();
--- a/lite/kernels/mlu/layout_compute.cc
+++ b/lite/kernels/mlu/layout_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/mlu/layout_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    layout,
+    kMLU,
+    kFloat,
+    kNHWC,
+    paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFloat)>,
+    def_layout_nhwc2nchw_fp32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    layout,
+    kMLU,
+    kFP16,
+    kNHWC,
+    paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFP16)>,
+    def_layout_nhwc2nchw_fp16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    layout,
+    kMLU,
+    kFloat,
+    kNHWC,
+    paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFloat)>,
+    def_layout_nchw2nhwc_fp32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    layout,
+    kMLU,
+    kFP16,
+    kNHWC,
+    paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFP16)>,
+    def_layout_nchw2nhwc_fp16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    layout,
+    kMLU,
+    kInt8,
+    kNHWC,
+    paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kInt8)>,
+    def_layout_nchw2nhwc_fp32_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <Eigen/Core>
+#include <string>
+#include <vector>
+#include "lite/backends/x86/math/math_function.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/layout_op.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+template <paddle::lite_api::PrecisionType>
+struct FPTypeTraits {};
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
+  typedef float T;
+};
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
+  typedef paddle::lite::fluid::float16 T;
+};
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
+  typedef int8_t T;
+};
+template <lite::TargetType Target, typename T>
+inline void LayoutTransCompute(const int dim,
+                               const lite::Context<Target>& context,
+                               const lite::Tensor& in,
+                               lite::Tensor* out,
+                               const std::vector<int>& axis) {
+  switch (dim) {
+    case 2:
+      paddle::lite::x86::math::Transpose<lite::TargetType::kX86, T, 2> trans2;
+      trans2(context, in, out, axis);
+      break;
+    case 3:
+      paddle::lite::x86::math::Transpose<lite::TargetType::kX86, T, 3> trans3;
+      trans3(context, in, out, axis);
+      break;
+    case 4:
+      paddle::lite::x86::math::Transpose<lite::TargetType::kX86, T, 4> trans4;
+      trans4(context, in, out, axis);
+      break;
+    default:
+      CHECK(0) << ("Unsupport dim in mlu layout");
+  }
+}
+template <PrecisionType Precision>
+class LayoutNchwToNhwcCompute
+    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::LayoutParam;
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto* x = param.x;
+    auto* out = param.y;
+    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
+    auto x_dims = param.x->dims().size();
+    auto& context = this->ctx_->template As<X86Context>();
+    const auto origin_dims = out->dims().Vectorize();
+    std::vector<int> axis;
+    switch (x_dims) {
+      case 2:
+        axis = {0, 1};
+        break;
+      case 3:
+        axis = {0, 2, 1};
+        out->Resize(std::vector<int64_t>{
+            out->dims()[0], out->dims()[2], out->dims()[1]});
+        break;
+      case 4:
+        axis = {0, 2, 3, 1};
+        out->Resize(std::vector<int64_t>{
+            out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
+        break;
+      default:
+        CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc";
+    }
+    LayoutTransCompute<lite::TargetType::kX86,
+                       typename FPTypeTraits<Precision>::T>(
+        x_dims, context, *x, out, axis);
+    if (x_dims > 2) {
+      out->Resize(origin_dims);
+    }
+  }
+  std::string doc() const override {
+    return "Mlu layout transform nchw to nhwc";
+  }
+};
+template <PrecisionType Precision>
+class LayoutNhwcToNchwCompute
+    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::LayoutParam;
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto* x = param.x;
+    auto* out = param.y;
+    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
+    auto x_dims = param.x->dims().size();
+    auto& context = this->ctx_->template As<X86Context>();
+    const auto origin_dims = out->dims().Vectorize();
+    std::vector<int> axis;
+    switch (x_dims) {
+      case 2:
+        axis = {0, 1};
+        break;
+      case 3:
+        out->Resize(std::vector<int64_t>{
+            out->dims()[0], out->dims()[2], out->dims()[1]});
+        axis = {0, 2, 1};
+        break;
+      case 4:
+        out->Resize(std::vector<int64_t>{
+            out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]});
+        axis = {0, 3, 1, 2};
+        break;
+      default:
+        CHECK(0) << "Unsupport dim in mlu layout nhwc to nchw";
+    }
+    LayoutTransCompute<lite::TargetType::kX86,
+                       typename FPTypeTraits<Precision>::T>(
+        x_dims, context, *x, out, axis);
+    if (x_dims > 2) {
+      out->Resize(origin_dims);
+    }
+  }
+  std::string doc() const override {
+    return "Mlu layout transform nhwc to nchw";
+  }
+};
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -46,6 +46,32 @@ class SubgraphEngine : public subgraph::Engine {
    graph_.SetFPType(type);
  }
+  int Build() {
+    // In order to attach all of the ops of the block desc, we need to build
+    // the original program firstly.
+    BuildOriginProgram();
+    // Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph
+    build_device_program_status_ = BuildDeviceProgram();
+    return build_device_program_status_;
+  }
+  int Launch() {
+    // Rebuild device program when the shapes of input tensors have been
+    // changed.
+    if (subgraph::CHECK_SUCCESS(build_device_program_status_) &&
+        subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED(
+            build_device_program_status_) &&
+        InputShapeChanged()) {
+      Build();
+    }
+    if (subgraph::CHECK_FAILED(build_device_program_status_)) {
+      LaunchOriginProgram();
+    } else {
+      LaunchDeviceProgram();
+    }
+    return 0;
+  }
 protected:
  int BuildDeviceProgram() override {
    int status = 0;
@@ -57,7 +83,7 @@ class SubgraphEngine : public subgraph::Engine {
          graph_.AddNode(input_name,
                         input_tensor->dims().Vectorize(),
                         CNML_TENSOR,
-                         CNML_NHWC,
+                         CNML_NCHW,
                         graph_.FPType(),
                         const_cast<void*>(input_tensor->raw_data()));
      CHECK(input_node);
@@ -71,9 +97,9 @@ class SubgraphEngine : public subgraph::Engine {
    for (auto& inst : origin_program_) {
      auto op = inst.op();
      CHECK(op);
-      op->CheckShape();
-      op->InferShape();
      std::string op_type = op->op_info()->Type();
+      op->CheckShape();
+      const_cast<OpLite*>(op)->InferShape();
      if (!bridges.Exists(op_type, TARGET(kMLU))) {
        LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
        return subgraph::FAILED;
@@ -108,23 +134,23 @@ class SubgraphEngine : public subgraph::Engine {
      graph_.AddInput(graph_.GetNode(input_name));
    }
    CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
-    // auto& mlu_context = this->ctx_->template As<MLUContext>();
+    auto& mlu_context = this->ctx_->template As<MLUContext>();
-    // auto core_version = mlu_context.MLUCoreVersion();
+    auto core_version = mlu_context.MLUCoreVersion();
-    // auto core_number = mlu_context.MLUCoreNumber();
+    auto core_number = mlu_context.MLUCoreNumber();
-    // graph_.Compile(core_version, core_number);
+    graph_.Compile(core_version, core_number);
    return status;
  }
  int LaunchDeviceProgram() override {
-    // auto& mlu_context = this->ctx_->template As<MLUContext>();
+    auto& mlu_context = this->ctx_->template As<MLUContext>();
-    // auto exec_queue = mlu_context.exec_queue();
+    auto exec_queue = mlu_context.exec_queue();
-    // u32_t affinity = mlu_context.affinity();
+    u32_t affinity = mlu_context.affinity();
-    // cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
+    cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
-    // int data_param = 1;
+    int data_param = 1;
-    // forward_param.data_parallelism = &data_param;
+    forward_param.data_parallelism = &data_param;
-    // forward_param.affinity = &affinity;
+    forward_param.affinity = &affinity;
-    // forward_param.end = CNRT_PARAM_END;
+    forward_param.end = CNRT_PARAM_END;
-    // graph_.Compute(forward_param, exec_queue);
+    graph_.Compute(forward_param, exec_queue);
    return 0;
  }

--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
-if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU)
  return()
 endif()

--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -33,7 +33,7 @@ add_kernel(slice_opencl OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kerne
 add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
 # extra
 # wait to add ...
@@ -97,6 +97,10 @@ lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc
 lite_cc_test(test_pad2d_image_opencl SRCS pad2d_image_compute_test.cc
                 DEPS pad2d_opencl layout_opencl op_registry program context)
+lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
+                 DEPS box_coder_opencl op_registry program context)
 ######################
 # buffer kernel      #
 ######################

--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
+                                               PRECISION(kFP16),
+                                               DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::BoxCoderParam;
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    boxcoder_param_ = param_.get_mutable<param_t>();
+    if (boxcoder_param_->code_type == "decode_center_size" &&
+        boxcoder_param_->box_normalized == true) {
+      kernel_func_name_ = "decode_center_size";
+    } else {
+      printf("This code_type %s doesn't support \n",
+             boxcoder_param_->code_type.c_str());
+      return;
+    }
+    CHECK(context.cl_context() != nullptr);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/box_coder_kernel.cl", build_options_);
+  }
+  void Run() override {
+    boxcoder_param_ = param_.get_mutable<param_t>();
+    const auto& out_dims = boxcoder_param_->proposals->dims();
+    auto image_shape = InitImageDimInfoWith(out_dims);
+    auto* out_buf =
+        boxcoder_param_->proposals->mutable_data<half_t, cl::Image2D>(
+            image_shape["width"], image_shape["height"]);
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "boxcoder input shape:  ";
+#endif
+    const auto* input_priorbox = boxcoder_param_->prior_box;
+    const auto* input_priorboxvar = boxcoder_param_->prior_box_var;
+    const auto* input_targetbox = boxcoder_param_->target_box;
+    const auto& code_type = boxcoder_param_->code_type;
+    if (code_type == "decode_center_size") {
+      auto* prior_box_image = input_priorbox->data<half_t, cl::Image2D>();
+      auto* prior_box_var_image =
+          input_priorboxvar->data<half_t, cl::Image2D>();
+      auto* target_box_image = input_targetbox->data<half_t, cl::Image2D>();
+      int new_dims[4] = {1, 1, 1, 1};
+      for (int i = 0; i < out_dims.size(); i++) {
+        new_dims[4 - out_dims.size() + i] = out_dims[i];
+      }
+      auto& context = ctx_->As<OpenCLContext>();
+      CHECK(context.cl_context() != nullptr);
+      STL::stringstream kernel_key;
+      kernel_key << kernel_func_name_ << build_options_;
+      auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+      auto default_work_size =
+          DefaultWorkSize(out_dims,
+                          DDim(std::vector<DDim::value_type>{
+                              static_cast<int64_t>(image_shape["width"]),
+                              static_cast<int64_t>(image_shape["height"])}));
+      int out_C = new_dims[1];
+      int out_H = new_dims[2];
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(4) << TargetToStr(boxcoder_param_->proposals->target());
+      VLOG(4) << "output shape: " << out_dims[0] << ", " << out_dims[1] << ", "
+              << out_dims[2] << ", " << out_dims[3];
+      VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+              << image_shape["height"];
+      VLOG(4) << "out_C = " << out_C;
+      VLOG(4) << "out_H = " << out_H;
+      VLOG(4) << "default_work_size = " << default_work_size[0] << ", "
+              << default_work_size[1] << ", " << default_work_size[2];
+#endif
+      int arg_idx = 0;
+      cl_int status = kernel.setArg(arg_idx++, *prior_box_image);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, *prior_box_var_image);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, *target_box_image);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, *out_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, out_C);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, out_H);
+      CL_CHECK_FATAL(status);
+      auto global_work_size =
+          cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+                      static_cast<cl::size_type>(default_work_size[2])};
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_wait_list()->emplace(out_buf, event_);
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
+              << global_work_size[1];
+#endif
+    }
+  }
+  std::string doc() { return "Boxcoder using cl::Image, kFP16"; }
+  param_t* boxcoder_param_{nullptr};
+  std::string kernel_func_name_{};
+  std::string build_options_{" -DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+typedef paddle::lite::kernels::opencl::BoxCoderComputeImage BoxCoder_image;
+REGISTER_LITE_KERNEL(
+    box_coder, kOpenCL, kFP16, kImageDefault, BoxCoder_image, ImageDefault)
+    .BindInput("PriorBox",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("PriorBoxVar",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("TargetBox",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("OutputBox",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/box_coder_image_compute_test.cc
+++ b/lite/kernels/opencl/box_coder_image_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+#define FP16_MAX_DIFF (5e-1)
+namespace paddle {
+namespace lite {
+void box_coder_ref(float* proposals_data,
+                   const float* anchors_data,
+                   const float* bbox_deltas_data,
+                   const float* variances_data,
+                   int axis,
+                   bool box_normalized,
+                   std::string code_type,
+                   int row,
+                   int col) {
+  if (code_type == "decode_center_size") {
+    int anchor_len = 4;
+    int out_len = 4;
+    int var_len = 4;
+    int delta_len = 4;
+    float normalized = !box_normalized ? 1.f : 0;
+    for (int64_t row_id = 0; row_id < row; ++row_id) {
+      for (int64_t col_id = 0; col_id < col; ++col_id) {
+        size_t delta_offset = row_id * col * delta_len + col_id * delta_len;
+        size_t out_offset = row_id * col * out_len + col_id * out_len;
+        int prior_box_offset =
+            axis == 0 ? col_id * anchor_len : row_id * anchor_len;
+        int var_offset = axis == 0 ? col_id * var_len : row_id * var_len;
+        auto anchor_data_tmp = anchors_data + prior_box_offset;
+        auto bbox_deltas_data_tmp = bbox_deltas_data + delta_offset;
+        auto proposals_data_tmp = proposals_data + out_offset;
+        auto anchor_width =
+            anchor_data_tmp[2] - anchor_data_tmp[0] + normalized;
+        auto anchor_height =
+            anchor_data_tmp[3] - anchor_data_tmp[1] + normalized;
+        auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width;
+        auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height;
+        float bbox_center_x = 0, bbox_center_y = 0;
+        float bbox_width = 0, bbox_height = 0;
+        auto variances_data_tmp = variances_data + var_offset;
+        bbox_center_x =
+            variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width +
+            anchor_center_x;
+        bbox_center_y =
+            variances_data_tmp[1] * bbox_deltas_data_tmp[1] * anchor_height +
+            anchor_center_y;
+        bbox_width = std::exp(variances_data_tmp[2] * bbox_deltas_data_tmp[2]) *
+                     anchor_width;
+        bbox_height =
+            std::exp(variances_data_tmp[3] * bbox_deltas_data_tmp[3]) *
+            anchor_height;
+        proposals_data_tmp[0] = bbox_center_x - bbox_width / 2;
+        proposals_data_tmp[1] = bbox_center_y - bbox_height / 2;
+        proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized;
+        proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized;
+      }
+    }
+  } else if (code_type == "encode_center_size") {
+    LOG(FATAL) << "not implemented type: " << code_type;
+  } else {
+    LOG(FATAL) << "not supported type: " << code_type;
+  }
+}
+// #define BOXCODER_FP16_LOOP_TEST
+// #define BOXCODER_FP16_PRINT_RESULT
+TEST(box_coder_image2d, compute) {
+#ifdef BOXCODER_FP16_LOOP_TEST
+  for (auto n : {1, 2, 3, 4}) {
+    for (auto m : {1, 3, 4, 8}) {
+      for (auto norm : {true}) {
+        for (auto code_type : {"decode_center_size"}) {
+          for (auto axis : {0}) {
+#else
+  const int n = 1;
+  const int m = 1;
+  const bool norm = true;
+  const std::string code_type = "decode_center_size";
+  const int axis = 0;
+#endif  // BOXCODER_FP16_LOOP_TEST
+            LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << m
+                      << " ========";
+            LOG(INFO) << "======== parameters: norm = " << norm
+                      << ", axis = " << axis << "code_type: " << code_type;
+            auto kernels =
+                KernelRegistry::Global().Create("box_coder",
+                                                TARGET(kOpenCL),
+                                                PRECISION(kFP16),
+                                                DATALAYOUT(kImageDefault));
+            ASSERT_FALSE(kernels.empty());
+            auto kernel = std::move(kernels.front());
+            LOG(INFO) << "get kernel:" << kernel->doc();
+            lite::Tensor prior_box, prior_box_var, target_box, output_box;
+            operators::BoxCoderParam param;
+            param.prior_box = &prior_box;
+            param.prior_box_var = &prior_box_var;
+            param.target_box = &target_box;
+            param.proposals = &output_box;
+            param.axis = axis;
+            param.box_normalized = norm;
+            param.code_type = code_type;
+            std::unique_ptr<KernelContext> context(new KernelContext);
+            context->As<OpenCLContext>().InitOnce();
+            kernel->SetParam(param);
+            std::unique_ptr<KernelContext> boxcoder_context(new KernelContext);
+            context->As<OpenCLContext>().CopySharedTo(
+                &(boxcoder_context->As<OpenCLContext>()));
+            kernel->SetContext(std::move(boxcoder_context));
+            const DDim prior_box_dims =
+                DDim(std::vector<DDim::value_type>{1, 1, m, 4});
+            const DDim prior_box_var_dims =
+                DDim(std::vector<DDim::value_type>{1, 1, m, 4});
+            const DDim target_box_dims =
+                DDim(std::vector<DDim::value_type>{1, n, m, 4});
+            const DDim out_dim =
+                DDim(std::vector<DDim::value_type>{1, n, m, 4});
+            prior_box.Resize(prior_box_dims);
+            prior_box_var.Resize(prior_box_var_dims);
+            target_box.Resize(target_box_dims);
+            output_box.Resize(out_dim);
+            std::vector<float> prior_box_data(prior_box_dims.production());
+            std::vector<float> prior_box_var_data(
+                prior_box_var_dims.production());
+            std::vector<float> target_box_data(target_box_dims.production());
+            for (int i = 0; i < prior_box_dims.production(); i++) {
+              prior_box_data[i] = i * 1.1 / prior_box_dims.production();
+            }
+            for (int i = 0; i < prior_box_var_dims.production(); i++) {
+              prior_box_var_data[i] = i * 1.2 / prior_box_var_dims.production();
+            }
+            for (int i = 0; i < target_box_dims.production(); i++) {
+              target_box_data[i] = i * 1.3 / target_box_dims.production();
+            }
+            LOG(INFO) << "prepare input";
+            CLImageConverterDefault* default_converter =
+                new CLImageConverterDefault();
+            DDim prior_box_image_shape =
+                default_converter->InitImageDimInfoWith(prior_box_dims);
+            LOG(INFO) << "prior_box_image_shape = " << prior_box_image_shape[0]
+                      << " " << prior_box_image_shape[1];
+            std::vector<half_t> prior_box_image_data(
+                prior_box_image_shape.production() * 4);  // 4 : RGBA
+            default_converter->NCHWToImage(prior_box_data.data(),
+                                           prior_box_image_data.data(),
+                                           prior_box_dims);
+            auto* prior_box_image = prior_box.mutable_data<half_t, cl::Image2D>(
+                prior_box_image_shape[0],
+                prior_box_image_shape[1],
+                prior_box_image_data.data());
+            DDim prior_box_var_image_shape =
+                default_converter->InitImageDimInfoWith(prior_box_var_dims);
+            LOG(INFO) << "prior_box_var_image_shape = "
+                      << prior_box_var_image_shape[0] << " "
+                      << prior_box_var_image_shape[1];
+            std::vector<half_t> prior_box_var_image_data(
+                prior_box_var_image_shape.production() * 4);  // 4 : RGBA
+            default_converter->NCHWToImage(prior_box_var_data.data(),
+                                           prior_box_var_image_data.data(),
+                                           prior_box_var_dims);
+            auto* prior_box_var_image =
+                prior_box_var.mutable_data<half_t, cl::Image2D>(
+                    prior_box_var_image_shape[0],
+                    prior_box_var_image_shape[1],
+                    prior_box_var_image_data.data());
+            DDim target_box_image_shape =
+                default_converter->InitImageDimInfoWith(target_box_dims);
+            LOG(INFO) << "target_box_image_shape = "
+                      << target_box_image_shape[0] << " "
+                      << target_box_image_shape[1];
+            std::vector<half_t> target_box_image_data(
+                target_box_image_shape.production() * 4);  // 4 : RGBA
+            default_converter->NCHWToImage(target_box_data.data(),
+                                           target_box_image_data.data(),
+                                           target_box_dims);
+            auto* target_box_image =
+                target_box.mutable_data<half_t, cl::Image2D>(
+                    target_box_image_shape[0],
+                    target_box_image_shape[1],
+                    target_box_image_data.data());
+            DDim out_image_shape =
+                default_converter->InitImageDimInfoWith(out_dim);
+            LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+                      << out_image_shape[1];
+            auto* out_image = output_box.mutable_data<half_t, cl::Image2D>(
+                out_image_shape[0], out_image_shape[1]);
+            kernel->Launch();
+            auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+            auto* out_ptr = param.proposals->data<half_t, cl::Image2D>();
+            auto it = wait_list->find(out_ptr);
+            if (it != wait_list->end()) {
+              VLOG(4) << "--- Find the sync event for the target cl "
+                         "tensor. ---";
+              auto& event = *(it->second);
+              event.wait();
+            } else {
+              LOG(FATAL) << "Could not find the sync event for the "
+                            "target cl tensor.";
+            }
+            lite::Tensor out_ref_tensor;
+            out_ref_tensor.Resize(out_dim);
+            box_coder_ref(out_ref_tensor.mutable_data<float>(),
+                          prior_box_data.data(),
+                          target_box_data.data(),
+                          prior_box_var_data.data(),
+                          axis,
+                          norm,
+                          code_type,
+                          target_box_dims[0],
+                          target_box_dims[1]);
+            const size_t cl_image2d_row_pitch{0};
+            const size_t cl_image2d_slice_pitch{0};
+            half_t* out_image_data =
+                new half_t[40000];  // [out_image_shape.production() * 4];
+            TargetWrapperCL::ImgcpySync(out_image_data,
+                                        out_image,
+                                        out_image_shape[0],
+                                        out_image_shape[1],
+                                        cl_image2d_row_pitch,
+                                        cl_image2d_slice_pitch,
+                                        IoDirection::DtoH);
+            float* out_data = new float[out_image_shape.production() * 4];
+            default_converter->ImageToNCHW(
+                out_image_data, out_data, out_image_shape, out_dim);
+// result
+#ifdef BOXCODER_FP16_PRINT_RESULT
+            LOG(INFO) << "---- print kernel result (input -> output) ----";
+            for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
+              std::cout << target_box_data[eidx] << " -> " << out_data[eidx]
+                        << std::endl;
+            }
+#endif  // BOXCODER_FP16_PRINT_RESULT
+            const float* out_ref = out_ref_tensor.data<float>();
+            for (int i = 0; i < out_dim.production(); i++) {
+              auto abs_diff = abs(out_data[i] - out_ref[i]);
+              auto relative_diff =
+                  COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+              EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
+                            (abs_diff <= FP16_MAX_DIFF),
+                        true);
+              if ((relative_diff > FP16_MAX_DIFF) &&
+                  (abs_diff > FP16_MAX_DIFF)) {
+                LOG(ERROR) << "error idx:" << i << ", in_data[" << i
+                           << "]: " << target_box_data[i] << ", out_data[" << i
+                           << "]: " << out_data[i] << ", out_ref[" << i
+                           << "]: " << out_ref[i] << ", abs_diff: " << abs_diff
+                           << ", relative_diff: " << relative_diff
+                           << ", FP16_MAX_DIFF: " << FP16_MAX_DIFF;
+              }
+            }
+#ifdef BOXCODER_FP16_LOOP_TEST
+          }  // axis
+        }    // code_type
+      }      // norm
+    }        // m
+  }          // n
+#else
+// nothing to do.
+#endif
+}
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(box_coder, kOpenCL, kFP16, kImageDefault, ImageDefault);
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -2,7 +2,7 @@ if(NOT LITE_WITH_X86)
    return()
 endif()
-add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops math_function)
+add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_function)
 # lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps})
@@ -30,6 +30,8 @@ add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps}
 add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps} blas math_function sequence2batch gru_compute)
 #add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(sequence_unpad_compute_x86 X86 basic SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps} sequence_padding)
+add_kernel(sequence_conv_compute_x86 X86 basic SRCS sequence_conv_compute.cc DEPS ${lite_kernel_deps} math_function blas context_project)
 # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
 add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type)

--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
@@ -21,7 +21,7 @@
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/fluid/eigen.h"
-#include "lite/operators/activation_ops.h"
+#include "lite/operators/op_params.h"
 namespace paddle {
 namespace lite {
@@ -231,8 +231,8 @@ class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    // auto& context = ctx_->As<X86Context>();
    auto& param = *param_.get_mutable<operators::ActivationParam>();
-    const T* x_data = param.X->data<T>();
+    const T* x_data = param.X->template data<T>();
-    T* out_data = param.Out->mutable_data<T>();
+    T* out_data = param.Out->template mutable_data<T>();
    size_t x_size = param.X->numel();
    for (size_t i = 0; i < x_size; i++) {
      out_data[i] = x_data[i] / (static_cast<T>(1) + std::abs(x_data[i]));

--- a/lite/kernels/x86/attention_padding_mask_compute.h
+++ b/lite/kernels/x86/attention_padding_mask_compute.h
@@ -45,9 +45,9 @@ class AttentionPaddingMaskCompute
    auto src_len = static_cast<int64_t>(bottom1->lod()[0][1]);
    const int att_batch = bottom0->lod()[0].size() - 1;
    const int src_batch = bottom1->lod()[0].size() - 1;
-    int* pad_begin = _pad_begin->mutable_data<int>();
+    int* pad_begin = _pad_begin->template mutable_data<int>();
    for (int i = 0; i < src_batch; ++i) {
-      const auto* src_data = bottom1->data<T>() + src_len * i;
+      const auto* src_data = bottom1->template data<T>() + src_len * i;
      int index = src_len - 1;
      for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
           --index) {
@@ -56,13 +56,14 @@ class AttentionPaddingMaskCompute
    }
    const auto att_len = static_cast<int64_t>(bottom0->lod()[0][1]);
-    auto* top_data = top->mutable_data<T>();
+    auto* top_data = top->template mutable_data<T>();
    memcpy(top_data,
-           bottom0->data<T>(),
+           bottom0->template data<T>(),
           bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T));
    for (int i = 0; i < att_batch; ++i) {
      for (int j = 0; j < att_len; ++j) {
-        top_data = top->mutable_data<T>() + src_len * (att_len * i + j);
+        top_data =
+            top->template mutable_data<T>() + src_len * (att_len * i + j);
        int src_idx = i % src_batch;
        for (int k = pad_begin[src_idx]; k < src_len; ++k) {
          top_data[k] = _mask;

--- a/lite/kernels/x86/batch_norm_compute.h
+++ b/lite/kernels/x86/batch_norm_compute.h
@@ -59,26 +59,26 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    const int sample_size = x->dims().production() / N / C;
    // alloc memory
-    param.y->mutable_data<T>();
+    param.y->template mutable_data<T>();
    if (!param.is_test) {
-      param.mean_out->mutable_data<T>();
+      param.mean_out->template mutable_data<T>();
-      param.variance_out->mutable_data<T>();
+      param.variance_out->template mutable_data<T>();
-      param.saved_mean->mutable_data<T>();
+      param.saved_mean->template mutable_data<T>();
-      param.saved_variance->mutable_data<T>();
+      param.saved_variance->template mutable_data<T>();
    }
    if (!global_stats) {
      // saved_xx is use just in this batch of data
-      EigenVectorArrayMap<T> saved_mean_e(param.saved_mean->mutable_data<T>(),
+      EigenVectorArrayMap<T> saved_mean_e(
-                                          C);
+          param.saved_mean->template mutable_data<T>(), C);
      EigenVectorArrayMap<T> saved_variance_e(
-          param.saved_variance->mutable_data<T>(), C);
+          param.saved_variance->template mutable_data<T>(), C);
      saved_mean_e.setZero();
      saved_variance_e.setZero();
-      EigenVectorArrayMap<T> running_mean_arr(param.mean_out->mutable_data<T>(),
+      EigenVectorArrayMap<T> running_mean_arr(
-                                              C);
+          param.mean_out->template mutable_data<T>(), C);
      EigenVectorArrayMap<T> running_var_arr(
-          param.variance_out->mutable_data<T>(), C);
+          param.variance_out->template mutable_data<T>(), C);
      if ((N * sample_size) == 1) {
        LOG(WARNING) << "Only 1 element in normalization dimension, "
@@ -89,7 +89,8 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
      switch (param.data_layout) {
        case DATALAYOUT(kNCHW): {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+          ConstEigenArrayMap<T> x_arr(
+              x->template data<T>(), sample_size, N * C);
          for (int nc = 0; nc < N * C; ++nc) {
            saved_mean_e(nc % C) += x_arr.col(nc).sum();
          }
@@ -115,33 +116,37 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    // use SavedMean and SavedVariance to do normalize
    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
    if (global_stats) {
-      ConstEigenVectorArrayMap<T> var_arr(param.variance->data<T>(), C);
+      ConstEigenVectorArrayMap<T> var_arr(param.variance->template data<T>(),
+                                          C);
      inv_std = (var_arr + param.epsilon).sqrt().inverse();
    } else {
      EigenVectorArrayMap<T> saved_inv_std(
-          param.saved_variance->mutable_data<T>(), C);
+          param.saved_variance->template mutable_data<T>(), C);
      // inverse SavedVariance first, gradient will use it too.
      saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt();
      inv_std = saved_inv_std;
    }
    ConstEigenVectorArrayMap<T> mean_arr(
-        global_stats ? param.mean->data<T>() : param.saved_mean->data<T>(), C);
+        global_stats ? param.mean->template data<T>()
+                     : param.saved_mean->template data<T>(),
+        C);
    //   ((x - est_mean) * (inv_var) * scale + bias
    //   formula transform ====>
    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-    ConstEigenVectorArrayMap<T> scale_arr(param.scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> scale_arr(param.scale->template data<T>(), C);
-    ConstEigenVectorArrayMap<T> bias_arr(param.bias->data<T>(), C);
+    ConstEigenVectorArrayMap<T> bias_arr(param.bias->template data<T>(), C);
    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
        bias_arr - mean_arr * inv_std * scale_arr;
    switch (param.data_layout) {
      case DATALAYOUT(kNCHW): {
-        EigenArrayMap<T> y_arr(param.y->mutable_data<T>(), sample_size, N * C);
+        EigenArrayMap<T> y_arr(
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+            param.y->template mutable_data<T>(), sample_size, N * C);
+        ConstEigenArrayMap<T> x_arr(x->template data<T>(), sample_size, N * C);
        for (int nc = 0; nc < N * C; ++nc) {
          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
        }

--- a/lite/kernels/x86/cast_compute.cc
+++ b/lite/kernels/x86/cast_compute.cc
@@ -23,3 +23,14 @@ REGISTER_LITE_KERNEL(cast,
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
    .Finalize();
+REGISTER_LITE_KERNEL(
+    cast,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::CastCompute<::paddle::lite::fluid::float16>,
+    fp16_to_any)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/lite/kernels/x86/concat_compute.h
+++ b/lite/kernels/x86/concat_compute.h
@@ -47,7 +47,7 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    int64_t axis = static_cast<int64_t>(param.axis);
    auto* axis_tensor = param.axis_tensor;
    if (axis_tensor != nullptr) {
-      auto* axis_tensor_data = axis_tensor->data<int>();
+      auto* axis_tensor_data = axis_tensor->template data<int>();
      axis = static_cast<int64_t>(axis_tensor_data[0]);
    }
@@ -60,7 +60,7 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    int concat_input_size = count(axis + 1, x_dims.size(), x_dims);
    const int top_concat_axis = out->dims()[axis];
    for (size_t i = 0; i < param.x.size(); ++i) {
-      const T* bottom_data = param.x[i]->data<T>();
+      const T* bottom_data = param.x[i]->template data<T>();
      const int64_t bottom_concat_axis = param.x[i]->dims()[axis];
      for (int n = 0; n < num_concat; ++n) {
        std::memcpy(

--- a/lite/kernels/x86/conv_compute.h
+++ b/lite/kernels/x86/conv_compute.h
@@ -52,7 +52,7 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto& context = ctx_->As<X86Context>();
    auto& param = *param_.get_mutable<operators::ConvParam>();
    lite::Tensor filter = *param.filter;
-    param.output->mutable_data<T>();
+    param.output->template mutable_data<T>();
    const int batch_size = static_cast<int>(param.x->dims()[0]);
    std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
@@ -95,9 +95,9 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto blas =
        paddle::lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
    for (int i = 0; i < batch_size; i++) {
-      lite::Tensor in_batch = param.x->Slice<T>(i, i + 1);
+      lite::Tensor in_batch = param.x->template Slice<T>(i, i + 1);
      in_batch.Resize(input_shape);
-      lite::Tensor out_batch = param.output->Slice<T>(i, i + 1);
+      lite::Tensor out_batch = param.output->template Slice<T>(i, i + 1);
      out_batch.Resize(output_matrix_shape);
      for (int g = 0; g < param.groups; g++) {
        lite::Tensor in_slice =

--- a/lite/kernels/x86/dropout_compute.h
+++ b/lite/kernels/x86/dropout_compute.h
@@ -38,10 +38,10 @@ class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  using param_t = operators::DropoutParam;
  void Run() override {
    auto& param = *param_.get_mutable<operators::DropoutParam>();
-    const auto* x_data = param.x->data<T>();
+    const auto* x_data = param.x->template data<T>();
-    auto* out_data = param.output->mutable_data<T>();
+    auto* out_data = param.output->template mutable_data<T>();
    if (!param.is_test) {
-      auto* mask_data = param.mask->mutable_data<T>();
+      auto* mask_data = param.mask->template mutable_data<T>();
      std::random_device rnd;
      std::minstd_rand engine;
      int seed = param.fix_seed ? param.seed : rnd();

--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
@@ -248,8 +248,8 @@ class TransformFunctor {
                   lite::Tensor *z,
                   const lite::Context<Target> &ctx,
                   Functor func)
-      : x_(x->data<T>()),
+      : x_(x->template data<T>()),
-        y_(y->data<T>()),
+        y_(y->template data<T>()),
        z_(z->mutable_data<OutType>()),
        nx_(x->numel()),
        ctx_(ctx),
@@ -483,9 +483,10 @@ void FusedElemwiseAndActComputeNoBroadcast(const lite::Context<Target> &ctx,
          x.data<T>(),
          y.data<T>(),
          compound_functor,
-          out->mutable_data<T>(),
+          out->template mutable_data<T>(),
-          intermediate_out == nullptr ? nullptr
+          intermediate_out == nullptr
-                                      : intermediate_out->mutable_data<T>()});
+              ? nullptr
+              : intermediate_out->template mutable_data<T>()});
 }
 template <lite::TargetType Target,
@@ -523,9 +524,10 @@ void FusedElemwiseAndActComputeWithBroadcast(const lite::Context<Target> &ctx,
        compound_functor,
        h,
        w,
-        out->mutable_data<T>(),
+        out->template mutable_data<T>(),
-        intermediate_out == nullptr ? nullptr
+        intermediate_out == nullptr
-                                    : intermediate_out->mutable_data<T>());
+            ? nullptr
+            : intermediate_out->template mutable_data<T>());
  } else {
    FusedElemwiseAndActBroadcast2CPU<T,
@@ -539,9 +541,10 @@ void FusedElemwiseAndActComputeWithBroadcast(const lite::Context<Target> &ctx,
        n,
        post,
        compound_functor,
-        out->mutable_data<T>(),
+        out->template mutable_data<T>(),
-        intermediate_out == nullptr ? nullptr
+        intermediate_out == nullptr
-                                    : intermediate_out->mutable_data<T>());
+            ? nullptr
+            : intermediate_out->template mutable_data<T>());
  }
 }

--- a/lite/kernels/x86/fc_compute.h
+++ b/lite/kernels/x86/fc_compute.h
@@ -140,9 +140,9 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    int M = output->dims().production() / w_dims1;
-    const T* input_data = input->data<T>();
+    const T* input_data = input->template data<T>();
-    const T* w_data = w->data<T>();
+    const T* w_data = w->template data<T>();
-    T* output_data = output->mutable_data<T>();
+    T* output_data = output->template mutable_data<T>();
    auto& context = ctx_->As<X86Context>();
    FCFunctor<lite::TargetType::kX86, T> fc;
@@ -153,7 +153,7 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
       input_data,
       w_data,
       output_data,
-       bias ? bias->data<T>() : NULL,
+       bias ? bias->template data<T>() : NULL,
       with_relu,
       padding_weights);
  }

--- a/lite/kernels/x86/fill_constant_batch_size_like_compute.h
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute.h
@@ -42,9 +42,9 @@ class FillConstantBatchSizeLikeCompute
      int output_dim_idx = param.output_dim_idx;
      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
      out->Resize(odims);
-      // out->mutable_data<T>();
+      // out->template mutable_data<T>();
    }
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
    auto value = param.value;
    paddle::lite::x86::math::SetConstant<lite::TargetType::kX86, T> setter;

--- a/lite/kernels/x86/gather_compute.h
+++ b/lite/kernels/x86/gather_compute.h
@@ -50,9 +50,9 @@ void CPUGather(const lite::Tensor* src,
  auto src_dims = src->dims();
-  const T* p_src = src->data<T>();
+  const T* p_src = src->template data<T>();
  const IndexT* p_index = index->data<IndexT>();
-  T* p_output = output->mutable_data<T>();
+  T* p_output = output->template mutable_data<T>();
  // slice size
  int slice_size = 1;
@@ -77,7 +77,7 @@ class GatherCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto index = param.Index;
    auto out = param.Out;
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
    if (x->dims().production() == 0) return;
    /*
     * Since there's no type defined for lite::Tensor in Paddle-Lite, then

--- a/lite/kernels/x86/gru_compute.h
+++ b/lite/kernels/x86/gru_compute.h
@@ -44,7 +44,7 @@ inline void ReorderInitState(const lite::Context<TARGET(kX86)>& context,
                             bool indexed_src) {
  lite::x86::math::CopyMatrixRowsFunctor<TARGET(kX86), T> row_shuffle;
  dst->Resize(src.dims());
-  dst->mutable_data<T>();
+  dst->template mutable_data<T>();
  row_shuffle(context, src, index_lod, dst, indexed_src);
 }
@@ -65,18 +65,19 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto* input = param.input;
    auto* h0 = param.h0;
    auto* weight = param.weight;
-    const T* weight_data = weight->data<T>();
+    const T* weight_data = weight->template data<T>();
    auto* bias = param.bias;
    auto* batch_gate = param.batch_gate;
    auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev;
    auto* batch_hidden = param.batch_hidden;
-    T* batch_gate_ptr = batch_gate->mutable_data<T>();
+    T* batch_gate_ptr = batch_gate->template mutable_data<T>();
-    T* batch_reset_hidden_prev_ptr = batch_reset_hidden_prev->mutable_data<T>();
+    T* batch_reset_hidden_prev_ptr =
-    T* batch_hidden_ptr = batch_hidden->mutable_data<T>();
+        batch_reset_hidden_prev->template mutable_data<T>();
+    T* batch_hidden_ptr = batch_hidden->template mutable_data<T>();
    auto* hidden = param.hidden;
-    hidden->mutable_data<T>();
+    hidden->template mutable_data<T>();
    const auto& hidden_dims = hidden->dims();
@@ -99,7 +100,7 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
      // Since the batch computing for GRU reorders the input sequences
      // according to their length. The initialized cell state also needs
      // to reorder.
-      const std::vector<size_t>& order(batch_gate->lod()[2]);
+      const std::vector<uint64_t>& order(batch_gate->lod()[2]);
      ReorderInitState<T>(context, *h0, order, &ordered_h0, true);
      gru_value.prev_out_value = ordered_h0.mutable_data<T>();
    } else {

--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
@@ -47,9 +47,9 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto x_dims = x->dims();
-    y->mutable_data<T>();
+    y->template mutable_data<T>();
-    Mean->mutable_data<T>();
+    Mean->template mutable_data<T>();
-    Var->mutable_data<T>();
+    Var->template mutable_data<T>();
    auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
    int left = static_cast<int>(matrix_dim[0]);
@@ -73,10 +73,10 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
                   .At(right);
    ker(in.mutable_data<T>(),
        out.mutable_data<T>(),
-        Mean->mutable_data<T>(),
+        Mean->template mutable_data<T>(),
-        Var->mutable_data<T>(),
+        Var->template mutable_data<T>(),
-        Scale->data<T>(),
+        Scale->template data<T>(),
-        Bias->data<T>(),
+        Bias->template data<T>(),
        static_cast<int>(left),
        epsilon,
        right);

--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
@@ -33,15 +33,15 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto *ids_t = param.Ids;
    auto *output_t = param.Out;
    int64_t padding_idx = param.padding_idx;
-    const int64_t *ids = ids_t->data<int64_t>();
+    const int64_t *ids = ids_t->template data<int64_t>();
    int64_t ids_numel = ids_t->dims().production();
    auto *table_t = param.W;
    int64_t row_number = table_t->dims()[0];
    int64_t row_width = table_t->dims()[1];
-    const T *table = table_t->data<T>();
+    const T *table = table_t->template data<T>();
-    T *output = output_t->mutable_data<T>();
+    T *output = output_t->template mutable_data<T>();
    memset(output, 0, output_t->dims().production() * sizeof(T));
    for (int64_t i = 0; i < ids_numel; ++i) {
      if (padding_idx != -1 && ids[i] == padding_idx) {

--- a/lite/kernels/x86/match_matrix_tensor_compute.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute.cc
@@ -35,7 +35,7 @@ void MatchMatrixTensorCompute<T>::Run() {
  const auto& offset_l = x->lod()[0];
  const auto& offset_r = y->lod()[0];
-  std::vector<size_t> top_offset;
+  std::vector<uint64_t> top_offset;
  int top_size = 0;
  top_offset.push_back(top_size);
  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
@@ -97,9 +97,9 @@ void MatchMatrixTensorCompute<T>::Run() {
  int batch_size = x->lod()[0].size() - 1;
  int lod_lv1_size = batch_size * dim_t;
  int lod_lv2_size = x->lod()[0].back() * dim_t;
-  std::vector<size_t> out_lod0(batch_size + 1, 0);
+  std::vector<uint64_t> out_lod0(batch_size + 1, 0);
-  std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
+  std::vector<uint64_t> out_lod1(lod_lv1_size + 1, 0);
-  std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
+  std::vector<uint64_t> out_lod2(lod_lv2_size + 1, 0);
  for (int i = 0; i < batch_size; i++) {
    out_lod0[i + 1] = out_lod0[i] + dim_t;
    int len_l = offset_l[i + 1] - offset_l[i];

--- a/lite/kernels/x86/matmul_compute.h
+++ b/lite/kernels/x86/matmul_compute.h
@@ -56,7 +56,7 @@ class MatMulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto *x = param.X;
    auto *y = param.Y;
    auto *out = param.Out;
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
    auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
    auto mat_dim_a = lite::x86::math::CreateMatrixDescriptor(

--- a/lite/kernels/x86/mul_compute.h
+++ b/lite/kernels/x86/mul_compute.h
@@ -64,7 +64,7 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
      y_matrix = *y;
    }
-    z->mutable_data<T>();
+    z->template mutable_data<T>();
    auto z_dim = z->dims();
    if (z_dim.size() != 2) {
      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});

--- a/lite/kernels/x86/reduce_compute.h
+++ b/lite/kernels/x86/reduce_compute.h
@@ -49,7 +49,7 @@ class ReduceSumCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    bool reduce_all = param.reduce_all;
    auto* input = param.x;
    auto* output = param.output;
-    param.output->mutable_data<T>();
+    param.output->template mutable_data<T>();
    const auto& dims = param.dim;
    bool keep_dim = param.keep_dim;

--- a/lite/kernels/x86/scale_compute.h
+++ b/lite/kernels/x86/scale_compute.h
@@ -41,8 +41,8 @@ class ScaleCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
-    scale_compute(param.x->data<T>(),
+    scale_compute(param.x->template data<T>(),
-                  param.output->mutable_data<T>(),
+                  param.output->template mutable_data<T>(),
                  param.x->dims().production(),
                  param.scale,
                  param.bias,

--- a/lite/kernels/x86/search_grnn_compute.cc
+++ b/lite/kernels/x86/search_grnn_compute.cc
@@ -84,7 +84,7 @@ void SearchGrnnCompute<T>::PrepareLayout(const Tensor* input_blob) {
  int max_width = width_data[idx_sorted_by_width_data[0]];
  // start of reorganizing the input
-  std::vector<size_t> new_offset;
+  std::vector<uint64_t> new_offset;
  new_offset.resize(max_width + 1);
  new_offset[0] = 0;

--- a/lite/kernels/x86/search_group_padding_compute.h
+++ b/lite/kernels/x86/search_group_padding_compute.h
@@ -50,7 +50,7 @@ class SearchGroupPaddingCompute
      }
    }
-    std::vector<size_t> new_offset;
+    std::vector<uint64_t> new_offset;
    new_offset.resize(batch + 1);
    for (int i = 0; i < batch + 1; ++i) {
      new_offset[i] = i * max_seq;
@@ -67,7 +67,7 @@ class SearchGroupPaddingCompute
    top1_lod.push_back(offset);
    top1->set_lod(top1_lod);
    top1->Resize({dim0, 1});
-    memset(top1->mutable_data<T>(),
+    memset(top1->template mutable_data<T>(),
           0,
           top1->dims()[0] * top1->dims()[1] * sizeof(T));
    // for padding input id
@@ -76,9 +76,9 @@ class SearchGroupPaddingCompute
    top2->set_lod(top2_lod);
    top2->Resize({batch * max_seq, 1});
    // copy data
-    const auto* bottom_data = bottom0->data<T>();
+    const auto* bottom_data = bottom0->template data<T>();
-    auto* top_data = top0->mutable_data<T>();
+    auto* top_data = top0->template mutable_data<T>();
-    auto* top_padding_input_data = top2->mutable_data<T>();
+    auto* top_padding_input_data = top2->template mutable_data<T>();
    for (int i = 0; i < batch; i++) {
      const int copy_step = offset[i + 1] - offset[i];
      const int start = i * max_seq;

--- a/lite/kernels/x86/search_seq_fc_compute.h
+++ b/lite/kernels/x86/search_seq_fc_compute.h
@@ -58,8 +58,10 @@ class SearchSeqFcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
      int M = x_dims[0];
      int N = w_dims[0];
      for (int i = 0; i < M; i++) {
-        blas.AXPY(
+        blas.AXPY(N,
-            N, static_cast<T>(1), b->data<T>(), out->mutable_data<T>() + i * N);
+                  static_cast<T>(1),
+                  b->template data<T>(),
+                  out->template mutable_data<T>() + i * N);
      }
    }
  }

--- a/lite/kernels/x86/sequence_arithmetic_compute.h
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
@@ -39,9 +39,9 @@ class SequenceArithmeticCompute
    out->Resize(x->dims());
    out->set_lod(x->lod());
-    auto x_data = x->data<T>();
+    auto x_data = x->template data<T>();
-    auto y_data = y->data<T>();
+    auto y_data = y->template data<T>();
-    auto out_data = out->mutable_data<T>();
+    auto out_data = out->template mutable_data<T>();
    auto x_seq_offset = x->lod()[0];
    auto y_seq_offset = y->lod()[0];
    int seq_num = x_seq_offset.size() - 1;

--- a/lite/kernels/x86/sequence_concat_compute.h
+++ b/lite/kernels/x86/sequence_concat_compute.h
@@ -25,7 +25,7 @@ namespace x86 {
 template <typename T>
 inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
                     std::vector<lite::Tensor>* xs_in_order) {
-  std::vector<size_t> result;
+  std::vector<uint64_t> result;
  result.resize(xs[0]->lod()[0].size());
  for (size_t i = 1; i < result.size(); ++i) {
@@ -75,7 +75,7 @@ class SequenceConcatCompute
    out_dims[0] = batch_size;
    param.Out->Resize(out_dims);
-    T* dout = param.Out->mutable_data<T>();
+    T* dout = param.Out->template mutable_data<T>();
    std::vector<lite::Tensor> x_in_order;
    param.Out->set_lod(ConcatLoD<T>(param.X, &x_in_order));

--- a/lite/kernels/x86/sequence_concat_compute_test.cc
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
@@ -26,7 +26,7 @@ namespace x86 {
 namespace {
 inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
                     std::vector<lite::Tensor>* xs_in_order) {
-  std::vector<size_t> result;
+  std::vector<uint64_t> result;
  result.resize(xs[0]->lod()[0].size());
  for (size_t i = 1; i < result.size(); ++i) {

--- a/lite/kernels/x86/sequence_conv_compute.cc
+++ b/lite/kernels/x86/sequence_conv_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/x86/sequence_conv_compute.h"
+REGISTER_LITE_KERNEL(sequence_conv,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SequenceConvCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/lite/kernels/x86/sequence_conv_compute.h
+++ b/lite/kernels/x86/sequence_conv_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/backends/x86/math/context_project.h"
+#include "lite/backends/x86/math/math_function.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+namespace math = paddle::lite::x86::math;
+template <typename T>
+class SequenceConvCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceConvParam;
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto& ctx = this->ctx_->template As<X86Context>();
+    auto* in = param.X;
+    auto* filter = param.Filter;
+    auto* out = param.Out;
+    out->template mutable_data<T>();
+    CHECK(in->lod().size() == 1) << "Only support one level sequence now";
+    int context_start = param.contextStart;
+    int context_stride = param.contextStride;
+    int context_length = param.contextLength;
+    bool padding_trainable = false;
+    const Tensor* padding_data = nullptr;
+    int up_pad = std::max(0, -context_start);
+    int down_pad = std::max(0, context_start + context_length - 1);
+    auto sequence_width = static_cast<int64_t>(in->dims()[1]);
+    std::vector<int64_t> col_shape{in->dims()[0],
+                                   context_length * sequence_width};
+    Tensor col;
+    col.Resize(col_shape);
+    col.mutable_data<T>();
+    // Because if padding_trainable is false, padding data should be zeros.
+    math::SetConstant<TARGET(kX86), T> set_zero;
+    auto blas = math::GetBlas<TARGET(kX86), T>(ctx);
+    set_zero(ctx, &col, static_cast<T>(0));
+    math::ContextProjectFunctor<TARGET(kX86), T> seq_project_functor;
+    seq_project_functor(ctx,
+                        *in,
+                        padding_data,
+                        padding_trainable,
+                        context_start,
+                        context_length,
+                        context_stride,
+                        up_pad,
+                        down_pad,
+                        &col);
+    blas.MatMul(col, *filter, out);
+  }
+  virtual ~SequenceConvCompute() = default;
+};
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/x86/sequence_expand_as_compute.h
+++ b/lite/kernels/x86/sequence_expand_as_compute.h
@@ -29,8 +29,9 @@ using Tensor = lite::Tensor;
 template <typename T>
 struct SequenceExpandFunctor {
-  void operator()(const Tensor &x,
+  void operator()(
-                  const std::vector<size_t> &ref_lod, /*expand referenced lod*/
+      const Tensor &x,
+      const std::vector<uint64_t> &ref_lod, /*expand referenced lod*/
      Tensor *out) {
    int64_t hight = x.dims()[0];
    int64_t width = x.data_size() / hight;
@@ -39,13 +40,13 @@ struct SequenceExpandFunctor {
    T *out_data = out->mutable_data<T, T>();
    for (int h_id = 0; h_id < hight; ++h_id) {
-      size_t span = ref_lod[h_id + 1] - ref_lod[h_id];
+      uint64_t span = ref_lod[h_id + 1] - ref_lod[h_id];
      if (span == 0) continue;
      const T *src = in_data + h_id * width;
-      for (int64_t w_id = 0; w_id < width; ++w_id) {
+      for (uint64_t w_id = 0; w_id < width; ++w_id) {
        T ele = src[w_id];
        size_t offset = ref_lod[h_id] * width;
-        for (size_t k = 0; k < span; ++k) {
+        for (uint64_t k = 0; k < span; ++k) {
          out_data[offset + k * width + w_id] = ele;
        }
      }
@@ -68,7 +69,7 @@ class SequenceExpandAsCompute
    CHECK_EQ(y_lod.size(), 1);
    CHECK_GT(y_lod[0].size(), 1);
-    out->mutable_data<T, T>();
+    out->template mutable_data<T, T>();
    SequenceExpandFunctor<T> seq_espand_functor;
    seq_espand_functor(*x, y_lod[0], out);

--- a/lite/kernels/x86/sequence_pool_compute.h
+++ b/lite/kernels/x86/sequence_pool_compute.h
@@ -40,7 +40,7 @@ class SequencePoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    dims[0] = lod[0].size() - 1;
    out->Resize({dims});
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
    lite::Tensor* index = nullptr;
    const bool is_test = true;

--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
@@ -64,9 +64,9 @@ class SequenceReshapeCompute
    out->Resize(std::vector<int64_t>{static_cast<int64_t>(out->lod()[0].back()),
                                     out_width});
-    auto* dst_ptr = out->mutable_data<T>();
+    auto* dst_ptr = out->template mutable_data<T>();
    auto size = in->numel() * sizeof(T);
-    std::memcpy(dst_ptr, in->data<T>(), size);
+    std::memcpy(dst_ptr, in->template data<T>(), size);
  }
  virtual ~SequenceReshapeCompute() = default;

--- a/lite/kernels/x86/sequence_unpad_compute.cc
+++ b/lite/kernels/x86/sequence_unpad_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/x86/sequence_unpad_compute.h"
+REGISTER_LITE_KERNEL(sequence_unpad,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SequenceUnpadCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Length",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/lite/kernels/x86/sequence_unpad_compute.h
+++ b/lite/kernels/x86/sequence_unpad_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/backends/x86/math/sequence_padding.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+namespace math = paddle::lite::x86::math;
+template <typename T>
+class SequenceUnpadCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceUnpadParam;
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto& ctx = this->ctx_->template As<X86Context>();
+    param.Out->template mutable_data<T>();
+    int64_t padded_length = param.X->dims()[1];
+    math::UnpaddingLoDTensorFunctor<lite::TargetType::kX86, T>()(
+        ctx,
+        *param.X,
+        param.Out,
+        padded_length,
+        0,
+        false,
+        math::kBatchLengthWidth);
+  }
+  virtual ~SequenceUnpadCompute() = default;
+};
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/x86/shape_compute.h
+++ b/lite/kernels/x86/shape_compute.h
@@ -29,7 +29,7 @@ class ShapeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  void Run() override {
    auto& param = *param_.get_mutable<operators::ShapeParam>();
    // auto& context = context_->As<X86Context>();
-    auto out_data = param.Out->mutable_data<int32_t>();
+    auto out_data = param.Out->template mutable_data<int32_t>();
    auto in_dims = param.X->dims();
    for (int i = 0; i < in_dims.size(); ++i) {
      out_data[i] = in_dims[i];

--- a/lite/kernels/x86/softmax_compute.h
+++ b/lite/kernels/x86/softmax_compute.h
@@ -58,7 +58,7 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto* x = param.x;
    auto* output = param.output;
-    output->mutable_data<T>();
+    output->template mutable_data<T>();
    const int rank = x->dims().size();
    const int axis = CanonicalAxis(param.axis, rank);

--- a/lite/kernels/x86/squeeze_compute.h
+++ b/lite/kernels/x86/squeeze_compute.h
@@ -35,8 +35,8 @@ class SqueezeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto x = param.X;
    auto output = param.Out;
    auto x_dims = x->dims();
-    auto* x_data = x->data<T>();
+    auto* x_data = x->template data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
    memcpy(out_data, x_data, x_dims.production() * sizeof(T));
  }
@@ -54,9 +54,9 @@ class Squeeze2Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto output = param.Out;
    auto xshape = param.XShape;
    auto x_dims = x->dims();
-    auto* x_data = x->data<T>();
+    auto* x_data = x->template data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
-    auto* xshape_data = xshape->mutable_data<T>();
+    auto* xshape_data = xshape->template mutable_data<T>();
    memcpy(out_data, x_data, x_dims.production() * sizeof(T));
    memcpy(xshape_data, x_data, x_dims.production() * sizeof(T));
  }

--- a/lite/kernels/x86/stack_compute.h
+++ b/lite/kernels/x86/stack_compute.h
@@ -40,9 +40,9 @@ class StackCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    if (axis < 0) axis += (x[0]->dims().size() + 1);
    int n = static_cast<int>(x.size());
-    auto y_data = y->mutable_data<T>();
+    auto y_data = y->template mutable_data<T>();
    std::vector<const T*> x_datas(n);
-    for (int i = 0; i < n; ++i) x_datas[i] = x[i]->data<T>();
+    for (int i = 0; i < n; ++i) x_datas[i] = x[i]->template data<T>();
    int pre = 1, post = 1;
    auto dim = x[0]->dims();

--- a/lite/kernels/x86/transpose_compute.h
+++ b/lite/kernels/x86/transpose_compute.h
@@ -73,7 +73,7 @@ class TransposeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto& param = *param_.get_mutable<param_t>();
    auto* x = param.x;
    auto* out = param.output;
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
    int ndims = param.axis.size();
    auto& context = ctx_->As<X86Context>();
    TransCompute<lite::TargetType::kX86, T>(
@@ -92,7 +92,7 @@ class Transpose2Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto& param = *param_.get_mutable<param_t>();
    auto* x = param.x;
    auto* out = param.output;
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
    int ndims = param.axis.size();
    auto& context = ctx_->As<X86Context>();
    TransCompute<lite::TargetType::kX86, T>(

--- a/lite/kernels/x86/uniform_random_compute.cc
+++ b/lite/kernels/x86/uniform_random_compute.cc
@@ -34,8 +34,8 @@ class UniformRandomCompute
    auto *param_out = &param.Out->raw_tensor();
-    T *data =
+    T *data = param_out->template mutable_data<T>(
-        param_out->mutable_data<T>(context.x86_device_context()->GetPlace());
+        context.x86_device_context()->GetPlace());
    unsigned int seed = static_cast<unsigned int>(param.seed);
    std::minstd_rand engine;

--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
@@ -80,7 +80,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    std::vector<int64_t> col_dims_vec{top_size};
    col_dims_vec.push_back(1);
    col->Resize(col_dims_vec);
-    auto* top_data = col->mutable_data<T>();
+    auto* top_data = col->template mutable_data<T>();
    const auto* bottom_data = input.data<T>();
    int kernel_win_size = kernel_h * kernel_w;
@@ -149,7 +149,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    // const auto& offset_y = in_row->lod()[0];
    const auto& offset_y = param.X->lod()[1];
    const auto& offset_x = param.X->lod()[2];
-    std::vector<size_t> top_offset;
+    std::vector<uint64_t> top_offset;
    int top_size = 0;
    top_offset.push_back(top_size);
    for (int b = 0; b < batch; ++b) {
@@ -178,9 +178,9 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    std::vector<int64_t> top_dims_vec{top_size};
    top_dims_vec.push_back(1);
    top->Resize(top_dims_vec);
-    auto* top_data = top->mutable_data<T>();
+    auto* top_data = top->template mutable_data<T>();
-    const auto* w_data = w->data<T>();
+    const auto* w_data = w->template data<T>();
-    const auto* col_data = col->data<T>();
+    const auto* col_data = col->template data<T>();
    auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
    for (int b = 0; b < batch; ++b) {

--- a/lite/kernels/x86/var_conv_2d_compute_test.cc
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
@@ -140,7 +140,7 @@ static void var_conv_2d_ref(const lite::Tensor* bottom,
  const auto& col_offset = col->lod()[0];
  const auto& offset_x = in_col->lod()[0];
  const auto& offset_y = in_row->lod()[0];
-  std::vector<size_t> top_offset;
+  std::vector<uint64_t> top_offset;
  int top_size = 0;
  top_offset.push_back(top_size);
  for (int b = 0; b < batch; ++b) {

--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
+if(NOT LITE_WITH_XPU)
+  return()
+endif()
-add_subdirectory(bridges)
+if(LITE_WITH_XTCL)
+  add_subdirectory(bridges)
-add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
+  add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
+else()
+  add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} target_wrapper_xpu)
+  add_kernel(batch_norm_compute_xpu XPU basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(activation_compute_xpu XPU basic SRCS activation_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(pool_compute_xpu XPU basic SRCS pool_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(elementwise_compute_xpu XPU basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(mul_compute_xpu XPU basic SRCS mul_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(softmax_compute_xpu XPU basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(scale_compute_xpu XPU basic SRCS scale_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(lookup_table_compute_xpu XPU basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(layer_norm_compute_xpu XPU basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(dropout_compute_xpu XPU basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(matmul_compute_xpu XPU basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__multi_encoder_compute_xpu XPU extra SRCS __xpu__multi_encoder_compute.cc DEPS ${lite_kernel_deps})
+endif()
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/__xpu__multi_encoder_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void XPUMultiEncoderCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  for (auto* fc_weight : param.fc_weight) {
+    arg_fc_weight_.push_back(
+        reinterpret_cast<const int16_t*>(fc_weight->data<float>()));
+  }
+  for (auto* fc_bias : param.fc_bias) {
+    arg_fc_bias_.push_back(fc_bias->data<float>());
+  }
+  for (auto* ln_scale : param.ln_scale) {
+    arg_ln_scale_.push_back(ln_scale->data<float>());
+  }
+  for (auto* ln_bias : param.ln_bias) {
+    arg_ln_bias_.push_back(ln_bias->data<float>());
+  }
+  if (param.act_type == "relu") {
+    act_type_ = xdnn::Activation_t::RELU;
+  }
+}
+void XPUMultiEncoderCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  int batch_size = param.input->dims()[0];
+  int seq_len = param.input->dims()[1];
+  int r = xdnn::bert_encoder_transformer_int16<int16_t>(
+      ctx.GetRawContext(),                             /* context */
+      batch_size,                                      /* batch_size */
+      seq_len,                                         /* from_seq_len */
+      seq_len,                                         /* to_seq_len */
+      param.head_num,                                  /* head_num */
+      param.size_per_head,                             /* size_per_head */
+      param.n_layers,                                  /* n_layers */
+      param.input->data<float>(),                      /* from_tensor */
+      param.input->data<float>(),                      /* to_tensor */
+      param.mask->data<float>(),                       /* att_mask */
+      &arg_fc_weight_[0],                              /* fc_weights */
+      &arg_fc_bias_[0],                                /* fc_biass */
+      &arg_ln_scale_[0],                               /* ln_scales */
+      &arg_ln_bias_[0],                                /* ln_biass */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* output */
+      param.fc_weight_max->data<float>(),              /* fc_weights_max */
+      true,                                            /* pretrans_b */
+      true,                                            /* use_l3 */
+      act_type_ /* act_type */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(__xpu__multi_encoder,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMultiEncoderCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("FCWeight", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("FCBias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("LNScale", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("LNBias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("FCWeightMax", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+class XPUMultiEncoderCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMultiEncoderParam;
+  virtual void PrepareForRun();
+  virtual void Run();
+ private:
+  std::vector<const int16_t *> arg_fc_weight_;
+  std::vector<const float *> arg_fc_bias_;
+  std::vector<const float *> arg_ln_scale_;
+  std::vector<const float *> arg_ln_bias_;
+  xdnn::Activation_t act_type_{xdnn::Activation_t::GELU};
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/__xpu__resnet50_compute.cc
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/__xpu__resnet50_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void XPUResNet50Compute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  for (auto* filter : param.filter) {
+    arg_filter_.push_back(
+        reinterpret_cast<const int16_t*>(filter->data<float>()));
+  }
+  for (auto* bias : param.bias) {
+    arg_bias_.push_back(bias->data<float>());
+  }
+  for (auto* max_filter : param.max_filter) {
+    arg_max_filter_.push_back(max_filter->data<float>());
+  }
+}
+void XPUResNet50Compute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  int batch_size = param.input->dims()[0];
+  int r = xdnn::conv2d_int16_resnet<float, int16_t>(
+      ctx.GetRawContext(),                             /* context */
+      batch_size,                                      /* num */
+      param.input->data<float>(),                      /* bottom */
+      &arg_filter_[0],                                 /* weight_list */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* top */
+      &arg_bias_[0],                                   /* bias_list */
+      &arg_max_filter_[0] /* max_filter_list */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(__xpu__resnet50,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUResNet50Compute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("MaxFilter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/__xpu__resnet50_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+class XPUResNet50Compute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUResNet50Param;
+  virtual void PrepareForRun();
+  virtual void Run();
+ private:
+  std::vector<const int16_t *> arg_filter_;
+  std::vector<const float *> arg_max_filter_;
+  std::vector<const float *> arg_bias_;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/activation_compute.cc
+++ b/lite/kernels/xpu/activation_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/activation_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void ReluCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),      /* context */
+      xdnn::Activation_t::RELU, /* type */
+      param.X->numel(),         /* len */
+      param.X->data<float>(),   /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+void TanhCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),      /* context */
+      xdnn::Activation_t::TANH, /* type */
+      param.X->numel(),         /* len */
+      param.X->data<float>(),   /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+void SigmoidCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),         /* context */
+      xdnn::Activation_t::SIGMOID, /* type */
+      param.X->numel(),            /* len */
+      param.X->data<float>(),      /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    relu, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ReluCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    tanh, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::TanhCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+REGISTER_LITE_KERNEL(sigmoid,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SigmoidCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+class ReluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+  virtual void Run();
+  virtual ~ReluCompute() = default;
+};
+class TanhCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+  virtual void Run();
+  virtual ~TanhCompute() = default;
+};
+class SigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+  virtual void Run();
+  virtual ~SigmoidCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/batch_norm_compute.cc
+++ b/lite/kernels/xpu/batch_norm_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/batch_norm_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void BatchNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  float epsilon = param.epsilon;
+  auto& x_dims = param.x->dims();
+  int r = xdnn::batch_norm_infer_forward(
+      ctx.GetRawContext(),                        /* context */
+      epsilon,                                    /* epsilon */
+      x_dims[0],                                  /* img_n */
+      x_dims[1],                                  /* img_c */
+      x_dims[2],                                  /* img_h */
+      x_dims[3],                                  /* img_w */
+      param.x->data<float>(),                     /* img_gm */
+      param.y->mutable_data<float>(TARGET(kXPU)), /* out_gm */
+      param.scale->data<float>(),                 /* scale_gm */
+      param.bias->data<float>(),                  /* bias_gm */
+      param.mean->data<float>(),                  /* mean_gm */
+      param.variance->data<float>() /* var__gm */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(batch_norm,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::BatchNormCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/batch_norm_compute.h
+++ b/lite/kernels/xpu/batch_norm_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+class BatchNormCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BatchNormParam;
+  virtual void Run();
+  virtual ~BatchNormCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
-if(NOT LITE_WITH_XPU)
+if(NOT LITE_WITH_XTCL)
  return()
 endif()

--- a/lite/kernels/xpu/bridges/graph.h
+++ b/lite/kernels/xpu/bridges/graph.h
@@ -14,12 +14,12 @@
 #pragma once
-#include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"

--- a/lite/kernels/xpu/bridges/utility.h
+++ b/lite/kernels/xpu/bridges/utility.h
@@ -14,10 +14,10 @@
 #pragma once
-#include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"

--- a/lite/kernels/xpu/cast_compute.cc
+++ b/lite/kernels/xpu/cast_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/cast_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+template <typename InType>
+void CastCompute<InType>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+  auto* x = param.X;
+  auto* out = param.Out;
+  int out_dtype = param.out_dtype;
+  auto* in_data = x->template data<InType>();
+  int numel = x->numel();
+  int r = 0;
+  // BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6;
+  // SIZE_T = 19;UINT8 = 20;INT8 = 21;
+  if (out_dtype == 5) {
+    auto* out_data = out->template mutable_data<float>(TARGET(kXPU));
+    r = xdnn::cast<InType, float>(
+        ctx.GetRawContext(), in_data, out_data, numel);
+  } else if (out_dtype == 2) {
+    auto* out_data = out->template mutable_data<int>(TARGET(kXPU));
+    r = xdnn::cast<InType, int>(ctx.GetRawContext(), in_data, out_data, numel);
+  } else if (out_dtype == 3) {
+    auto* out_data = out->template mutable_data<int64_t>(TARGET(kXPU));
+    r = xdnn::cast<InType, int64_t>(
+        ctx.GetRawContext(), in_data, out_data, numel);
+  } else {
+    CHECK(false);
+  }
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(cast,
+                     kXPU,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::CastCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))})
+    .Finalize();
--- a/lite/kernels/xpu/cast_compute.h
+++ b/lite/kernels/xpu/cast_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+template <typename InType>
+class CastCompute : public KernelLite<TARGET(kXPU), PRECISION(kAny)> {
+ public:
+  using param_t = operators::CastParam;
+  void Run() override;
+  virtual ~CastCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/conv_compute.cc
+++ b/lite/kernels/xpu/conv_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/conv_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+template <>
+void Conv2dCompute<PRECISION(kFloat)>::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  auto& x_dims = param.x->dims();
+  auto& w_dims = param.filter->dims();
+  int groups = param.groups;
+  auto& strides = param.strides;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  int r = xdnn::conv2d_forward_int16<float, float, float, float>(
+      ctx.GetRawContext(),                             /* context */
+      x_dims[0],                                       /* num */
+      x_dims[1],                                       /* input_c */
+      x_dims[2],                                       /* input_h */
+      x_dims[3],                                       /* input_w */
+      w_dims[0],                                       /* num_filter */
+      w_dims[2],                                       /* kernel_h */
+      w_dims[3],                                       /* kernel_w */
+      strides[0],                                      /* stride_h */
+      strides[1],                                      /* stride_w */
+      paddings[0],                                     /* pad_h */
+      paddings[1],                                     /* pad_w */
+      dilations[0],                                    /* dilation_h */
+      dilations[1],                                    /* dilation_w */
+      groups,                                          /* group */
+      param.x->data<float>(),                          /* bottom */
+      param.filter->data<float>(),                     /* weight */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* top */
+      nullptr,                                         /* bias */
+      nullptr,                                         /* branch */
+      xdnn::Activation_t::LINEAR,                      /* type */
+      nullptr,                                         /* max_image_ptr */
+      nullptr,                                         /* max_filter_ptr */
+      nullptr /* max_result_ptr */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+namespace xpu = paddle::lite::kernels::xpu;
+using Conv2dFp32 = xpu::Conv2dCompute<PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(conv2d, kXPU, kFloat, kNCHW, Conv2dFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/conv_compute.h
+++ b/lite/kernels/xpu/conv_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+template <PrecisionType FilterPtype>
+class Conv2dCompute : public KernelLite<TARGET(kXPU), FilterPtype> {
+ public:
+  using param_t = operators::ConvParam;
+  virtual void Run();
+  virtual ~Conv2dCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/dropout_compute.cc
+++ b/lite/kernels/xpu/dropout_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/dropout_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void DropoutCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  int size = param.x->numel() * sizeof(float);
+  int r = xdnn::memcpy_device(
+      ctx.GetRawContext(),                             /* context */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* dst */
+      param.x->data<float>(),                          /* src */
+      size /* size */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(dropout,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::DropoutCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/dropout_compute.h
+++ b/lite/kernels/xpu/dropout_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+class DropoutCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::DropoutParam;
+  virtual void Run();
+  virtual ~DropoutCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/elementwise_compute.cc
+++ b/lite/kernels/xpu/elementwise_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/elementwise_compute.h"
+#include <functional>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void ElementwiseAddCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  auto& x_dims = param.X->dims().data();
+  auto& y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (param.axis == -1) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int iter = std::accumulate(
+      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
+  int stride = param.Y->numel();
+  for (int i = 0; i < iter; ++i) {
+    const float* x_ptr = param.X->data<float>() + i * stride;
+    const float* y_ptr = param.Y->data<float>();
+    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
+    int r = xdnn::elementwise_add(ctx.GetRawContext(), /* context */
+                                  x_ptr,               /* x */
+                                  y_ptr,               /* y */
+                                  o_ptr,               /* z */
+                                  stride /* len */);
+    CHECK_EQ(r, 0);
+  }
+}
+void ElementwiseSubCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  auto& x_dims = param.X->dims().data();
+  auto& y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (param.axis == -1) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int iter = std::accumulate(
+      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
+  int stride = param.Y->numel();
+  for (int i = 0; i < iter; ++i) {
+    const float* x_ptr = param.X->data<float>() + i * stride;
+    const float* y_ptr = param.Y->data<float>();
+    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
+    int r = xdnn::elementwise_sub(ctx.GetRawContext(), /* context */
+                                  x_ptr,               /* x */
+                                  y_ptr,               /* y */
+                                  o_ptr,               /* z */
+                                  stride /* len */);
+    CHECK_EQ(r, 0);
+  }
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ElementwiseAddCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ElementwiseSubCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/elementwise_compute.h
+++ b/lite/kernels/xpu/elementwise_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  virtual void Run();
+  virtual ~ElementwiseAddCompute() = default;
+};
+class ElementwiseSubCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  virtual void Run();
+  virtual ~ElementwiseSubCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/io_copy_compute.cc
+++ b/lite/kernels/xpu/io_copy_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/xpu/target_wrapper.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+/*
+ * This kernel copies a tensor from host to XPU.
+ */
+class IoCopyHostToXPUCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override {
+    auto& param = Param<operators::IoCopyParam>();
+    CHECK(param.x->target() == TARGET(kHost) ||
+          param.x->target() == TARGET(kX86) ||
+          param.x->target() == TARGET(kARM));
+    auto mem_size = param.x->memory_size();
+    VLOG(4) << "host to xpu, copy size " << mem_size;
+    auto* data = param.y->mutable_data(TARGET(kXPU), mem_size);
+    TargetWrapperXPU::MemcpySync(
+        data, param.x->raw_data(), mem_size, IoDirection::HtoD);
+  }
+  std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
+    std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
+    *res = [](const std::map<std::string, const Type*>& inputs,
+              const std::string& out) -> const Type* {
+      CHECK(!inputs.empty());
+      auto* type = inputs.at("Input");
+      CHECK(type->target() == TARGET(kHost));
+      auto out_place = type->place();
+      out_place.target = TARGET(kXPU);
+      auto* out_type = Type::Get(type->id(),
+                                 out_place.target,
+                                 out_place.precision,
+                                 out_place.layout,
+                                 out_place.device);
+      return out_type;
+    };
+    return res;
+  }
+  std::string doc() const override { return "Copy IO from HOST to XPU"; }
+};
+/*
+ * This kernel copies a tensor from XPU to host.
+ */
+class IoCopyXPUToHostCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override {
+    auto& param = Param<operators::IoCopyParam>();
+    CHECK(param.x->target() == TARGET(kXPU));
+    auto mem_size = param.x->memory_size();
+    VLOG(4) << "xpu to host, copy size " << mem_size;
+    auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
+    TargetWrapperXPU::MemcpySync(
+        data, param.x->raw_data(), mem_size, IoDirection::DtoH);
+  }
+  std::string doc() const override { return "Copy IO from XPU to HOST"; }
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(io_copy,
+                     kXPU,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::xpu::IoCopyHostToXPUCompute,
+                     host_to_device)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kXPU),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+REGISTER_LITE_KERNEL(io_copy,
+                     kXPU,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::xpu::IoCopyXPUToHostCompute,
+                     device_to_host)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+REGISTER_LITE_KERNEL(io_copy_once,
+                     kXPU,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::xpu::IoCopyHostToXPUCompute,
+                     host_to_device)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kXPU),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+REGISTER_LITE_KERNEL(io_copy_once,
+                     kXPU,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::xpu::IoCopyXPUToHostCompute,
+                     device_to_host)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
--- a/lite/kernels/xpu/layer_norm_compute.cc
+++ b/lite/kernels/xpu/layer_norm_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/layer_norm_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void LayerNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  auto x_dims = param.X->dims();
+  auto axis = param.begin_norm_axis;
+  auto matrix_dim = x_dims.Flatten2D(axis);
+  float epsilon = param.epsilon;
+  int r = xdnn::layer_norm(ctx.GetRawContext(),    /* context */
+                           matrix_dim[0],          /* m */
+                           matrix_dim[1],          /* n */
+                           param.X->data<float>(), /* in */
+                           param.Y->mutable_data<float>(TARGET(kXPU)), /* out */
+                           param.Scale->data<float>(), /* scale */
+                           param.Bias->data<float>(),  /* bias */
+                           epsilon /* epsilon */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(layer_norm,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::LayerNormCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/layer_norm_compute.h
+++ b/lite/kernels/xpu/layer_norm_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+class LayerNormCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::LayerNormParam;
+  virtual void Run();
+  virtual ~LayerNormCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/lookup_table_compute.cc
+++ b/lite/kernels/xpu/lookup_table_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/lookup_table_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void LookupTableCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  int num = param.Ids->numel();
+  int embed_dim = param.W->dims()[1];
+  int r = xdnn::embedding<float, int64_t>(
+      ctx.GetRawContext(),        /* context */
+      num,                        /* num */
+      param.Ids->data<int64_t>(), /* indices */
+      embed_dim,                  /* embed_dim */
+      param.W->data<float>(),     /* table */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* top */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(lookup_table,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::LookupTableCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/lookup_table_compute.h
+++ b/lite/kernels/xpu/lookup_table_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+class LookupTableCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::LookupTableParam;
+  virtual void Run();
+  virtual ~LookupTableCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/matmul_compute.cc
+++ b/lite/kernels/xpu/matmul_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/matmul_compute.h"
+#include "lite/backends/xpu/math.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+namespace math = paddle::lite::xpu::math;
+void MatMulCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  auto* x = param.X;
+  auto* y = param.Y;
+  auto* out = param.Out;
+  auto mat_dim_a = math::CreateMatrixDescriptor(
+      math::RowMatrixFromVector(x->dims()), 0, param.transpose_X);
+  auto mat_dim_b = math::CreateMatrixDescriptor(
+      math::ColumnMatrixFromVector(y->dims()), 0, param.transpose_Y);
+  int lda = (mat_dim_a.trans_ ? mat_dim_a.height_ : mat_dim_a.width_);
+  int ldb = (mat_dim_b.trans_ ? mat_dim_b.height_ : mat_dim_b.width_);
+  int ldc = mat_dim_b.width_;
+  int r = 0;
+  if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
+    r = xdnn::fc_int16(ctx.GetRawContext(), /* context */
+                       mat_dim_a.trans_,    /* TransA */
+                       mat_dim_b.trans_,    /* TransB */
+                       mat_dim_a.height_,   /* m */
+                       mat_dim_b.width_,    /* n */
+                       mat_dim_a.width_,    /* k */
+                       param.alpha,         /* alpha */
+                       x->data<float>(),    /* A */
+                       y->data<float>(),    /* B */
+                       0.0f,                /* beta */
+                       out->mutable_data<float>(TARGET(kXPU)) /* C */);
+  } else {
+    // batch matmul
+    r = xdnn::gemm_strided_batched_int16<float, float, float>(
+        ctx.GetRawContext(),                    /* context */
+        mat_dim_a.trans_,                       /* TransA */
+        mat_dim_b.trans_,                       /* TransB */
+        mat_dim_a.batch_size_,                  /* batch_size */
+        mat_dim_a.height_,                      /* M */
+        mat_dim_b.width_,                       /* N */
+        mat_dim_a.width_,                       /* K */
+        param.alpha,                            /* alpha */
+        x->data<float>(),                       /* A */
+        lda,                                    /* lda */
+        mat_dim_a.stride_,                      /* stride_a */
+        y->data<float>(),                       /* B */
+        ldb,                                    /* ldb */
+        mat_dim_b.stride_,                      /* stride_b */
+        0.0f,                                   /* beta */
+        out->mutable_data<float>(TARGET(kXPU)), /* C */
+        ldc,                                    /* ldc */
+        mat_dim_a.height_ * mat_dim_b.width_ /* stride_c */);
+  }
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    matmul, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::MatMulCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/matmul_compute.h
+++ b/lite/kernels/xpu/matmul_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+class MatMulCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatMulParam;
+  virtual void Run();
+  virtual ~MatMulCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/mul_compute.cc
+++ b/lite/kernels/xpu/mul_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/mul_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void MulCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  auto& origin_x = *param.x;
+  auto& origin_y = *param.y;
+  auto& x_dims = origin_x.dims();
+  auto& y_dims = origin_y.dims();
+  Tensor x_matrix, y_matrix;
+  if (x_dims.size() > 2) {
+    x_matrix = ReshapeToMatrix(origin_x, param.x_num_col_dims);
+  } else {
+    x_matrix = origin_x;
+  }
+  if (y_dims.size() > 2) {
+    y_matrix = ReshapeToMatrix(origin_y, param.y_num_col_dims);
+  } else {
+    y_matrix = origin_y;
+  }
+  int m = x_matrix.dims()[0];
+  int k = x_matrix.dims()[1];
+  int n = y_matrix.dims()[1];
+  int r =
+      xdnn::fc_int16(ctx.GetRawContext(), /* context */
+                     false,               /* TransA */
+                     false,               /* TransB */
+                     m,
+                     n,
+                     k,
+                     1.0f,                   /* alpha */
+                     x_matrix.data<float>(), /* A */
+                     y_matrix.data<float>(), /* B */
+                     0.0f,                   /* beta */
+                     param.output->mutable_data<float>(TARGET(kXPU)) /* C */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    mul, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::MulCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/mul_compute.h
+++ b/lite/kernels/xpu/mul_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+static inline lite::Tensor ReshapeToMatrix(const lite::Tensor& src,
+                                           int num_col_dims) {
+  int rank = src.dims().size();
+  if (rank == 2) {
+    return src;
+  }
+  lite::Tensor res;
+  res.ShareDataWith(src);
+  res.Resize(src.dims().Flatten2D(num_col_dims));
+  return res;
+}
+class MulCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulParam;
+  virtual void Run();
+  virtual ~MulCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/pool_compute.cc
+++ b/lite/kernels/xpu/pool_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/pool_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void Pool2DCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  auto& x_dims = param.x->dims();
+  CHECK_EQ(x_dims.size(), 4);
+  auto& o_dims = param.output->dims();
+  CHECK_EQ(param.ksize.size(), 2);
+  if (param.global_pooling) {
+    param.ksize[0] = x_dims[2];
+    param.ksize[1] = x_dims[3];
+  }
+  CHECK_EQ(param.strides.size(), 2);
+  CHECK_EQ(param.paddings->size(), 4);
+  auto& paddings = *param.paddings;
+  auto type = xdnn::MAX_WITHOUT_INDEX;
+  if (param.pooling_type == "avg") {
+    if (paddings[0] == 0 && paddings[1] == 0 && paddings[2] == 0 &&
+        paddings[3] == 0) {
+      type = xdnn::AVG_WITHOUT_PAD;
+    } else {
+      type = xdnn::AVG_WITH_PAD;
+    }
+  }
+  int r = xdnn::pooling_forward<float, float>(
+      ctx.GetRawContext(),                             /* context */
+      param.x->data<float>(),                          /* x */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* y */
+      nullptr,                                         /* y_index */
+      type,                                            /* type */
+      x_dims[0] * x_dims[1],                           /* c */
+      x_dims[2],                                       /* in_h */
+      x_dims[3],                                       /* in_w */
+      paddings[0],                                     /* pad_left */
+      paddings[1],                                     /* pad_right */
+      paddings[2],                                     /* pad_up */
+      paddings[3],                                     /* pad_down */
+      param.ksize[0],                                  /* win_h */
+      param.ksize[1],                                  /* win_w */
+      param.strides[0],                                /* stride_h */
+      param.strides[1],                                /* stride_w */
+      o_dims[2],                                       /* out_h */
+      o_dims[3] /* out_w */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    pool2d, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Pool2DCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/pool_compute.h
+++ b/lite/kernels/xpu/pool_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+class Pool2DCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::PoolParam;
+  virtual void Run();
+  virtual ~Pool2DCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/scale_compute.cc
+++ b/lite/kernels/xpu/scale_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/scale_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void ScaleCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  auto& x_dims = param.x->dims();
+  int r = xdnn::scale(ctx.GetRawContext(),    /* context */
+                      x_dims.production(),    /* len */
+                      param.scale,            /* alpha */
+                      param.bias,             /* beta */
+                      param.bias_after_scale, /* bias_after_scale */
+                      param.x->data<float>(), /* x */
+                      param.output->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    scale, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ScaleCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/scale_compute.h
+++ b/lite/kernels/xpu/scale_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+class ScaleCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ScaleParam;
+  virtual void Run();
+  virtual ~ScaleCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/slice_compute.cc
+++ b/lite/kernels/xpu/slice_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/slice_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void SliceCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.X->dims();
+  x_shape_.reserve(x_dims.size());
+  x_dim_begin_.reserve(x_dims.size());
+  x_dim_end_.reserve(x_dims.size());
+}
+void SliceCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  auto x_dims = param.X->dims();
+  for (size_t i = 0; i < x_dims.size(); ++i) {
+    x_shape_[i] = x_dims[i];
+    x_dim_begin_[i] = 0;
+    x_dim_end_[i] = x_dims[i];
+  }
+  for (size_t i = 0; i < param.axes.size(); ++i) {
+    int axis = param.axes[i];
+    x_dim_begin_[axis] = param.starts[i];
+    x_dim_end_[axis] = param.ends[i];
+  }
+  int ndim = param.X->dims().size();
+  int r = xdnn::slice_forward(
+      ctx.GetRawContext(),    /* context */
+      &x_shape_[0],           /* shape */
+      &x_dim_begin_[0],       /* starts */
+      &x_dim_end_[0],         /* ends */
+      ndim,                   /* n */
+      param.X->data<float>(), /* in */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    slice, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SliceCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/slice_compute.h
+++ b/lite/kernels/xpu/slice_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <vector>
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+class SliceCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SliceParam;
+  virtual void PrepareForRun();
+  virtual void Run();
+  virtual ~SliceCompute() = default;
+ private:
+  std::vector<int> x_shape_;
+  std::vector<int> x_dim_begin_;
+  std::vector<int> x_dim_end_;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/softmax_compute.cc
+++ b/lite/kernels/xpu/softmax_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/softmax_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void SoftmaxCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  auto& x_dims = param.x->dims();
+  int axis = CanonicalAxis(param.axis, x_dims.size());
+  int rows = SizeToAxis(axis, x_dims);
+  int cols = SizeFromAxis(axis, x_dims);
+  int r = xdnn::softmax2d_forward(
+      ctx.GetRawContext(),                             /* context */
+      param.x->data<float>(),                          /* x */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* y */
+      rows,                                            /* rows */
+      cols /* cols */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(softmax,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SoftmaxCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/softmax_compute.h
+++ b/lite/kernels/xpu/softmax_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+static inline int SizeToAxis(const int axis, lite::DDim dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+static inline int SizeFromAxis(const int axis, lite::DDim dims) {
+  int size = 1;
+  for (size_t i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+class SoftmaxCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SoftmaxParam;
+  virtual void Run();
+  virtual ~SoftmaxCompute() = default;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/xpu/stack_compute.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+void StackCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  int n = param.X.size();
+  void* x_ptr = nullptr;
+  xpu_malloc(&x_ptr, n * 8 /* sizeof(__global__ float*) */);
+  x_ptr_guard_.reset(x_ptr);
+  x_ptr_cpu_.reserve(n);
+}
+void StackCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+  int n = param.X.size();
+  auto x_dims = param.X[0]->dims();
+  int axis = param.axis;
+  // XXX(miaotianxiang): +1?
+  if (axis < 0) axis += (x_dims.size() + 1);
+  auto matrix = x_dims.Flatten2D(axis);
+  int height = matrix[0];
+  int width = matrix[1];
+  for (int i = 0; i < n; ++i) {
+    x_ptr_cpu_[i] = param.X[i]->data<float>();
+  }
+  xpu_memcpy(x_ptr_guard_.get(), &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE);
+  int r = xdnn::stack_forward(
+      ctx.GetRawContext(), /* context */
+      height,              /* height */
+      width,               /* width */
+      n,                   /* n */
+      x_ptr_guard_.get(),  /* x_ptr */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
+  CHECK_EQ(r, 0);
+}
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    stack, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::StackCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+struct XPUFreeDeleter {
+  void operator()(void* p) const { xpu_free(p); }
+};
+class StackCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::StackParam;
+  virtual void PrepareForRun();
+  virtual void Run();
+  virtual ~StackCompute() = default;
+ private:
+  std::unique_ptr<void, XPUFreeDeleter> x_ptr_guard_;
+  std::vector<const float*> x_ptr_cpu_;
+};
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
@@ -14,10 +14,10 @@
 #pragma once
-#include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/kernel.h"
 #include "lite/kernels/npu/bridges/engine.h"
 #include "lite/kernels/npu/bridges/registry.h"

--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -14,7 +14,7 @@ add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} )
 add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS})
 add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS})
 add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS})
-add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
+add_operator(activation_basic_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
 add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS})
 add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS})
 add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
@@ -60,6 +60,7 @@ add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS})
 add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS})
 # 3.extra ops
+add_operator(activation_extra_ops extra SRCS activation_extra_ops.cc DEPS ${op_DEPS})
 add_operator(search_group_padding extra SRCS search_group_padding_op.cc DEPS ${op_DEPS})
 add_operator(lrn_op_lite extra SRCS lrn_op.cc DEPS ${op_DEPS})
 add_operator(decode_bboxes_op_lite extra SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
@@ -73,6 +74,7 @@ add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS})
 add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS})
 add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS})
+add_operator(sequence_unpad_op_lite extra SRCS sequence_unpad_op.cc DEPS ${op_DEPS})
 add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS})
 add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS})
 add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS})
@@ -105,6 +107,7 @@ add_operator(conditional_block_op_lite extra SRCS conditional_block_op.cc DEPS $
 add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.cc DEPS ${op_DEPS})
 add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS})
 add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS})
+add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS})
 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
@@ -148,6 +151,10 @@ add_operator(elementwise_grad_op train SRCS elementwise_grad_ops.cc DEPS ${op_DE
 add_operator(mul_grad_op train SRCS mul_grad_op.cc DEPS ${op_DEPS})
 add_operator(sgd_op train SRCS sgd_op.cc DEPS ${op_DEPS})
+# Only for XPU
+add_operator(__xpu__resnet50_op extra SRCS __xpu__resnet50_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__multi_encoder_op extra SRCS __xpu__multi_encoder_op.cc DEPS ${op_DEPS})
 if (NOT LITE_WITH_X86)
    lite_cc_test(test_fc_op SRCS fc_op_test.cc
                DEPS fc_op memory

--- a/lite/operators/__xpu__multi_encoder_op.cc
+++ b/lite/operators/__xpu__multi_encoder_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/__xpu__multi_encoder_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+bool XPUMultiEncoderOp::CheckShape() const { return true; }
+bool XPUMultiEncoderOp::InferShapeImpl() const {
+  auto input_shape = param_.input->dims();
+  param_.output->Resize(input_shape);
+  return true;
+}
+bool XPUMultiEncoderOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                   lite::Scope* scope) {
+  param_.input = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
+  param_.mask = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("Mask").front())->Get<lite::Tensor>());
+  param_.fc_weight_max = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("FCWeightMax").front())
+           ->Get<lite::Tensor>());
+  param_.output = scope->FindVar(op_desc.Output("Output").front())
+                      ->GetMutable<lite::Tensor>();
+  param_.fc_weight.clear();
+  for (auto& name : op_desc.Input("FCWeight")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.fc_weight.push_back(t);
+  }
+  param_.fc_bias.clear();
+  for (auto& name : op_desc.Input("FCBias")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.fc_bias.push_back(t);
+  }
+  param_.ln_scale.clear();
+  for (auto& name : op_desc.Input("LNScale")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.ln_scale.push_back(t);
+  }
+  param_.ln_bias.clear();
+  for (auto& name : op_desc.Input("LNBias")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.ln_bias.push_back(t);
+  }
+  param_.n_layers = op_desc.GetAttr<int>("n_layers");
+  param_.head_num = op_desc.GetAttr<int>("head_num");
+  param_.size_per_head = op_desc.GetAttr<int>("size_per_head");
+  param_.act_type = op_desc.GetAttr<std::string>("act_type");
+  return true;
+}
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_OP(__xpu__multi_encoder,
+                 paddle::lite::operators::XPUMultiEncoderOp);
--- a/lite/operators/__xpu__multi_encoder_op.h
+++ b/lite/operators/__xpu__multi_encoder_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+class XPUMultiEncoderOp : public OpLite {
+ public:
+  XPUMultiEncoderOp() {}
+  explicit XPUMultiEncoderOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShapeImpl() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "MultiEncoder"; }
+ private:
+  mutable XPUMultiEncoderParam param_;
+};
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/__xpu__resnet50_op.cc
+++ b/lite/operators/__xpu__resnet50_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/__xpu__resnet50_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+bool XPUResNet50Op::CheckShape() const { return true; }
+bool XPUResNet50Op::InferShapeImpl() const {
+  auto input_shape = param_.input->dims();
+  input_shape[1] = 2048;
+  input_shape[2] = 1;
+  input_shape[3] = 1;
+  param_.output->Resize(input_shape);
+  return true;
+}
+bool XPUResNet50Op::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  param_.input = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
+  param_.output = scope->FindVar(op_desc.Output("Output").front())
+                      ->GetMutable<lite::Tensor>();
+  param_.filter.clear();
+  for (auto& name : op_desc.Input("Filter")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.filter.push_back(t);
+  }
+  param_.bias.clear();
+  for (auto& name : op_desc.Input("Bias")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.bias.push_back(t);
+  }
+  param_.max_filter.clear();
+  for (auto& name : op_desc.Input("MaxFilter")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.max_filter.push_back(t);
+  }
+  return true;
+}
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_OP(__xpu__resnet50, paddle::lite::operators::XPUResNet50Op);
--- a/lite/operators/__xpu__resnet50_op.h
+++ b/lite/operators/__xpu__resnet50_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+class XPUResNet50Op : public OpLite {
+ public:
+  XPUResNet50Op() {}
+  explicit XPUResNet50Op(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShapeImpl() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "ResNet50"; }
+ private:
+  mutable XPUResNet50Param param_;
+};
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/activation_extra_ops.cc
+++ b/lite/operators/activation_extra_ops.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.i
+#include "lite/core/op_registry.h"
+#include "lite/operators/activation_ops.h"
+// Extra activation ops
+REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(relu_clipped, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(hard_swish, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(reciprocal, paddle::lite::operators::ActivationOp);
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -74,6 +74,14 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
  } else if (opdesc.Type() == "abs") {
    // abs
    param_.active_type = lite_api::ActivationType::kAbs;
+  } else if (opdesc.Type() == "hard_swish") {
+    // hard_swish
+    param_.active_type = lite_api::ActivationType::kHardSwish;
+    param_.hard_swish_threshold = opdesc.GetAttr<float>("threshold");
+    param_.hard_swish_scale = opdesc.GetAttr<float>("scale");
+    param_.hard_swish_offset = opdesc.GetAttr<float>("offset");
+  } else if (opdesc.Type() == "reciprocal") {
+    param_.active_type = lite_api::ActivationType::kReciprocal;
  }
  VLOG(4) << "opdesc.Type():" << opdesc.Type();
@@ -84,21 +92,11 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
-REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp);
+// Baisc activation ops
-REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(relu_clipped, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(sigmoid, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(tanh, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp);
--- a/lite/operators/ctc_align_op.cc
+++ b/lite/operators/ctc_align_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/ctc_align_op.h"
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+bool CtcAlignOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.input != nullptr);
+  CHECK_OR_FALSE(param_.output != nullptr);
+  auto* input = param_.input;
+  auto* input_length = param_.input_length;
+  auto input_lod = input->lod();
+  CHECK_OR_FALSE(!input_lod.empty() || input_length != nullptr);
+  return true;
+}
+bool CtcAlignOpLite::InferShapeImpl() const {
+  auto input_dims = param_.input->dims();
+  // It is tricky to set the wrong dimension here.
+  param_.output->Resize(input_dims);
+  if (param_.input_length != nullptr && param_.output_length != nullptr) {
+    param_.output_length->Resize({input_dims[0], 1});
+  }
+  return true;
+}
+bool CtcAlignOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                lite::Scope* scope) {
+  AttachInput(op_desc, scope, "Input", false, &param_.input);
+  AttachInput(op_desc, scope, "InputLength", true, &param_.input_length);
+  AttachOutput(op_desc, scope, "Output", false, &param_.output);
+  AttachOutput(op_desc, scope, "OutputLength", true, &param_.output_length);
+  param_.blank = op_desc.GetAttr<int>("blank");
+  param_.merge_repeated = op_desc.GetAttr<bool>("merge_repeated");
+  param_.padding_value = op_desc.GetAttr<int>("padding_value");
+  return true;
+}
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_OP(ctc_align, paddle::lite::operators::CtcAlignOpLite);
--- a/lite/operators/ctc_align_op.h
+++ b/lite/operators/ctc_align_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+class CtcAlignOpLite : public OpLite {
+ public:
+  CtcAlignOpLite() {}
+  explicit CtcAlignOpLite(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShapeImpl() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "ctc_align"; }
+ private:
+  mutable CtcAlignParam param_;
+};
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -336,17 +336,22 @@ struct ConcatParam : ParamBase {
 /// ----------------------- activation operators ----------------------
 struct ActivationParam : ParamBase {
  const lite::Tensor* X{};
+  lite::Tensor* Out{};
+  lite_api::ActivationType active_type;
+  bool has_active{false};
  float Leaky_relu_alpha{0};   // leaky_relu param
  float Relu_clipped_coef{6};  // relu_clipped param
  std::string Prelu_mode{
      "channel"};  // prelu param, can be "all", "channel" or "element"
  lite::Tensor* Prelu_alpha{};  // prelu param
  float Swish_beta;             // swish param
+  // hard_sigmoid param
  float hard_sigmoid_slope{0.2};
  float hard_sigmoid_offset{0.5};
-  lite::Tensor* Out{};
+  // hard_swish param
-  bool has_active{false};
+  float hard_swish_threshold{6.0};
-  lite_api::ActivationType active_type;
+  float hard_swish_scale{6.0};
+  float hard_swish_offset{3.0};
 };
 struct ActivationGradParam : ParamBase {
@@ -1019,6 +1024,12 @@ struct SequenceExpandParam : ParamBase {
  int ref_level{-1};
 };
+struct SequenceUnpadParam : ParamBase {
+  const lite::Tensor* X{};
+  const lite::Tensor* Length{};
+  lite::Tensor* Out{};
+};
 struct SequenceExpandAsParam : ParamBase {
  const lite::Tensor* x{nullptr};
  const lite::Tensor* y{nullptr};
@@ -1438,6 +1449,40 @@ struct CrfDecodingParam : ParamBase {
  lite::Tensor* viterbi_path{};
 };
+struct CtcAlignParam : ParamBase {
+  lite::Tensor* input{};
+  lite::Tensor* input_length{};
+  lite::Tensor* output{};
+  lite::Tensor* output_length{};
+  int blank{0};
+  bool merge_repeated{true};
+  int padding_value{0};
+};
+struct XPUResNet50Param : ParamBase {
+  lite::Tensor* input{};
+  std::vector<lite::Tensor*> filter;
+  std::vector<lite::Tensor*> bias;
+  std::vector<lite::Tensor*> max_filter;
+  lite::Tensor* output{};
+};
+struct XPUMultiEncoderParam : ParamBase {
+  lite::Tensor* input{};
+  std::vector<lite::Tensor*> fc_weight;
+  std::vector<lite::Tensor*> fc_bias;
+  std::vector<lite::Tensor*> ln_scale;
+  std::vector<lite::Tensor*> ln_bias;
+  lite::Tensor* fc_weight_max{};
+  lite::Tensor* mask{};
+  lite::Tensor* output{};
+  int n_layers{};
+  int head_num{};
+  int size_per_head{};
+  std::string act_type{};
+};
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
--- a/lite/operators/sequence_unpad_op.cc
+++ b/lite/operators/sequence_unpad_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/sequence_unpad_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+bool SequenceUnpadOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Length);
+  CHECK_OR_FALSE(param_.Out);
+  auto x_dims = param_.X->dims();
+  auto len_dims = param_.Length->dims();
+  CHECK(x_dims.size() >= 2) << "Rank of X can't be less than 2";
+  CHECK(len_dims.size() == 1) << "Rank of Length should be 1";
+  CHECK(x_dims[0] == len_dims[0])
+      << "X and Length should have the same 1st dim";
+  return true;
+}
+bool SequenceUnpadOp::InferShapeImpl() const {
+  auto x_dims = param_.X->dims();
+  auto len_dims = param_.Length->dims();
+  auto *seq_len_ptr = param_.Length->data<int64_t>();
+  int64_t batch_size = len_dims[0];
+  std::vector<uint64_t> out_lod0(batch_size + 1, 0);
+  for (int64_t i = 0; i < batch_size; ++i) {
+    out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
+  }
+  paddle::lite::LoD out_lod;
+  out_lod.push_back(out_lod0);
+  int64_t out_dim0 = out_lod0.back();
+  std::vector<int64_t> out_dims{out_dim0};
+  if (x_dims.size() == 2) {
+    out_dims.push_back(1);
+  } else {
+    for (size_t i = 2; i < x_dims.size(); ++i) {
+      out_dims.push_back(x_dims[i]);
+    }
+  }
+  param_.Out->Resize(out_dims);
+  param_.Out->set_lod(out_lod);
+  return true;
+}
+bool SequenceUnpadOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                 lite::Scope *scope) {
+  param_.X = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  param_.Length = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("Length").front())->Get<lite::Tensor>());
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  return true;
+}
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_OP(sequence_unpad, paddle::lite::operators::SequenceUnpadOp);
--- a/lite/operators/sequence_unpad_op.h
+++ b/lite/operators/sequence_unpad_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+class SequenceUnpadOp : public OpLite {
+ public:
+  SequenceUnpadOp() {}
+  explicit SequenceUnpadOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShapeImpl() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_unpad"; }
+ private:
+  mutable SequenceUnpadParam param_;
+};
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/stack_op.cc
+++ b/lite/operators/stack_op.cc
@@ -47,6 +47,7 @@ bool StackOp::InferShapeImpl() const {
 bool StackOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
  auto X = op_desc.Input("X");
  auto Out = op_desc.Output("Y").front();
+  param_.X.clear();
  for (auto var : X) {
    param_.X.emplace_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
  }

--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
 add_subdirectory(kernels)
 add_subdirectory(math)
 add_subdirectory(cv)
+add_subdirectory(api)
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
+if(LITE_WITH_XPU)
+    lite_cc_test(test_resnet50_lite_xpu SRCS test_resnet50_lite_xpu.cc
+      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+    lite_cc_test(test_ernie_lite_xpu SRCS test_ernie_lite_xpu.cc
+      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+    lite_cc_test(test_bert_lite_xpu SRCS test_bert_lite_xpu.cc
+      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+endif()
--- a/lite/tests/api/test_bert_lite_xpu.cc
+++ b/lite/tests/api/test_bert_lite_xpu.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+namespace paddle {
+namespace lite {
+template <typename T>
+lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
+  lite::Tensor ret;
+  ret.Resize(shape);
+  T* ptr = ret.mutable_data<T>();
+  for (int i = 0; i < ret.numel(); ++i) {
+    ptr[i] = (T)1;
+  }
+  return ret;
+}
+TEST(Ernie, test_ernie_lite_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  int64_t batch_size = 1;
+  int64_t seq_len = 64;
+  Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
+  std::vector<int64_t> input_shape{batch_size, seq_len, 1};
+  predictor->GetInput(0)->Resize(input_shape);
+  predictor->GetInput(1)->Resize(input_shape);
+  predictor->GetInput(2)->Resize(input_shape);
+  predictor->GetInput(3)->Resize(input_shape);
+  memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+  std::vector<std::vector<float>> results;
+  results.emplace_back(std::vector<float>({0.278893, 0.330888, 0.39022}));
+  auto out = predictor->GetOutput(0);
+  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape()[0], 1);
+  ASSERT_EQ(out->shape()[1], 3);
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(
+          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
+    }
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/api/test_ernie_lite_xpu.cc
+++ b/lite/tests/api/test_ernie_lite_xpu.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+namespace paddle {
+namespace lite {
+template <typename T>
+lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
+  lite::Tensor ret;
+  ret.Resize(shape);
+  T* ptr = ret.mutable_data<T>();
+  for (int i = 0; i < ret.numel(); ++i) {
+    ptr[i] = (T)1;
+  }
+  return ret;
+}
+TEST(Ernie, test_ernie_lite_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  int64_t batch_size = 1;
+  int64_t seq_len = 64;
+  Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
+  std::vector<int64_t> input_shape{batch_size, seq_len, 1};
+  predictor->GetInput(0)->Resize(input_shape);
+  predictor->GetInput(1)->Resize(input_shape);
+  predictor->GetInput(2)->Resize(input_shape);
+  predictor->GetInput(3)->Resize(input_shape);
+  memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+  std::vector<std::vector<float>> results;
+  results.emplace_back(std::vector<float>({0.108398}));
+  auto out = predictor->GetOutput(0);
+  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape()[0], 1);
+  ASSERT_EQ(out->shape()[1], 1);
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(
+          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
+    }
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/api/test_resnet50_lite_xpu.cc
+++ b/lite/tests/api/test_resnet50_lite_xpu.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+namespace paddle {
+namespace lite {
+TEST(Resnet50, test_resnet50_lite_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  auto input_tensor = predictor->GetInput(0);
+  std::vector<int64_t> input_shape{1, 3, 224, 224};
+  input_tensor->Resize(input_shape);
+  auto* data = input_tensor->mutable_data<float>();
+  int input_num = 1;
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    input_num *= input_shape[i];
+  }
+  for (int i = 0; i < input_num; i++) {
+    data[i] = 1;
+  }
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+  std::vector<std::vector<float>> results;
+  results.emplace_back(std::vector<float>(
+      {0.000268651, 0.000174053, 0.000213181, 0.000396771, 0.000591516,
+       0.00018169,  0.000289721, 0.000855934, 0.000732185, 9.2055e-05,
+       0.000220664, 0.00235289,  0.00571265,  0.00357688,  0.00129667,
+       0.000465392, 0.000143775, 0.000211628, 0.000617144, 0.000265033}));
+  auto out = predictor->GetOutput(0);
+  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape()[0], 1);
+  ASSERT_EQ(out->shape()[1], 1000);
+  int step = 50;
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
+                  results[i][j],
+                  1e-5);
+    }
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
-if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
+if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
    lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
 endif()
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
    lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -61,6 +61,7 @@ if(LITE_BUILD_EXTRA)
    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    # for training kernel
    if (LITE_WITH_TRAIN)

--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -36,7 +36,9 @@ enum activation_type_test {
  FLOOR,
  RSQRT,
  GELU,
-  SQUARE
+  SQUARE,
+  HARD_SWISH,
+  RECIPROCAL
 };
 class ActivationComputeTester : public arena::TestCase {
@@ -49,6 +51,9 @@ class ActivationComputeTester : public arena::TestCase {
  float relu_clipped_coef_ = 6.;
  std::string prelu_mode_ = "";
  float swish_beta_ = 0.;
+  float hard_swish_threshold = 6.0;
+  float hard_swish_scale = 6.0;
+  float hard_swish_offset = 3.0;
  DDim dims_{{1}};
  std::string type_ = "";
  activation_type_test act_type_ = RELU;
@@ -199,6 +204,20 @@ class ActivationComputeTester : public arena::TestCase {
        }
        break;
      }
+      case HARD_SWISH: {
+        for (int i = 0; i < dims_.production(); i++) {
+          float max_value = std::max(0.f, x_data[i] + hard_swish_offset);
+          float min_value = std::min(max_value, hard_swish_threshold);
+          output_data[i] = min_value * x_data[i] / hard_swish_scale;
+        }
+        break;
+      }
+      case RECIPROCAL: {
+        for (int i = 0; i < dims_.production(); i++) {
+          output_data[i] = 1.0 / x_data[i];
+        }
+        break;
+      }
      default:
        LOG(INFO) << "the type of activation is unknow.";
    }
@@ -221,6 +240,11 @@ class ActivationComputeTester : public arena::TestCase {
    if (act_type_ == SWISH) {
      op_desc->SetAttr("beta", swish_beta_);
    }
+    if (act_type_ == HARD_SWISH) {
+      op_desc->SetAttr("threshold", hard_swish_threshold);
+      op_desc->SetAttr("scale", hard_swish_scale);
+      op_desc->SetAttr("offset", hard_swish_offset);
+    }
  }
  void PrepareData() override {
@@ -552,5 +576,61 @@ TEST(Activation_gelu, precision) {
  }
 }
+TEST(activation_hard_swish, precision) {
+  LOG(INFO) << "test hard_swish op";
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(
+        new ActivationComputeTester(place,
+                                    "def",
+                                    0.01,
+                                    6.,
+                                    "all",
+                                    0.,
+                                    DDim(dims),
+                                    "hard_swish",
+                                    HARD_SWISH));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+TEST(activation_reciprocal, precision) {
+  LOG(INFO) << "test reciprocal op";
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(
+        new ActivationComputeTester(place,
+                                    "def",
+                                    0.01,
+                                    6.,
+                                    "all",
+                                    0.,
+                                    DDim(dims),
+                                    "reciprocal",
+                                    RECIPROCAL));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
 }  // namespace lite
 }  // namespace paddle
--- a/lite/tests/kernels/ctc_align_compute_test.cc
+++ b/lite/tests/kernels/ctc_align_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+namespace paddle {
+namespace lite {
+class CtcAlignComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "input";
+  std::string input_length_ = "input_length";
+  std::string output_ = "output";
+  std::string output_length_ = "output_length";
+  std::vector<int> input_data_;
+  std::vector<int64_t> input_shape_;
+  std::vector<std::vector<uint64_t>> input_lod_;
+  std::vector<int> input_length_data_;
+  std::vector<int64_t> input_length_shape_;
+  std::vector<int> output_data_;
+  std::vector<int64_t> output_shape_;
+  std::vector<std::vector<uint64_t>> output_lod_;
+  std::vector<int> output_length_data_;
+  std::vector<int64_t> output_length_shape_;
+  int blank_;
+  bool merge_repeated_;
+  int padding_value_;
+ public:
+  CtcAlignComputeTester(const Place& place,
+                        const std::string& alias,
+                        const std::vector<int>& input_data,
+                        const std::vector<int64_t> input_shape,
+                        const std::vector<std::vector<uint64_t>>& input_lod,
+                        const std::vector<int>& input_length_data,
+                        const std::vector<int64_t> input_length_shape,
+                        const int blank,
+                        const bool merge_repeated,
+                        const int padding_value,
+                        const std::vector<int>& output_data,
+                        const std::vector<int64_t>& output_shape,
+                        const std::vector<std::vector<uint64_t>>& output_lod,
+                        const std::vector<int>& output_length_data,
+                        const std::vector<int64_t>& output_length_shape)
+      : TestCase(place, alias) {
+    input_data_ = input_data;
+    input_shape_ = input_shape;
+    input_lod_ = input_lod;
+    input_length_data_ = input_length_data;
+    input_length_shape_ = input_length_shape;
+    blank_ = blank;
+    merge_repeated_ = merge_repeated;
+    padding_value_ = padding_value;
+    output_data_ = output_data;
+    output_shape_ = output_shape;
+    output_lod_ = output_lod;
+    output_length_data_ = output_length_data;
+    output_length_shape_ = output_length_shape;
+  }
+  void RunBaseline(Scope* scope) override {
+    auto* output_tensor = scope->NewTensor(output_);
+    output_tensor->Resize(output_shape_);
+    if (!output_lod_.empty()) {
+      output_tensor->set_lod(output_lod_);
+    }
+    auto* output_data = output_tensor->mutable_data<int>();
+    int64_t output_num = 1;
+    for (auto e : output_shape_) {
+      output_num *= e;
+    }
+    for (int i = 0; i < output_num; i++) {
+      output_data[i] = output_data_[i];
+    }
+    if (!input_length_data_.empty() && !output_length_data_.empty()) {
+      auto* output_length_tensor = scope->NewTensor(output_length_);
+      output_length_tensor->Resize(output_length_shape_);
+      auto* output_length_data = output_length_tensor->mutable_data<int>();
+      int64_t num = 1;
+      for (auto e : output_length_shape_) {
+        num *= e;
+      }
+      for (int i = 0; i < num; i++) {
+        output_length_data[i] = output_length_data_[i];
+      }
+    }
+  }
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("ctc_align");
+    op_desc->SetInput("Input", {input_});
+    op_desc->SetOutput("Output", {output_});
+    if (!input_length_data_.empty()) {
+      op_desc->SetInput("InputLength", {input_length_});
+      op_desc->SetOutput("OutputLength", {output_length_});
+    }
+    op_desc->SetAttr("blank", blank_);
+    op_desc->SetAttr("merge_repeated", merge_repeated_);
+    op_desc->SetAttr("padding_value", padding_value_);
+  }
+  void PrepareData() override {
+    SetCommonTensor(input_, DDim(input_shape_), input_data_.data(), input_lod_);
+    if (!input_length_data_.empty()) {
+      SetCommonTensor(
+          input_length_, DDim(input_length_shape_), input_length_data_.data());
+    }
+  }
+};
+TEST(CtcAlign1, precision) {
+  LOG(INFO) << "test ctc_align op";
+#ifdef LITE_WITH_ARM
+  // Define variable
+  const std::vector<int>& input_data = {
+      0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0};
+  const std::vector<int64_t> input_shape = {18, 1};
+  const std::vector<std::vector<uint64_t>> input_lod = {{11, 7}};
+  const std::vector<int> input_length_data = {};
+  const std::vector<int64_t> input_length_shape = {};
+  const int blank = 0;
+  const bool merge_repeated = false;
+  const int padding_value = 0;
+  const std::vector<int> output_data = {1, 2, 2, 4, 4, 5, 6, 6, 7, 7, 7};
+  const std::vector<int64_t> output_shape = {11, 1};
+  const std::vector<std::vector<uint64_t>> output_lod = {{7, 4}};
+  const std::vector<int> output_length_data = {};
+  const std::vector<int64_t> output_length_shape = {};
+  // Test
+  Place place(TARGET(kHost), PRECISION(kInt32));
+  std::unique_ptr<arena::TestCase> tester(
+      new CtcAlignComputeTester(place,
+                                "def",
+                                input_data,
+                                input_shape,
+                                input_lod,
+                                input_length_data,
+                                input_length_shape,
+                                blank,
+                                merge_repeated,
+                                padding_value,
+                                output_data,
+                                output_shape,
+                                output_lod,
+                                output_length_data,
+                                output_length_shape));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+#endif
+}
+TEST(CtcAlign2, precision) {
+  LOG(INFO) << "test ctc_align op";
+#ifdef LITE_WITH_ARM
+  // Define variable
+  const std::vector<int>& input_data = {
+      0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 0, 0, 7, 7, 7, 0, 0};
+  const std::vector<int64_t> input_shape = {3, 6};
+  const std::vector<std::vector<uint64_t>> input_lod = {};
+  const std::vector<int> input_length_data = {6, 5, 4};
+  const std::vector<int64_t> input_length_shape = {3, 1};
+  const int blank = 0;
+  const bool merge_repeated = true;
+  const int padding_value = 0;
+  const std::vector<int> output_data = {
+      1, 2, 4, 0, 0, 0, 4, 5, 6, 0, 0, 0, 7, 0, 0, 0, 0, 0};
+  const std::vector<int64_t> output_shape = {3, 6};
+  const std::vector<std::vector<uint64_t>> output_lod = {};
+  const std::vector<int> output_length_data = {3, 3, 1};
+  const std::vector<int64_t> output_length_shape = {3, 1};
+  // Test
+  Place place(TARGET(kHost), PRECISION(kInt32));
+  std::unique_ptr<arena::TestCase> tester(
+      new CtcAlignComputeTester(place,
+                                "def",
+                                input_data,
+                                input_shape,
+                                input_lod,
+                                input_length_data,
+                                input_length_shape,
+                                blank,
+                                merge_repeated,
+                                padding_value,
+                                output_data,
+                                output_shape,
+                                output_lod,
+                                output_length_data,
+                                output_length_shape));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+#endif
+}
+TEST(CtcAlign3, precision) {
+  LOG(INFO) << "test ctc_align op";
+#ifdef LITE_WITH_ARM
+  // Define variable
+  const std::vector<int>& input_data = {
+      0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 0, 0, 7, 7, 7, 0, 0};
+  const std::vector<int64_t> input_shape = {3, 6};
+  const std::vector<std::vector<uint64_t>> input_lod = {};
+  const std::vector<int> input_length_data = {6, 5, 4};
+  const std::vector<int64_t> input_length_shape = {3, 1};
+  const int blank = 0;
+  const bool merge_repeated = false;
+  const int padding_value = 0;
+  const std::vector<int> output_data = {
+      1, 2, 2, 4, 0, 0, 4, 5, 6, 0, 0, 0, 7, 7, 7, 0, 0, 0};
+  const std::vector<int64_t> output_shape = {3, 6};
+  const std::vector<std::vector<uint64_t>> output_lod = {};
+  const std::vector<int> output_length_data = {4, 3, 3};
+  const std::vector<int64_t> output_length_shape = {3, 1};
+  // Test
+  Place place(TARGET(kHost), PRECISION(kInt32));
+  std::unique_ptr<arena::TestCase> tester(
+      new CtcAlignComputeTester(place,
+                                "def",
+                                input_data,
+                                input_shape,
+                                input_lod,
+                                input_length_data,
+                                input_length_shape,
+                                blank,
+                                merge_repeated,
+                                padding_value,
+                                output_data,
+                                output_shape,
+                                output_lod,
+                                output_length_data,
+                                output_length_shape));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+#endif
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
    lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -25,6 +25,7 @@ SHUTDOWN_LOG=ON
 BUILD_NPU=OFF
 NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/
 BUILD_XPU=OFF
+BUILD_XTCL=OFF
 XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/"
 LITE_WITH_ARM_LANG=OFF
@@ -138,6 +139,7 @@ function make_tiny_publish_so {
      -DLITE_WITH_NPU=$BUILD_NPU \
      -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
      -DLITE_WITH_XPU=$BUILD_XPU \
+      -DLITE_WITH_XTCL=$BUILD_XTCL \
      -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
@@ -226,6 +228,7 @@ function make_full_publish_so {
      -DLITE_WITH_NPU=$BUILD_NPU \
      -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
      -DLITE_WITH_XPU=$BUILD_XPU \
+      -DLITE_WITH_XTCL=$BUILD_XTCL \
      -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
      -DLITE_WITH_TRAIN=$BUILD_TRAIN \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
@@ -260,6 +263,7 @@ function make_all_tests {
      -DLITE_WITH_NPU=$BUILD_NPU \
      -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
      -DLITE_WITH_XPU=$BUILD_XPU \
+      -DLITE_WITH_XTCL=$BUILD_XTCL \
      -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
@@ -330,7 +334,10 @@ function make_cuda {
            -DWITH_TESTING=OFF \
            -DLITE_WITH_ARM=OFF \
            -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
-            -DLITE_BUILD_EXTRA=ON
+            -DLITE_BUILD_EXTRA=ON \
+            -DLITE_WITH_XPU=$BUILD_XPU \
+            -DLITE_WITH_XTCL=$BUILD_XTCL \
+            -DXPU_SDK_ROOT=$XPU_SDK_ROOT
  make publish_inference -j$NUM_PROC
  cd -
@@ -362,9 +369,10 @@ function make_x86 {
            -DWITH_GPU=OFF \
            -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
            -DLITE_BUILD_EXTRA=ON \
-            -DCMAKE_BUILD_TYPE=Release \
+            -DLITE_WITH_XPU=$BUILD_XPU \
-            -DLITE_WITH_XPU=$BUID_XPU \
+            -DLITE_WITH_XTCL=$BUILD_XTCL \
-            -DXPU_SDK_ROOT=$XPU_SDK_ROOT
+            -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+            -DCMAKE_BUILD_TYPE=Release
  make publish_inference -j$NUM_PROC
  cd -
@@ -483,6 +491,10 @@ function main {
                BUILD_XPU="${i#*=}"
                shift
                ;;
+            --build_xtcl=*)
+                BUILD_XTCL="${i#*=}"
+                shift
+                ;;
            --xpu_sdk_root=*)
                XPU_SDK_ROOT="${i#*=}"
                shift

--- a/lite/tools/build_mlu.sh
+++ b/lite/tools/build_mlu.sh
@@ -2,10 +2,10 @@
 set -ex
 # global variables with default value
-NEUWARE_HOME="${NEUWARE_HOME}"    # XPU SDK
+NEUWARE_HOME="${NEUWARE_HOME}"
 TARGET_NAME="all"    # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
-WITH_TESTING=OFF                     # ON/OFF
+WITH_TESTING=ON                     # ON/OFF
 function print_usage {
    echo -e "\nUSAGE:"
@@ -20,10 +20,9 @@ function print_usage {
 # readonly variables with default value
 readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
                               -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
-                               -DWITH_PYTHON=OFF \
                               -DLITE_WITH_ARM=OFF"
-readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
+readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-8}
 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
 readonly workspace=$(pwd)
@@ -37,8 +36,7 @@ function prepare_thirdparty {
        fi
        tar xzf third-party-05b862.tar.gz
    else
-        # git submodule update --init --recursive
+        git submodule update --init --recursive
-        echo "third-party is in ready"
    fi
 }
@@ -62,12 +60,12 @@ function prepare_workspace {
 }
 function build_mlu {
+    prepare_workspace
    build_dir=${workspace}/build.lite.mlu
    mkdir -p $build_dir
    cd $build_dir
    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
-    prepare_workspace
    cmake .. \
        ${CMAKE_COMMON_OPTIONS} \
        -DWITH_GPU=OFF \
@@ -75,9 +73,10 @@ function build_mlu {
        -DLITE_WITH_X86=ON \
        -DWITH_MKL=ON \
        -DLITE_WITH_MLU=ON \
+        -DLITE_WITH_PYTHON=OFF \
        -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
        -DWITH_TESTING=${WITH_TESTING} \
-        -DMLU_SDK_ROOT=${XPU_SDK_ROOT}
+        -DNEUWARE_HOME=${NEUWARE_HOME}
    make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE

--- a/lite/tools/build_xpu.sh
+++ b/lite/tools/build_xpu.sh
-#!/bin/bash
-set -ex
-# global variables with default value
-XPU_SDK_ROOT="$(pwd)/../XPU_SDK"    # XPU SDK
-TARGET_NAME="test_subgraph_pass"    # default target
-BUILD_EXTRA=ON                      # ON(with sequence ops)/OFF
-WITH_TESTING=ON                     # ON/OFF
-function print_usage {
-    echo -e "\nUSAGE:"
-    echo
-    echo "----------------------------------------"
-    echo -e "--xpu_sdk_root=<xpu sdk directory>"
-    echo -e "--target_name=<target name>"
-    echo "----------------------------------------"
-    echo
-}
-# readonly variables with default value
-readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
-                               -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
-                               -DWITH_PYTHON=OFF \
-                               -DLITE_WITH_ARM=OFF"
-readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
-readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
-readonly workspace=$(pwd)
-function prepare_thirdparty {
-    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
-        rm -rf $workspace/third-party
-        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
-            wget $THIRDPARTY_TAR
-        fi
-        tar xzf third-party-05b862.tar.gz
-    else
-        git submodule update --init --recursive
-    fi
-}
-# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
-# here we fake an empty file to make cmake works.
-function prepare_workspace {
-    # in build directory
-    # 1. Prepare gen_code file
-    GEN_CODE_PATH_PREFIX=lite/gen_code
-    mkdir -p ./${GEN_CODE_PATH_PREFIX}
-    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
-    # 2.Prepare debug tool
-    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
-    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
-    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
-    # clone submodule
-    # git submodule update --init --recursive
-    prepare_thirdparty
-}
-function build_xpu {
-    build_dir=${workspace}/build.lite.xpu
-    mkdir -p $build_dir
-    cd $build_dir
-    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
-    prepare_workspace
-    cmake .. \
-        ${CMAKE_COMMON_OPTIONS} \
-        -DWITH_GPU=OFF \
-        -DWITH_MKLDNN=OFF \
-        -DLITE_WITH_X86=ON \
-        -DWITH_MKL=ON \
-        -DLITE_WITH_XPU=ON \
-        -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
-        -DWITH_TESTING=${WITH_TESTING} \
-        -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
-    make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE
-    cd -
-    echo "Done"
-}
-function main {
-    # Parse command line.
-    for i in "$@"; do
-        case $i in
-            --target_name=*)
-                TARGET_NAME="${i#*=}"
-                shift
-                ;;
-            --build_extra=*)
-                BUILD_EXTRA="${i#*=}"
-                shift
-                ;;
-            --xpu_sdk_root=*)
-                XPU_SDK_ROOT="${i#*=}"
-                shift
-                ;;
-            build)
-                build_xpu
-                shift
-                ;;
-            full_publish)
-                TARGET_NAME=publish_inference
-                build_xpu
-                shift
-                ;;
-            *)
-                # unknown option
-                print_usage
-                exit 1
-                ;;
-        esac
-    done
-}
-main $@