diff --git a/CMakeLists.txt b/CMakeLists.txt
index e6066c8a281272d65e68609ddb7b1832344d7699..e1ac0817ee0377cb86d458df07529e8e1e53874b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,8 +59,10 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
+lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_HW_ASCEND_NPU "Enable Huawei Ascend NPU in lite mode" OFF)
+lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
 lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
@@ -182,6 +184,10 @@ if (LITE_WITH_HW_ASCEND_NPU)
     include(device/hw_ascend_npu)
 endif()
 
+if(LITE_WITH_MLU)
+    include(mlu)
+endif()
+
 include(external/mklml)     # download mklml package
 include(external/xbyak)     # download xbyak package
 include(external/libxsmm)   # download, build, install libxsmm
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e42efc77cb4e7a54c7e6e08481a73c6f49e4fb61..553f218615b12b21e2f96a0becf1d2c3c4d2927b 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -140,6 +140,9 @@ endif()
 
 if (LITE_WITH_XPU)
     add_definitions("-DLITE_WITH_XPU")
+    if (LITE_WITH_XTCL)
+      add_definitions("-DLITE_WITH_XTCL")
+    endif()
 endif()
 
 if (LITE_WITH_OPENCL)
@@ -154,6 +157,10 @@ if (LITE_WITH_BM)
 add_definitions("-DLITE_WITH_BM")
 endif()
 
+if (LITE_WITH_MLU)
+add_definitions("-DLITE_WITH_MLU")
+endif()
+
 if (LITE_WITH_PROFILE)
     add_definitions("-DLITE_WITH_PROFILE")
 endif()
diff --git a/cmake/device/xpu.cmake b/cmake/device/xpu.cmake
index 099833ee4cf80968671036cffe89329506bbf091..823048552f3cb5f05375e97e94cd5b5ad63e7563 100644
--- a/cmake/device/xpu.cmake
+++ b/cmake/device/xpu.cmake
@@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT)
     message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
   endif()
 endif()
-
 message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
-find_path(XPU_SDK_INC NAMES xtcl.h
-  PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl
-  NO_DEFAULT_PATH)
-if(NOT XPU_SDK_INC)
-  message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
-endif()
 
-include_directories("${XPU_SDK_ROOT}/XTCL/include")
 include_directories("${XPU_SDK_ROOT}/XTDK/include")
 
-find_library(XPU_SDK_XTCL_FILE NAMES xtcl
-  PATHS ${XPU_SDK_ROOT}/XTCL/so
-  NO_DEFAULT_PATH)
-
-if(NOT XPU_SDK_XTCL_FILE)
-  message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
-  add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
-endif()
-
-find_library(XPU_SDK_TVM_FILE NAMES tvm
-  PATHS ${XPU_SDK_ROOT}/XTCL/so
-  NO_DEFAULT_PATH)
-
-if(NOT XPU_SDK_TVM_FILE)
-  message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
-  add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
-endif()
-
 find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
   PATHS ${XPU_SDK_ROOT}/XTDK/shlib
   NO_DEFAULT_PATH)
@@ -82,23 +50,55 @@ else()
   set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE})
 endif()
 
-find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
-  NO_DEFAULT_PATH)
-
-find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
-  NO_DEFAULT_PATH)
-
-if(NOT XPU_SDK_LLVM_FILE)
-  message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
-  add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
+set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs")
+set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs")
+
+if(LITE_WITH_XTCL)
+    find_path(XPU_SDK_INC NAMES xtcl.h
+      PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
+    if(NOT XPU_SDK_INC)
+      message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
+    endif()
+    include_directories("${XPU_SDK_ROOT}/XTCL/include")
+
+    find_library(XPU_SDK_XTCL_FILE NAMES xtcl
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
+      NO_DEFAULT_PATH)
+
+    if(NOT XPU_SDK_XTCL_FILE)
+      message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
+      add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
+    endif()
+
+    find_library(XPU_SDK_TVM_FILE NAMES tvm
+      PATHS ${XPU_SDK_ROOT}/XTCL/so
+      NO_DEFAULT_PATH)
+
+    if(NOT XPU_SDK_TVM_FILE)
+      message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
+      add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
+    endif()
+
+    find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
+      PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+      NO_DEFAULT_PATH)
+
+    if(NOT XPU_SDK_LLVM_FILE)
+      message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
+    else()
+      message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
+      add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
+      set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
+    endif()
+
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1")
+
+    set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
+    set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
 endif()
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0")
-
-set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
-set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index 9a8887cef3f9f1c565466e1b2670721e55b6369a..c346784fbf0c2efc6ca91d433ad259402e382a5b 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -106,6 +106,12 @@ function (lite_deps TARGET)
     endforeach(var)
   endif()
 
+  if (LITE_WITH_MLU)
+    foreach(var ${lite_deps_MLU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
   set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()
 
@@ -151,6 +157,7 @@ function(lite_cc_library TARGET)
             PROFILE_DEPS ${args_PROFILE_DEPS}
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
             )
 
     if (args_SHARED OR ARGS_shared)
@@ -197,6 +204,7 @@ function(lite_cc_binary TARGET)
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
             CV_DEPS ${CV_DEPS}
+            MLU_DEPS ${args_MLU_DEPS}
             )
     cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
     target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -226,11 +234,9 @@ function(lite_cc_test TARGET)
     endif()
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
-        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS HW_ASCEND_NPU_DEPS
-        ARGS
-        COMPILE_LEVEL # (basic|extra)
-        )
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
+        XPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS HW_ASCEND_NPU_DEPS
+        ARGS COMPILE_LEVEL # (basic|extra))
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
     if (args_COMPILE_LEVEL STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
@@ -254,6 +260,7 @@ function(lite_cc_test TARGET)
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
               CV_DEPS ${args_CV_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
               )
     _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
     # strip binary target to reduce size
@@ -278,6 +285,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
+set(mlu_kernels CACHE INTERNAL "mlu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
 set(hw_ascend_npu_kernels CACHE INTERNAL "huawei ascend npu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
@@ -295,7 +303,7 @@ if(LITE_BUILD_TAILOR)
   file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
+# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
     set(options "")
@@ -388,6 +396,12 @@ function(add_kernel TARGET device level)
         endif()
         set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
+    if ("${device}" STREQUAL "MLU")
+        if (NOT LITE_WITH_MLU)
+            return()
+        endif()
+        set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
     if ("${device}" STREQUAL "OPENCL")
         if (NOT LITE_WITH_OPENCL)
             foreach(src ${args_SRCS})
@@ -428,7 +442,8 @@ function(add_kernel TARGET device level)
               NPU_DEPS ${args_NPU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
               HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
-	            BM_DEPS ${args_BM_DEPS}
+	          BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
@@ -447,7 +462,7 @@ endif()
 function(add_operator TARGET level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -482,7 +497,8 @@ function(add_operator TARGET level)
               NPU_DEPS ${args_NPU_DEPS}
               XPU_DEPS ${args_XPU_DEPS}
               HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
-	            BM_DEPS ${args_BM_DEPS}
+	          BM_DEPS ${args_BM_DEPS}
+              MLU_DEPS ${args_MLU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index ed2c6a3c15c07c03787ec8a843aae048533aaed7..7becebab5168d3afdbaef95351a387b7b01dece4 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -8,7 +8,9 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
+message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
+message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WTH_HW_ASCEND_NPU:\t${LITE_WITH_HW_ASCEND_NPU}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
@@ -109,59 +111,53 @@ if (LITE_WITH_PYTHON)
     add_dependencies(publish_inference publish_inference_python_light_demo)
 endif()
 
+if (LITE_WITH_CUDA OR LITE_WITH_X86)
+    add_custom_target(publish_inference_cxx_lib ${TARGET}
+        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
+        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+        COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+        COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        )
+    add_custom_target(publish_inference_third_party ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
+    add_dependencies(publish_inference_cxx_lib bundle_full_api)
+    add_dependencies(publish_inference_cxx_lib bundle_light_api)
+    add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
+    add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
+    add_dependencies(publish_inference publish_inference_cxx_lib)
+    add_dependencies(publish_inference publish_inference_third_party)
+endif()
+
 if (LITE_WITH_X86)
     add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
             COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
             COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
             )
-    add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
-    add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
     add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
-    add_dependencies(publish_inference_x86_cxx_lib paddle_full_api_shared)
-    add_dependencies(publish_inference_x86_cxx_lib paddle_light_api_shared)
-    add_dependencies(publish_inference publish_inference_x86_cxx_lib)
 
     add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
            COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            )
     add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
     add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
+    add_dependencies(publish_inference publish_inference_x86_cxx_lib)
+    add_dependencies(publish_inference publish_inference_x86_cxx_demos)
 endif()
 
 if(LITE_WITH_CUDA)
-    add_custom_target(publish_inference_cuda_cxx_lib ${TARGET}
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-        )
-    add_dependencies(publish_inference_cuda_cxx_lib bundle_full_api)
-    add_dependencies(publish_inference_cuda_cxx_lib bundle_light_api)
-    add_dependencies(publish_inference_cuda_cxx_lib paddle_full_api_shared)
-    add_dependencies(publish_inference_cuda_cxx_lib paddle_light_api_shared)
-    add_dependencies(publish_inference publish_inference_cuda_cxx_lib)
-
     add_custom_target(publish_inference_cuda_cxx_demos ${TARGET}
-           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            )
-    add_dependencies(publish_inference_cuda_cxx_lib publish_inference_cuda_cxx_demos)
     add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared)
-endif(LITE_WITH_CUDA) 
+    add_dependencies(publish_inference publish_inference_cuda_cxx_demos)
+endif(LITE_WITH_CUDA)
+
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
     if (NOT LITE_ON_TINY_PUBLISH)
         # add cxx lib
@@ -193,7 +189,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 add_dependencies(publish_inference publish_inference_cxx_lib)
                 if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
                     add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
-                            COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
+                            COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a
+                            COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.so)
                 endif()
             endif()
     else()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 952961055e6e92e4fe17b20eb224edbbf4e737e8..ad34b0669593edaea69c775fd848135688c96fce 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -8,17 +8,19 @@ if (LITE_ON_TINY_PUBLISH)
     set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
     set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
 endif()
-set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
+
+set(light_lib_DEPS light_api paddle_api paddle_api_light)
+
 if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
     #full api dynamic library
-    add_library(paddle_full_api_shared SHARED "")
-    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
+    lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
+                  DEPS paddle_api paddle_api_light  paddle_api_full)
     add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
     target_link_libraries(paddle_full_api_shared framework_proto)
     if(LITE_WITH_X86)
         add_dependencies(paddle_full_api_shared xxhash)
         target_link_libraries(paddle_full_api_shared xxhash)
-        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) 
+        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
             add_dependencies(paddle_full_api_shared dynload_mklml)
         endif()
     endif()
@@ -27,13 +29,13 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
     endif(LITE_WITH_CUDA)
 
     #light api dynamic library
-    lite_cc_library(paddle_light_api_shared MODULE
-        SRCS light_api_shared.cc
-        DEPS ${light_lib_DEPS}
-        ARM_DEPS ${arm_kernels}
-        CV_DEPS paddle_cv_arm
-        NPU_DEPS ${npu_kernels})
-
+    lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc
+                  DEPS ${light_lib_DEPS}
+                  ARM_DEPS ${arm_kernels}
+                  CV_DEPS paddle_cv_arm
+                  NPU_DEPS ${npu_kernels}
+                  )
+    add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
     target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
     set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
     set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
@@ -67,6 +69,7 @@ if (WITH_TESTING)
       XPU_DEPS ${xpu_kernels}
       BM_DEPS ${bm_kernels}
       HW_ASCENND_NPU_DEPS ${hw_ascend_npu_kernels})
+      MLU_DEPS ${mlu_kernels})
 endif()
 if(LITE_WITH_FPGA)
     set(light_api_deps ${light_api_deps} ${fpga_deps})
@@ -89,6 +92,7 @@ message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
 message(STATUS "get HW_ASCEND_NPU kernels ${hw_ascend_npu_kernels}")
+message(STATUS "get MLU kernels ${mlu_kernels}")
 
 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -129,6 +133,7 @@ lite_cc_library(light_api SRCS light_api.cc
         FPGA_DEPS ${fpga_kernels}
         BM_DEPS ${bm_kernels}
         HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels})
+        MLU_DEPS ${mlu_kernels})
 
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -148,6 +153,7 @@ if(WITH_TESTING)
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
        HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
+       MLU_DEPS ${mlu_kernels}
        EXCLUDE_COMPILE_DEPS "ON"
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
             --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -294,6 +300,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
         XPU_DEPS ${xpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         BM_DEPS ${bm_kernels}
+        MLU_DEPS ${mlu_kernels}
         ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
         --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
 
@@ -332,6 +339,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
   FPGA_DEPS ${fpga_kernels}
   BM_DEPS ${bm_kernels}
   HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
+  MLU_DEPS ${mlu_kernels}
   ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
 if (WITH_TESTING)
     add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -345,30 +353,33 @@ if(NOT IOS)
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
         CL_DEPS ${opencl_kernels}
         BM_DEPS ${bm_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
-    
+
     lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
         ARM_DEPS ${arm_kernels}
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
         CL_DEPS ${opencl_kernels}
         BM_DEPS ${bm_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
-    
+
     lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
         ARM_DEPS ${arm_kernels}
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
         CL_DEPS ${opencl_kernels}
         BM_DEPS ${bm_kernels}
         FPGA_DEPS ${fpga_kernels}
@@ -381,17 +392,19 @@ if(NOT IOS)
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
-    
+
     lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
         ARM_DEPS ${arm_kernels}
         CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
+        MLU_DEPS ${mlu_kernels}
         CL_DEPS ${opencl_kernels}
 	BM_DEPS ${bm_kernels}
         FPGA_DEPS ${fpga_kernels}
diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc
index 0843faf0d6b060a5b76a850de069b1dbf714da19..81708bc625f5e4bda7599b9c2563225f05653c5e 100644
--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -27,6 +27,9 @@
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 
+DEFINE_string(optimized_model_path,
+              "",
+              "the path of the model that is optimized by opt.");
 DEFINE_string(model_dir,
               "",
               "the path of the model, the model and param files is under "
@@ -61,10 +64,7 @@ DEFINE_int32(threads, 1, "threads num");
 DEFINE_string(result_filename,
               "result.txt",
               "save the inference time to the file.");
-DEFINE_bool(run_model_optimize,
-            false,
-            "if set true, apply model_optimize_tool to "
-            "model and use optimized model to test. ");
+DEFINE_bool(show_output, false, "Wether to show the output in shell.");
 
 namespace paddle {
 namespace lite_api {
@@ -100,15 +100,23 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
   LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
 }
 
+int64_t ShapeProduction(const std::vector<int64_t>& shape) {
+  int64_t num = 1;
+  for (auto i : shape) {
+    num *= i;
+  }
+  return num;
+}
+
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 void Run(const std::vector<int64_t>& input_shape,
-         const std::string& model_dir,
+         const std::string& model_path,
          const std::string model_name) {
   // set config and create predictor
   lite_api::MobileConfig config;
   config.set_threads(FLAGS_threads);
   config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
-  config.set_model_from_file(model_dir + ".nb");
+  config.set_model_from_file(model_path);
 
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
@@ -116,10 +124,7 @@ void Run(const std::vector<int64_t>& input_shape,
   auto input_tensor = predictor->GetInput(0);
   input_tensor->Resize(input_shape);
   auto input_data = input_tensor->mutable_data<float>();
-  int input_num = 1;
-  for (size_t i = 0; i < input_shape.size(); ++i) {
-    input_num *= input_shape[i];
-  }
+  int64_t input_num = ShapeProduction(input_shape);
   if (FLAGS_input_img_path.empty()) {
     for (int i = 0; i < input_num; ++i) {
       input_data[i] = 1.f;
@@ -167,26 +172,73 @@ void Run(const std::vector<int64_t>& input_shape,
   ofs << "average = " << std::setw(12) << avg_res;
   ofs << std::endl;
   ofs.close();
+
+  if (FLAGS_show_output) {
+    auto out_tensor = predictor->GetOutput(0);
+    auto* out_data = out_tensor->data<float>();
+    int64_t output_num = ShapeProduction(out_tensor->shape());
+    float max_value = out_data[0];
+    int max_index = 0;
+    for (int i = 0; i < output_num; i++) {
+      if (max_value < out_data[i]) {
+        max_value = out_data[i];
+        max_index = i;
+      }
+    }
+    LOG(INFO) << "max_value:" << max_value;
+    LOG(INFO) << "max_index:" << max_index;
+  }
 }
 #endif
 
 }  // namespace lite_api
 }  // namespace paddle
 
+void print_usage() {
+  std::string help_info =
+      "Usage: \n"
+      "./benchmark_bin \n"
+      "  --optimized_model_path (the path of the model that is optimized\n"
+      "    by opt.) type: string \n"
+      "  --model_dir (the path of the model that is not optimized by opt,\n"
+      "    the model and param files is under model_dir.) type: string \n"
+      "  --model_filename (the filename of model file. When the model is\n "
+      "    combined formate, please set model_file. Otherwise, it is not\n"
+      "    necessary to set it.) type: string \n"
+      "  --param_filename (the filename of param file, set param_file when\n"
+      "    the model is combined formate. Otherwise, it is not necessary\n"
+      "    to set it.) type: string \n"
+      "  --input_shape (set input shapes according to the model, separated by\n"
+      "    colon and comma, such as 1,3,244,244) type: string\n"
+      "    default: 1,3,224,224 \n"
+      "  --input_img_path (the path of input image, if not set\n"
+      "    input_img_path, the input will be 1.0.) type: string \n "
+      "  --power_mode (arm power mode: 0 for big cluster, 1 for little\n"
+      "    cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n"
+      "  --repeats (repeats times) type: int32 default: 1 \n"
+      "  --result_filename (save the inference time to the file.) type: \n"
+      "    string default: result.txt \n"
+      "  --threads (threads num) type: int32 default: 1 \n"
+      "  --warmup (warmup times) type: int32 default: 0 \n"
+      "Note that: \n"
+      "  If load the optimized model, set optimized_model_path, or set\n"
+      "    model_dir, model_filename and param_filename according to the\n"
+      "    model. \n";
+  LOG(INFO) << help_info;
+}
+
 int main(int argc, char** argv) {
+  // Check inputs
   gflags::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_model_dir == "") {
-    LOG(INFO) << "Please run ./benchmark_bin --help to obtain usage.";
+  bool is_opt_model = (FLAGS_optimized_model_path != "");
+  bool is_origin_model = (FLAGS_model_dir != "");
+  if (!is_origin_model && !is_opt_model) {
+    LOG(INFO) << "Input error, the model path should not be empty.\n";
+    print_usage();
     exit(0);
   }
 
-  if (FLAGS_model_dir.back() == '/') {
-    FLAGS_model_dir.pop_back();
-  }
-  std::size_t found = FLAGS_model_dir.find_last_of("/");
-  std::string model_name = FLAGS_model_dir.substr(found + 1);
-  std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2";
-
+  // Get input shape
   auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
     std::vector<int64_t> shape;
     std::string tmp_str = str_shape;
@@ -202,19 +254,31 @@ int main(int argc, char** argv) {
     }
     return shape;
   };
-
   std::vector<int64_t> input_shape = get_shape(FLAGS_input_shape);
 
-  // Output optimized model if needed
-  if (FLAGS_run_model_optimize) {
-    paddle::lite_api::OutputOptModel(save_optimized_model_dir);
+  // Get model_name and run_model_path
+  std::string model_name;
+  std::string run_model_path;
+  if (is_origin_model) {
+    if (FLAGS_model_dir.back() == '/') {
+      FLAGS_model_dir.pop_back();
+    }
+    std::size_t found = FLAGS_model_dir.find_last_of("/");
+    model_name = FLAGS_model_dir.substr(found + 1);
+    std::string optimized_model_path = FLAGS_model_dir + "_opt2";
+    paddle::lite_api::OutputOptModel(optimized_model_path);
+    run_model_path = optimized_model_path + ".nb";
+  } else {
+    size_t found1 = FLAGS_optimized_model_path.find_last_of("/");
+    size_t found2 = FLAGS_optimized_model_path.find_last_of(".");
+    size_t len = found2 - found1 - 1;
+    model_name = FLAGS_optimized_model_path.substr(found1 + 1, len);
+    run_model_path = FLAGS_optimized_model_path;
   }
 
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  // Run inference using optimized model
-  std::string run_model_dir =
-      FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
-  paddle::lite_api::Run(input_shape, run_model_dir, model_name);
+  // Run test
+  paddle::lite_api::Run(input_shape, run_model_path, model_name);
 #endif
   return 0;
 }
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index 6a14b807cf14939309a6e8058877bfa78cb68367..3f3428b434e98ffb0ba578ef7f31a4fbcd9ca619 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -19,6 +19,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "lite/api/paddle_use_passes.h"
 #include "lite/utils/io.h"
 
 namespace paddle {
diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
index e63893cb91e112beb6be50bd661a57b9738e5fb1..146556756af7e0b56ae38b5303e622c97dfe58af 100644
--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -43,6 +43,7 @@ class LITE_API Predictor {
  public:
   // Create an empty predictor.
   Predictor() { scope_ = std::make_shared<Scope>(); }
+
   // Create a predictor with the weight variable scope set.
   explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
       : scope_(root_scope) {}
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 972210c8f9ea05ba1b041382c43efad64aeacc1b..ccd7c981385ff776c47c01fbfdd058001341dff6 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -31,10 +31,26 @@ namespace lite {
 
 void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   config_ = config;
+  auto places = config.valid_places();
 #ifdef LITE_WITH_CUDA
-  Env<TARGET(kCUDA)>::Init();
+  // if kCUDA is included in valid places, it should be initialized first,
+  // otherwise skip this step.
+  for (auto &p : places) {
+    if (p.target == TARGET(kCUDA)) {
+      Env<TARGET(kCUDA)>::Init();
+      break;
+    }
+  }
 #endif
-  auto places = config.valid_places();
+#ifdef LITE_WITH_MLU
+  Env<TARGET(kMLU)>::Init();
+  lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
+                                           config.mlu_core_number(),
+                                           config.mlu_use_first_conv(),
+                                           config.mlu_first_conv_mean(),
+                                           config.mlu_first_conv_std(),
+                                           config.mlu_input_layout());
+#endif  // LITE_WITH_MLU
   std::vector<std::string> passes{};
   auto use_layout_preprocess_pass =
       config.model_dir().find("OPENCL_PRE_PRECESS");
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
index b641973a15b2e6abc1cf4c999d759271f7522638..01f8853cb9ad2de903e2d6ce675a189b4932f309 100644
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -13,13 +13,10 @@
 // limitations under the License.
 
 #include "lite/api/light_api.h"
+#include <algorithm>
+#include <unordered_map>
 #include "paddle_use_kernels.h"  // NOLINT
 #include "paddle_use_ops.h"      // NOLINT
-#ifndef LITE_ON_TINY_PUBLISH
-#include "lite/api/paddle_use_passes.h"
-#endif
-
-#include <algorithm>
 
 namespace paddle {
 namespace lite {
@@ -32,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file,
     LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
   }
 
+  // For weight quantization of post training, load the int8/16 weights
+  // for optimized model, and dequant it to fp32.
   DequantizeWeight();
+
   BuildRuntimeProgram(cpp_program_desc_);
   PrepareFeedFetch();
 }
@@ -139,7 +139,15 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
   // 1. Create op first
   Program program(prog, scope_, {});
 
-  // 2. Create Instructs
+// 2. Create Instructs
+#ifdef LITE_WITH_OPENCL
+  using WaitListType =
+      std::unordered_map<decltype(static_cast<const void*>(nullptr)),
+                         std::shared_ptr<cl::Event>>;
+  using OpenCLContext = Context<TargetType::kOpenCL>;
+  std::unique_ptr<KernelContext> local_ctx(new KernelContext());
+  local_ctx->As<OpenCLContext>().InitOnce();
+#endif
 
   // Create the kernels of the target places, and filter out the specific
   // kernel with the target alias.
@@ -155,7 +163,18 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
           return it->alias() == alias;
         });
     CHECK(it != kernels.end());
+
+#ifdef LITE_WITH_OPENCL
+    if ((*it)->target() == TARGET(kOpenCL)) {
+      std::unique_ptr<KernelContext> ctx(new KernelContext());
+      (*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
+      (*it)->SetContext(std::move(ctx));
+    } else {
+      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+    }
+#else
     (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+#endif
 
     insts.emplace_back(op, std::move(*it));
   }
@@ -166,58 +185,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
 }
 
 void LightPredictor::DequantizeWeight() {
-#define PROCESS_CONV2D_DATA()                                   \
-  for (int64_t i = 0; i < h; ++i) {                             \
-    for (int64_t j = 0; j < w; ++j) {                           \
-      fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \
-    }                                                           \
+#define PROCESS_CONV2D_DATA()                                             \
+  for (int64_t i = 0; i < ch; ++i) {                                      \
+    for (int64_t j = 0; j < offset; ++j) {                                \
+      fp_data[i * offset + j] = scale_list[i] * int_data[i * offset + j]; \
+    }                                                                     \
   }
 
-#define PROCESS_FC_DATA()                           \
-  for (int i = 0; i < input_tensor->numel(); i++) { \
-    *fp_data = scale_list[0] * (*int_data);         \
-    ++fp_data;                                      \
-    ++int_data;                                     \
+#define PROCESS_FC_DATA()                                               \
+  for (int64_t i = 0; i < chin; i++) {                                  \
+    for (int64_t j = 0; j < chout; j++) {                               \
+      fp_data[i * chout + j] = scale_list[j] * int_data[i * chout + j]; \
+    }                                                                   \
   }
 
+  auto is_weight_quantized_op = [](const cpp::OpDesc* op_desc) {
+    bool result = false;
+    if (op_desc->HasAttr("quantization_type")) {
+      std::string type = op_desc->GetAttr<std::string>("quantization_type");
+      result = (type == "post_weight_abs_max") ||
+               (type == "post_weight_channel_wise_abs_max");
+    } else {
+      result = op_desc->HasAttr("quantize_weight_bits");
+    }
+    return result;
+  };
+
   Tensor tmp_tensor;
-  CHECK(cpp_program_desc_.BlocksSize());
-  auto* main_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
-  for (size_t k = 0; k < main_block->OpsSize(); ++k) {
-    auto* op_desc = main_block->GetOp<cpp::OpDesc>(k);
-    if (op_desc->HasAttr("quantize_weight_bits")) {  //  weight quantized op
-      auto input_names = op_desc->input_vars();
-      for (auto& input_name : input_names) {
-        std::string input_scale_name = input_name + "_quant_scale";
-        if (op_desc->HasAttr(input_scale_name)) {  // the input is quantized
-          auto input_tensor =
-              scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
-          tmp_tensor.CopyDataFrom(*input_tensor);
-          auto scale_list =
-              op_desc->GetAttr<std::vector<float>>(input_scale_name);
-          int quantize_weight_bits =
-              op_desc->GetAttr<int>("quantize_weight_bits");
-          float* fp_data = input_tensor->mutable_data<float>();
-
-          std::string op_type = op_desc->Type();
-          if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
-            int64_t h = input_tensor->dims()[0];
-            int64_t w = input_tensor->numel() / h;
-            CHECK_EQ(scale_list.size(), h);
-            if (quantize_weight_bits == 8) {
-              const int8_t* int_data = tmp_tensor.data<int8_t>();
-              PROCESS_CONV2D_DATA()
-            } else {
-              const int16_t* int_data = tmp_tensor.data<int16_t>();
-              PROCESS_CONV2D_DATA()
-            }
-          } else if (op_type == "fc" || op_type == "mul") {
-            if (quantize_weight_bits == 8) {
-              const int8_t* int_data = tmp_tensor.data<int8_t>();
-              PROCESS_FC_DATA()
-            } else {
-              const int16_t* int_data = tmp_tensor.data<int16_t>();
-              PROCESS_FC_DATA()
+  for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
+    auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
+    for (size_t k = 0; k < block->OpsSize(); ++k) {
+      auto* op_desc = block->GetOp<cpp::OpDesc>(k);
+      if (is_weight_quantized_op(op_desc)) {
+        auto input_names = op_desc->input_vars();
+        for (auto& input_name : input_names) {
+          std::string input_scale_name = input_name + "_quant_scale";
+          if (op_desc->HasAttr(input_scale_name)) {  // the input is quantized
+            auto input_tensor =
+                scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
+            tmp_tensor.CopyDataFrom(*input_tensor);
+            auto scale_list =
+                op_desc->GetAttr<std::vector<float>>(input_scale_name);
+
+            int quantize_weight_bits =
+                op_desc->GetAttr<int>("quantize_weight_bits");
+            CHECK(quantize_weight_bits == 8 || quantize_weight_bits == 16);
+            float* fp_data = input_tensor->mutable_data<float>();
+
+            std::string op_type = op_desc->Type();
+            if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
+              int64_t ch = input_tensor->dims()[0];
+              int64_t offset = input_tensor->numel() / ch;
+              CHECK_EQ(scale_list.size(), ch);
+              if (quantize_weight_bits == 8) {
+                const int8_t* int_data = tmp_tensor.data<int8_t>();
+                PROCESS_CONV2D_DATA()
+              } else {
+                const int16_t* int_data = tmp_tensor.data<int16_t>();
+                PROCESS_CONV2D_DATA()
+              }
+            } else if (op_type == "fc" || op_type == "mul") {
+              int64_t chin = input_tensor->dims()[0];
+              int64_t chout = input_tensor->dims()[1];
+              CHECK_EQ(scale_list.size(), chout);
+              if (quantize_weight_bits == 8) {
+                const int8_t* int_data = tmp_tensor.data<int8_t>();
+                PROCESS_FC_DATA()
+              } else {
+                const int16_t* int_data = tmp_tensor.data<int16_t>();
+                PROCESS_FC_DATA()
+              }
             }
           }
         }
diff --git a/lite/api/light_api_shared.cc b/lite/api/light_api_shared.cc
deleted file mode 100644
index cfe3d9de09a646e33c4a116bb3cd087d28aa24c2..0000000000000000000000000000000000000000
--- a/lite/api/light_api_shared.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/api/paddle_api.h"
-
-namespace paddle {
-namespace lite_api {
-
-void RunModel() {
-  // 1. Set MobileConfig
-  MobileConfig mobile_config;
-
-  // 2. Create PaddlePredictor by MobileConfig
-  std::shared_ptr<PaddlePredictor> mobile_predictor =
-      CreatePaddlePredictor<MobileConfig>(mobile_config);
-}
-
-}  // namespace lite_api
-}  // namespace paddle
diff --git a/lite/api/opt.cc b/lite/api/opt.cc
index 12003050af864da7d88d335553d71007cf5ed9c5..7a8cd7f1ef1234269c986b781f0546b26df53c4b 100644
--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -109,6 +109,8 @@ std::vector<Place> ParserValidPlaces() {
       valid_places.emplace_back(TARGET(kNPU));
     } else if (target_repr == "xpu") {
       valid_places.emplace_back(TARGET(kXPU));
+    } else if (target_repr == "mlu") {
+      valid_places.emplace_back(TARGET(kMLU));
     } else {
       LOG(FATAL) << lite::string_format(
           "Wrong target '%s' found, please check the command flag "
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index 2cb2064da518bca442e882d0733c5c6966c4fac0..daef2c66dda5188a1eec25c3d5f045f1fa705e1e 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/api/paddle_api.h"
+#include "lite/core/context.h"
 #include "lite/core/device_info.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
@@ -203,6 +204,58 @@ void ConfigBase::set_threads(int threads) {
 #endif
 }
 
+#ifdef LITE_WITH_MLU
+void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
+  mlu_core_version_ = core_version;
+}
+void CxxConfig::set_mlu_core_number(int core_number) {
+  mlu_core_number_ = core_number;
+}
+void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
+  mlu_input_layout_ = layout;
+}
+void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
+  mlu_use_first_conv_ = use_first_conv;
+}
+void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
+  mlu_first_conv_mean_ = mean;
+}
+void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
+  mlu_first_conv_std_ = std;
+}
+lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
+  return mlu_core_version_;
+}
+int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
+DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
+bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
+const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
+  return mlu_first_conv_mean_;
+}
+const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
+  return mlu_first_conv_std_;
+}
+#endif
+
+void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_workspace_l3_size_per_thread' is ignored, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
+void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
+#ifdef LITE_WITH_XPU
+  lite::Context<TargetType::kXPU>::SetDev(dev_no);
+#else
+  LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
+                  "ignored, please rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
 // set model data in combined format, `set_model_from_file` refers to loading
 // model from file, set_model_from_buffer refers to loading model from memory
 // buffer
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index c445ef641b96d9fbbc5b4123be794976c0cf03c4..ce0f0e15d84835fab733a5114906e0a0df3a0064 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -136,6 +136,14 @@ class LITE_API CxxConfig : public ConfigBase {
 #ifdef LITE_WITH_X86
   int x86_math_library_math_threads_ = 1;
 #endif
+#ifdef LITE_WITH_MLU
+  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
+  int mlu_core_number_{1};
+  DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
+  bool mlu_use_first_conv_{false};
+  std::vector<float> mlu_first_conv_mean_;
+  std::vector<float> mlu_first_conv_std_;
+#endif
 
  public:
   void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -163,6 +171,37 @@ class LITE_API CxxConfig : public ConfigBase {
     return x86_math_library_math_threads_;
   }
 #endif
+
+#ifdef LITE_WITH_MLU
+  // set MLU core version, which is used when compiling MLU kernels
+  void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
+  // set MLU core number, which is used when compiling MLU kernels
+  void set_mlu_core_number(int core_number);
+  // set MLU input layout. User can specify layout of input data to be NHWC,
+  // default is NCHW
+  void set_mlu_input_layout(DataLayoutType layout);
+  // whether use MLU's first conv kernel. First conv is a special kernel
+  // provided by MLU, its input is uint8, and also needs two 3-dimentional
+  // vectors which save all inputs' mean and std values
+  void set_mlu_use_first_conv(bool use_first_conv);
+  // set the 3-dimentional mean vector used by MLU's first conv
+  void set_mlu_first_conv_mean(const std::vector<float>& mean);
+  // set the 3-dimentional std vector used by MLU's first conv
+  void set_mlu_first_conv_std(const std::vector<float>& std);
+
+  lite_api::MLUCoreVersion mlu_core_version() const;
+  int mlu_core_number() const;
+  DataLayoutType mlu_input_layout() const;
+  bool mlu_use_first_conv() const;
+  const std::vector<float>& mlu_first_conv_mean() const;
+  const std::vector<float>& mlu_first_conv_std() const;
+#endif
+
+  // XPU only, set the size of the workspace memory from L3 cache for the
+  // current thread.
+  void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
+  // XPU only, specify the target device ID for the current thread.
+  void set_xpu_dev_per_thread(int dev_no = 0);
 };
 
 /// MobileConfig is the config for the light weight predictor, it will skip
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index fd6612d40ef0f60e68739d268c8b3c1a97835b25..485ce8d08eaa806ffbbbd96ad0cce776d1264c4b 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -72,7 +72,8 @@ const std::string& TargetToStr(TargetType target) {
                                               "npu",
                                               "xpu",
                                               "bm",
-                                              "ascend310"};
+                                              "mlu",
+                                              "hw_ascend_npu"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -113,6 +114,7 @@ const std::string& TargetRepr(TargetType target) {
                                               "kNPU",
                                               "kXPU",
                                               "kBM",
+                                              "kMLU"
                                               "kHWAscendNPU"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
@@ -155,6 +157,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                                TARGET(kNPU),
                                                TARGET(kXPU),
                                                TARGET(kBM),
+                                               TARGET(kMLU),
                                                TARGET(kFPGA)});
   if (target == TARGET(kAny)) {
     return valid_set;
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index 88447f90b6efc5dd5f5d850d6b23d88e0716f919..a5ec29ec493e5d1b7a33c1d7067a540c2064a615 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -53,10 +53,10 @@ enum class TargetType : int {
   kNPU = 8,
   kXPU = 9,
   kBM = 10,
-  kAny = 6,  // any target
   kMLU = 11,
   kHWAscendNPU = 12,
-  NUM = 13,  // number of fields.
+  kAny = 6,  // any target
+  NUM = 13   // number of fields.
 };
 enum class PrecisionType : int {
   kUnk = 0,
@@ -90,6 +90,8 @@ typedef enum {
   LITE_POWER_RAND_LOW = 5
 } PowerMode;
 
+typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion;
+
 enum class ActivationType : int {
   kIndentity = 0,
   kRelu = 1,
@@ -101,7 +103,9 @@ enum class ActivationType : int {
   kSwish = 7,
   kExp = 8,
   kAbs = 9,
-  NUM = 10,
+  kHardSwish = 10,
+  kReciprocal = 11,
+  NUM = 12,
 };
 
 static size_t PrecisionTypeLength(PrecisionType type) {
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 41eca021a9ded40134122cb7b68604d9cd8f9fc2..219952bd2aa440c81b116d9ae8aaba0920268eb5 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -45,5 +45,9 @@ USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
+USE_MIR_PASS(mlu_subgraph_pass);
+USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(__xpu__resnet_fuse_pass);
+USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc
index e86d570e18b50bdc3d8943ecdd3732f8475ad56c..5512e7bc438eddd6bcd9c8f792fc8507b03bf800 100644
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -47,6 +47,7 @@ using lite_api::TargetType;
 using lite_api::PrecisionType;
 using lite_api::DataLayoutType;
 using lite_api::Place;
+using lite_api::MLUCoreVersion;
 using lite::LightPredictorImpl;
 using lite_api::OptBase;
 
@@ -76,6 +77,7 @@ static void BindLiteMobileConfig(py::module *m);
 static void BindLitePowerMode(py::module *m);
 static void BindLitePlace(py::module *m);
 static void BindLiteTensor(py::module *m);
+static void BindLiteMLUCoreVersion(py::module *m);
 
 void BindLiteApi(py::module *m) {
   BindLiteCxxConfig(m);
@@ -83,6 +85,7 @@ void BindLiteApi(py::module *m) {
   BindLitePowerMode(m);
   BindLitePlace(m);
   BindLiteTensor(m);
+  BindLiteMLUCoreVersion(m);
 #ifndef LITE_ON_TINY_PUBLISH
   BindLiteCxxPredictor(m);
 #endif
@@ -124,6 +127,14 @@ void BindLiteCxxConfig(py::module *m) {
       .def("set_power_mode", &CxxConfig::set_power_mode)
       .def("power_mode", &CxxConfig::power_mode);
 #endif
+#ifdef LITE_WITH_MLU
+  cxx_config.def("set_mlu_core_version", &CxxConfig::set_mlu_core_version)
+      .def("set_mlu_core_number", &CxxConfig::set_mlu_core_number)
+      .def("set_mlu_input_layout", &CxxConfig::set_mlu_input_layout)
+      .def("set_mlu_use_first_conv", &CxxConfig::set_mlu_use_first_conv)
+      .def("set_mlu_first_conv_mean", &CxxConfig::set_mlu_first_conv_mean)
+      .def("set_mlu_first_conv_std", &CxxConfig::set_mlu_first_conv_std);
+#endif
 }
 
 // TODO(sangoly): Should MobileConfig be renamed to LightConfig ??
@@ -155,6 +166,12 @@ void BindLitePowerMode(py::module *m) {
       .value("LITE_POWER_RAND_LOW", PowerMode::LITE_POWER_RAND_LOW);
 }
 
+void BindLiteMLUCoreVersion(py::module *m) {
+  py::enum_<MLUCoreVersion>(*m, "MLUCoreVersion")
+      .value("LITE_MLU_220", MLUCoreVersion::MLU_220)
+      .value("LITE_MLU_270", MLUCoreVersion::MLU_270);
+}
+
 void BindLitePlace(py::module *m) {
   // TargetType
   py::enum_<TargetType>(*m, "TargetType")
@@ -165,6 +182,7 @@ void BindLitePlace(py::module *m) {
       .value("OpenCL", TargetType::kOpenCL)
       .value("FPGA", TargetType::kFPGA)
       .value("NPU", TargetType::kNPU)
+      .value("MLU", TargetType::kMLU)
       .value("Any", TargetType::kAny);
 
   // PrecisionType
@@ -245,6 +263,20 @@ void BindLiteTensor(py::module *m) {
   DO_GETTER_ONCE(data_type__, name__##_data)
 
   DATA_GETTER_SETTER_ONCE(int8_t, int8);
+#ifdef LITE_WITH_MLU
+  tensor.def("set_uint8_data",
+             [](Tensor &self,
+                const std::vector<uint8_t> &data,
+                TargetType type = TargetType::kHost) {
+               if (type == TargetType::kHost) {
+                 self.CopyFromCpu<uint8_t, TargetType::kHost>(data.data());
+               }
+             },
+             py::arg("data"),
+             py::arg("type") = TargetType::kHost);
+
+  DO_GETTER_ONCE(uint8_t, "uint8_data");
+#endif
   DATA_GETTER_SETTER_ONCE(int32_t, int32);
   DATA_GETTER_SETTER_ONCE(float, float);
 #undef DO_GETTER_ONCE
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
index e3517464812a24c9911e824c53841efc05dd2bc5..fb459ae3621d1281f0a2433ca6b237a165d078a1 100644
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -6,4 +6,5 @@ add_subdirectory(fpga)
 add_subdirectory(host)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(mlu)
 add_subdirectory(bm)
diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc
index 9f478eab60538eeca38415afea4e0989eff5a04e..26e63e23f6acb761b61b397bb881d425e3442468 100644
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/backends/arm/math/activation.h"
+#include <algorithm>
 #include <string>
 #include "lite/backends/arm/math/funcs.h"
 
@@ -711,6 +712,38 @@ void act_square<float>(const float* din, float* dout, int size, int threads) {
   }
 }
 
+template <>
+void act_hard_swish<float>(const float* din,
+                           float* dout,
+                           int size,
+                           float threshold,
+                           float scale,
+                           float offset,
+                           int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = std::min(std::max(0.f, ptr_in[0] + offset), threshold) *
+                 ptr_in[0] / scale;
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
+template <>
+void act_reciprocal<float>(const float* din,
+                           float* dout,
+                           int size,
+                           int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = 1.0 / ptr_in[0];
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
 #ifdef LITE_WITH_TRAIN
 template <>
 void act_square_grad(const float* din,
diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h
index 63f4418d70db25f98dea2a405de1f4bb6b0b9111..ca6b146442a3ec324a9bd244ee4ce6ad0601d4d7 100644
--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -72,6 +72,17 @@ void act_rsqrt(const T* din, T* dout, int size, int threads);
 template <typename T>
 void act_square(const T* din, T* dout, int size, int threads);
 
+template <typename T>
+void act_hard_swish(const T* din,
+                    T* dout,
+                    int size,
+                    float threshold,
+                    float scale,
+                    float offset,
+                    int threads);
+template <typename T>
+void act_reciprocal(const T* din, T* dout, int size, int threads);
+
 #ifdef LITE_WITH_TRAIN
 template <typename T>
 void act_square_grad(
diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc
index 7fa12a5e5e697549c32a2f477119eee2fddfc700..0edb83acc4772b2f878b22f2ea16b3175b14a7ba 100644
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,7 +10,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "lite/backends/opencl/cl_context.h"
-#include <algorithm>
 #include <memory>
 #include <string>
 #include <utility>
@@ -55,7 +51,8 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
 
 void CLContext::AddKernel(const std::string &kernel_name,
                           const std::string &file_name,
-                          const std::string &options) {
+                          const std::string &options,
+                          const std::string &time_stamp) {
   cl_int status{CL_SUCCESS};
   VLOG(3) << " --- to get program " << file_name << " --- ";
   auto program = GetProgram(file_name, options);
@@ -67,7 +64,7 @@ void CLContext::AddKernel(const std::string &kernel_name,
   VLOG(3) << " --- end create kernel --- ";
   kernels_.emplace_back(std::move(kernel));
   STL::stringstream kernel_key;
-  kernel_key << kernel_name << options;
+  kernel_key << kernel_name << options << time_stamp;
   kernel_offset_[kernel_key.str()] = kernels_.size() - 1;
 }
 
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
index 7ffe1bc87d86cf370074091e7adf16c8460d218a..586dc3df1267e47c6cdaad1d362cd9ed2df2770e 100644
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -27,6 +27,20 @@ namespace lite {
 
 class CLContext {
  public:
+  ~CLContext() {
+    for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
+      clReleaseKernel(kernels_[kidx]->get());
+      kernels_[kidx].reset();
+    }
+    kernels_.clear();
+    kernel_offset_.clear();
+    for (auto &p : programs_) {
+      clReleaseProgram(p.second->get());
+    }
+    programs_.clear();
+    LOG(INFO) << "release cl::Program, cl::Kernel finished.";
+  }
+
   cl::CommandQueue &GetCommandQueue();
 
   cl::Context &GetContext();
@@ -36,7 +50,8 @@ class CLContext {
 
   void AddKernel(const std::string &kernel_name,
                  const std::string &file_name,
-                 const std::string &options = "");
+                 const std::string &options = "",
+                 const std::string &time_stamp = "");
 
   cl::Kernel &GetKernel(const int index);
 
@@ -45,12 +60,12 @@ class CLContext {
   cl::NDRange DefaultWorkSize(const CLImage &image);
 
   cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
+
   cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
                                 size_t max_work_size,
                                 int divitor = 2);
   //  cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
   //                                   size_t max_work_size);
-
  private:
   std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
   std::vector<std::unique_ptr<cl::Kernel>> kernels_;
diff --git a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
index cb29860dc7556bdaea3c09589a8c6120c5ef2a1a..08491d5d9fd195430a4b03673c38767f7e4a5be8 100644
--- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
@@ -55,17 +55,20 @@ __kernel void relu6(__read_only image2d_t input,
 __kernel void sigmoid(__read_only image2d_t input,
                       __write_only image2d_t output,
                       __private const float threshold,
-                   __private const float scale) {
+                      __private const float scale) {
+  const int x = get_global_id(0);  // image_width
+  const int y = get_global_id(1);  // image_height
 
-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
   CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  CL_DTYPE4 out = 1 / (1 + exp(-in));
+  CL_DTYPE4 out;
+  out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
+  out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
+  out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
+  out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
+
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
 
diff --git a/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..72b0b66f9737ce0ca9c740e6d4e399d06eaf2cd8
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl
@@ -0,0 +1,152 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void decode_center_size(__read_only image2d_t prior_box_image,
+                                __read_only image2d_t prior_box_var_image,
+                                __read_only image2d_t target_box_image,
+                                __write_only image2d_t output_image,
+                                __private const int out_C,
+                                __private const int out_H){
+                        const int out_c = get_global_id(0);
+                        const int out_nh = get_global_id(1);
+                        const int out_h = out_nh % out_H;
+                        const int out_n =  1;
+
+                        const int prior_box_n = 1;
+                        const int prior_box_c = 0;
+                        const int prior_box_h = out_h;
+
+                        const int prior_box_var_n = 1;
+                        const int prior_box_var_c = 0;
+                        const int prior_box_var_h = out_h;
+
+                        const int target_box_n = 1;
+                        const int target_box_c = out_c;
+                        const int target_box_h = out_h;
+
+                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                 CLK_ADDRESS_CLAMP      |
+                                                 CLK_FILTER_NEAREST;
+                        int2  prior_box_pos;
+                        int2  prior_box_var_pos;
+                        int2  target_box_pos;
+                        int2  output_pos;
+
+                        prior_box_pos.x = prior_box_c * 4;
+                        prior_box_pos.y = prior_box_n * prior_box_h;
+
+                        prior_box_var_pos.x = prior_box_var_c * 4;
+                        prior_box_var_pos.y = prior_box_var_n * prior_box_var_h;
+
+                        target_box_pos.x = target_box_c * 4;
+                        target_box_pos.y = target_box_n * target_box_h;
+
+                        output_pos.x = out_c * 4;
+                        output_pos.y = out_n * out_h;
+
+                        CL_DTYPE4 prior_box_input[4];
+                        CL_DTYPE4 prior_box_var_input[4];
+                        CL_DTYPE4 target_box_input[4];
+
+                        prior_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 0, prior_box_pos.y));
+                        prior_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 1, prior_box_pos.y));
+                        prior_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 2, prior_box_pos.y));
+                        prior_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
+                                            (int2)(prior_box_pos.x + 3, prior_box_pos.y));
+
+                        prior_box_var_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 0, prior_box_var_pos.y));
+                        prior_box_var_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 1, prior_box_var_pos.y));
+                        prior_box_var_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
+                                                (int2)(prior_box_var_pos.x + 2, prior_box_var_pos.y));
+                        prior_box_var_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, 
+                                                (int2)(prior_box_var_pos.x + 3, prior_box_var_pos.y));
+
+                        target_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 0,target_box_pos.y));
+                        target_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 1, target_box_pos.y));
+                        target_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 2, target_box_pos.y));
+                        target_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
+                                            (int2)(target_box_pos.x + 3, target_box_pos.y));
+
+                        CL_DTYPE prior_box_width = prior_box_input[2].x - prior_box_input[0].x;
+                        CL_DTYPE prior_box_height = prior_box_input[3].x - prior_box_input[1].x;
+                        CL_DTYPE prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(CL_DTYPE)2;
+                        CL_DTYPE prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(CL_DTYPE)2;
+
+                        CL_DTYPE4 target_box_center_x;
+                        CL_DTYPE4 target_box_center_y;
+                        CL_DTYPE4 target_box_width;
+                        CL_DTYPE4 target_box_height;
+                        CL_DTYPE4 output[4];
+
+                        output[0] = 0.0f;
+                        output[1] = 0.0f;
+                        output[2] = 0.0f;
+                        output[3] = 0.0f;
+
+                        target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x;
+                        target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y;
+                        target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width;
+                        target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height;
+
+                        output[0].x = target_box_center_x.x - target_box_width.x/(half)2;
+                        output[1].x = target_box_center_y.x - target_box_height.x/(half)2;
+                        output[2].x = target_box_center_x.x + target_box_width.x/(half)2;
+                        output[3].x = target_box_center_y.x + target_box_height.x/(half)2;
+
+                        if(out_C - out_c * 4 >= 2){
+                            target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x;
+                            target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y;
+                            target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width;
+                            target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height;
+                            output[0].y = target_box_center_x.y - target_box_width.y/(half)2;
+                            output[1].y = target_box_center_y.y - target_box_height.y/(half)2;
+                            output[2].y = target_box_center_x.y + target_box_width.y/(half)2;
+                            output[3].y = target_box_center_y.y + target_box_height.y/(half)2;
+                        }
+                        if(out_C - out_c * 4 >= 3){
+                            target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x;
+                            target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y;
+                            target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width;
+                            target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height;
+                            output[0].z = target_box_center_x.z - target_box_width.z/(half)2;
+                            output[1].z = target_box_center_y.z - target_box_height.z/(half)2;
+                            output[2].z = target_box_center_x.z + target_box_width.z/(half)2;
+                            output[3].z = target_box_center_y.z + target_box_height.z/(half)2;
+                        }
+                        if(out_C - out_c * 4 >= 4){
+                            target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x;
+                            target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y;
+                            target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width;
+                            target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height;
+                            output[0].w = target_box_center_x.w - target_box_width.w/(half)2;
+                            output[1].w = target_box_center_y.w - target_box_height.w/(half)2;
+                            output[2].w = target_box_center_x.w + target_box_width.w/(half)2;
+                            output[3].w = target_box_center_y.w + target_box_height.w/(half)2;
+                        }
+
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]);
+                        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]);
+}
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
index 63c9954f9181e9252c4d14f57b6ed29107965fe3..8a6b026367986548b017aee263a70d4df33381b5 100644
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,6 +26,7 @@ CLRuntime* CLRuntime::Global() {
 
 CLRuntime::~CLRuntime() {
   if (command_queue_ != nullptr) {
+    command_queue_->flush();
     command_queue_->finish();
   }
   // For controlling the destruction order:
@@ -36,6 +34,7 @@ CLRuntime::~CLRuntime() {
   context_.reset();
   device_.reset();
   platform_.reset();
+  LOG(INFO) << "release ~CLRuntime() ";
 }
 
 bool CLRuntime::Init() {
diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h
index 1a5ededeff37d9f6820af6a49dc22c669620734b..2a8996b066a480d9c0a6db67fa5fd60142885046 100644
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc
index 8d61fb3bbb97705c697fba934e6cab9424f85bad..9cf3281152840416dc141f98992499c663783b7a 100644
--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -96,8 +96,8 @@ class BeamSearchFunctor<TARGET(kX86), T> {
     //        : nullptr;
 
     // fill in data
-    std::vector<size_t> low_level;
-    size_t low_offset = 0;
+    std::vector<uint64_t> low_level;
+    uint64_t low_offset = 0;
     for (auto &items : selected_items) {
       low_level.push_back(low_offset);
       for (auto &item : items) {
diff --git a/lite/backends/x86/math/beam_search_test.cc b/lite/backends/x86/math/beam_search_test.cc
index 904870207b08d462025ecb4b84d6cf57f7b13f26..233fa03fbaa31165dae4453affb148276f8c6584 100644
--- a/lite/backends/x86/math/beam_search_test.cc
+++ b/lite/backends/x86/math/beam_search_test.cc
@@ -22,8 +22,8 @@ void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
                        paddle::framework::LoDTensor* pre_scores) {
   // lod
   paddle::framework::LoD lod;
-  std::vector<size_t> level0({0, 2, 4});
-  std::vector<size_t> level1({0, 1, 2, 3, 4});
+  std::vector<uint64_t> level0({0, 2, 4});
+  std::vector<uint64_t> level1({0, 1, 2, 3, 4});
   lod.push_back(level0);
   lod.push_back(level1);
   ids->set_lod(lod);
diff --git a/lite/backends/x86/math/blas_impl.h b/lite/backends/x86/math/blas_impl.h
index 72d0736268f342187f0be8c6348f5bed75df30ea..34b258892be05625ae88076eff175f56a53d3537 100644
--- a/lite/backends/x86/math/blas_impl.h
+++ b/lite/backends/x86/math/blas_impl.h
@@ -483,7 +483,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
              mat_a.data<T>(),
              mat_b.data<T>(),
              beta,
-             mat_out->mutable_data<T>());
+             mat_out->template mutable_data<T>());
 }
 
 template <>
@@ -759,7 +759,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
                            mat_a.data<T>(),
                            mat_b.data<T>(),
                            beta,
-                           mat_out->mutable_data<T>());
+                           mat_out->template mutable_data<T>());
   } else {
     PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
                    dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
@@ -773,7 +773,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
         mat_a.data<T>(),
         mat_b.data<T>(),
         beta,
-        mat_out->mutable_data<T>(),
+        mat_out->template mutable_data<T>(),
         dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
         dim_a.stride_,
         dim_b.stride_);
diff --git a/lite/backends/x86/math/concat_and_split.cc b/lite/backends/x86/math/concat_and_split.cc
index bec93dde41fdb654cfbfd20f5d9e59d1d372e3a8..df75654aebaba26b9889d97445bd889cdf2f4eb0 100644
--- a/lite/backends/x86/math/concat_and_split.cc
+++ b/lite/backends/x86/math/concat_and_split.cc
@@ -51,7 +51,7 @@ class ConcatFunctor<lite::TargetType::kX86, T> {
     // auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
 
     // computation
-    auto output_data = output->mutable_data<T>();
+    auto output_data = output->template mutable_data<T>();
     int col_idx = 0;
     for (int j = 0; j < num; ++j) {
       int col_len = input_cols[j];
@@ -108,7 +108,7 @@ class SplitFunctor<lite::TargetType::kX86, T> {
         int col_len = output_cols[j];
         auto* out_tensor = outputs->at(j);
         if (out_tensor != nullptr) {
-          T* dst_ptr = out_tensor->mutable_data<T>() + k * col_len;
+          T* dst_ptr = out_tensor->template mutable_data<T>() + k * col_len;
           std::copy_n(src_ptr + col_idx, col_len, dst_ptr);
           // memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
           //             sizeof(T) * col_len);
diff --git a/lite/backends/x86/math/cross_entropy.cc b/lite/backends/x86/math/cross_entropy.cc
index 366486924a8c4a5eefd6341183b4f1bc1c0277ad..941a34643669f060cdd18f38f92c39e529da7b19 100644
--- a/lite/backends/x86/math/cross_entropy.cc
+++ b/lite/backends/x86/math/cross_entropy.cc
@@ -50,8 +50,8 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> {
                 .reshape(batch_axis_remain)
                 .sum(Eigen::DSizes<int, 1>(1)));
     } else {
-      const T* prob_data = prob->data<T>();
-      T* loss_data = out->mutable_data<T>();
+      const T* prob_data = prob->template data<T>();
+      T* loss_data = out->template mutable_data<T>();
 
       const int64_t* label_data = labels->data<int64_t>();
       for (int i = 0; i < batch_size; ++i) {
diff --git a/lite/backends/x86/math/im2col.cc b/lite/backends/x86/math/im2col.cc
index 1c4c6a49f5bb804a57344c59368d18255e8a7912..b916c912ffc2a4d62b63b98fdce150b353ba087e 100644
--- a/lite/backends/x86/math/im2col.cc
+++ b/lite/backends/x86/math/im2col.cc
@@ -99,7 +99,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
 
     int channels_col = im_channels * filter_height * filter_width;
 
-    T* im_data = im->mutable_data<T>();
+    T* im_data = im->template mutable_data<T>();
     const T* col_data = col.data<T>();
 
     for (int c = 0; c < channels_col; ++c) {
@@ -161,7 +161,7 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
     int col_width = col->dims()[1];
 
     const T* im_data = im.data<T>();
-    T* col_data = col->mutable_data<T>();
+    T* col_data = col->template mutable_data<T>();
 
     for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
       for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
@@ -235,7 +235,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
         "col_width and padding(padding_left, padding_right) are "
         "inconsistent.");
 
-    T* im_data = im->mutable_data<T>();
+    T* im_data = im->template mutable_data<T>();
     const T* col_data = col.data<T>();
 
     for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
diff --git a/lite/backends/x86/math/im2col_cfo_cpu.h b/lite/backends/x86/math/im2col_cfo_cpu.h
index 4623f045bb1cbe67605b36621efcc3285b989ad5..97579647d4ec3a9a95e033a153417cb0aaadbeb6 100644
--- a/lite/backends/x86/math/im2col_cfo_cpu.h
+++ b/lite/backends/x86/math/im2col_cfo_cpu.h
@@ -42,7 +42,7 @@ inline void im2col_common(const lite::Tensor& im,
   int channels_col = im_channels * filter_height * filter_width;
 
   const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
   for (int c = 0; c < channels_col; ++c) {
     int w_offset = c % filter_width;
     int h_offset = (c / filter_width) % filter_height;
@@ -77,7 +77,7 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im,
   int output_width = col->dims()[4];
 
   const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
   int col_matrix_width = output_width * output_height;
   int im_size = im_height * im_width;
   size_t copy_size = sizeof(T) * output_width;
@@ -123,7 +123,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im,
   constexpr int prw = 1;
 
   const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
+  T* col_data = col->template mutable_data<T>();
   int im_size = im_height * im_width;
   int col_matrix_width = output_width * output_height;
   int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc
index a17807e8a997f0ecf908313a4cb205676e4fa4b8..05a10b5a19fbc8e80ee6dd07e67154d9cf6d1b22 100644
--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -65,7 +65,7 @@ struct TensorSetConstantCPU {
       : tensor_(tensor), value_(value) {}
   template <typename T>
   void apply() const {
-    auto* begin = tensor_->mutable_data<T>(lite::TargetType::kX86);
+    auto* begin = tensor_->template mutable_data<T>(lite::TargetType::kX86);
     std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
   }
   lite::Tensor* tensor_;
@@ -126,7 +126,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
 
     const T* input_data = input.data<T>();
     const T* vector_data = vector.data<T>();
-    T* output_data = output->mutable_data<T>();
+    T* output_data = output->template mutable_data<T>();
     for (int64_t i = 0; i < in_dims[0]; ++i) {
       for (int64_t j = 0; j < size; ++j) {
         output_data[i * in_dims[0] + j] =
diff --git a/lite/backends/x86/math/math_function_impl.h b/lite/backends/x86/math/math_function_impl.h
index 3aaca2e59370f8f2b922554ec6f378bb2a3de9b5..acfb76759f6fc9fa4122afd2388bc3adf8f5ea22 100644
--- a/lite/backends/x86/math/math_function_impl.h
+++ b/lite/backends/x86/math/math_function_impl.h
@@ -83,7 +83,7 @@ class ColwiseSum<lite::TargetType::kX86, T> {
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(out->numel(), size);
 
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
     const T* in_buf = input.data<T>();
 
     for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -129,7 +129,7 @@ class RowwiseMean<lite::TargetType::kX86, T> {
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(out->numel(), height);
     auto inv_size = 1.0 / size;
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
     const T* in_buf = input.data<T>();
 
     for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
@@ -173,7 +173,7 @@ class RowwiseSum<lite::TargetType::kX86, T> {
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(out->numel(), height);
 
-    T* out_buf = out->mutable_data<T>(out->target());
+    T* out_buf = out->template mutable_data<T>(out->target());
     const T* in_buf = input.data<T>();
 
     for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
diff --git a/lite/backends/x86/math/maxouting.cc b/lite/backends/x86/math/maxouting.cc
index 20b40fe7c5000cc1d0ee80c18efa5d1defc911f0..f97b16f7fb3326a6d2eb186e2984df3dbd0a0a90 100644
--- a/lite/backends/x86/math/maxouting.cc
+++ b/lite/backends/x86/math/maxouting.cc
@@ -35,7 +35,7 @@ class MaxOutFunctor<lite::TargetType::kX86, T> {
     // c_size means the output size of each sample
     int c_size = fea_size * output_channels;
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
 
     for (int i = 0; i < batch_size; ++i) {
       int new_bindex = c_size * i;
@@ -72,7 +72,8 @@ class MaxOutGradFunctor<lite::TargetType::kX86, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     for (int i = 0; i < batch_size; ++i) {
       int blen = fea_size * output_channels * i;
diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc
index ab6c1edb481f914d5751149aca2595fee550ca51..4393c42157bb7667ec2218e8b76f05a2c60bcc86 100644
--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
@@ -54,8 +54,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    const T* input_data = input->template data<T>();
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
 
     int hstart, hend;
     int wstart, wend;
@@ -137,7 +137,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     int hstart, hend;
     int wstart, wend;
@@ -220,7 +221,8 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -322,7 +324,7 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const int output_stride = output_depth * output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
 
     int dstart, dend;
     int hstart, hend;
@@ -425,7 +427,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     int dstart, dend;
     int hstart, hend;
@@ -530,7 +533,8 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
diff --git a/lite/backends/x86/math/sample_prob.h b/lite/backends/x86/math/sample_prob.h
index 5312b3df10a41444c073f0cf61d69bce6fc3859a..4351df68a2630c2b8c6f7285f3955a9b06165f67 100644
--- a/lite/backends/x86/math/sample_prob.h
+++ b/lite/backends/x86/math/sample_prob.h
@@ -58,11 +58,11 @@ class SampleWithProb {
     const int64_t* label_data = L->data<int64_t>();
     // int64_t* samples_data =
     //    S->mutable_data<int64_t>(ret_dim, Target);
-    // T* probabilities_data = P->mutable_data<T>(ret_dim, Target);
+    // T* probabilities_data = P->template mutable_data<T>(ret_dim, Target);
     S->Resize({batch_size, num_sampled_classes});
     auto* samples_data = S->mutable_data<int64_t>(Target);
     P->Resize({batch_size, num_sampled_classes});
-    auto* probabilities_data = P->mutable_data<T>(Target);
+    auto* probabilities_data = P->template mutable_data<T>(Target);
 
     // temp sets for unique sampling
     std::unordered_set<int64_t> tmp_samples;
diff --git a/lite/backends/x86/math/search_fc.cc b/lite/backends/x86/math/search_fc.cc
index 56fc363cb48ec5c58f4a7ee3e62a2e6bd7355021..014b213d4f10f7161dc1881d582cca93f2be58e5 100644
--- a/lite/backends/x86/math/search_fc.cc
+++ b/lite/backends/x86/math/search_fc.cc
@@ -42,7 +42,7 @@ class SearchFcFunctor<lite::TargetType::kX86, T> {
     lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size}));
 
     const auto bottom_data = bottom.data<T>();
-    auto top_data = top->mutable_data<T>(lite::TargetType::kX86);
+    auto top_data = top->template mutable_data<T>(lite::TargetType::kX86);
     const auto weights = w.data<T>();
     auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
     call_gemm<lite::X86Context, T>(blas,
diff --git a/lite/backends/x86/math/selected_rows_functor.cc b/lite/backends/x86/math/selected_rows_functor.cc
index f8f1b42361832771ba04d1bdc8b3e2e05f954e29..acb377e31ccac96547fc4f0644332cfad36d66bc 100644
--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
@@ -52,7 +52,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
     PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
     PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
 
-    auto* out_data = out_value->mutable_data<T>();
+    auto* out_data = out_value->template mutable_data<T>();
     auto* in1_data = in1_value.data<T>();
     std::copy_n(in1_data, in1_value.numel(), out_data);
 
@@ -87,7 +87,7 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
     functor(context, output, 0.0);
 
     auto* in1_data = in1_value.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
 
     for (size_t i = 0; i < in1_rows.size(); i++) {
       for (int64_t j = 0; j < in1_row_numel; j++) {
@@ -127,7 +127,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
     in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
 
     auto* in1_data = in1_value.data<T>();
-    auto* in2_data = in2_value->mutable_data<T>();
+    auto* in2_data = in2_value->template mutable_data<T>();
     std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset);
   }
 };
@@ -161,7 +161,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
     input2->set_rows(in2_rows);
 
     auto* in2_value = input2->mutable_value();
-    T* in2_data = in2_value->mutable_data<T>();
+    T* in2_data = in2_value->template mutable_data<T>();
     auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
     size_t offset = 0u;
     for (size_t i = 0u; i != input1.size(); ++i) {
@@ -194,7 +194,7 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {
     PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
 
     auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->mutable_data<T>();
+    auto* input2_data = input2->template mutable_data<T>();
 
     for (size_t i = 0; i < in1_rows.size(); i++) {
       for (int64_t j = 0; j < in1_row_numel; j++) {
@@ -305,7 +305,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
     lite::DDim dims(std::vector<int64_t>(
         {static_cast<int64_t>(merged_row_set.size()), input_width}));
     out.mutable_value()->Resize(dims);
-    auto* out_data = out.mutable_value()->mutable_data<T>();
+    auto* out_data = out.mutable_value()->template mutable_data<T>();
 
     if (merged_row_set.size() == row_num && !sorted_result) {
       // no duplicated ids, just concat the result together
@@ -385,7 +385,7 @@ struct UpdateToTensor<lite::TargetType::kX86, T> {
     PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
 
     auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
+    auto* input2_data = input2->template data<T>();
 
     // FIXME(typhoonzero): use macro fix the below messy code.
     switch (op) {
diff --git a/lite/backends/x86/math/sequence2batch.cc b/lite/backends/x86/math/sequence2batch.cc
index c12c05414d717dce706590a491ccae2384f3bfe5..aa7aeac532e2fa1f90d452924b364be1896ee862 100644
--- a/lite/backends/x86/math/sequence2batch.cc
+++ b/lite/backends/x86/math/sequence2batch.cc
@@ -24,10 +24,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
  public:
   void operator()(const lite::Context<lite::TargetType::kX86>& context,
                   const lite::Tensor& src,
-                  const std::vector<size_t>& index_lod,
+                  const std::vector<uint64_t>& index_lod,
                   lite::Tensor* dst,
                   bool is_src_index) {
-    const size_t* index = index_lod.data();
+    const uint64_t* index = index_lod.data();
     const auto& src_dims = src.dims();
     const auto& dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(
@@ -39,7 +39,7 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
-    auto* dst_data = dst->mutable_data<T>();
+    auto* dst_data = dst->template mutable_data<T>();
     const int sz = width * sizeof(T);
     if (is_src_index) {
       for (int i = 0; i < height; ++i) {
diff --git a/lite/backends/x86/math/sequence2batch.h b/lite/backends/x86/math/sequence2batch.h
index a70cc5bf73522f97ab312fc48553b5316dbf8376..63df008b6dfca936265019a71ac0a553c525dc73 100644
--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
@@ -36,7 +36,7 @@ class CopyMatrixRowsFunctor {
   // The indexed rows are based on the input index.
   void operator()(const lite::Context<Target>& context,
                   const lite::Tensor& src,
-                  const std::vector<size_t>& index_lod,
+                  const std::vector<uint64_t>& index_lod,
                   lite::Tensor* dst,
                   bool is_src_index);
 };
@@ -130,8 +130,8 @@ class LoDTensor2BatchFunctor {
     // batch_lods[2] is the sort order for the input LoDTensor.
     batch_lods->at(2).resize(seq_info.size());
 
-    size_t* batch_starts = batch_lods->at(0).data();
-    size_t* seq2batch_idx = batch_lods->at(1).data();
+    auto* batch_starts = batch_lods->at(0).data();
+    auto* seq2batch_idx = batch_lods->at(1).data();
     batch_starts[0] = 0;
     for (int n = 0; n < max_seqlen; n++) {
       auto batch_id = static_cast<int>(batch_starts[n]);
@@ -148,7 +148,7 @@ class LoDTensor2BatchFunctor {
       }
       batch_starts[n + 1] = static_cast<size_t>(batch_id);
     }
-    size_t* seq_order = batch_lods->at(2).data();
+    auto* seq_order = batch_lods->at(2).data();
     for (size_t i = 0; i < seq_info.size(); ++i) {
       seq_order[i] = seq_info[i].seq_idx;
     }
diff --git a/lite/backends/x86/math/sequence_padding.cc b/lite/backends/x86/math/sequence_padding.cc
index fbb6c11a5f7a0cbae36d2f8fba0b141dadadf542..eb977dc2d23f4cfaeec7dd5a6e2834ca23345f76 100644
--- a/lite/backends/x86/math/sequence_padding.cc
+++ b/lite/backends/x86/math/sequence_padding.cc
@@ -22,15 +22,15 @@ namespace math {
 template <typename T>
 void CopyValidData(lite::Tensor* dst_tensor,
                    const lite::Tensor* src_tensor,
-                   const std::vector<size_t>& seq_offsets,
+                   const std::vector<uint64_t>& seq_offsets,
                    int pad_seq_len,
                    int step_width,
                    bool norm_by_len,
                    CopyType type,
                    PadLayout layout) {
   int seq_num = seq_offsets.size() - 1;
-  const T* src_data = src_tensor->data<T>();
-  T* dst_data = dst_tensor->mutable_data<T>();
+  const T* src_data = src_tensor->template data<T>();
+  T* dst_data = dst_tensor->template mutable_data<T>();
 
   int seq_cpy_gap = step_width;
   int pad_cpy_gap =
@@ -113,7 +113,7 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
                    "'step_width'.");
 
     // fill padding value
-    T* pad_data = pad_tensor->mutable_data<T>();
+    T* pad_data = pad_tensor->template mutable_data<T>();
     const T* pad_value_data = pad_value.data<T>();
     if (pad_value.numel() == 1) {
       fast_mem_init<T>(
diff --git a/lite/backends/x86/math/sequence_padding.h b/lite/backends/x86/math/sequence_padding.h
index a3f4512042de4c7a2fc665f2fd41777d472225f5..43407014dea0ed0c78ab29da7fb8ebb0e0310566 100644
--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
@@ -30,10 +30,10 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
 
 enum CopyType { kSeqToPad, kPadToSeq };
 
-inline static size_t MaximumSequenceLength(
-    const std::vector<size_t>& seq_offset) {
-  size_t seq_num = seq_offset.size() - 1;
-  size_t max_seq_len = 0;
+inline static uint64_t MaximumSequenceLength(
+    const std::vector<uint64_t>& seq_offset) {
+  uint64_t seq_num = seq_offset.size() - 1;
+  uint64_t max_seq_len = 0;
   for (size_t i = 0; i < seq_num; ++i) {
     max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
   }
@@ -42,7 +42,7 @@ inline static size_t MaximumSequenceLength(
 
 inline static void CheckDims(const lite::DDim& seq_tensor_dims,
                              const lite::DDim& pad_tensor_dims,
-                             const std::vector<size_t>& seq_offset,
+                             const std::vector<uint64_t>& seq_offset,
                              int64_t padded_seq_len,
                              int64_t step_width,
                              const PadLayout& layout) {
diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc
index 186b8b5543c7132867093616c83b45ae8ff27d3c..34c55c5714e467954bc1bb79d9b1385ef5cfe497 100644
--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
@@ -55,7 +55,7 @@ class MaxSeqPoolFunctor {
 
     auto starts = input.lod()[0];
     const T* in_data = input.data<T>();
-    T* out_data = output->mutable_data<T>();
+    T* out_data = output->template mutable_data<T>();
     int* max_index = index->mutable_data<int>();
 
     int64_t num_seq = out_dims[0];
@@ -103,7 +103,7 @@ class MaxSeqPoolFunctor<T, true> {
 
     auto starts = input.lod()[0];
     const T* in_data = input.data<T>();
-    T* out_data = output->mutable_data<T>();
+    T* out_data = output->template mutable_data<T>();
 
     int64_t num_seq = out_dims[0];
     int64_t dim = output->numel() / num_seq;
@@ -145,7 +145,7 @@ class MaxSeqPoolGradFunctor {
 
     const T* og_data = out_grad.data<T>();
     const int* max_index = index.data<int>();
-    T* ig_data = in_grad->mutable_data<T>();
+    T* ig_data = in_grad->template mutable_data<T>();
 
     SetConstant<TARGET(kX86), T> set_zero;
     set_zero(context, in_grad, static_cast<T>(0.0));
@@ -170,7 +170,7 @@ class LastSeqPoolFunctor {
                   lite::Tensor* output) {
     // Create pointers to input and output data
     auto* in_data = input.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
 
     // Calculate the size of each item in sequence
     int64_t item_size = input.numel() / input.dims()[0];
@@ -203,7 +203,7 @@ class FirstSeqPoolFunctor {
                   lite::Tensor* output) {
     // Create pointers to input and output data
     auto* in_data = input.data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* out_data = output->template mutable_data<T>();
 
     // Calculate the size of each item in sequence
     int64_t item_size = input.numel() / input.dims()[0];
@@ -238,7 +238,7 @@ class SumSeqPoolGradFunctor {
     int64_t in_w = in_grad->numel() / in_grad->dims()[0];
     PADDLE_ENFORCE(in_w == out_w);
     const T* out_g_data = out_grad.data<T>();
-    T* in_g_data = in_grad->mutable_data<T>(TARGET(kX86));
+    T* in_g_data = in_grad->template mutable_data<T>(TARGET(kX86));
     auto blas = math::GetBlas<TARGET(kX86), T>(context);
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
@@ -288,7 +288,7 @@ class SequencePoolFunctor<TARGET(kX86), T> {
     auto lod = input.lod()[0];
     if (pooltype == "SUM") {
       const T* src = input.data<T>();
-      T* dst = output->mutable_data<T>(TARGET(kX86));
+      T* dst = output->template mutable_data<T>(TARGET(kX86));
       jit::seq_pool_attr_t attr(
           static_cast<int>(input.numel() / input.dims()[0]),
           jit::SeqPoolType::kSum);
diff --git a/lite/backends/x86/math/sequence_pooling_test.cc b/lite/backends/x86/math/sequence_pooling_test.cc
index a73014767345842f09ac2ff0cd5c2e7231c1f90a..b91f43a571994bef95650361a6dc62c0465837a7 100644
--- a/lite/backends/x86/math/sequence_pooling_test.cc
+++ b/lite/backends/x86/math/sequence_pooling_test.cc
@@ -101,13 +101,13 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
 
 TEST(SequencePoolingGrad, CPU_SUM) {
   paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
+  lod1.push_back(std::vector<uint64_t>{0, 10});
   TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
                          paddle::platform::CPUPlace,
                          float>(lod1);
 
   paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
   TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
                          paddle::platform::CPUPlace,
                          float>(lod2);
@@ -116,13 +116,13 @@ TEST(SequencePoolingGrad, CPU_SUM) {
 #ifdef PADDLE_WITH_CUDA
 TEST(SequencePoolingGrad, CUDA_SUM) {
   paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
+  lod1.push_back(std::vector<uint64_t>{0, 10});
   TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
                          paddle::platform::CUDAPlace,
                          float>(lod1);
 
   paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
   TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
                          paddle::platform::CUDAPlace,
                          float>(lod2);
diff --git a/lite/backends/x86/math/sequence_scale.cc b/lite/backends/x86/math/sequence_scale.cc
index fad0628de15379b58847827cc3d48bf6085cbda2..25c7be0d0e2747f4f28c1d82f8855872d57726d1 100644
--- a/lite/backends/x86/math/sequence_scale.cc
+++ b/lite/backends/x86/math/sequence_scale.cc
@@ -32,7 +32,7 @@ class ScaleLoDTensorFunctor<lite::TargetType::kX86, T> {
     size_t seq_width = seq->dims()[1];
     lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod);
 
-    T* seq_data = seq->mutable_data<T>(lite::TargetType::kX86);
+    T* seq_data = seq->template mutable_data<T>(lite::TargetType::kX86);
     for (size_t i = 0; i < num_seq; ++i) {
       for (size_t j = lod[level][i] * seq_width;
            j < lod[level][i + 1] * seq_width;
diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.cc b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
index 035a7923c70f91cf27f1d845f68110f8f33cb73d..97e27fed59f4bc1a4c457ea9cf515da6caca9a1c 100644
--- a/lite/backends/x86/math/sequence_topk_avg_pooling.cc
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
@@ -83,7 +83,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
     auto pos_data = pos->mutable_data<int>(lite::TargetType::kX86);
 
     int offset = 0;
-    std::vector<size_t> vec_out_lod;
+    std::vector<uint64_t> vec_out_lod;
     vec_out_lod.reserve(batch_size + 1);
     for (int i = 0; i <= batch_size; ++i) {
       offset = row_lod[i];
@@ -95,7 +95,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
     out->set_lod(lod_temp);
 
     auto in_data = in.data<T>();
-    auto out_data = out->mutable_data<T>(lite::TargetType::kX86);
+    auto out_data = out->template mutable_data<T>(lite::TargetType::kX86);
 
     T* sum_data = new T[max_k];
     for (int i = 0; i < batch_size; ++i) {
diff --git a/lite/backends/x86/math/softmax_impl.h b/lite/backends/x86/math/softmax_impl.h
index ec45377bc55154a4a36ebc5c3684ab7efeeef88e..1ba84dda42093155b10fa74a49e953d6663b8c88 100644
--- a/lite/backends/x86/math/softmax_impl.h
+++ b/lite/backends/x86/math/softmax_impl.h
@@ -108,8 +108,8 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
     const int num_remain = num_classes / axis_dim;
 
     if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
-      const T* in_data = X->data<T>();
-      auto* out_data = Y->mutable_data<T>();
+      const T* in_data = X->template data<T>();
+      auto* out_data = Y->template mutable_data<T>();
       for (int bs = 0; bs < batch_size; ++bs) {
         T max_val = *std::max_element(in_data, in_data + num_classes);
         max_val *= static_cast<T>(-1);
@@ -219,9 +219,9 @@ class SoftmaxGradFunctor<Target, T, enable_if_CPU<Target>> {
     const int num_remain = num_classes / axis_dim;
 
     if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
-      const T* out_data = y->data<T>();
-      const T* out_grad = y_grad->data<T>();
-      T* in_grad = x_grad->mutable_data<T>();
+      const T* out_data = y->template data<T>();
+      const T* out_grad = y_grad->template data<T>();
+      T* in_grad = x_grad->template mutable_data<T>();
       for (int bs = 0; bs < batch_size; ++bs) {
         T scalar;
         vec_mul_reduce<T, lite::x86::avx>(
diff --git a/lite/backends/x86/math/tree2col.cc b/lite/backends/x86/math/tree2col.cc
index 20b913331308c8b8c95d190b6b0b3d76ccac354b..bfc7084c9ff018101ca3dfc1d1748083b1449662 100644
--- a/lite/backends/x86/math/tree2col.cc
+++ b/lite/backends/x86/math/tree2col.cc
@@ -104,12 +104,12 @@ class Tree2ColFunctor<lite::TargetType::kX86, T> {
     patch_size = processing_list.size();
 
     // T *patch_data =
-    //    patch->mutable_data<T>({static_cast<int64_t>(patch_size),
+    //    patch->template mutable_data<T>({static_cast<int64_t>(patch_size),
     //                            static_cast<int64_t>(patch_elem_size)},
     //                           cpu_place);
     patch->Resize({static_cast<int64_t>(patch_size),
                    static_cast<int64_t>(patch_elem_size)});
-    auto *patch_data = patch->mutable_data<T>(lite::TargetType::kX86);
+    auto *patch_data = patch->template mutable_data<T>(lite::TargetType::kX86);
     constant(context, patch, 0);
     const T *features = node_features.data<T>();
 
@@ -166,12 +166,12 @@ class Col2TreeFunctor<lite::TargetType::kX86, T> {
       }
     }
     // T *grad_data =
-    //    in_grad->mutable_data<T>({static_cast<int64_t>(node_count),
+    //    in_grad->template mutable_data<T>({static_cast<int64_t>(node_count),
     //                              static_cast<int64_t>(grad_elem_size)},
     //                             cpu_place);
     in_grad->Resize({static_cast<int64_t>(node_count),
                      static_cast<int64_t>(grad_elem_size)});
-    auto *grad_data = in_grad->mutable_data<T>(lite::TargetType::kX86);
+    auto *grad_data = in_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     constant(context, in_grad, 0);
     const T *out_g = out_grad.data<T>();
diff --git a/lite/backends/x86/math/unpooling.cc b/lite/backends/x86/math/unpooling.cc
index 568f9952cab755c8441695e1a9266a2001d2b9a9..119d7294e9ec21e67f09776ad20d04f15b8b81ce 100644
--- a/lite/backends/x86/math/unpooling.cc
+++ b/lite/backends/x86/math/unpooling.cc
@@ -36,7 +36,7 @@ class Unpool2dMaxFunctor<lite::TargetType::kX86, T> {
     int output_feasize = output_height * output_width;
     const T* input_data = input.data<T>();
     const int* indices_data = indices.data<int>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
+    T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
     for (int b = 0; b < batch_size; ++b) {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
@@ -70,7 +70,8 @@ class Unpool2dMaxGradFunctor<lite::TargetType::kX86, T> {
     int output_feasize = output_height * output_width;
     const int* indices_data = indices.data<int>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
+    T* input_grad_data =
+        input_grad->template mutable_data<T>(lite::TargetType::kX86);
 
     for (int b = 0; b < batch_size; ++b) {
       for (int c = 0; c < output_channels; ++c) {
diff --git a/lite/backends/x86/math/vol2col.cc b/lite/backends/x86/math/vol2col.cc
index 8fd5e8954e2010d5226d56ac4a87a44e6364c8c6..91979bb7fdcfe66d84ded3f9797144ddafc8769e 100644
--- a/lite/backends/x86/math/vol2col.cc
+++ b/lite/backends/x86/math/vol2col.cc
@@ -75,7 +75,7 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
                       "mismatching.");
 
     const T* vol_data = vol.data<T>();
-    T* col_data = col->mutable_data<T>();
+    T* col_data = col->template mutable_data<T>();
 
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
@@ -159,7 +159,7 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
                       output_width,
                       "input_width and output_width are "
                       "mismatching.");
-    T* vol_data = vol->mutable_data<T>();
+    T* vol_data = vol->template mutable_data<T>();
     const T* col_data = col.data<T>();
 
     for (int c = 0; c < channels_col; ++c) {
diff --git a/lite/backends/xpu/CMakeLists.txt b/lite/backends/xpu/CMakeLists.txt
index 4491fdeaefe9f16265bdee2c07ebb02b86a2b038..85bef0452c41ce35c90d9bd058bb7fdefd030f3a 100644
--- a/lite/backends/xpu/CMakeLists.txt
+++ b/lite/backends/xpu/CMakeLists.txt
@@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU)
   return()
 endif()
 
-lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+if(LITE_WITH_XTCL)
+  lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
+endif()
+lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
diff --git a/lite/backends/xpu/device.h b/lite/backends/xpu/device.h
index 6de18d5466da6e6b791363d2e275ea72376c78b8..a2cc3206d3d0391d89690026561f47983e9376c9 100644
--- a/lite/backends/xpu/device.h
+++ b/lite/backends/xpu/device.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
-#include <xtcl/xtcl.h>
 #include <cstdlib>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/xpu/math.h b/lite/backends/xpu/math.h
new file mode 100644
index 0000000000000000000000000000000000000000..48352736d45a20d9abd496d9dd10b000d3f15a28
--- /dev/null
+++ b/lite/backends/xpu/math.h
@@ -0,0 +1,219 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+#include <cmath>
+#include <cstdlib>
+#include <utility>
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+namespace math {
+
+static inline long round_half_to_even(const float src) {  // NOLINT
+  long ret = llround(src);                                // NOLINT
+  if (fabs(fabs(round(src) - src) - 0.5) > 0) {
+    return ret;
+  } else {
+    if (abs(ret) % 2 == 0) {
+      return ret;
+    } else {
+      return ret + (ret > 0 ? -1 : 1);
+    }
+  }
+}
+
+static float ieee_compliance_0(float f) {
+  uint32_t *ptr = reinterpret_cast<uint32_t *>(&f);
+  uint32_t sign = (*ptr) & 0x80000000;
+  uint32_t uf = 0;
+  // nan -> inf
+  if (std::isnan(f)) {
+    uf = (sign | 0x7F800000);
+    float *ptr = reinterpret_cast<float *>(&uf);
+    return *ptr;
+  } else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) {
+    return f;
+  } else {
+    // denormal -> +-0
+    uf = 0x0;
+    float *ptr = reinterpret_cast<float *>(&uf);
+    return *ptr;
+  }
+}
+
+template <typename T, int RMAX>
+static inline T fp32_to_intx(const float f, float max) {
+  max = ieee_compliance_0(max);
+  float input = ieee_compliance_0(f);
+  // +0 and -0 -> +0
+  if (input == 0) {
+    input = 0.0f;
+  }
+
+  float tmp = RMAX / max;
+  if (std::isinf(tmp)) {
+    uint32_t *ptr = reinterpret_cast<uint32_t *>(&input);
+    if ((*ptr) >> 31 & 1) {
+      return T(-RMAX);
+    } else {
+      return T(RMAX);
+    }
+  }
+
+  tmp = input * tmp;
+  if (std::isnan(tmp)) {
+    return T(RMAX);
+  }
+
+  tmp = ieee_compliance_0(tmp);
+  // early check to avoid INF or big value get into convertor func.
+  if (tmp > RMAX) {
+    return T(RMAX);
+  }
+  if (tmp < -RMAX) {
+    return T(-RMAX);
+  }
+  T ret = (T)round_half_to_even(tmp);
+  if (ret > RMAX) {
+    ret = T(RMAX);
+  }
+  if (ret < -RMAX) {
+    ret = T(-RMAX);
+  }
+  return ret;
+}
+
+static inline int16_t fp32_to_int16(const float f, float max) {
+  int16_t v1 = fp32_to_intx<int16_t, 32767>(f, max);
+  return v1;
+}
+
+static inline int ConvertFP32ToInt16(const void *input,
+                                     void *output,
+                                     float max_val,
+                                     int len) {
+  for (int i = 0; i < len; i++) {
+    static_cast<int16_t *>(output)[i] =
+        fp32_to_int16(static_cast<const float *>(input)[i], max_val);
+  }
+  return 0;
+}
+
+static inline float FindMaxAbs(const float *data, int len) {
+  float max_f = 0.0f;
+  for (int i = 0; i < len; ++i) {
+    float max = std::abs(data[i]);
+    if (max > max_f) {
+      max_f = max;
+    }
+  }
+  return max_f;
+}
+
+template <typename T>
+static inline void Transpose(const T *in, T *out, int h, int w) {
+  for (int h1 = 0; h1 < w; ++h1) {
+    for (int w1 = 0; w1 < h; ++w1) {
+      out[h1 * h + w1] = in[w1 * w + h1];
+    }
+  }
+}
+
+/**
+ * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
+ * original x_dim is returned.
+ */
+static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return lite::DDim({1, x_dim[0]});
+}
+
+/**
+ * Get column matrix shape from a vector shape. If the rank of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return lite::DDim({y_dim[0], 1});
+}
+
+/**
+ * Matrix Descriptor of a memory buffer.
+ *
+ * It is used for Blas::MatMul. MatMul operator can be batched.
+ * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
+ * `batch_size` times of GEMM. The batched GEMM could be faster base on the
+ * implementation of the blas library. The batch size could be zero. If any
+ * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
+ * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
+ * [BatchSize, H1, W2]
+ *
+ * The boolean flag, `trans`, describe the memory is the transpose of matrix or
+ * not. If the trans is true, the last two dims of matrix are transposed. The
+ * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
+ *
+ * The MatDescriptor is not only the dimension or shape of a matrix, it also
+ * contains the layout, stride of matrix. It is clearer to have a structure than
+ * reuse `DDim`.
+ */
+struct MatDescriptor {
+  int64_t height_;
+  int64_t width_;
+  int64_t stride_{0};
+  int64_t batch_size_{0};
+  bool trans_;
+};
+
+static MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
+                                            int num_flatten_cols,
+                                            bool trans) {
+  MatDescriptor retv;
+  if (num_flatten_cols > 1) {
+    auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
+    retv.height_ = flatten_dim[0];
+    retv.width_ = flatten_dim[1];
+  } else {
+    if (tensor_dim.size() == 2) {
+      retv.height_ = tensor_dim[0];
+      retv.width_ = tensor_dim[1];
+    } else {
+      auto dim_vec = tensor_dim.Vectorize();
+      retv.batch_size_ = 1;
+      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
+        retv.batch_size_ *= dim_vec[i];
+      }
+      retv.height_ = dim_vec[dim_vec.size() - 2];
+      retv.width_ = dim_vec[dim_vec.size() - 1];
+      retv.stride_ = retv.height_ * retv.width_;
+    }
+  }
+  if (trans) {
+    std::swap(retv.width_, retv.height_);
+  }
+  retv.trans_ = trans;
+  return retv;
+}
+
+}  // namespace math
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5dcbc1e275cca8c32003cbef74dfb1f6d4caee93
--- /dev/null
+++ b/lite/backends/xpu/target_wrapper.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/target_wrapper.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+
+namespace paddle {
+namespace lite {
+
+void* TargetWrapperXPU::Malloc(size_t size) {
+  void* ptr{nullptr};
+  xpu_malloc(&ptr, size);
+  return ptr;
+}
+
+void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
+
+void TargetWrapperXPU::MemcpySync(void* dst,
+                                  const void* src,
+                                  size_t size,
+                                  IoDirection dir) {
+  switch (dir) {
+    case IoDirection::HtoD:
+      xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
+      break;
+    case IoDirection::DtoH:
+      xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..c42d4139246085d8b9a367b45b60699209d0b668
--- /dev/null
+++ b/lite/backends/xpu/target_wrapper.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+
+using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
+
+template <>
+class TargetWrapper<TARGET(kXPU)> {
+ public:
+  static size_t num_devices() { return 1; }
+  static size_t maximum_stream() { return 0; }
+
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/xpu_header_sitter.h b/lite/backends/xpu/xpu_header_sitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..875e67d57d4ba2110bfbffb7ee9d1d6a876060fa
--- /dev/null
+++ b/lite/backends/xpu/xpu_header_sitter.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#pragma GCC system_header
+#include <xpu/api.h>
+#include <xpu/golden.h>
+#include <xpu/runtime.h>
+
+#if defined(LITE_WITH_XTCL)
+#include <xtcl/xtcl.h>
+#endif
+
+namespace paddle {
+namespace lite {
+
+namespace xdnn = baidu::xpu::api;
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index db8bc29d70d4764f14f24915fcbc254ba2af91df..278f971b0b1ee8a0b941158839fcc6810e25ad67 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -5,9 +5,11 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
   DEPS target_wrapper_host place
   X86_DEPS target_wrapper_x86
   CUDA_DEPS target_wrapper_cuda
+  XPU_DEPS target_wrapper_xpu
   CL_DEPS cl_target_wrapper
   FPGA_DEPS fpga_target_wrapper
-  BM_DEPS target_wrapper_bm)
+  BM_DEPS target_wrapper_bm
+  MLU_DEPS target_wrapper_mlu)
 
 lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)
 
diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt
index 0f3f36768bd5a079564002cbb6464d61bd5db3aa..afc104073684ff00395fb32335630705ff3f7bc8 100644
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 
 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/core/context.cc b/lite/core/context.cc
index be886168e02e21d192305d701110ce5075ffba63..be41aa6eb0cb986760f38eaa2bb5b7e017cc4edb 100644
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -15,5 +15,11 @@
 #include "lite/core/context.h"
 
 namespace paddle {
-namespace lite {}  // namespace lite
+namespace lite {
+
+#ifdef LITE_WITH_XPU
+thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
+#endif
+
+}  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/context.h b/lite/core/context.h
index 23ce3e9d32888a4f7f544f53bda355edc03633ac..d20ccde592ff8190c6dde9ba494a43fc584191b6 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -24,6 +24,14 @@
 #include "lite/backends/opencl/cl_context.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
+#ifdef LITE_WITH_MLU
+#include <cnml.h>
+#include <cnrt.h>
+#include "lite/backends/mlu/mlu_utils.h"
+#endif
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#endif
 
 #include <map>
 #include <memory>
@@ -119,11 +127,38 @@ class Context<TargetType::kXPU> {
  public:
   Context() {}
   explicit Context(const XPUContext& ctx);
+
   // NOTE: InitOnce should only be used by ContextScheduler
   void InitOnce() {}
+
   void CopySharedTo(XPUContext* ctx) {}
 
+  static xdnn::Context* GetRawContext() {
+    if (_tls_raw_ctx == nullptr) {
+      _tls_raw_ctx = xdnn::create_context();
+      CHECK(_tls_raw_ctx);
+    }
+    return _tls_raw_ctx;
+  }
+
+  static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
+    xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
+  }
+
+  static void SetDev(int dev_no = 0) {
+    const char* dev_env = getenv("LITE_XPU_DEV");
+    if (dev_env) {
+      xpu_set_device(atoi(dev_env));
+      return;
+    }
+
+    xpu_set_device(dev_no);
+  }
+
   std::string name() const { return "XPUContext"; }
+
+ private:
+  static thread_local xdnn::Context* _tls_raw_ctx;
 };
 #endif
 
@@ -188,6 +223,85 @@ class Context<TargetType::kFPGA> {
 };
 #endif
 
+#ifdef LITE_WITH_MLU
+template <>
+class Context<TargetType::kMLU> {
+ public:
+  typename Env<TargetType::kMLU>::Devs& devs = Env<TargetType::kMLU>::Global();
+
+  void InitOnce() {}
+
+  MLUContext& operator=(const MLUContext& ctx) {
+    this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
+    return *this;
+  }
+
+  void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
+    CHECK_GT(devs.size(), 0UL)
+        << "Env is not initialized or current target is not exit!";
+    if (dev_id >= static_cast<int>(devs.size())) {
+      LOG(WARNING) << "device index exceeds the number of devices, set to "
+                      "default device(0)!";
+      device_id_ = 0;
+    } else {
+      device_id_ = dev_id;
+    }
+    SetMluDevice(device_id_);
+    if (io_queue_id >= devs[dev_id].max_queue()) {
+      LOG(WARNING) << "data queue index exceeds the maximum queue number, "
+                      "set to default qeueu(0)!";
+      io_queue_id = 0;
+    }
+    if (exec_queue_id >= devs[dev_id].max_queue()) {
+      LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
+                      "set to default qeueu(0)!";
+      exec_queue_id = 0;
+    }
+    io_queue_ = devs[dev_id].io_queues()[io_queue_id];
+    exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
+
+    exec_queue_id_ = exec_queue_id;
+    io_queue_id_ = io_queue_id;
+  }
+
+  void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
+
+  const cnrtQueue_t& exec_queue() const { return exec_queue_; }
+  void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; }
+
+  const cnrtQueue_t& io_queue() const { return io_queue_; }
+  void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
+
+  cnmlCoreVersion_t MLUCoreVersion() {
+    return DeviceInfo::Global().MLUCoreVersion();
+  }
+
+  int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
+
+  u32_t affinity() { return affinity_; }
+
+  cnrtInvokeFuncParam_t forward_param() { return forward_param_; }
+
+  int device_id() { return device_id_; }
+
+  std::string name() const { return "MLUContext"; }
+
+ private:
+  int device_id_;
+  // overall information
+  int exec_queue_id_;
+  int io_queue_id_;
+  cnrtQueue_t io_queue_;
+  cnrtQueue_t exec_queue_;
+
+  std::vector<cnrtNotifier_t> input_notifiers_;
+  std::vector<cnrtNotifier_t> output_notifiers_;
+
+  cnrtInvokeFuncParam_t forward_param_;
+  u32_t affinity_ = 0x01;
+};
+#endif  // LITE_WITH_MLU
+
 #ifdef LITE_WITH_CUDA
 // Only works with CUDA kernels.
 template <>
@@ -197,7 +311,11 @@ class Context<TargetType::kCUDA> {
       Env<TargetType::kCUDA>::Global();
   // NOTE: InitOnce should only be used by ContextScheduler
   void InitOnce() {
-    cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
+    if (devs.size() > 0) {
+      cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
+    } else {
+      LOG(INFO) << "No cuda device(s) found, CUDAContext init failed.";
+    }
   }
   void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
     CHECK_GT(devs.size(), 0UL)
@@ -417,6 +535,15 @@ class ContextScheduler {
             .As<HWAscendNPUContext>()
             .CopySharedTo(&ctx->As<HWAscendNPUContext>());
         break;
+#ifdef LITE_WITH_MLU
+      case TARGET(kMLU): {
+        int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
+        auto& context = ctx->As<MLUContext>();
+        context.Init(dev_id);
+        kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
+            &context);
+        LOG(INFO) << "New Context for MLU";
+      } break;
 #endif
       default:
 #if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
@@ -461,6 +588,9 @@ class ContextScheduler {
 #endif
 #ifdef LITE_WITH_BM
     InitContext<TargetType::kBM, BMContext>();
+#endif
+#ifdef LITE_WITH_MLU
+    InitContext<TargetType::kMLU, MLUContext>();
 #endif
   }
 
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
index 6e0d743fb9d8d8af5e7168e292c1e85d76844383..29ac96ed744b016833a746b35002dd68109efd8b 100644
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -58,7 +58,7 @@
 namespace paddle {
 namespace lite {
 
-#ifdef LITE_WITH_ARM
+#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
 thread_local lite_api::PowerMode DeviceInfo::mode_;
 thread_local ARMArch DeviceInfo::arch_;
 thread_local int DeviceInfo::mem_size_;
@@ -66,6 +66,15 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
 thread_local TensorLite DeviceInfo::workspace_;
 thread_local int64_t DeviceInfo::count_ = 0;
 
+#ifdef LITE_WITH_MLU
+thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
+thread_local int DeviceInfo::mlu_core_number_{1};
+thread_local bool DeviceInfo::use_first_conv_{false};
+thread_local std::vector<float> DeviceInfo::mean_vec_;
+thread_local std::vector<float> DeviceInfo::std_vec_;
+thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
+#endif
+
 #ifdef TARGET_IOS
 const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
 const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
@@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() {
   return 0;
 }
 
+#ifdef LITE_WITH_MLU
+void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                               int core_number,
+                               bool use_first_conv,
+                               const std::vector<float>& mean_vec,
+                               const std::vector<float>& std_vec,
+                               DataLayoutType input_layout) {
+  switch (core_version) {
+    case (lite_api::MLUCoreVersion::MLU_220):
+      mlu_core_version_ = CNML_MLU220;
+      break;
+    case (lite_api::MLUCoreVersion::MLU_270):
+      mlu_core_version_ = CNML_MLU270;
+      break;
+    default:
+      mlu_core_version_ = CNML_MLU270;
+      break;
+  }
+  mlu_core_number_ = core_number;
+  use_first_conv_ = use_first_conv;
+  mean_vec_ = mean_vec;
+  std_vec_ = std_vec;
+  input_layout_ = input_layout;
+}
+
+cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
+
+int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
+
+bool DeviceInfo::UseFirstConv() { return use_first_conv_; }
+
+const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
+
+const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
+
+DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
+
+#endif  // LITE_WITH_MLU
+
 void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
 #ifdef ARM_WITH_OMP
   thread_num = std::min(thread_num, core_num_);
@@ -1159,6 +1207,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) {
 
 #endif  // LITE_WITH_ARM
 
+#ifdef LITE_WITH_MLU
+void SetMluDevice(int device_id) {
+  LOG(INFO) << "Set mlu device " << device_id;
+  cnrtDev_t dev_handle;
+  CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, device_id));
+  CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
+}
+
+void Device<TARGET(kMLU)>::Init() {
+  SetMluDevice(idx_);
+  GetInfo();
+  CreateQueue();
+}
+
+void Device<TARGET(kMLU)>::GetInfo() {}
+
+void Device<TARGET(kMLU)>::CreateQueue() {
+  exec_queue_.clear();
+  io_queue_.clear();
+  for (size_t i = 0; i < max_queue_; ++i) {
+    cnrtQueue_t exec_queue;
+    cnrtQueue_t io_queue;
+    cnrtCreateQueue(&exec_queue);
+    cnrtCreateQueue(&io_queue);
+    exec_queue_.push_back(exec_queue);
+    io_queue_.push_back(io_queue);
+
+    cnrtCreateQueue(&exec_queue);
+    exec_queue_.push_back(exec_queue);
+  }
+}
+#endif  // LITE_WITH_MLU
+
 #ifdef LITE_WITH_CUDA
 
 void Device<TARGET(kCUDA)>::Init() {
diff --git a/lite/core/device_info.h b/lite/core/device_info.h
index 1ff8b896a70dc538d2486a24db2625c7b62c600a..a108ae3d4b564aaac02a63ead9a35eba26a6cf63 100644
--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -19,11 +19,14 @@
 #include <vector>
 #include "lite/core/tensor.h"
 #include "lite/utils/cp_logging.h"
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/mlu_utils.h"
+#endif
 
 namespace paddle {
 namespace lite {
 
-#ifdef LITE_WITH_ARM
+#if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
 
 typedef enum {
   kAPPLE = 0,
@@ -52,6 +55,20 @@ class DeviceInfo {
   int Setup();
 
   void SetRunMode(lite_api::PowerMode mode, int thread_num);
+#ifdef LITE_WITH_MLU
+  void SetMLURunMode(lite_api::MLUCoreVersion core_version,
+                     int core_number,
+                     bool use_first_conv,
+                     const std::vector<float>& mean_vec,
+                     const std::vector<float>& std_vec,
+                     DataLayoutType input_layout);
+  cnmlCoreVersion_t MLUCoreVersion();
+  int MLUCoreNumber();
+  bool UseFirstConv();
+  const std::vector<float>& MeanVec() const;
+  const std::vector<float>& StdVec() const;
+  DataLayoutType InputLayout() const;
+#endif
   void SetCache(int l1size, int l2size, int l3size);
   void SetArch(ARMArch arch) { arch_ = arch; }
 
@@ -103,6 +120,15 @@ class DeviceInfo {
   static thread_local TensorLite workspace_;
   static thread_local int64_t count_;
 
+#ifdef LITE_WITH_MLU
+  static thread_local cnmlCoreVersion_t mlu_core_version_;
+  static thread_local int mlu_core_number_;
+  static thread_local bool use_first_conv_;
+  static thread_local std::vector<float> mean_vec_;
+  static thread_local std::vector<float> std_vec_;
+  static thread_local DataLayoutType input_layout_;
+#endif
+
   void SetDotInfo(int argc, ...);
   void SetFP16Info(int argc, ...);
   void SetFP32Info(int argc, ...);
@@ -134,6 +160,9 @@ class Env {
     return *devs;
   }
   static void Init(int max_stream = 4) {
+#ifdef LITE_WITH_MLU
+    CNRT_CALL(cnrtInit(0));
+#endif
     Devs& devs = Global();
     if (devs.size() > 0) {
       return;
@@ -142,7 +171,7 @@ class Env {
     // Get device count
     count = API::num_devices();
     if (count == 0) {
-      CHECK(false) << "No device found!";
+      LOG(INFO) << "No " << TargetToStr(Type) << " device(s) found!";
     } else {
       LOG(INFO) << "Found " << count << " device(s)";
     }
@@ -156,6 +185,41 @@ class Env {
   }
 };
 
+#ifdef LITE_WITH_MLU
+void SetMluDevice(int device_id);
+
+template <>
+class Device<TARGET(kMLU)> {
+ public:
+  Device(int dev_id, int max_queue = 1) : idx_(dev_id), max_queue_(max_queue) {}
+  void Init();
+
+  int id() { return idx_; }
+  int max_queue() { return max_queue_; }
+  void SetId(int idx) { idx_ = idx; }
+  std::string name() { return "MLU"; }
+  int core_num() { return 16; }
+  float max_memory() { return 16 * 1024; }
+  std::vector<cnrtQueue_t> io_queues() { return io_queue_; }
+  std::vector<cnrtQueue_t> exec_queues() { return exec_queue_; }
+
+ private:
+  void CreateQueue();
+  void GetInfo();
+
+ private:
+  int idx_{0};
+  int max_queue_;
+  std::string device_name_;
+  float max_memory_;
+
+  std::vector<cnrtQueue_t> io_queue_;
+  std::vector<cnrtQueue_t> exec_queue_;
+};
+
+template class Env<TARGET(kMLU)>;
+#endif  // LITE_WITH_MLU
+
 #ifdef LITE_WITH_CUDA
 template <>
 class Device<TARGET(kCUDA)> {
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
index 18a1243c11652afc181f13f0f5a497858a30885f..ff848dae9e4ad6e8aaef70432301033406633db6 100644
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -83,6 +83,9 @@ class KernelBase {
 #if defined(LITE_WITH_CUDA)
     WorkSpace::Global_CUDA().AllocReset();
 #endif
+#if defined(LITE_WITH_MLU)
+    WorkSpace::Global_MLU().AllocReset();
+#endif
 #ifdef LITE_WITH_PROFILE
     profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
     profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
diff --git a/lite/core/memory.cc b/lite/core/memory.cc
index 0ee973a8b6412a2fd20e33745b7b86561696efae..1f2f7fed7d61b67a76f54a092b6d48951bc9fcbd 100644
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -45,6 +45,16 @@ void* TargetMalloc(TargetType target, size_t size) {
       data = TargetWrapper<TARGET(kBM)>::Malloc(size);
       break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      data = TargetWrapper<TARGET(kMLU)>::Malloc(size);
+      break;
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      data = TargetWrapperXPU::Malloc(size);
+      break;
+#endif  // LITE_WITH_XPU
     default:
       LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
   }
@@ -83,6 +93,16 @@ void TargetFree(TargetType target, void* data, std::string free_flag) {
       TargetWrapper<TARGET(kBM)>::Free(data);
       break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      TargetWrapper<TARGET(kMLU)>::Free(data);
+      break;
+#endif  // LITE_WITH_MLU
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      TargetWrapperXPU::Free(data);
+      break;
+#endif  // LITE_WITH_XPU
     default:
       LOG(FATAL) << "Unknown type";
   }
@@ -114,6 +134,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
       TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, IoDirection::DtoD);
       break;
 #endif
+#ifdef LITE_WITH_MLU
+    case TargetType::kMLU:
+      TargetWrapper<TARGET(kMLU)>::MemcpySync(
+          dst, src, size, IoDirection::HtoD);
+      break;
+#endif
 #ifdef LITE_WITH_OPENCL
     case TargetType::kOpenCL:
       TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);
diff --git a/lite/core/memory.h b/lite/core/memory.h
index 691415aecb53bf7f48faf5fbb4dbca448da04a10..a1013910019251271ddfccfbc700297c45226fe6 100644
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -31,6 +31,14 @@
 #include "lite/backends/bm/target_wrapper.h"
 #endif  // LITE_WITH_BM
 
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/target_wrapper.h"
+#endif  // LITE_WITH_MLU
+
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif  // LITE_WITH_XPU
+
 namespace paddle {
 namespace lite {
 
@@ -75,6 +83,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
       TargetWrapperCL::MemcpySync(dst, src, size, dir);
       break;
 #endif  // LITE_WITH_OPENCL
+#ifdef LITE_WITH_MLU
+    case TARGET(kMLU):
+      TargetWrapperMlu::MemcpySync(dst, src, size, dir);
+      break;
+#endif
 #ifdef LITE_WITH_FPGA
     case TARGET(kFPGA):
       TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir);
@@ -126,7 +139,7 @@ class Buffer {
                         const size_t img_h,
                         void* host_ptr = nullptr) {
     if (target != target_ || cl_image2d_width_ < img_w ||
-        cl_image2d_height_ < img_h) {
+        cl_image2d_height_ < img_h || host_ptr != nullptr) {
       CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
       Free();
       data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index 82b19b030c35e69ad2a666f93475c556cc51fd23..91accc907ed16b2de64e5982b88d38029fd2902b 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -21,6 +21,8 @@ lite_cc_library(mir_passes
       fusion/elementwise_add_activation_fuse_pass.cc
       fusion/quant_dequant_fuse_pass.cc
       fusion/sequence_pool_concat_fuse_pass.cc
+      fusion/__xpu__resnet_fuse_pass.cc
+      fusion/__xpu__multi_encoder_fuse_pass.cc
       elimination/identity_scale_eliminate_pass.cc
       elimination/elementwise_mul_constant_eliminate_pass.cc
       static_kernel_pick_pass.cc
@@ -35,6 +37,7 @@ lite_cc_library(mir_passes
       demo_pass.cc
       runtime_context_assign_pass.cc
       memory_optimize_pass.cc
+      mlu_postprocess_pass.cc
       weight_quantization_preprocess_pass.cc
       quantized_op_attributes_inference_pass.cc
   DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
@@ -69,10 +72,10 @@ set(pattern_deps mir_node mir_ssa_graph op)
 if (WITH_TESTING)
   list(APPEND pattern_deps gtest)
 endif()
-lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps})
+lite_cc_library(pattern_matcher SRCS pattern_matcher.cc xpu_pattern_matcher.cc DEPS ${pattern_deps})
 lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher)
 
-lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher)
+lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc xpu_pattern_matcher_high_api.cc DEPS pattern_matcher)
 
 
 # for mobile, unnecessary to compile the following testings.
diff --git a/lite/core/mir/dot.h b/lite/core/mir/dot.h
index df70565c0775acdb61cb540598f15b7f84e0119c..a68890910ab33bd32c68efc6f06236db21909a05 100644
--- a/lite/core/mir/dot.h
+++ b/lite/core/mir/dot.h
@@ -27,8 +27,8 @@
 #include "lite/utils/string.h"
 
 namespace paddle {
-namespace inference {
-namespace analysis {
+namespace lite {
+namespace mir {
 
 static size_t dot_node_counter{0};
 
@@ -162,6 +162,6 @@ class Dot {
   std::vector<Attr> attrs_;
 };
 
-}  // namespace analysis
-}  // namespace inference
+}  // namespace mir
+}  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt
index e65e72cf7b367ee8477f3f783ae4d82372529864..04a36976c7110c64ef781af12fc86fd4853fe583 100644
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -27,10 +27,10 @@ lite_cc_library(fuse_transpose_softmax_transpose
         DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_interpolate
         SRCS interpolate_fuser.cc
-        DEPS pattern_matcher_high_api)       
+        DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_sequence_pool_concat
         SRCS sequence_pool_concat_fuser.cc
-        DEPS pattern_matcher_high_api)       
+        DEPS pattern_matcher_high_api)
 
 set(mir_fusers
     fuse_fc
diff --git a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..655274070f1ffcccf39b5f3ff6aaa705c5cbbfda
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -0,0 +1,637 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include "lite/operators/subgraph_op.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+namespace fusion {
+
+class XPUSingleEncoderFuser : public FuseBase {
+ public:
+  explicit XPUSingleEncoderFuser(const std::string& act_type = "gelu")
+      : act_type_(act_type) {}
+
+  void BuildPattern() override {
+    auto* input = VarNode("input")
+                      ->assert_is_op_input("mul", "X")
+                      ->assert_is_op_input("elementwise_add", "Y")
+                      ->AsInput();
+
+    auto* q_mul_y =
+        VarNode("q_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* q_mul = OpNode("q_mul", "mul");
+    auto* q_mul_out = VarNode("q_mul_out")
+                          ->assert_is_op_output("mul", "Out")
+                          ->assert_is_op_input("elementwise_add", "X")
+                          ->AsIntermediate();
+    auto* q_add_y = VarNode("q_add_y")
+                        ->assert_is_op_input("elementwise_add", "Y")
+                        ->AsInput();
+    auto* q_add = OpNode("q_add", "elementwise_add")->AsIntermediate();
+    auto* q_add_out = VarNode("q_add_out")
+                          ->assert_is_op_output("elementwise_add", "Out")
+                          ->assert_is_op_input("reshape2", "X")
+                          ->AsIntermediate();
+    auto* q_reshape2 = OpNode("q_reshape2", "reshape2")->AsIntermediate();
+    auto* q_reshape2_out = VarNode("q_reshape2_out")
+                               ->assert_is_op_output("reshape2", "Out")
+                               ->assert_is_op_input("transpose2", "X")
+                               ->AsIntermediate();
+    auto* q_reshape2_xshape = VarNode("q_reshape2_xshape")
+                                  ->assert_is_op_output("reshape2", "XShape")
+                                  ->AsIntermediate();
+    auto* q_transpose2 = OpNode("q_transpose2", "transpose2")->AsIntermediate();
+    auto* q_transpose2_out = VarNode("q_transpose2_out")
+                                 ->assert_is_op_output("transpose2", "Out")
+                                 ->assert_is_op_input("scale", "X")
+                                 ->AsIntermediate();
+    auto* q_transpose2_xshape =
+        VarNode("q_transpose2_xshape")
+            ->assert_is_op_output("transpose2", "XShape")
+            ->AsIntermediate();
+    auto* q_scale = OpNode("q_scale", "scale")->AsIntermediate();
+    auto* q_scale_out = VarNode("q_scale_out")
+                            ->assert_is_op_output("scale", "Out")
+                            ->assert_is_op_input("matmul", "X")
+                            ->AsIntermediate();
+
+    auto* k_mul_y =
+        VarNode("k_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* k_mul = OpNode("k_mul", "mul")->AsIntermediate();
+    auto* k_mul_out = VarNode("k_mul_out")
+                          ->assert_is_op_output("mul", "Out")
+                          ->assert_is_op_input("elementwise_add", "X")
+                          ->AsIntermediate();
+    auto* k_add_y = VarNode("k_add_y")
+                        ->assert_is_op_input("elementwise_add", "Y")
+                        ->AsInput();
+    auto* k_add = OpNode("k_add", "elementwise_add")->AsIntermediate();
+    auto* k_add_out = VarNode("k_add_out")
+                          ->assert_is_op_output("elementwise_add", "Out")
+                          ->assert_is_op_input("reshape2", "X")
+                          ->AsIntermediate();
+    auto* k_reshape2 = OpNode("k_reshape2", "reshape2")->AsIntermediate();
+    auto* k_reshape2_out = VarNode("k_reshape2_out")
+                               ->assert_is_op_output("reshape2", "Out")
+                               ->assert_is_op_input("transpose2", "X")
+                               ->AsIntermediate();
+    auto* k_reshape2_xshape = VarNode("k_reshape2_xshape")
+                                  ->assert_is_op_output("reshape2", "XShape")
+                                  ->AsIntermediate();
+    auto* k_transpose2 = OpNode("k_transpose2", "transpose2")->AsIntermediate();
+    auto* k_transpose2_out = VarNode("k_transpose2_out")
+                                 ->assert_is_op_output("transpose2", "Out")
+                                 ->assert_is_op_input("matmul", "Y")
+                                 ->AsIntermediate();
+    auto* k_transpose2_xshape =
+        VarNode("k_transpose2_xshape")
+            ->assert_is_op_output("transpose2", "XShape")
+            ->AsIntermediate();
+
+    auto* qk_matmul = OpNode("qk_matmul", "matmul")->AsIntermediate();
+    auto* qk_matmul_out = VarNode("qk_matmul_out")
+                              ->assert_is_op_output("matmul", "Out")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* qk_mask = VarNode("qk_mask")
+                        ->assert_is_op_input("elementwise_add", "Y")
+                        ->AsInput();
+    auto* qk_add = OpNode("qk_add", "elementwise_add")->AsIntermediate();
+    auto* qk_add_out = VarNode("qk_add_out")
+                           ->assert_is_op_output("elementwise_add", "Out")
+                           ->assert_is_op_input("softmax", "X")
+                           ->AsIntermediate();
+    auto* qk_softmax = OpNode("qk_softmax", "softmax")->AsIntermediate();
+    auto* qk_softmax_out = VarNode("qk_softmax_out")
+                               ->assert_is_op_output("softmax", "Out")
+                               ->AsIntermediate();
+    auto* qk_dropout = OpNode("qk_dropout", "dropout")->AsIntermediate();
+    auto* qk_dropout_out = VarNode("qk_dropout_out")
+                               ->assert_is_op_output("dropout", "Out")
+                               ->assert_is_op_input("matmul", "X")
+                               ->AsIntermediate();
+    auto* qk_dropout_mask = VarNode("qk_dropout_mask")
+                                ->assert_is_op_output("dropout", "Mask")
+                                ->AsIntermediate();
+
+    auto* v_mul_y =
+        VarNode("v_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* v_mul = OpNode("v_mul", "mul")->AsIntermediate();
+    auto* v_mul_out = VarNode("v_mul_out")
+                          ->assert_is_op_output("mul", "Out")
+                          ->assert_is_op_input("elementwise_add", "X")
+                          ->AsIntermediate();
+    auto* v_add_y = VarNode("v_add_y")
+                        ->assert_is_op_input("elementwise_add", "Y")
+                        ->AsInput();
+    auto* v_add = OpNode("v_add", "elementwise_add")->AsIntermediate();
+    auto* v_add_out = VarNode("v_add_out")
+                          ->assert_is_op_output("elementwise_add", "Out")
+                          ->assert_is_op_input("reshape2", "X")
+                          ->AsIntermediate();
+    auto* v_reshape2 = OpNode("v_reshape2", "reshape2")->AsIntermediate();
+    auto* v_reshape2_out = VarNode("v_reshape2_out")
+                               ->assert_is_op_output("reshape2", "Out")
+                               ->assert_is_op_input("transpose2", "X")
+                               ->AsIntermediate();
+    auto* v_reshape2_xshape = VarNode("v_reshape2_xshape")
+                                  ->assert_is_op_output("reshape2", "XShape")
+                                  ->AsIntermediate();
+    auto* v_transpose2 = OpNode("v_transpose2", "transpose2")->AsIntermediate();
+    auto* v_transpose2_out = VarNode("v_transpose2_out")
+                                 ->assert_is_op_output("transpose2", "Out")
+                                 ->assert_is_op_input("matmul", "Y")
+                                 ->AsIntermediate();
+    auto* v_transpose2_xshape =
+        VarNode("v_transpose2_xshape")
+            ->assert_is_op_output("transpose2", "XShape")
+            ->AsIntermediate();
+
+    auto* qkv_matmul = OpNode("qkv_matmul", "matmul")->AsIntermediate();
+    auto* qkv_matmul_out = VarNode("qkv_matmul_out")
+                               ->assert_is_op_output("matmul", "Out")
+                               ->assert_is_op_input("transpose2", "X")
+                               ->AsIntermediate();
+    auto* qkv_transpose2 =
+        OpNode("qkv_transpose2", "transpose2")->AsIntermediate();
+    auto* qkv_transpose2_out = VarNode("qkv_transpose2_out")
+                                   ->assert_is_op_output("transpose2", "Out")
+                                   ->assert_is_op_input("reshape2", "X")
+                                   ->AsIntermediate();
+    auto* qkv_transpose2_xshape =
+        VarNode("qkv_transpose2_xshape")
+            ->assert_is_op_output("transpose2", "XShape")
+            ->AsIntermediate();
+    auto* qkv_reshape2 = OpNode("qkv_reshape2", "reshape2")->AsIntermediate();
+    auto* qkv_reshape2_out = VarNode("qkv_reshape2_out")
+                                 ->assert_is_op_output("reshape2", "Out")
+                                 ->assert_is_op_input("mul", "X")
+                                 ->AsIntermediate();
+    auto* qkv_reshape2_xshape = VarNode("qkv_reshape2_xshape")
+                                    ->assert_is_op_output("reshape2", "XShape")
+                                    ->AsIntermediate();
+    auto* qkv_mul_y =
+        VarNode("qkv_mul_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* qkv_mul = OpNode("qkv_mul", "mul")->AsIntermediate();
+    auto* qkv_mul_out = VarNode("qkv_mul_out")
+                            ->assert_is_op_output("mul", "Out")
+                            ->assert_is_op_input("elementwise_add", "X")
+                            ->AsIntermediate();
+    auto* qkv_add_y = VarNode("qkv_add_y")
+                          ->assert_is_op_input("elementwise_add", "Y")
+                          ->AsInput();
+    auto* qkv_add = OpNode("qkv_add", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_out = VarNode("qkv_add_out")
+                            ->assert_is_op_output("elementwise_add", "Out")
+                            ->assert_is_op_input("dropout", "X")
+                            ->AsIntermediate();
+    auto* qkv_dropout = OpNode("qkv_dropout", "dropout")->AsIntermediate();
+    auto* qkv_dropout_out = VarNode("qkv_dropout_out")
+                                ->assert_is_op_output("dropout", "Out")
+                                ->assert_is_op_input("elementwise_add", "X")
+                                ->AsIntermediate();
+    auto* qkv_dropout_mask = VarNode("qkv_dropout_mask")
+                                 ->assert_is_op_output("dropout", "Mask")
+                                 ->AsIntermediate();
+
+    auto* qkv_add_2 = OpNode("qkv_add_2", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_2_out = VarNode("qkv_add_2_out")
+                              ->assert_is_op_output("elementwise_add", "Out")
+                              ->assert_is_op_input("layer_norm", "X")
+                              ->AsIntermediate();
+    auto* qkv_ln_2_scale = VarNode("qkv_ln_2_scale")
+                               ->assert_is_op_input("layer_norm", "Scale")
+                               ->AsInput();
+    auto* qkv_ln_2_bias = VarNode("qkv_ln_2_bias")
+                              ->assert_is_op_input("layer_norm", "Bias")
+                              ->AsInput();
+    auto* qkv_ln_2 = OpNode("qkv_ln_2", "layer_norm")->AsIntermediate();
+    auto* qkv_ln_2_out = VarNode("qkv_ln_2_out")
+                             ->assert_is_op_output("layer_norm", "Y")
+                             ->assert_is_op_input("mul", "X")
+                             ->assert_is_op_input("elementwise_add", "Y")
+                             ->AsIntermediate();
+    auto* qkv_ln_2_mean = VarNode("qkv_ln_2_mean")
+                              ->assert_is_op_output("layer_norm", "Mean")
+                              ->AsIntermediate();
+    auto* qkv_ln_2_var = VarNode("qkv_ln_2_var")
+                             ->assert_is_op_output("layer_norm", "Variance")
+                             ->AsIntermediate();
+
+    auto* qkv_mul_3_y =
+        VarNode("qkv_mul_3_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* qkv_mul_3 = OpNode("qkv_mul_3", "mul")->AsIntermediate();
+    auto* qkv_mul_3_out = VarNode("qkv_mul_3_out")
+                              ->assert_is_op_output("mul", "Out")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* qkv_add_3_y = VarNode("qkv_add_3_y")
+                            ->assert_is_op_input("elementwise_add", "Y")
+                            ->AsInput();
+    auto* qkv_add_3 = OpNode("qkv_add_3", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_3_out = VarNode("qkv_add_3_out")
+                              ->assert_is_op_output("elementwise_add", "Out")
+                              ->assert_is_op_input(act_type_, "X")
+                              ->AsIntermediate();
+    auto* qkv_act = OpNode("qkv_act", act_type_)->AsIntermediate();
+    auto* qkv_act_out = VarNode("qkv_act_out")
+                            ->assert_is_op_output(act_type_, "Out")
+                            ->assert_is_op_input("mul", "X")
+                            ->AsIntermediate();
+    auto* qkv_mul_4_y =
+        VarNode("qkv_mul_4_y")->assert_is_op_input("mul", "Y")->AsInput();
+    auto* qkv_mul_4 = OpNode("qkv_mul_4", "mul")->AsIntermediate();
+    auto* qkv_mul_4_out = VarNode("qkv_mul_4_out")
+                              ->assert_is_op_output("mul", "Out")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* qkv_add_4_y = VarNode("qkv_add_4_y")
+                            ->assert_is_op_input("elementwise_add", "Y")
+                            ->AsInput();
+    auto* qkv_add_4 = OpNode("qkv_add_4", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_4_out = VarNode("qkv_add_4_out")
+                              ->assert_is_op_output("elementwise_add", "Out")
+                              ->assert_is_op_input("dropout", "X")
+                              ->AsIntermediate();
+    auto* qkv_dropout_4 = OpNode("qkv_dropout_4", "dropout")->AsIntermediate();
+    auto* qkv_dropout_4_out = VarNode("qkv_dropout_4_out")
+                                  ->assert_is_op_output("dropout", "Out")
+                                  ->assert_is_op_input("elementwise_add", "X")
+                                  ->AsIntermediate();
+    auto* qkv_dropout_4_mask = VarNode("qkv_dropout_4_mask")
+                                   ->assert_is_op_output("dropout", "Mask")
+                                   ->AsIntermediate();
+
+    auto* qkv_add_5 = OpNode("qkv_add_5", "elementwise_add")->AsIntermediate();
+    auto* qkv_add_5_out = VarNode("qkv_add_5_out")
+                              ->assert_is_op_output("elementwise_add", "Out")
+                              ->assert_is_op_input("layer_norm", "X")
+                              ->AsIntermediate();
+    auto* qkv_ln_5_scale = VarNode("qkv_ln_5_scale")
+                               ->assert_is_op_input("layer_norm", "Scale")
+                               ->AsInput();
+    auto* qkv_ln_5_bias = VarNode("qkv_ln_5_bias")
+                              ->assert_is_op_input("layer_norm", "Bias")
+                              ->AsInput();
+    auto* qkv_ln_5 = OpNode("qkv_ln_5", "layer_norm")->AsIntermediate();
+    auto* qkv_ln_5_out = VarNode("qkv_ln_5_out")
+                             ->assert_is_op_output("layer_norm", "Y")
+                             ->AsOutput();
+    auto* qkv_ln_5_mean = VarNode("qkv_ln_5_mean")
+                              ->assert_is_op_output("layer_norm", "Mean")
+                              ->AsIntermediate();
+    auto* qkv_ln_5_var = VarNode("qkv_ln_5_var")
+                             ->assert_is_op_output("layer_norm", "Variance")
+                             ->AsIntermediate();
+
+    // TODO(miaotianxiang): use LinksFrom/LinksTo() instead
+    *input >> *q_mul >> *q_mul_out >> *q_add >> *q_add_out >> *q_reshape2 >>
+        *q_reshape2_out >> *q_transpose2 >> *q_transpose2_out >> *q_scale >>
+        *q_scale_out >> *qk_matmul;
+    *q_mul_y >> *q_mul;
+    *q_add_y >> *q_add;
+    *q_reshape2 >> *q_reshape2_xshape;
+    *q_transpose2 >> *q_transpose2_xshape;
+
+    *input >> *k_mul >> *k_mul_out >> *k_add >> *k_add_out >> *k_reshape2 >>
+        *k_reshape2_out >> *k_transpose2 >> *k_transpose2_out >> *qk_matmul;
+    *k_mul_y >> *k_mul;
+    *k_add_y >> *k_add;
+    *k_reshape2 >> *k_reshape2_xshape;
+    *k_transpose2 >> *k_transpose2_xshape;
+
+    *qk_matmul >> *qk_matmul_out >> *qk_add >> *qk_add_out >> *qk_softmax >>
+        *qk_softmax_out >> *qk_dropout >> *qk_dropout_out >> *qkv_matmul;
+    *qk_mask >> *qk_add;
+    *qk_dropout >> *qk_dropout_mask;
+
+    *input >> *v_mul >> *v_mul_out >> *v_add >> *v_add_out >> *v_reshape2 >>
+        *v_reshape2_out >> *v_transpose2 >> *v_transpose2_out >> *qkv_matmul;
+    *v_mul_y >> *v_mul;
+    *v_add_y >> *v_add;
+    *v_reshape2 >> *v_reshape2_xshape;
+    *v_transpose2 >> *v_transpose2_xshape;
+
+    *qkv_matmul >> *qkv_matmul_out >> *qkv_transpose2 >> *qkv_transpose2_out >>
+        *qkv_reshape2 >> *qkv_reshape2_out >> *qkv_mul >> *qkv_mul_out >>
+        *qkv_add >> *qkv_add_out >> *qkv_dropout >> *qkv_dropout_out >>
+        *qkv_add_2;
+    *qkv_transpose2 >> *qkv_transpose2_xshape;
+    *qkv_reshape2 >> *qkv_reshape2_xshape;
+    *qkv_mul_y >> *qkv_mul;
+    *qkv_add_y >> *qkv_add;
+    *qkv_dropout >> *qkv_dropout_mask;
+
+    *input >> *qkv_add_2 >> *qkv_add_2_out >> *qkv_ln_2 >> *qkv_ln_2_out;
+    *qkv_ln_2_scale >> *qkv_ln_2;
+    *qkv_ln_2_bias >> *qkv_ln_2;
+    *qkv_ln_2 >> *qkv_ln_2_mean;
+    *qkv_ln_2 >> *qkv_ln_2_var;
+
+    *qkv_ln_2_out >> *qkv_mul_3 >> *qkv_mul_3_out >> *qkv_add_3 >>
+        *qkv_add_3_out >> *qkv_act >> *qkv_act_out >> *qkv_mul_4 >>
+        *qkv_mul_4_out >> *qkv_add_4 >> *qkv_add_4_out >> *qkv_dropout_4 >>
+        *qkv_dropout_4_out >> *qkv_add_5;
+    *qkv_mul_3_y >> *qkv_mul_3;
+    *qkv_add_3_y >> *qkv_add_3;
+    *qkv_mul_4_y >> *qkv_mul_4;
+    *qkv_add_4_y >> *qkv_add_4;
+    *qkv_dropout_4 >> *qkv_dropout_4_mask;
+
+    *qkv_ln_2_out >> *qkv_add_5 >> *qkv_add_5_out >> *qkv_ln_5 >> *qkv_ln_5_out;
+    *qkv_ln_5_scale >> *qkv_ln_5;
+    *qkv_ln_5_bias >> *qkv_ln_5;
+    *qkv_ln_5 >> *qkv_ln_5_mean;
+    *qkv_ln_5 >> *qkv_ln_5_var;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("single_encoder");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Mask", {matched.at("qk_mask")->arg()->name});
+    op_desc.SetInput("FCWeight",
+                     {
+                         matched.at("q_mul_y")->arg()->name,
+                         matched.at("k_mul_y")->arg()->name,
+                         matched.at("v_mul_y")->arg()->name,
+                         matched.at("qkv_mul_y")->arg()->name,
+                         matched.at("qkv_mul_3_y")->arg()->name,
+                         matched.at("qkv_mul_4_y")->arg()->name,
+                     });
+    op_desc.SetInput("FCBias",
+                     {
+                         matched.at("q_add_y")->arg()->name,
+                         matched.at("k_add_y")->arg()->name,
+                         matched.at("v_add_y")->arg()->name,
+                         matched.at("qkv_add_y")->arg()->name,
+                         matched.at("qkv_add_3_y")->arg()->name,
+                         matched.at("qkv_add_4_y")->arg()->name,
+                     });
+    op_desc.SetInput("LNScale",
+                     {
+                         matched.at("qkv_ln_2_scale")->arg()->name,
+                         matched.at("qkv_ln_5_scale")->arg()->name,
+                     });
+    op_desc.SetInput("LNBias",
+                     {
+                         matched.at("qkv_ln_2_bias")->arg()->name,
+                         matched.at("qkv_ln_5_bias")->arg()->name,
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("qkv_ln_5_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    // extra traits to distill
+    auto* reshape_op_info = matched.at("q_reshape2")->stmt()->op_info();
+    auto reshape_dim = reshape_op_info->GetAttr<std::vector<int>>("shape");
+    op_desc.SetAttr<int>("head_num", reshape_dim[2]);
+    op_desc.SetAttr<int>("size_per_head", reshape_dim[3]);
+    op_desc.SetAttr<std::string>("act_type", act_type_);
+
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    // XXX: memleak?
+    auto sub_block_desc = new cpp::BlockDesc();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetSubBlock(sub_block_desc);
+    auto* single_encoder_stmt = matched.at("q_mul")->stmt();
+    fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());
+    single_encoder_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "qk_mask",
+        "k_mul_y",
+        "v_mul_y",
+        "qkv_mul_y",
+        "qkv_mul_3_y",
+        "qkv_mul_4_y",
+        "q_add_y",
+        "k_add_y",
+        "v_add_y",
+        "qkv_add_y",
+        "qkv_add_3_y",
+        "qkv_add_4_y",
+        "qkv_ln_2_scale",
+        "qkv_ln_2_bias",
+        "qkv_ln_5_scale",
+        "qkv_ln_5_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("q_mul"));
+    }
+    IR_OP_VAR_LINK(matched.at("q_mul"), matched.at("qkv_ln_5_out"));
+  }
+
+ private:
+  std::string act_type_;
+};
+
+class XPUMultiEncoderFuser {
+ public:
+  bool IsDirectPredecessorOf(Node* op1, Node* op2) {
+    for (auto* out : op1->outlinks) {
+      for (auto* in : op2->inlinks) {
+        if (out == in) return true;
+      }
+    }
+    return false;
+  }
+
+  void operator()(SSAGraph* graph) {
+    std::vector<Node*> all_encoders;
+    for (auto* node : graph->StmtTopologicalOrder()) {
+      CHECK(node->IsStmt());
+      if (node->stmt()->op_info()->Type() == "single_encoder") {
+        all_encoders.push_back(node);
+      }
+    }
+    VLOG(3) << "Found " << all_encoders.size() << " single_encoder";
+    if (all_encoders.size() == 0) {
+      return;
+    }
+
+    // TODO(miaotianxiang): more verification
+    for (size_t i = 0; i < all_encoders.size() - 1; ++i) {
+      CHECK(IsDirectPredecessorOf(all_encoders[i], all_encoders[i + 1]));
+    }
+    std::string mask_name;
+    for (auto* encoder : all_encoders) {
+      auto* op_info = encoder->stmt()->op_info();
+      if (mask_name.empty()) {
+        mask_name = op_info->Input("Mask").front();
+      } else {
+        // CHECK(mask_name == op_info->Input("Mask").front());
+      }
+    }
+
+    std::unordered_set<const Node*> to_remove;
+    Node* first_encoder = all_encoders[0];
+    std::string in_name, out_name;
+    std::vector<std::string> arg_names{
+        "FCWeight", "FCBias", "LNScale", "LNBias"};
+    std::unordered_map<std::string, std::vector<std::string>> arg_map;
+    for (size_t i = 0; i < all_encoders.size(); ++i) {
+      Node* cur_encoder = all_encoders[i];
+      auto* op_info = cur_encoder->stmt()->op_info();
+      for (auto arg_name : arg_names) {
+        auto real_names = op_info->Input(arg_name);
+        for (auto name : real_names) {
+          auto* arg_node = graph->RetrieveArgument(name);
+          DirectedLink(arg_node, first_encoder);
+          arg_map[arg_name].push_back(name);
+        }
+      }
+
+      auto* cur_out =
+          graph->RetrieveArgument(op_info->Output("Outputs").front());
+      if (i == 0) {
+        // first encoder
+        to_remove.insert(cur_out);
+        in_name = op_info->Input("Inputs").front();
+        mask_name = op_info->Input("Mask").front();
+      } else if (i == all_encoders.size() - 1) {
+        // last encoder
+        to_remove.insert(cur_encoder);
+        DirectedLink(first_encoder, cur_out);
+        out_name = op_info->Output("Outputs").front();
+      } else {
+        to_remove.insert(cur_encoder);
+        to_remove.insert(cur_out);
+      }
+    }
+    GraphSafeRemoveNodes(graph, to_remove);
+
+    auto* multi_encoder_stmt = first_encoder->stmt();
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__multi_encoder");
+    op_desc.SetInput("Input", {in_name});
+    for (auto kv : arg_map) {
+      op_desc.SetInput(kv.first, kv.second);
+    }
+    op_desc.SetInput("Mask", {mask_name});
+    op_desc.SetOutput("Output", {out_name});
+    op_desc.SetAttr<int>("xpu", 1);
+    auto* first_encoder_op_info = multi_encoder_stmt->op_info();
+    op_desc.SetAttr<int>("head_num",
+                         first_encoder_op_info->GetAttr<int>("head_num"));
+    op_desc.SetAttr<int>("size_per_head",
+                         first_encoder_op_info->GetAttr<int>("size_per_head"));
+    op_desc.SetAttr<int>("n_layers", all_encoders.size());
+    op_desc.SetAttr<std::string>(
+        "act_type", first_encoder_op_info->GetAttr<std::string>("act_type"));
+
+    auto* scope = multi_encoder_stmt->op()->scope();
+    std::vector<float> fc_weight_max(arg_map["FCWeight"].size());
+    auto& fc_weight_names = arg_map["FCWeight"];
+    for (size_t i = 0; i < fc_weight_names.size(); ++i) {
+      auto* weight_t = scope->FindMutableTensor(fc_weight_names[i]);
+      auto weight_dims = weight_t->dims();
+      int weight_len = weight_t->numel();
+      float* weight_on_host = weight_t->mutable_data<float>();
+      float max_f =
+          paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
+
+      std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
+      std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
+      paddle::lite::xpu::math::ConvertFP32ToInt16(
+          weight_on_host, weight_int16.get(), max_f, weight_len);
+      paddle::lite::xpu::math::Transpose(weight_int16.get(),
+                                         weight_trans_int16.get(),
+                                         weight_dims[0],
+                                         weight_dims[1]);
+      memcpy(weight_on_host,
+             weight_trans_int16.get(),
+             weight_len * sizeof(int16_t));
+      fc_weight_max[i] = max_f;
+    }
+
+    std::string max_name = "encoder_max";
+    auto* max_filter_node = graph->NewArgumentNode(max_name);
+    max_filter_node->arg()->is_weight = true;
+    max_filter_node->arg()->type = LiteType::GetTensorTy(
+        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    DirectedLink(max_filter_node, first_encoder);
+    auto* max_filter_tensor = scope->NewTensor(max_name);
+    max_filter_tensor->Resize({static_cast<int>(fc_weight_max.size())});
+    memcpy(max_filter_tensor->mutable_data<float>(),
+           &fc_weight_max[0],
+           sizeof(float) * fc_weight_max.size());
+    op_desc.SetInput("FCWeightMax", {max_name});
+
+    auto multi_encoder_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    multi_encoder_op->Attach(op_desc, scope);
+    multi_encoder_op->SetValidPlaces(multi_encoder_stmt->op()->valid_places());
+    auto kernels =
+        multi_encoder_op->CreateKernels(multi_encoder_op->valid_places());
+    multi_encoder_stmt->SetOp(multi_encoder_op);
+    multi_encoder_stmt->SetKernels(std::move(kernels));
+
+    // temp remove useless cast
+    std::unordered_set<const Node*> to_remove2;
+    Node* stack = nullptr;
+    for (auto* node : graph->StmtTopologicalOrder()) {
+      CHECK(node->IsStmt());
+      if (node->stmt()->op_info()->Type() == "stack") {
+        stack = node;
+      }
+    }
+    Node* stack_out = stack->outlinks.front();
+    for (Node* cast : stack_out->outlinks) {
+      Node* cast_out = cast->outlinks.front();
+      if (cast_out->outlinks.size() == 0) {
+        // remove
+        to_remove2.insert(cast_out);
+        to_remove2.insert(cast);
+      }
+    }
+    GraphSafeRemoveNodes(graph, to_remove2);
+  }
+};
+
+}  // namespace fusion
+
+class XPUMultiEncoderFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    // TODO(miaotianxiang): backup graph, recover from failed match
+    std::vector<std::string> act_types{"gelu", "relu"};
+    for (auto& act_type : act_types) {
+      fusion::XPUSingleEncoderFuser single_encoder_fuser(act_type);
+      single_encoder_fuser(graph.get());
+      fusion::XPUMultiEncoderFuser multi_encoder_fuser;
+      multi_encoder_fuser(graph.get());
+    }
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__multi_encoder_fuse_pass,
+                  paddle::lite::mir::XPUMultiEncoderFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("matmul");
diff --git a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de2210a76ea0647cb02131a088ceb754afd0ef9c
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
@@ -0,0 +1,951 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include "lite/operators/subgraph_op.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class XPUResNetBlock0Fuser : public FuseBase {
+ public:
+  XPUResNetBlock0Fuser() {}
+
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+
+    auto* left_conv1_weight = VarNode("left_conv1_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv1 = OpNode("left_conv1", "conv2d");
+    auto* left_conv1_out = VarNode("left_conv1_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn1_scale = VarNode("left_bn1_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn1_bias = VarNode("left_bn1_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn1_mean = VarNode("left_bn1_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn1_var = VarNode("left_bn1_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate();
+    auto* left_bn1_out = VarNode("left_bn1_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
+                             ->AsIntermediate();
+    auto* left_bn1_mean_out = VarNode("left_bn1_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn1_var_out =
+        VarNode("left_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn1_saved_mean =
+        VarNode("left_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn1_saved_var =
+        VarNode("left_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate();
+    auto* left_relu1_out = VarNode("left_relu1_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+
+    auto* left_conv2_weight = VarNode("left_conv2_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate();
+    auto* left_conv2_out = VarNode("left_conv2_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn2_scale = VarNode("left_bn2_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn2_bias = VarNode("left_bn2_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn2_mean = VarNode("left_bn2_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn2_var = VarNode("left_bn2_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate();
+    auto* left_bn2_out = VarNode("left_bn2_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
+                             ->AsIntermediate();
+    auto* left_bn2_mean_out = VarNode("left_bn2_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn2_var_out =
+        VarNode("left_bn2_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn2_saved_mean =
+        VarNode("left_bn2_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn2_saved_var =
+        VarNode("left_bn2_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate();
+    auto* left_relu2_out = VarNode("left_relu2_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+
+    auto* left_conv3_weight = VarNode("left_conv3_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate();
+    auto* left_conv3_out = VarNode("left_conv3_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn3_scale = VarNode("left_bn3_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn3_bias = VarNode("left_bn3_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn3_mean = VarNode("left_bn3_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn3_var = VarNode("left_bn3_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate();
+    auto* left_bn3_out = VarNode("left_bn3_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("elementwise_add", "Y")
+                             ->AsIntermediate();
+    auto* left_bn3_mean_out = VarNode("left_bn3_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn3_var_out =
+        VarNode("left_bn3_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn3_saved_mean =
+        VarNode("left_bn3_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn3_saved_var =
+        VarNode("left_bn3_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    auto* right_conv1_weight = VarNode("right_conv1_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate();
+    auto* right_conv1_out = VarNode("right_conv1_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn1_scale = VarNode("right_bn1_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn1_bias = VarNode("right_bn1_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn1_mean = VarNode("right_bn1_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn1_var = VarNode("right_bn1_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
+    auto* right_bn1_out = VarNode("right_bn1_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* right_bn1_mean_out =
+        VarNode("right_bn1_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn1_var_out =
+        VarNode("right_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn1_saved_mean =
+        VarNode("right_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn1_saved_var =
+        VarNode("right_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
+    auto* add_out = VarNode("add_out")
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("relu", "X")
+                        ->AsIntermediate();
+    auto* relu = OpNode("relu", "relu")->AsIntermediate();
+    auto* relu_out =
+        VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+
+    *input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >>
+        *left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >>
+        *left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >>
+        *left_conv3 >> *left_conv3_out >> *left_bn3 >> *left_bn3_out >> *add;
+
+    *left_conv1_weight >> *left_conv1;
+    *left_bn1_scale >> *left_bn1;
+    *left_bn1_bias >> *left_bn1;
+    *left_bn1_mean >> *left_bn1;
+    *left_bn1_var >> *left_bn1;
+    *left_bn1 >> *left_bn1_mean_out;
+    *left_bn1 >> *left_bn1_var_out;
+    *left_bn1 >> *left_bn1_saved_mean;
+    *left_bn1 >> *left_bn1_saved_var;
+
+    *left_conv2_weight >> *left_conv2;
+    *left_bn2_scale >> *left_bn2;
+    *left_bn2_bias >> *left_bn2;
+    *left_bn2_mean >> *left_bn2;
+    *left_bn2_var >> *left_bn2;
+    *left_bn2 >> *left_bn2_mean_out;
+    *left_bn2 >> *left_bn2_var_out;
+    *left_bn2 >> *left_bn2_saved_mean;
+    *left_bn2 >> *left_bn2_saved_var;
+
+    *left_conv3_weight >> *left_conv3;
+    *left_bn3_scale >> *left_bn3;
+    *left_bn3_bias >> *left_bn3;
+    *left_bn3_mean >> *left_bn3;
+    *left_bn3_var >> *left_bn3;
+    *left_bn3 >> *left_bn3_mean_out;
+    *left_bn3 >> *left_bn3_var_out;
+    *left_bn3 >> *left_bn3_saved_mean;
+    *left_bn3 >> *left_bn3_saved_var;
+
+    *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
+        *right_bn1_out >> *add;
+
+    *right_conv1_weight >> *right_conv1;
+    *right_bn1_scale >> *right_bn1;
+    *right_bn1_bias >> *right_bn1;
+    *right_bn1_mean >> *right_bn1;
+    *right_bn1_var >> *right_bn1;
+    *right_bn1 >> *right_bn1_mean_out;
+    *right_bn1 >> *right_bn1_var_out;
+    *right_bn1 >> *right_bn1_saved_mean;
+    *right_bn1 >> *right_bn1_saved_var;
+
+    *add >> *add_out >> *relu >> *relu_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_block0");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter",
+                     {
+                         matched.at("left_conv1_weight")->arg()->name,
+                         matched.at("left_conv2_weight")->arg()->name,
+                         matched.at("left_conv3_weight")->arg()->name,
+                         matched.at("right_conv1_weight")->arg()->name,
+                     });
+    op_desc.SetInput("Scale",
+                     {
+                         matched.at("left_bn1_scale")->arg()->name,
+                         matched.at("left_bn2_scale")->arg()->name,
+                         matched.at("left_bn3_scale")->arg()->name,
+                         matched.at("right_bn1_scale")->arg()->name,
+                     });
+    op_desc.SetInput("Bias",
+                     {
+                         matched.at("left_bn1_bias")->arg()->name,
+                         matched.at("left_bn2_bias")->arg()->name,
+                         matched.at("left_bn3_bias")->arg()->name,
+                         matched.at("right_bn1_bias")->arg()->name,
+                     });
+    op_desc.SetInput("Mean",
+                     {
+                         matched.at("left_bn1_mean")->arg()->name,
+                         matched.at("left_bn2_mean")->arg()->name,
+                         matched.at("left_bn3_mean")->arg()->name,
+                         matched.at("right_bn1_mean")->arg()->name,
+                     });
+    op_desc.SetInput("Var",
+                     {
+                         matched.at("left_bn1_variance")->arg()->name,
+                         matched.at("left_bn2_variance")->arg()->name,
+                         matched.at("left_bn3_variance")->arg()->name,
+                         matched.at("right_bn1_variance")->arg()->name,
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    auto block0_stmt = matched.at("left_conv1")->stmt();
+    // block0_stmt->ResetOp(op_desc, graph->valid_places());
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    // XXX: memleak?
+    auto sub_block_desc = new cpp::BlockDesc();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetSubBlock(sub_block_desc);
+    fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
+    block0_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "left_conv2_weight",
+        "left_conv3_weight",
+        "right_conv1_weight",
+        "left_bn1_bias",
+        "left_bn2_bias",
+        "left_bn3_bias",
+        "right_bn1_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1"));
+    }
+    IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out"));
+  }
+};
+
+class XPUResNetBlock1Fuser : public FuseBase {
+ public:
+  XPUResNetBlock1Fuser() {}
+
+  void BuildPattern() override {
+    auto* input = VarNode("input")
+                      ->assert_is_op_input("conv2d", "Input")
+                      ->assert_is_op_input("elementwise_add", "X")
+                      ->AsInput();
+
+    auto* right_conv1_weight = VarNode("right_conv1_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv1 = OpNode("right_conv1", "conv2d");
+    auto* right_conv1_out = VarNode("right_conv1_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn1_scale = VarNode("right_bn1_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn1_bias = VarNode("right_bn1_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn1_mean = VarNode("right_bn1_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn1_var = VarNode("right_bn1_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
+    auto* right_bn1_out = VarNode("right_bn1_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("relu", "X")
+                              ->AsIntermediate();
+    auto* right_bn1_mean_out =
+        VarNode("right_bn1_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn1_var_out =
+        VarNode("right_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn1_saved_mean =
+        VarNode("right_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn1_saved_var =
+        VarNode("right_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* right_relu1 = OpNode("right_relu1", "relu")->AsIntermediate();
+    auto* right_relu1_out = VarNode("right_relu1_out")
+                                ->assert_is_op_output("relu", "Out")
+                                ->assert_is_op_input("conv2d", "Input")
+                                ->AsIntermediate();
+
+    auto* right_conv2_weight = VarNode("right_conv2_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv2 = OpNode("right_conv2", "conv2d")->AsIntermediate();
+    auto* right_conv2_out = VarNode("right_conv2_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn2_scale = VarNode("right_bn2_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn2_bias = VarNode("right_bn2_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn2_mean = VarNode("right_bn2_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn2_var = VarNode("right_bn2_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn2 = OpNode("right_bn2", "batch_norm")->AsIntermediate();
+    auto* right_bn2_out = VarNode("right_bn2_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("relu", "X")
+                              ->AsIntermediate();
+    auto* right_bn2_mean_out =
+        VarNode("right_bn2_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn2_var_out =
+        VarNode("right_bn2_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn2_saved_mean =
+        VarNode("right_bn2_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn2_saved_var =
+        VarNode("right_bn2_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* right_relu2 = OpNode("right_relu2", "relu")->AsIntermediate();
+    auto* right_relu2_out = VarNode("right_relu2_out")
+                                ->assert_is_op_output("relu", "Out")
+                                ->assert_is_op_input("conv2d", "Input")
+                                ->AsIntermediate();
+
+    auto* right_conv3_weight = VarNode("right_conv3_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv3 = OpNode("right_conv3", "conv2d")->AsIntermediate();
+    auto* right_conv3_out = VarNode("right_conv3_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn3_scale = VarNode("right_bn3_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn3_bias = VarNode("right_bn3_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn3_mean = VarNode("right_bn3_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn3_var = VarNode("right_bn3_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn3 = OpNode("right_bn3", "batch_norm")->AsIntermediate();
+    auto* right_bn3_out = VarNode("right_bn3_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("elementwise_add", "Y")
+                              ->AsIntermediate();
+    auto* right_bn3_mean_out =
+        VarNode("right_bn3_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn3_var_out =
+        VarNode("right_bn3_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn3_saved_mean =
+        VarNode("right_bn3_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn3_saved_var =
+        VarNode("right_bn3_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
+    auto* add_out = VarNode("add_out")
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("relu", "X")
+                        ->AsIntermediate();
+    auto* relu = OpNode("relu", "relu")->AsIntermediate();
+    auto* relu_out =
+        VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+
+    *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
+        *right_bn1_out >> *right_relu1 >> *right_relu1_out >> *right_conv2 >>
+        *right_conv2_out >> *right_bn2 >> *right_bn2_out >> *right_relu2 >>
+        *right_relu2_out >> *right_conv3 >> *right_conv3_out >> *right_bn3 >>
+        *right_bn3_out >> *add;
+
+    *right_conv1_weight >> *right_conv1;
+    *right_bn1_scale >> *right_bn1;
+    *right_bn1_bias >> *right_bn1;
+    *right_bn1_mean >> *right_bn1;
+    *right_bn1_var >> *right_bn1;
+    *right_bn1 >> *right_bn1_mean_out;
+    *right_bn1 >> *right_bn1_var_out;
+    *right_bn1 >> *right_bn1_saved_mean;
+    *right_bn1 >> *right_bn1_saved_var;
+
+    *right_conv2_weight >> *right_conv2;
+    *right_bn2_scale >> *right_bn2;
+    *right_bn2_bias >> *right_bn2;
+    *right_bn2_mean >> *right_bn2;
+    *right_bn2_var >> *right_bn2;
+    *right_bn2 >> *right_bn2_mean_out;
+    *right_bn2 >> *right_bn2_var_out;
+    *right_bn2 >> *right_bn2_saved_mean;
+    *right_bn2 >> *right_bn2_saved_var;
+
+    *right_conv3_weight >> *right_conv3;
+    *right_bn3_scale >> *right_bn3;
+    *right_bn3_bias >> *right_bn3;
+    *right_bn3_mean >> *right_bn3;
+    *right_bn3_var >> *right_bn3;
+    *right_bn3 >> *right_bn3_mean_out;
+    *right_bn3 >> *right_bn3_var_out;
+    *right_bn3 >> *right_bn3_saved_mean;
+    *right_bn3 >> *right_bn3_saved_var;
+
+    *input >> *add;
+
+    *add >> *add_out >> *relu >> *relu_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_block1");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter",
+                     {
+                         matched.at("right_conv1_weight")->arg()->name,
+                         matched.at("right_conv2_weight")->arg()->name,
+                         matched.at("right_conv3_weight")->arg()->name,
+                     });
+    op_desc.SetInput("Scale",
+                     {
+                         matched.at("right_bn1_scale")->arg()->name,
+                         matched.at("right_bn2_scale")->arg()->name,
+                         matched.at("right_bn3_scale")->arg()->name,
+                     });
+    op_desc.SetInput("Bias",
+                     {
+                         matched.at("right_bn1_bias")->arg()->name,
+                         matched.at("right_bn2_bias")->arg()->name,
+                         matched.at("right_bn3_bias")->arg()->name,
+                     });
+    op_desc.SetInput("Mean",
+                     {
+                         matched.at("right_bn1_mean")->arg()->name,
+                         matched.at("right_bn2_mean")->arg()->name,
+                         matched.at("right_bn3_mean")->arg()->name,
+                     });
+    op_desc.SetInput("Var",
+                     {
+                         matched.at("right_bn1_variance")->arg()->name,
+                         matched.at("right_bn2_variance")->arg()->name,
+                         matched.at("right_bn3_variance")->arg()->name,
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    auto block1_stmt = matched.at("right_conv1")->stmt();
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    // XXX: memleak?
+    auto sub_block_desc = new cpp::BlockDesc();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetSubBlock(sub_block_desc);
+    fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
+    block1_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "right_conv2_weight",
+        "right_conv3_weight",
+        "right_bn1_bias",
+        "right_bn2_bias",
+        "right_bn3_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("right_conv1"));
+    }
+    IR_OP_VAR_LINK(matched.at("right_conv1"), matched.at("relu_out"));
+  }
+};
+
+class XPUResNet50Fuser : public xpu::XPUFuseBase {
+ public:
+  XPUResNet50Fuser() {}
+
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+
+    auto* top_conv_weight = VarNode("top_conv_weight")
+                                ->assert_is_op_input("conv2d", "Filter")
+                                ->AsInput();
+    auto* top_conv = OpNode("top_conv", "conv2d");
+    auto* top_conv_out = VarNode("top_conv_out")
+                             ->assert_is_op_output("conv2d", "Output")
+                             ->assert_is_op_input("batch_norm", "X")
+                             ->AsIntermediate();
+    auto* top_bn_scale = VarNode("top_bn_scale")
+                             ->assert_is_op_input("batch_norm", "Scale")
+                             ->AsIntermediate();
+    auto* top_bn_bias = VarNode("top_bn_bias")
+                            ->assert_is_op_input("batch_norm", "Bias")
+                            ->AsInput();
+    auto* top_bn_mean = VarNode("top_bn_mean")
+                            ->assert_is_op_input("batch_norm", "Mean")
+                            ->AsIntermediate();
+    auto* top_bn_var = VarNode("top_bn_variance")
+                           ->assert_is_op_input("batch_norm", "Variance")
+                           ->AsIntermediate();
+    auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate();
+    auto* top_bn_out = VarNode("top_bn_out")
+                           ->assert_is_op_output("batch_norm", "Y")
+                           ->assert_is_op_input("relu", "X")
+                           ->AsIntermediate();
+    auto* top_bn_mean_out = VarNode("top_bn_mean_out")
+                                ->assert_is_op_output("batch_norm", "MeanOut")
+                                ->AsIntermediate();
+    auto* top_bn_var_out =
+        VarNode("top_bn_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* top_bn_saved_mean =
+        VarNode("top_bn_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* top_bn_saved_var =
+        VarNode("top_bn_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate();
+    auto* top_relu_out = VarNode("top_relu_out")
+                             ->assert_is_op_output("relu", "Out")
+                             ->assert_is_op_input("pool2d", "X")
+                             ->AsIntermediate();
+    auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate();
+    auto* top_pool_out = VarNode("top_pool_out")
+                             ->assert_is_op_output("pool2d", "Out")
+                             ->assert_is_op_input("resnet_block0", "Inputs")
+                             ->AsIntermediate();
+
+    // args are left out
+    auto* resnet_block0_1 =
+        OpNode("resnet_block0_1", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_1_out =
+        VarNode("resnet_block0_1_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_1_1 =
+        OpNode("resnet_block1_1_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_1_1_out =
+        VarNode("resnet_block1_1_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_1_2 =
+        OpNode("resnet_block1_1_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_1_2_out =
+        VarNode("resnet_block1_1_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_2 =
+        OpNode("resnet_block0_2", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_2_out =
+        VarNode("resnet_block0_2_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_1 =
+        OpNode("resnet_block1_2_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_2_1_out =
+        VarNode("resnet_block1_2_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_2 =
+        OpNode("resnet_block1_2_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_2_2_out =
+        VarNode("resnet_block1_2_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_3 =
+        OpNode("resnet_block1_2_3", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_2_3_out =
+        VarNode("resnet_block1_2_3_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_3 =
+        OpNode("resnet_block0_3", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_3_out =
+        VarNode("resnet_block0_3_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_1 =
+        OpNode("resnet_block1_3_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_1_out =
+        VarNode("resnet_block1_3_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_2 =
+        OpNode("resnet_block1_3_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_2_out =
+        VarNode("resnet_block1_3_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_3 =
+        OpNode("resnet_block1_3_3", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_3_out =
+        VarNode("resnet_block1_3_3_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_4 =
+        OpNode("resnet_block1_3_4", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_4_out =
+        VarNode("resnet_block1_3_4_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_5 =
+        OpNode("resnet_block1_3_5", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_5_out =
+        VarNode("resnet_block1_3_5_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_4 =
+        OpNode("resnet_block0_4", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_4_out =
+        VarNode("resnet_block0_4_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_4_1 =
+        OpNode("resnet_block1_4_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_4_1_out =
+        VarNode("resnet_block1_4_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_4_2 =
+        OpNode("resnet_block1_4_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_4_2_out =
+        VarNode("resnet_block1_4_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* bottom_pool = OpNode("bottom_pool", "pool2d")->AsIntermediate();
+    auto* bottom_pool_out = VarNode("bottom_pool_out")
+                                ->assert_is_op_output("pool2d", "Out")
+                                ->AsOutput();
+
+    *input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >>
+        *top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >>
+        *resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >>
+        *resnet_block1_1_1_out >> *resnet_block1_1_2 >>
+        *resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >>
+        *resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >>
+        *resnet_block1_2_2_out >> *resnet_block1_2_3 >>
+        *resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >>
+        *resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >>
+        *resnet_block1_3_2_out >> *resnet_block1_3_3 >>
+        *resnet_block1_3_3_out >> *resnet_block1_3_4 >>
+        *resnet_block1_3_4_out >> *resnet_block1_3_5 >>
+        *resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >>
+        *resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >>
+        *resnet_block1_4_2_out >> *bottom_pool >> *bottom_pool_out;
+
+    *top_conv_weight >> *top_conv;
+    *top_bn_scale >> *top_bn;
+    *top_bn_bias >> *top_bn;
+    *top_bn_mean >> *top_bn;
+    *top_bn_var >> *top_bn;
+    *top_bn >> *top_bn_mean_out;
+    *top_bn >> *top_bn_var_out;
+    *top_bn >> *top_bn_saved_mean;
+    *top_bn >> *top_bn_saved_var;
+  }
+
+  void InsertNewNode(SSAGraph* graph,
+                     const key2nodes_t& matched,
+                     const std::vector<Node*>& extra_input_vars) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__resnet50");
+    op_desc.SetInput("Input", {matched.at("input")->arg()->name});
+    std::vector<std::string> filter_name = {
+        matched.at("top_conv_weight")->arg()->name};
+    std::vector<std::string> scale_name = {
+        matched.at("top_bn_scale")->arg()->name};
+    std::vector<std::string> bias_name = {
+        matched.at("top_bn_bias")->arg()->name};
+    std::vector<std::string> mean_name = {
+        matched.at("top_bn_mean")->arg()->name};
+    std::vector<std::string> var_name = {
+        matched.at("top_bn_variance")->arg()->name};
+    std::vector<std::string> max_filter_name;
+    std::vector<std::string> resnet_block_vec = {
+        "resnet_block0_1",
+        "resnet_block1_1_1",
+        "resnet_block1_1_2",
+        "resnet_block0_2",
+        "resnet_block1_2_1",
+        "resnet_block1_2_2",
+        "resnet_block1_2_3",
+        "resnet_block0_3",
+        "resnet_block1_3_1",
+        "resnet_block1_3_2",
+        "resnet_block1_3_3",
+        "resnet_block1_3_4",
+        "resnet_block1_3_5",
+        "resnet_block0_4",
+        "resnet_block1_4_1",
+        "resnet_block1_4_2",
+    };
+    for (auto& block : resnet_block_vec) {
+      auto* block_op_info = matched.at(block)->stmt()->op_info();
+      auto block_filter_name = block_op_info->Input("Filter");
+      std::copy(block_filter_name.begin(),
+                block_filter_name.end(),
+                std::back_inserter(filter_name));
+      auto block_scale_name = block_op_info->Input("Scale");
+      std::copy(block_scale_name.begin(),
+                block_scale_name.end(),
+                std::back_inserter(scale_name));
+      auto block_bias_name = block_op_info->Input("Bias");
+      std::copy(block_bias_name.begin(),
+                block_bias_name.end(),
+                std::back_inserter(bias_name));
+      auto block_mean_name = block_op_info->Input("Mean");
+      std::copy(block_mean_name.begin(),
+                block_mean_name.end(),
+                std::back_inserter(mean_name));
+      auto block_var_name = block_op_info->Input("Var");
+      std::copy(block_var_name.begin(),
+                block_var_name.end(),
+                std::back_inserter(var_name));
+    }
+    op_desc.SetInput("Filter", filter_name);
+    op_desc.SetInput("Bias", bias_name);
+    op_desc.SetOutput("Output", {matched.at("bottom_pool_out")->arg()->name});
+    op_desc.SetAttr<int>("xpu", 1);
+
+    auto* resnet50_stmt = matched.at("top_conv")->stmt();
+    auto* scope = resnet50_stmt->op()->scope();
+    for (size_t i = 0; i < filter_name.size(); ++i) {
+      auto* filter_t = scope->FindMutableTensor(filter_name[i]);
+      auto* scale_t = scope->FindMutableTensor(scale_name[i]);
+      auto* bias_t = scope->FindMutableTensor(bias_name[i]);
+      auto* mean_t = scope->FindMutableTensor(mean_name[i]);
+      auto* var_t = scope->FindMutableTensor(var_name[i]);
+
+      int mean_len = mean_t->numel();
+      int filter_len = filter_t->numel();
+      int filter_stride = filter_len / mean_len;
+
+      float* filter_on_host = filter_t->mutable_data<float>();
+      float* scale_on_host = scale_t->mutable_data<float>();
+      float* bias_on_host = bias_t->mutable_data<float>();
+      float* mean_on_host = mean_t->mutable_data<float>();
+      float* var_on_host = var_t->mutable_data<float>();
+
+      // Perform preprocess
+      for (int i = 0; i < mean_len; ++i) {
+        scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f);
+      }
+      for (int i = 0; i < mean_len; ++i) {
+        for (int j = 0; j < filter_stride; ++j) {
+          filter_on_host[i * filter_stride + j] *= scale_on_host[i];
+        }
+      }
+      for (int i = 0; i < mean_len; ++i) {
+        bias_on_host[i] += -mean_on_host[i] * scale_on_host[i];
+      }
+
+      float max_f =
+          paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
+      std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
+      paddle::lite::xpu::math::ConvertFP32ToInt16(
+          filter_on_host, filter_int16.get(), max_f, filter_len);
+      memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
+
+      // create new arg in graph and scope
+      std::string max_name = filter_name[i] + "_max";
+      max_filter_name.push_back(max_name);
+      auto* max_filter_node = graph->NewArgumentNode(max_name);
+      max_filter_node->arg()->is_weight = true;
+      max_filter_node->arg()->type = LiteType::GetTensorTy(
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+      DirectedLink(max_filter_node, matched.at("top_conv"));
+      auto* max_filter_t = scope->NewTensor(max_name);
+      max_filter_t->Resize({4});
+      float* max_ptr = max_filter_t->mutable_data<float>();
+      max_ptr[0] = max_f;
+      max_ptr[1] = max_f;
+      max_ptr[2] = max_f;
+      max_ptr[3] = max_f;
+    }
+    op_desc.SetInput("MaxFilter", max_filter_name);
+
+    auto resnet50_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    resnet50_op->Attach(op_desc, scope);
+    resnet50_op->SetValidPlaces(resnet50_stmt->op()->valid_places());
+    auto kernels = resnet50_op->CreateKernels(resnet50_op->valid_places());
+    resnet50_stmt->SetOp(resnet50_op);
+    resnet50_stmt->SetKernels(std::move(kernels));
+
+    IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv"));
+    for (auto* node : extra_input_vars) {
+      IR_NODE_LINK_TO(node, matched.at("top_conv"));
+    }
+    IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("bottom_pool_out"));
+  }
+};
+
+}  // namespace fusion
+
+class XPUResNet50FusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    fusion::XPUResNetBlock0Fuser block0_fuser;
+    block0_fuser(graph.get());
+    fusion::XPUResNetBlock1Fuser block1_fuser;
+    block1_fuser(graph.get());
+    fusion::XPUResNet50Fuser resnet50_fuser;
+    resnet50_fuser(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__resnet_fuse_pass,
+                  paddle::lite::mir::XPUResNet50FusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("conv2d");
diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc
index 150a6e68d8a924ebfa96fdffb99e28b230689a48..43869beddd0af701d5f78ea047b30f6b136e6b3f 100644
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -116,8 +116,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   }
   size_t weight_num = conv_weight_t->data_size();
   bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
-  bool is_weight_quantization =
-      conv_op_desc->HasAttr("quantize_weight_bits") ? true : false;
+  bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits");
 
   // comupte BN alpha and beta
   Tensor alpha_tensor, beta_tensor;
diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc
index 28ec814fa85451b5292bfde6bddc6b64b57b2f08..a32c9c05f69e5c31b77bc0d2ff976560f29b9bec 100644
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -26,15 +26,13 @@ namespace paddle {
 namespace lite {
 namespace mir {
 
-using inference::analysis::Dot;
-
 void GraphVisualizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   VLOG(5) << "\n" << Visualize(graph.get());
 }
 
 std::string Visualize(mir::SSAGraph* graph) {
   std::ostringstream os;
-  inference::analysis::Dot dot;
+  Dot dot;
   auto string_trunc = [](const std::string& str) -> std::string {
     const int max_disp_size = 100;
     if (str.length() > max_disp_size)
diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc
index d6240888d0806486f478511ef81ba8179b46ab43..15f62f36b0f026dc42ecbb274c946e294c7fc44e 100644
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -15,7 +15,6 @@
 #include "lite/core/mir/mlu_postprocess_pass.h"
 #include <list>
 #include <memory>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -50,10 +49,9 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
     op_desc.SetAttr<int>("out_dtype", 4);  // FP16
     op_desc.SetInput("X", {cur_node->AsArg().name});
     op_desc.SetOutput("Out", {cast_arg_name});
-  } else if (op_type == "transpose") {
+  } else if (op_type == "layout") {
     // NCHW -> NHWC
-    op_desc.SetAttr<std::vector<int>>("axis", {0, 2, 3, 1});
-    op_desc.SetInput("X", {cur_node->AsArg().name});
+    op_desc.SetInput("Input", {cur_node->AsArg().name});
     op_desc.SetOutput("Out", {cast_arg_name});
   } else if (op_type == "io_copy") {
     op_desc.SetInput("Input", {cur_node->AsArg().name});
@@ -72,8 +70,15 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
       if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
         is_found = true;
       }
-    } else if (op_type == "transpose") {
-      is_found = true;
+    } else if (op_type == "layout") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
+          DataLayoutCompatible(*out_arg_ty, *cast_type) &&
+          //  for first conv
+          PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
+        is_found = true;
+      }
     } else if (op_type == "io_copy") {
       const Type* in_arg_ty = kernel->GetInputDeclType("Input");
       const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
@@ -89,8 +94,13 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
       // we pick the kernel
       cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
       auto& stmt = cast_inst->AsStmt();
-      stmt.picked_kernel().SetContext(
-          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+      if (op_type == "layout") {
+        stmt.picked_kernel().SetContext(
+            ContextScheduler::Global().NewContext(TARGET(kX86)));
+      } else {
+        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            stmt.picked_kernel().target()));
+      }
       break;
     }
   }
@@ -113,7 +123,7 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
   cast_arg->AsArg().type = cast_type;
   auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
   // for CastAfter manully set the tensor's type
-  var->GetMutable<::paddle::lite::Tensor>();
+  var->GetMutable<paddle::lite::Tensor>();
 
   // create the stmt node
   auto* cast_inst = graph->NewInstructNode();
@@ -127,10 +137,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
     op_desc.SetAttr<int>("out_dtype", 5);  // FP16
     op_desc.SetInput("X", {cast_arg_name});
     op_desc.SetOutput("Out", {cur_node->AsArg().name});
-  } else if (op_type == "transpose") {
+  } else if (op_type == "layout") {
     // NHWC -> NCHW
-    op_desc.SetAttr<std::vector<int>>("axis", {0, 3, 1, 2});
-    op_desc.SetInput("X", {cast_arg_name});
+    op_desc.SetInput("Input", {cast_arg_name});
     op_desc.SetOutput("Out", {cur_node->AsArg().name});
   } else if (op_type == "io_copy") {
     op_desc.SetInput("Input", {cast_arg_name});
@@ -151,8 +160,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
       if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) {
         is_found = true;
       }
-    } else if (op_type == "transpose") {
-      is_found = true;
+    } else if (op_type == "layout") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (DataLayoutCompatible(*in_arg_ty, *cast_type) &&
+          DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) {
+        is_found = true;
+      }
     } else if (op_type == "io_copy") {
       const Type* in_arg_ty = kernel->GetInputDeclType("Input");
       const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
@@ -168,8 +182,13 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
       // we pick the kernel
       cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
       auto& stmt = cast_inst->AsStmt();
-      stmt.picked_kernel().SetContext(
-          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+      if (op_type == "layout") {
+        stmt.picked_kernel().SetContext(
+            ContextScheduler::Global().NewContext(TARGET(kX86)));
+      } else {
+        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            stmt.picked_kernel().target()));
+      }
       break;
     }
   }
@@ -193,24 +212,28 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
   auto* cur_node = head_node;
   const auto name_prefix =
       head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
+  bool is_first_conv_head =
+      std::find(first_conv_nodes_.begin(),
+                first_conv_nodes_.end(),
+                head_node->AsArg().name) != first_conv_nodes_.end();
 
-  // layout cast node
-  if (head_type->layout() != inst_type->layout()) {
+  // precision cast node
+  if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
     cur_node = InsertCastBefore(
-        "transpose",
-        name_prefix + "transpose",
+        "cast",
+        name_prefix + "cast",
         graph,
         cur_node,
         inst_node,
         LiteType::GetTensorTy(
-            head_type->target(), head_type->precision(), inst_type->layout()));
+            head_type->target(), inst_type->precision(), head_type->layout()));
   }
 
-  // precision cast node
-  if (head_type->precision() != inst_type->precision()) {
+  // layout cast node
+  if (head_type->layout() != inst_type->layout()) {
     cur_node = InsertCastBefore(
-        "cast",
-        name_prefix + "cast",
+        "layout",
+        name_prefix + "layout",
         graph,
         cur_node,
         inst_node,
@@ -260,7 +283,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
 
   // get subgraph's valid precision
   const auto& places = graph->valid_places();
-  std::set<::paddle::lite_api::PrecisionType> prec_set;
+  std::set<paddle::lite_api::PrecisionType> prec_set;
   for (const auto& place : places) {
     if (place.target == TARGET(kMLU)) {
       prec_set.insert(place.precision);
@@ -343,23 +366,23 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
   const auto name_prefix =
       tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
 
-  // layout cast node
-  if (tail_type->layout() != inst_type->layout()) {
+  // precision cast node
+  if (tail_type->precision() != inst_type->precision()) {
     cur_node = InsertCastAfter(
-        "transpose",
-        name_prefix + "transpose",
+        "cast",
+        name_prefix + "cast",
         graph,
         cur_node,
         inst_node,
         LiteType::GetTensorTy(
-            tail_type->target(), tail_type->precision(), inst_type->layout()));
+            tail_type->target(), inst_type->precision(), tail_type->layout()));
   }
 
-  // precision cast node
-  if (tail_type->precision() != inst_type->precision()) {
+  // layout cast node
+  if (tail_type->layout() != inst_type->layout()) {
     cur_node = InsertCastAfter(
-        "cast",
-        name_prefix + "cast",
+        "layout",
+        name_prefix + "layout",
         graph,
         cur_node,
         inst_node,
@@ -392,6 +415,14 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
     auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
     UpdateOutputTo(
         sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+    /* graph like this
+     *        subgraph_op_0
+     *          /       \
+     *         /         \
+     * subgraph_op_1   host_op
+     */
+    UpdateInputTo(
+        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
   }
 
   // recreate the op
@@ -415,6 +446,56 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
   }
 }
 
+bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
+  auto* block_desc =
+      static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
+          ->GetSubBlock();
+  for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
+    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
+    CHECK(op_desc);
+    if (op_desc->Type() == "conv2d") {
+      for (auto& names : op_desc->inputs()) {
+        if (std::find(names.second.begin(),
+                      names.second.end(),
+                      arg_node->AsArg().name) != names.second.end()) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool MLUPostprocessPass::IsFirstConvNode(Node* arg_node) {
+  CHECK(arg_node->IsArg());
+  for (auto& inst : arg_node->outlinks) {
+    if (inst->AsStmt().op_type() == "subgraph") {
+      return IsFirstConvInSubgraph(arg_node, inst);
+    }
+  }
+  return false;
+}
+
+void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) {
+  for (auto& node : graph->mutable_nodes()) {
+    if (!node.IsStmt()) continue;
+    if (node.AsStmt().op_type() == "feed") {
+      for (auto& out : node.outlinks) {
+        if (IsFirstConvNode(out)) {
+          first_conv_nodes_.insert(out->AsArg().name);
+          // modify first conv nodes' type
+          const auto* old_type = out->AsArg().type;
+          out->AsArg().type =
+              LiteType::GetTensorTy(old_type->target(),
+                                    paddle::lite_api::PrecisionType::kInt8,
+                                    old_type->layout(),
+                                    old_type->device());
+        }
+      }
+    }
+  }
+}
+
 void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
   for (auto& node : graph->mutable_nodes()) {
     if (!node.IsStmt()) continue;
@@ -432,7 +513,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
           out->AsArg().type =
               LiteType::GetTensorTy(old_type->target(),
                                     old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                     old_type->device());
         }
       }
@@ -451,7 +532,7 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
           inp->AsArg().type =
               LiteType::GetTensorTy(old_type->target(),
                                     old_type->precision(),
-                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    paddle::lite_api::DataLayoutType::kNHWC,
                                     old_type->device());
         }
       }
@@ -460,14 +541,22 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
 }
 
 void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  // currently for non-persistent input and output args, mlu subgraph op
-  // only support float16/float32 data type
-
-  // in two situations as folllows:
-  // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
-  // arg_in and arg_out are assumed to be NHWC which user should be aware of.
-  // Thus here we change these args' layout to NHWC
-  ModifyLayout(graph.get());
+// currently for non-persistent input and output args, mlu subgraph op
+// only support float16/float32 data type
+
+// in two situations as folllows:
+// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
+// arg_in and arg_out are assumed to be NHWC which user should be aware of.
+// Thus here we change these args' layout to NHWC
+#ifdef LITE_WITH_MLU
+  if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) {
+    ModifyLayout(graph.get());
+  }
+
+  if (lite::DeviceInfo::Global().UseFirstConv()) {
+    GatherAndModifyFirstConvNodes(graph.get());
+  }
+#endif
 
   // insert io_copy, layout and precision cast of subgraph's inputs and outputs
   for (auto& node : graph->mutable_nodes()) {
diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h
index 8ffcbc952a44abea272bdd22467d86cd04baa207..688dd06fb5fbec0c8e1c53acfe4215456ddb4192 100644
--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 #include "lite/core/mir/pass.h"
@@ -107,6 +108,15 @@ class MLUPostprocessPass : public ProgramPass {
                         const Type* cast_type);
 
   void RecreateOp(Node* inst_node, SSAGraph* graph);
+
+  void GatherAndModifyFirstConvNodes(SSAGraph* graph);
+
+  bool IsFirstConvNode(Node* arg_node);
+
+  bool IsFirstConvInSubgraph(Node* arg_node, Node* inst);
+
+ private:
+  std::set<std::string> first_conv_nodes_;
 };
 
 }  // namespace mir
diff --git a/lite/core/mir/pattern_matcher.cc b/lite/core/mir/pattern_matcher.cc
index b625919cbfb6d26ecbbd1bad36772aff86bee087..aaebf852b2ec519515e59655a57600f59ec6a2c3 100644
--- a/lite/core/mir/pattern_matcher.cc
+++ b/lite/core/mir/pattern_matcher.cc
@@ -322,7 +322,6 @@ void PatternMatcher::RemoveOverlappedMatch(std::vector<subgraph_t> *subgraphs) {
 }
 
 std::string PMPattern::DotString() const {
-  using inference::analysis::Dot;
   Dot dot;
   int id = 0;
   // Create Nodes
diff --git a/lite/core/mir/pattern_matcher_high_api.h b/lite/core/mir/pattern_matcher_high_api.h
index e62a4fc7494d750b2b5331c4b54b787df239ceee..3ac8e331aacb28044fca7f328319de37b27950bf 100644
--- a/lite/core/mir/pattern_matcher_high_api.h
+++ b/lite/core/mir/pattern_matcher_high_api.h
@@ -64,7 +64,6 @@ class FuseBase {
  protected:
   virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0;
 
- private:
   void PerformPatternMatcher(SSAGraph* graph);
 
   // Delete nodes that are marked as Intermediate
diff --git a/lite/core/mir/runtime_context_assign_pass.cc b/lite/core/mir/runtime_context_assign_pass.cc
index 97c4819eaf6734ba9b374444166d17cb15e8ae65..3cbe602f31a87c6ddb42d36fe75e52e8347695d8 100644
--- a/lite/core/mir/runtime_context_assign_pass.cc
+++ b/lite/core/mir/runtime_context_assign_pass.cc
@@ -24,11 +24,31 @@ class RuntimeContextAssignPass : public StmtPass {
   RuntimeContextAssignPass() {}
 
   void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+#ifdef LITE_WITH_OPENCL
+    using OpenCLContext = Context<TargetType::kOpenCL>;
+    std::unique_ptr<KernelContext> local_ctx(new KernelContext());
+    local_ctx->As<OpenCLContext>().InitOnce();
+#endif
     for (auto& node : graph->mutable_nodes()) {
       if (!node.IsStmt()) continue;
       auto& inst = node.AsStmt();
+
+#ifdef LITE_WITH_OPENCL
+      if (inst.picked_kernel().target() == TARGET(kOpenCL)) {
+        std::unique_ptr<KernelContext> ctx(new KernelContext());
+        (*local_ctx)
+            .As<OpenCLContext>()
+            .CopySharedTo(&ctx->As<OpenCLContext>());
+        inst.picked_kernel().SetContext(std::move(ctx));
+      } else {
+        inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+            inst.picked_kernel().target()));
+      }
+#else
       inst.picked_kernel().SetContext(
           ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
+
+#endif
     }
   }
 };
diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc
index 6c45ce828249c3e236706c297db3d434c71c351a..54f5f4d46ce465d9db78b43f339296a3135c9507 100644
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -64,6 +64,26 @@ std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildOperationAdjList() {
   return adj_list;
 }
 
+std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildNodeAdjList() {
+  std::map<mir::Node *, std::set<mir::Node *>> adj_list;
+
+  for (auto &n : mutable_nodes()) {
+    if (adj_list.find(&n) == adj_list.end()) {
+      adj_list[&n] = std::set<mir::Node *>();
+    }
+    std::vector<mir::Node *> nodes;
+    for (auto &var : n.inlinks) {
+      nodes.push_back(var);
+    }
+    std::sort(nodes.begin(),
+              nodes.end(),
+              [](mir::Node *node1, mir::Node *node2) { return node1 > node2; });
+    adj_list[&n].insert(std::make_move_iterator(nodes.begin()),
+                        std::make_move_iterator(nodes.end()));
+  }
+  return adj_list;
+}
+
 void SSAGraph::SortHelper(
     const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
     mir::Node *node,
@@ -98,6 +118,24 @@ std::vector<mir::Node *> SSAGraph::StmtTopologicalOrder() {
   return res;
 }
 
+std::vector<mir::Node *> SSAGraph::NodeTopologicalOrder() {
+  CheckBidirectionalConnection();
+
+  std::stack<mir::Node *> stack;
+  std::set<mir::Node *> visited;
+  std::vector<mir::Node *> res;
+
+  auto adj_list = BuildNodeAdjList();
+
+  for (auto adj : adj_list) {
+    if (visited.find(adj.first) == visited.end()) {
+      SortHelper(adj_list, adj.first, &visited, &res);
+    }
+  }
+
+  return res;
+}
+
 Node *SSAGraph::GraphCreateInstructNode(
     const std::shared_ptr<OpLite> &op, const std::vector<Place> &valid_places) {
   node_storage_.emplace_back();
@@ -213,9 +251,10 @@ std::vector<mir::Node *> SSAGraph::outputs() {
 }
 
 mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) {
-  auto it = arguments_.find(arg);
-  if (it != arguments_.end()) {
-    return it->second;
+  for (auto &node : node_storage_) {
+    if (node.IsArg() && node.arg()->name == arg) {
+      return &node;
+    }
   }
   return nullptr;
 }
diff --git a/lite/core/mir/ssa_graph.h b/lite/core/mir/ssa_graph.h
index b5b9fb1cb28a35f37d51e4e63eb7512354d0547b..e2967cf96a6b00ccc225ce05b043cb94f161b1d6 100644
--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
@@ -42,6 +42,8 @@ class SSAGraph : GraphBase {
 
   std::vector<mir::Node *> StmtTopologicalOrder();
 
+  std::vector<mir::Node *> NodeTopologicalOrder();
+
   // The inputs of the graph.
   std::vector<mir::Node *> inputs();
 
@@ -86,6 +88,9 @@ class SSAGraph : GraphBase {
   // Build operator inlink edge table.
   std::map<mir::Node *, std::set<mir::Node *>> BuildOperationAdjList();
 
+  // Build node inlink edge table.
+  std::map<mir::Node *, std::set<mir::Node *>> BuildNodeAdjList();
+
   void SortHelper(const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
                   mir::Node *node,
                   std::set<mir::Node *> *visited,
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
index 6844fd96688d5086b47d66a32f770a757f56fda4..b61f7f365f51a32e267dd12943be5fcfadb3e08a 100644
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -30,10 +30,8 @@ namespace paddle {
 namespace lite {
 namespace mir {
 
-using inference::analysis::Dot;
-
 std::string SubgraphVisualizer::operator()() {
-  inference::analysis::Dot dot;
+  Dot dot;
   const std::vector<std::string> subgraph_colors{
       "red",          "green",          "cyan",           "bisque3",
       "coral",        "darkseagreen1",  "goldenrod1",     "darkorchid",
@@ -314,8 +312,14 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) {
 
 std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
     node_map_t *nodes) {
-  for (auto &it : *nodes) {
-    node_dat_t *node = it.second;
+  for (auto &ordered_node : graph_->NodeTopologicalOrder()) {
+    // different orders when traversing nodes in graph may lead to
+    // different subgraph division, which may generate different result
+    // with device such as MLU. These different results are all "right"
+    // but a little confusing. Thus the topological order is used instead
+    // of the address of the node in graph.
+    CHECK(nodes->find(ordered_node) != nodes->end());
+    node_dat_t *node = (*nodes)[ordered_node];
     if (!node->marked) {
       continue;
     }
@@ -573,13 +577,14 @@ void ExtractInputsOutputs(const std::vector<Node *> &op_nodes,
         unused_var_nodes->insert(var_node);
         continue;
       }
-      // Var can have more than one next op node, So, if any one in the
-      // op_nodes then continue
-      bool next_op_in_nodes = false;
+      // Var can have more than one next op node, So, if all next nodes are in
+      // op_nodes then it should be put into local_var_nodes
+      bool next_op_in_nodes = true;
       for (auto &next_op_node : var_node->outlinks) {
-        if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) !=
+        if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) ==
             op_nodes.end()) {
-          next_op_in_nodes = true;
+          next_op_in_nodes = false;
+          break;
         }
       }
       if (next_op_in_nodes) {
diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc
index 974772a9839c1e089359be3ae98e1833645ccd7a..1e54e1497b5d49754a705340aafa30ded1c2a727 100644
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -200,7 +200,7 @@ TEST(Subgraph, detect_custom_model) {
 #ifdef LITE_WITH_NPU
       Place{TARGET(kNPU), PRECISION(kFloat)},
 #endif
-#ifdef LITE_WITH_XPU
+#ifdef LITE_WITH_XTCL
       Place{TARGET(kXPU), PRECISION(kFloat)},
 #endif
   });
diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc
index 5e2cecd277820ab39b5a25db6159591157982d01..eecd9348ae684929d3f55dee2a94921a078f148c 100644
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -20,6 +20,7 @@
 #include <vector>
 #include "lite/core/mir/pass_registry.h"
 #include "lite/core/mir/subgraph/subgraph_detector.h"
+#include "lite/utils/env.h"
 
 namespace paddle {
 namespace lite {
@@ -40,6 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }
 
 void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
   std::unordered_set<std::string> supported_lists;
 #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
 #include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
@@ -67,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   fuser();
 }
 
+void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/mlu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
@@ -77,3 +93,5 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
     .BindTargets({TARGET(kXPU)});
 REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
     .BindTargets({TARGET(kBM)});
+REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
+    .BindTargets({TARGET(kMLU)});
diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h
index 1ba0f2ab4aa52c384f4175de0eb34475b34fb94c..f83448df42ffe6d6d8c5b37503b5127290037dce 100644
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass {
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };
 
+class MLUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
index 7117e1b3399fe823194f7f1a4d4c239099580955..a2369adc5d882310503cbf52fa5394098d824b40 100644
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -180,7 +180,7 @@ TEST(Subgraph, generate_model_and_check_precision) {
 #ifdef LITE_WITH_NPU
   valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
 #endif
-#ifdef LITE_WITH_XPU
+#ifdef LITE_WITH_XTCL
   valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
 #endif
   auto tar_predictor = TestModel(FLAGS_model_dir,
diff --git a/lite/core/mir/subgraph_cast_display_pass.cc b/lite/core/mir/subgraph_cast_display_pass.cc
deleted file mode 100644
index 3a2c94d23298fcb607de0bf821d0dc92c95da7bb..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph_cast_display_pass.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class SubgraphCastDisplayPass : public DebugPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
-    VLOG(3) << "== Argument types ==";
-    for (auto& node : graph->mutable_nodes()) {
-      if (!node.IsArg()) continue;
-
-      auto* type = node.AsArg().type;
-      if (type) {
-        VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type;
-      } else {
-        VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK";
-      }
-    }
-    VLOG(3) << "---------------------";
-
-    //
-    VLOG(0) << "== SubgraphOp Debug Info ==";
-    for (auto& node : graph->mutable_nodes()) {
-      if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
-        VLOG(0) << "FOUND SUBGRAPH OP";
-        display_debug_info(node, "subgraph");
-        break;
-      }
-    }
-    VLOG(0) << "---------------------";
-  }
-
-  void display_debug_info(const Node& node,
-                          std::string op_type,
-                          bool display_in_nodes = true,
-                          bool display_out_nodes = true) {
-    CHECK(node.IsStmt());
-    VLOG(0) << node.AsStmt();
-    if (display_in_nodes) {
-      for (auto p_in_arg_node : node.inlinks) {
-        CHECK(p_in_arg_node->IsArg());
-        VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name
-                << " type: " << *p_in_arg_node->AsArg().type
-                << " is_weight: " << p_in_arg_node->AsArg().is_weight
-                << " is_persist: " << p_in_arg_node->AsArg().is_persist
-                << " input_count: " << p_in_arg_node->inlinks.size();
-        if (p_in_arg_node->inlinks.size() == 0) {
-          VLOG(0) << "** END with No Op";
-        }
-        for (auto p_in_stmt_node : p_in_arg_node->inlinks) {
-          CHECK(p_in_stmt_node->IsStmt());
-          std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type();
-          if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
-              stmt_op_type == "io_copy") {
-            display_debug_info(*p_in_stmt_node, stmt_op_type, true, false);
-          } else {
-            VLOG(0) << "** END with op type: " << stmt_op_type;
-          }
-        }
-      }
-    }
-    if (display_out_nodes) {
-      for (auto p_out_arg_node : node.outlinks) {
-        CHECK(p_out_arg_node->IsArg());
-        VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
-                << " type: " << *p_out_arg_node->AsArg().type
-                << " is_weight: " << p_out_arg_node->AsArg().is_weight
-                << " is_persist: " << p_out_arg_node->AsArg().is_persist
-                << " output_count: " << p_out_arg_node->outlinks.size();
-        if (p_out_arg_node->outlinks.size() == 0) {
-          VLOG(0) << "** END with No Op";
-        }
-        for (auto p_out_stmt_node : p_out_arg_node->outlinks) {
-          CHECK(p_out_stmt_node->IsStmt());
-          std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type();
-          if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
-              stmt_op_type == "io_copy") {
-            display_debug_info(*p_out_stmt_node, stmt_op_type, false, true);
-          } else {
-            VLOG(0) << "** END with op type: " << stmt_op_type;
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(subgraph_cast_display_pass,
-                  paddle::lite::mir::SubgraphCastDisplayPass)
-    .BindTargets({TARGET(kAny)});
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
index 75d8022d5f5f9d8572a5e020c11ae5d8cf630c10..aca7343c8af39f767c2a336e0b298995731b755f 100644
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -180,7 +180,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
         VLOG(4) << "picked, opencl found";
         is_found = true;
       } else if (TypeCompatible(*in_arg_ty, from) &&
-                 out_arg_ty->target() == to.target()) {
+                 TargetCompatibleTo(*out_arg_ty, to)) {
         VLOG(4) << "picked";
         is_found = true;
       }
diff --git a/lite/core/mir/weight_quantization_preprocess_pass.cc b/lite/core/mir/weight_quantization_preprocess_pass.cc
index c7889a54903f2a1d194fb3eade0bd92670b36699..2bb247871b9500129eeea855677a907cb4fd88b9 100644
--- a/lite/core/mir/weight_quantization_preprocess_pass.cc
+++ b/lite/core/mir/weight_quantization_preprocess_pass.cc
@@ -22,9 +22,29 @@ namespace paddle {
 namespace lite {
 namespace mir {
 
+bool IsAbsMaxQuantizedOp(const OpInfo& op_info) {
+  bool result = false;
+  if (op_info.HasAttr("quantization_type") &&
+      op_info.GetAttr<std::string>("quantization_type") ==
+          "post_weight_abs_max") {
+    result = true;
+  } else if (!op_info.HasAttr("quantization_type") &&
+             op_info.HasAttr("quantize_weight_bits")) {  // Support older model,
+                                                         // save this for now
+    result = true;
+  }
+  return result;
+}
+
+/*
+ * For abs_max method in WeightQuantization, this pass obtains the scale value
+ * of conv2d, depthwise_conv2d and mul, expands the scale list, and save the
+ * list in the quantized ops.
+*/
 void WeightQuantizationPreprocessPass::Apply(
     const std::unique_ptr<SSAGraph>& graph) {
-  std::vector<std::string> weight_quantized_op = {"conv2d", "depthwise_conv2d"};
+  std::vector<std::string> weight_quantized_op = {
+      "conv2d", "depthwise_conv2d", "mul"};
   for (auto& node : graph->StmtTopologicalOrder()) {
     if (node->IsStmt() &&
         std::find(weight_quantized_op.begin(),
@@ -32,14 +52,20 @@ void WeightQuantizationPreprocessPass::Apply(
                   node->AsStmt().op_type()) != weight_quantized_op.end()) {
       auto* scope = node->stmt()->op()->scope();
       auto* op_desc = node->stmt()->mutable_op_info();
-      if (op_desc->HasAttr("quantize_weight_bits")) {
+      if (IsAbsMaxQuantizedOp(*op_desc)) {
         for (auto& input_name : op_desc->input_vars()) {
           std::string scale_name = input_name + "_quant_scale";
           if (op_desc->HasAttr(scale_name)) {
-            VLOG(5) << "op:" << op_desc->Type() << " input_name:" << input_name;
+            VLOG(0) << " WeightQuantizationPreprocessPass op:"
+                    << op_desc->Type() << " input_name:" << input_name;
             auto input_tensor =
                 scope->FindVar(input_name)->GetMutable<lite::Tensor>();
-            int weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
+            int weight_out_channel;
+            if (op_desc->Type() == "mul") {
+              weight_out_channel = static_cast<int>(input_tensor->dims()[1]);
+            } else {
+              weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
+            }
             auto input_scale = op_desc->GetAttr<std::vector<float>>(scale_name);
             // scale length is equal to weight out channel
             std::vector<float> scale_list(weight_out_channel, input_scale[0]);
diff --git a/lite/core/mir/weight_quantization_preprocess_pass.h b/lite/core/mir/weight_quantization_preprocess_pass.h
index 76a35c6b443c692ec08688abd4c10680be62b8af..e7c9f03eef78bdafea204d30c78cf0d044bb15e9 100644
--- a/lite/core/mir/weight_quantization_preprocess_pass.h
+++ b/lite/core/mir/weight_quantization_preprocess_pass.h
@@ -25,8 +25,9 @@ namespace mir {
  * If the model is quantized by WeightQuantization in PostTrainingQuantization,
  * the data type of the weight in quantized ops (conv2d, depthwise_conv2d) is
  * int, and the scale is save in the quantized ops.
- * WeightQuantizationPreprocessPass obtains the scale value, expands the
- * scale value to a list, and save the list in the quantized ops.
+ * For abs_max method in WeightQuantization, WeightQuantizationPreprocessPass
+ * obtains the scale value of conv2d, depthwise_conv2d and mul, expands the
+ * scale list, and save the list in the quantized ops.
  */
 class WeightQuantizationPreprocessPass : public ProgramPass {
  public:
diff --git a/lite/core/mir/xpu_pattern_matcher.cc b/lite/core/mir/xpu_pattern_matcher.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f268e7af8a55d22163d52c7f8824406f58bd17b
--- /dev/null
+++ b/lite/core/mir/xpu_pattern_matcher.cc
@@ -0,0 +1,271 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <string>
+#include <vector>
+
+#include "lite/core/mir/dot.h"
+#include "lite/core/mir/xpu_pattern_matcher.h"
+#include "lite/core/op_lite.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+
+void XPUPatternMatcher::operator()(SSAGraph *graph,
+                                   XPUPatternMatcher::handle_t handler) {
+  if (!MarkPMNodesInGraph(graph)) {
+    return;
+  }
+
+  auto subgraphs = DetectPatterns();
+  UniquePatterns(&subgraphs);
+  RemoveOverlappedMatch(&subgraphs);
+  ValidateByNodeRole(&subgraphs);
+
+  if (subgraphs.empty()) return;
+  LOG(INFO) << "detected " << subgraphs.size() << " subgraph";
+  int id = 0;
+  for (auto &g : subgraphs) {
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
+    handler(g, graph);
+  }
+}
+
+bool XPUPatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) {
+  VLOG(3) << "mark pmnodes in graph";
+  if (graph->nodes().empty()) return false;
+  for (auto &node : graph->mutable_nodes()) {
+    for (const auto &pmnode : pattern_.nodes()) {
+      if (pmnode->Tell(&node)) {
+        pmnodes2nodes_[pmnode.get()].insert(&node);
+      }
+    }
+  }
+  // Check to early stop if some PMNode can't find matched Node.
+  for (auto &pmnode : pattern_.nodes()) {
+    if (!pmnodes2nodes_.count(pmnode.get())) {
+      VLOG(4) << pmnode->name() << " can't find matched Node, early stop";
+      // return false;
+    }
+  }
+  VLOG(3) << pmnodes2nodes_.size() << " nodes marked";
+
+  return !pmnodes2nodes_.empty();
+}
+
+// The intermediate Nodes can only link to the nodes inside the pattern, or this
+// subgraph will be droped.
+void XPUPatternMatcher::ValidateByNodeRole(
+    std::vector<PatternMatcher::subgraph_t> *subgraphs) {
+  subgraphs->erase(
+      std::remove_if(subgraphs->begin(),
+                     subgraphs->end(),
+                     [](const XPUPatternMatcher::subgraph_t &subgraph) -> bool {
+                       // Collect the inlinks and outlinks.
+                       std::unordered_set<Node *> ios;
+                       for (auto &item : subgraph) {
+                         ios.insert(item.second);
+                       }
+                       for (auto &item : subgraph) {
+                         if (item.first->IsIntermediate()) {
+                           for (auto *x : item.second->outlinks) {
+                             if (!ios.count(x)) {
+                               return true;
+                             }
+                           }
+                         }
+                       }
+                       return false;
+                     }),
+      subgraphs->end());
+
+  for (auto &subgraph : *subgraphs) {
+    std::unordered_set<Node *> ios;
+    for (auto &item : subgraph) {
+      ios.insert(item.second);
+    }
+    extra_input_vars_.emplace_back();
+    for (auto &item : subgraph) {
+      for (auto *x : item.second->inlinks) {
+        if (x->IsArg() && ios.count(x) == 0) {
+          // extra weight var
+          extra_input_vars_.back().push_back(x);
+        }
+      }
+    }
+  }
+}
+
+struct HitGroup {
+  std::unordered_map<PMNode *, Node *> roles;
+
+  bool Match(Node *node, PMNode *pat) {
+    if (nodes_.count(node)) {
+      if (roles.count(pat) && roles[pat] == node) return true;
+      return false;
+    } else {
+      if (roles.count(pat) && roles[pat] != node) return false;
+      return true;
+    }
+  }
+
+  void Register(Node *node, PMNode *pat) {
+    roles[pat] = node;
+    nodes_.insert(node);
+  }
+
+ private:
+  std::unordered_set<Node *> nodes_;
+};
+
+// Tell whether Node a links to b.
+bool IsNodesLink(Node *a, Node *b) {
+  for (auto *node : a->outlinks) {
+    if (b == node) {
+      return true;
+    }
+  }
+  return false;
+}
+
+std::vector<PatternMatcher::subgraph_t> XPUPatternMatcher::DetectPatterns() {
+  // Init empty subgraphs.
+  std::vector<PatternMatcher::subgraph_t> result;
+  std::vector<HitGroup> init_groups;
+  std::array<std::vector<HitGroup>, 2> bi_records;
+  auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
+                                               : pattern_.edges().front().first;
+  if (!pmnodes2nodes_.count(first_pnode)) return result;
+  for (auto *node : pmnodes2nodes_[first_pnode]) {
+    HitGroup group;
+    group.roles[first_pnode] = node;
+    init_groups.emplace_back(group);
+  }
+
+  int step = 0;
+  bi_records[0] = std::move(init_groups);
+
+  // Extend a PMNode to subgraphs by deducing the connection relations defined
+  // in edges of PMNodes.
+  for (const auto &edge : pattern_.edges()) {
+    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
+    // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
+    // Each role has two PMNodes, which indicates two roles.
+    // Detect two Nodes that can match these two roles and they are connected.
+    auto &pre_groups = bi_records[step % 2];
+    auto &cur_groups = bi_records[1 - (step++ % 2)];
+    cur_groups.clear();
+    if (pre_groups.empty()) break;
+    // source -> target
+    for (Node *source : pmnodes2nodes_[edge.first]) {
+      for (Node *target : pmnodes2nodes_[edge.second]) {
+        // TODO(Superjomn) add some prune strategies.
+        for (const auto &group : pre_groups) {
+          if (IsNodesLink(source, target)) {
+            HitGroup new_group = group;
+            bool flag = new_group.Match(source, edge.first) &&
+                        new_group.Match(target, edge.second);
+            if (flag) {
+              new_group.Register(source, edge.first);
+              new_group.Register(target, edge.second);
+              cur_groups.push_back(new_group);
+              // TODO(Superjomn) need to unique
+            }
+          }
+        }
+      }
+    }
+    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
+  }
+
+  for (auto &group : bi_records[step % 2]) {
+    XPUPatternMatcher::subgraph_t subgraph;
+    for (auto &role : group.roles) {
+      subgraph.emplace(role.first, role.second);
+    }
+    result.emplace_back(subgraph);
+  }
+  return result;
+}
+
+struct GraphItemLessThan {
+  bool operator()(const std::pair<PMNode *, Node *> &a,
+                  const std::pair<PMNode *, Node *> &b) {
+    if (a.first != b.first) {
+      return a.first < b.first;
+    } else {
+      return a.second < b.second;
+    }
+  }
+};
+
+// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
+// see https://github.com/PaddlePaddle/Paddle/issues/13550
+void XPUPatternMatcher::UniquePatterns(
+    std::vector<PatternMatcher::subgraph_t> *subgraphs) {
+  if (subgraphs->empty()) return;
+  std::vector<PatternMatcher::subgraph_t> result;
+
+  std::unordered_set<size_t> set;
+  std::hash<std::string> hasher;
+  for (auto &g : *subgraphs) {
+    // Sort the items in the sub-graph, and transform to a string key.
+    std::vector<std::pair<PMNode *, Node *>> sorted_keys(g.begin(), g.end());
+    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
+    STL::stringstream ss;
+    for (auto &item : sorted_keys) {
+      ss << reinterpret_cast<size_t>(item.first) << ":"
+         << reinterpret_cast<size_t>(item.second);
+    }
+    auto key = hasher(ss.str());
+    if (!set.count(key)) {
+      result.emplace_back(g);
+      set.insert(key);
+    }
+  }
+  *subgraphs = result;
+}
+
+void XPUPatternMatcher::RemoveOverlappedMatch(
+    std::vector<subgraph_t> *subgraphs) {
+  std::vector<subgraph_t> result;
+  std::unordered_set<Node *> node_set;
+
+  for (const auto &subgraph : *subgraphs) {
+    bool valid = true;
+    for (auto &item : subgraph) {
+      if (item.first->IsIntermediate() && node_set.count(item.second)) {
+        valid = false;
+        break;
+      }
+    }
+    if (valid) {
+      for (auto &item : subgraph) {
+        node_set.insert(item.second);
+      }
+      result.push_back(subgraph);
+    }
+  }
+  *subgraphs = result;
+}
+
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/xpu_pattern_matcher.h b/lite/core/mir/xpu_pattern_matcher.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ac03718f32a859ff6888e63e57fd4098e435e06
--- /dev/null
+++ b/lite/core/mir/xpu_pattern_matcher.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/pattern_matcher.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+
+/*
+ * PatternMatcher helps to detect the specific patterns in the graph.
+ * Input a pattern, output a list of the matched subgraphs/nodes.
+ * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
+ *
+ * The algorithm has three phases:
+ *   1. Mark the nodes that match the defined PMNodes in a PMPattern,
+ *   2. Extend a PMNode to subgraphs by deducing the connection relation defined
+ *      in PAPattern(the edges),
+ *   3. Get the filtered subgraphs and treat them with a pre-defined handler.
+ *
+ * Usage:
+ *    // Create a matcher
+ *    PatternMatcher matcher;
+ *    // Define the matcher's pattern, by adding PMNode and define the edges.
+ *    auto* node0 = matcher.mutable_pattern().AddNode(...)
+ *    auto* node1 = matcher.mutable_pattern().AddNode(...)
+ *    node0->teller = some lambda.
+ *    node1->teller = some lambda.
+ *    matcher.mutable_pattern().AddEdge(node0, node1);
+ *    // Create an handler, to define the behavior of treating the filtered
+ *    // subgraphs that comply with the patterns.
+ *    PatternMatcher::handle_t handler = some labmda
+ *    // Execute the matcher.
+ *    matcher(&graph, handler);
+ */
+struct XPUPatternMatcher {
+  using subgraph_t = std::unordered_map<PMNode*, Node*>;
+
+  // Operate on the detected pattern.
+  using handle_t =
+      std::function<void(const subgraph_t& /*hitted pattern*/, SSAGraph*)>;
+
+  void operator()(SSAGraph* graph, handle_t handler);
+
+  const PMPattern& pattern() const { return pattern_; }
+  PMPattern* mutable_pattern() { return &pattern_; }
+
+  // Mark the nodes that fits the pattern.
+  bool MarkPMNodesInGraph(SSAGraph* graph);
+
+  // Detect all the pattern and output the hit records.
+  std::vector<subgraph_t> DetectPatterns();
+
+  // Remove duplicate patterns.
+  void UniquePatterns(std::vector<subgraph_t>* subgraphs);
+
+  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
+  // The intermediate PMNodes will be removed, so can't shared by multiple
+  // patterns.
+  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
+
+  // Validate whether the intermediate nodes are linked by external nodes.
+  void ValidateByNodeRole(std::vector<subgraph_t>* subgraphs);
+
+  using hit_rcd_t =
+      std::pair<Node* /*node in graph*/, PMNode* /*node in pattern*/>;
+  PMPattern pattern_;
+  std::unordered_map<const PMNode*, std::unordered_set<Node*>> pmnodes2nodes_;
+  std::vector<std::vector<Node*>> extra_input_vars_;
+};
+
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/xpu_pattern_matcher_high_api.cc b/lite/core/mir/xpu_pattern_matcher_high_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ffc496d1593d15f02d82e824c06443e7b3e01c9
--- /dev/null
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include <set>
+#include <unordered_set>
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+
+void XPUFuseBase::PerformPatternMatcher(SSAGraph *graph) {
+  VLOG(4) << "\n" << matcher_.pattern().DotString();
+  // Get subgraphs and record the mir::Node pointers for each PMNode.
+  auto handler = [&](const PatternMatcher::subgraph_t &subgraph, SSAGraph *g) {
+    // get all the reigistered nodes.
+    key2nodes_.emplace_back();
+    for (auto &item : nodes_) {
+      key2nodes_.back()[item.first] = subgraph.at(item.second);
+    }
+  };
+
+  matcher_(graph, handler);
+}
+
+void XPUFuseBase::DeleteInterNodes(SSAGraph *graph) {
+  std::set<std::string> keys;
+  for (auto &node : nodes_) {
+    if (node.second->IsIntermediate()) {
+      keys.insert(node.first);
+    }
+  }
+
+  VLOG(4) << "keys: " << key2nodes_.size();
+  std::unordered_set<const Node *> nodes2rm;
+  for (auto &matched : key2nodes_) {
+    for (const auto &key : keys) {
+      nodes2rm.insert(matched.at(key));
+    }
+  }
+
+  VLOG(3) << "clean nodes " << nodes2rm.size();
+  GraphSafeRemoveNodes(graph, nodes2rm);
+}
+
+PMNode *XPUFuseBase::GetOrCreateNode(const std::string &key) {
+  auto it = nodes_.find(key);
+  if (it != nodes_.end()) {
+    return it->second;
+  }
+  nodes_.emplace(key,
+                 matcher_.mutable_pattern()->NewNode(patterns::UniqueKey(key)));
+  it = nodes_.find(key);
+  return it->second;
+}
+
+PMNode *XPUFuseBase::OpNode(const std::string &key,
+                            const std::string &op_type) {
+  GetOrCreateNode(key)->set_op_type(op_type);
+  GetOrCreateNode(key)->AsOp(op_type);
+  return GetOrCreateNode(key);
+}
+
+PMNode *XPUFuseBase::VarNode(const std::string &key) {
+  GetOrCreateNode(key)->AsVar();
+  return GetOrCreateNode(key);
+}
+
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/xpu_pattern_matcher_high_api.h b/lite/core/mir/xpu_pattern_matcher_high_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..3302bcb6137f16afcf82269af91c8a13558da2b9
--- /dev/null
+++ b/lite/core/mir/xpu_pattern_matcher_high_api.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+#include "lite/core/mir/xpu_pattern_matcher.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace xpu {
+
+class XPUFuseBase {
+ public:
+  using key2nodes_t = std::map<std::string, Node*>;
+
+  virtual ~XPUFuseBase() = default;
+
+  void operator()(SSAGraph* graph) {
+    BuildPattern();
+    PerformPatternMatcher(graph);
+
+    for (size_t i = 0; i < key2nodes_.size(); ++i) {
+      InsertNewNode(graph, key2nodes_[i], matcher_.extra_input_vars_[i]);
+    }
+
+    DeleteInterNodes(graph);
+  }
+
+  // Build a PMPattern using PMNode.
+  virtual void BuildPattern() = 0;
+
+  // Generate an operator desc with a matched subgraph.
+  virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
+    return cpp::OpDesc();
+  }
+
+  PMNode* OpNode(const std::string& key) {
+    return GetOrCreateNode(key)->assert_is_op();
+  }
+
+  PMNode* OpNode(const std::string& key, const std::string& op_type);
+
+  PMNode* VarNode(const std::string& key);
+
+ protected:
+  virtual void InsertNewNode(SSAGraph* graph,
+                             const key2nodes_t& matched,
+                             const std::vector<Node*>& extra_input_vars) = 0;
+
+  void PerformPatternMatcher(SSAGraph* graph);
+
+  // Delete nodes that are marked as Intermediate
+  void DeleteInterNodes(SSAGraph* graph);
+
+  PMNode* GetOrCreateNode(const std::string& key);
+
+ protected:
+  XPUPatternMatcher matcher_;
+  std::map<std::string, PMNode*> nodes_;
+  std::vector<key2nodes_t> key2nodes_;
+};
+
+}  // namespace xpu
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc
index a9ccd1b9ae9a5d45f8d0e5638b3aab1d73d1903c..f8a706179374a0c86e28cf9a3638f5df2c932540 100644
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -157,5 +157,33 @@ Tensor *OpLite::GetMutableTensor(lite::Scope *scope,
   return var->GetMutable<lite::Tensor>();
 }
 
+void OpLite::AttachInput(const cpp::OpDesc &op_desc,
+                         lite::Scope *scope,
+                         const std::string &input_name,
+                         bool is_dispensable,
+                         lite::Tensor **input_var) {
+  bool is_have_input =
+      op_desc.HasInput(input_name) && op_desc.Input(input_name).size() > 0;
+  CHECK(is_dispensable || is_have_input);
+  if (is_have_input) {
+    std::string input_var_name = op_desc.Input(input_name).front();
+    *input_var = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  }
+}
+
+void OpLite::AttachOutput(const cpp::OpDesc &op_desc,
+                          lite::Scope *scope,
+                          const std::string &output_name,
+                          bool is_dispensable,
+                          lite::Tensor **output_var) {
+  bool is_have_output =
+      op_desc.HasOutput(output_name) && op_desc.Output(output_name).size() > 0;
+  CHECK(is_dispensable || is_have_output);
+  if (is_have_output) {
+    std::string output_var_name = op_desc.Output(output_name).front();
+    *output_var = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h
index 1cdc33825cb4ffb758b46ac4b9bee968b3fca055..428b188c468ded790e74c9cc4f5da5c7efe2fd00 100644
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -105,6 +105,20 @@ class OpLite : public Registry {
     return kernel_.get();
   }
 
+  // Attach input variable from scope by op_desc and input name
+  void AttachInput(const cpp::OpDesc &op_desc,
+                   lite::Scope *scope,
+                   const std::string &input_name,
+                   bool is_dispensable,
+                   lite::Tensor **input_var);
+
+  // Attach output variable from scope by op_desc and output name
+  void AttachOutput(const cpp::OpDesc &op_desc,
+                    lite::Scope *scope,
+                    const std::string &output_name,
+                    bool is_dispensable,
+                    lite::Tensor **output_var);
+
   virtual ~OpLite() = default;
 
  protected:
diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc
index 5fc2e15d188118c9c756fc84c732390ad4036df5..2b1dfd5704edf99d6decad6dd3f07d622727b21a 100644
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -155,6 +155,8 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kMLU, kInt16, kNCHW);
 
   INIT_FOR(kHost, kFloat, kNCHW);
+  INIT_FOR(kHost, kInt32, kNCHW);
+  INIT_FOR(kHost, kInt64, kNCHW);
   INIT_FOR(kHost, kAny, kNCHW);
   INIT_FOR(kHost, kFloat, kNHWC);
   INIT_FOR(kHost, kFloat, kAny);
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
index afde70c58ad57c41cd48060abbf3b72360ef943a..65e7dd46b1994872410b4f0e8e32004d8f97380f 100644
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -135,6 +135,12 @@ class KernelRegistry final {
               KernelRegistryForTarget<TARGET(kHost),
                                       PRECISION(kAny),
                                       DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kHost),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kHost),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW)> *,  //
               KernelRegistryForTarget<TARGET(kCUDA),
                                       PRECISION(kAny),
                                       DATALAYOUT(kAny)> *,  //
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index ca22c86907d4f582ef9d7ca84b908711ba1b8660..80c2bd553f6b8073e55d28ef0115246266a6a1c9 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -75,6 +75,8 @@ class Optimizer {
     (defined LITE_WITH_ARM)
            "lite_elementwise_add_activation_fuse_pass",  //
 #endif
+           "__xpu__resnet_fuse_pass",
+           "__xpu__multi_encoder_fuse_pass",
            "quantized_op_attributes_inference_pass",  // Only for fully
                                                       // quantized model, infer
                                                       // the output scale and
@@ -115,9 +117,15 @@ class Optimizer {
            "variable_place_inference_pass",  //
            "argument_type_display_pass",
 
+           "mlu_subgraph_pass",
+
            "runtime_context_assign_pass",
            "argument_type_display_pass",
+
+           "mlu_postprocess_pass",
+
            "memory_optimize_pass"}};
+
       if (passes.size() == 1) {
         passes_local.push_back(passes[0]);
       }
diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc
index f4d0e3c0afbe1f9df4e381a502e1800a3d58ba68..3906cf0989a11c079323bdc8f256e6b5a5a33394 100644
--- a/lite/core/profile/profiler.cc
+++ b/lite/core/profile/profiler.cc
@@ -100,7 +100,8 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
      << " " << setw(12) << left << "Avg (ms)"
      << " " << setw(12) << left << "Min (ms)"
      << " " << setw(12) << left << "Max (ms)"
-     << " " << setw(12) << left << "Last (ms)" << std::endl;
+     << " " << setw(12) << left << "Last (ms)"
+     << " " << setw(12) << left << "Percent (%)" << std::endl;
   // Profile information.
   if (concise) {
     std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
@@ -117,7 +118,16 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
         summary.insert({unit.Character(), info});
       }
     }
+    // compute total time
+    float total = 0.0;
     for (const auto& item : summary) {
+      total += item.second.avg;
+    }
+    for (const auto& item : summary) {
+      float percent = 0;
+      if (total > 0) {
+        percent = 100 * (item.second.avg / total);
+      }
       // clang-format off
       ss << setw(25) << left << fixed << item.first.op_type             \
          << " " << setw(40) << left << fixed << item.first.kernel_name  \
@@ -125,12 +135,23 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
          << " " << setw(12) << left << fixed << item.second.avg         \
          << " " << setw(12) << left << fixed << item.second.min         \
          << " " << setw(12) << left << fixed << item.second.max         \
+         << " " << setw(12) << left << fixed << percent << "%"          \
          << " " << std::endl;
       // clang-format on
     }
   } else {
+    float total = 0.0;
     for (auto& unit : units_) {
       const auto& times = unit.Timer(type)->LapTimes();
+      total += times.Avg(w);
+    }
+    for (auto& unit : units_) {
+      const auto& times = unit.Timer(type)->LapTimes();
+      float run = times.Avg(w);
+      float percent = 0;
+      if (total > 0) {
+        percent = 100 * (run / total);
+      }
       // clang-format off
       ss << setw(25) << left << fixed << unit.Character().op_type            \
          << " " << setw(40) << left << fixed << unit.Character().kernel_name \
@@ -139,6 +160,7 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
          << " " << setw(12) << left << fixed << times.Min(w)                 \
          << " " << setw(12) << left << fixed << times.Max(w)                 \
          << " " << setw(12) << left << fixed << times.Last(w)                \
+          << " " << setw(12) << left << fixed << percent << "%"              \
          << std::endl;
       // clang-format on
     }
diff --git a/lite/core/workspace.h b/lite/core/workspace.h
index 117b80aaa7863719536d8dbec70cf38c7ba04efc..54efb6699ac6df63286b26843f8d79b7c84949f1 100644
--- a/lite/core/workspace.h
+++ b/lite/core/workspace.h
@@ -69,6 +69,13 @@ class WorkSpace {
   }
 #endif
 
+#if defined(LITE_WITH_MLU)
+  static WorkSpace& Global_MLU() {
+    thread_local std::unique_ptr<WorkSpace> x(new WorkSpace(TARGET(kMLU)));
+    return *x;
+  }
+#endif
+
  private:
   explicit WorkSpace(TargetType x) : target_(x) {}
 
diff --git a/lite/fluid/lod.h b/lite/fluid/lod.h
index 36386f7eb967f31ec258681fe17222a928aa7b4b..b1f2f14a0a4534e588d18237826858812740db69 100644
--- a/lite/fluid/lod.h
+++ b/lite/fluid/lod.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace lite {
 namespace fluid {
-using LoD = std::vector<std::vector<size_t>>;
+using LoD = std::vector<std::vector<uint64_t>>;
 
 static LoD ToAbsOffset(const LoD &in) {
   // the lowest level stores relative offsets
diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt
index ad0269148e08b5c6c1e7fed144ccace6efc697f1..9ab165569a973cbbf5add82ee1c3263016376851 100644
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -10,5 +10,6 @@ add_subdirectory(opencl)
 add_subdirectory(fpga)
 add_subdirectory(npu)
 add_subdirectory(xpu)
+add_subdirectory(mlu)
 add_subdirectory(bm)
 add_subdirectory(hw_ascend_npu)
diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc
index d609716ee53ec584b8340e9b72498ed95afd5820..ea60cf528ea71f0bc0ba0a162063bd76899622f9 100644
--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
@@ -179,6 +179,34 @@ void SquareCompute::Run() {
       x_data, output_data, x_dims.production(), ctx.threads());
 }
 
+void HardSwishCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  float threshold = param.hard_swish_threshold;
+  float scale = param.hard_swish_scale;
+  float offset = param.hard_swish_offset;
+  lite::arm::math::act_hard_swish<float>(x_data,
+                                         output_data,
+                                         x_dims.production(),
+                                         threshold,
+                                         scale,
+                                         offset,
+                                         ctx.threads());
+}
+
+void ReciprocalCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_reciprocal<float>(
+      x_data, output_data, x_dims.production(), ctx.threads());
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -275,3 +303,21 @@ REGISTER_LITE_KERNEL(
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+REGISTER_LITE_KERNEL(hard_swish,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::HardSwishCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+REGISTER_LITE_KERNEL(reciprocal,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ReciprocalCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h
index 476d7bb0a32db193d9afb1451507699d0af71736..2e8deda786a1ea9af70499c7b33c8aa1c6e19370 100644
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
@@ -148,6 +148,24 @@ class SquareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
   virtual ~SquareCompute() = default;
 };
 
+class HardSwishCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~HardSwishCompute() = default;
+};
+
+class ReciprocalCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~ReciprocalCompute() = default;
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index a52428aa097099150139de82627d5770c9b9071c..94fe384d0414d87f38fb0d1ab3e8ac1033423702 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -5,3 +5,4 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${lite_kernel_deps})
diff --git a/lite/kernels/host/ctc_align_compute.cc b/lite/kernels/host/ctc_align_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a62c2ee15ac2752d5d3349fbaaeb18f31ac4c5a0
--- /dev/null
+++ b/lite/kernels/host/ctc_align_compute.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/ctc_align_compute.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+LoD ToAbs(const LoD& in) {
+  if (in.empty()) return in;
+  LoD result;
+  for (auto& src : in) {
+    std::vector<uint64_t> dest(src.size() + 1, 0);
+    for (int i = 0; i < src.size(); i++) {
+      dest[i + 1] = dest[i] + src[i];
+    }
+    result.emplace_back(dest);
+  }
+  return result;
+}
+
+LoD ToNorm(const LoD& in) {
+  if (in.empty()) return in;
+  LoD result;
+  for (auto& src : in) {
+    std::vector<uint64_t> dest(src.size() - 1, 0);
+    for (int i = 0; i < dest.size(); i++) {
+      dest[i] = src[i + 1] - src[i];
+    }
+    result.emplace_back(dest);
+  }
+  return result;
+}
+
+LoD ToAbsOffset(const LoD& in) {
+  // the lowest level stores relative offsets
+  if (in.empty() || in.size() == 1) return in;
+  LoD result = in;
+  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
+    for (size_t i = 0; i < in[level].size(); ++i) {
+      size_t index = in[level][i];
+      result[level][i] = result[level + 1][index];
+    }
+  }
+  return result;
+}
+
+template <typename T, PrecisionType PT>
+void CtcAlignCompute<T, PT>::Run() {
+  auto& param = this->template Param<operators::CtcAlignParam>();
+  auto* input = param.input;
+  auto* output = param.output;
+  size_t blank = static_cast<size_t>(param.blank);
+  bool merge_repeated = param.merge_repeated;
+  size_t padding_value = static_cast<size_t>(param.padding_value);
+
+  const auto* input_data = input->template data<T>();
+  auto input_dims = input->dims();
+  auto* output_data = output->template mutable_data<T>();
+
+  if (input->lod().empty()) {
+    auto* input_length = param.input_length;
+    auto* output_length = param.output_length;
+    CHECK(input_length != nullptr);
+    CHECK(output_length != nullptr);
+    const auto* input_length_data = input_length->template data<T>();
+    auto* output_length_data = output_length->template mutable_data<T>();
+
+    for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0]; batch_id++) {
+      T prev_token = -1;
+      size_t output_idx = 0;
+      for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) {
+        size_t input_ind = batch_id * input_dims[1] + i;
+        if ((unsigned)input_data[input_ind] != blank &&
+            !(merge_repeated && input_data[input_ind] == prev_token)) {
+          output_data[batch_id * input_dims[1] + output_idx] =
+              input_data[input_ind];
+          ++output_idx;
+        }
+        prev_token = input_data[input_ind];
+      }
+      output_length_data[batch_id] = output_idx;
+      for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++)
+        output_data[batch_id * input_dims[1] + j] = padding_value;
+    }
+  } else {
+    const size_t level = 0;
+
+    auto input_lod = input->lod();
+    input_lod = ToAbs(input->lod());
+    input_lod = ToAbsOffset(input_lod);
+    CHECK_EQ(input_dims[0], static_cast<int64_t>(input_lod[level].back()));
+
+    const size_t num_sequences = input_lod[level].size() - 1;
+    // merge repeated tokens and delete blank
+    size_t output_idx = 0;
+    std::vector<uint64_t> output_lod0(1, 0);
+    for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
+      T prev_token = -1;
+      for (size_t i = input_lod[level][seq_idx];
+           i < input_lod[level][seq_idx + 1];
+           ++i) {
+        if ((unsigned)input_data[i] != blank &&
+            !(merge_repeated && input_data[i] == prev_token)) {
+          output_data[output_idx] = input_data[i];
+          ++output_idx;
+        }
+        prev_token = input_data[i];
+      }
+      output_lod0.push_back(static_cast<uint64_t>(output_idx));
+    }
+
+    LoD output_lod;
+    output_lod.push_back(output_lod0);
+    output_lod = ToNorm(output_lod);
+    output->set_lod(output_lod);
+    output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+    if (output_lod0.back() == 0) {
+      output->Resize({1, 1});
+      output_data = output->template mutable_data<T>();
+      output_data[0] = -1;
+    }
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+using ctc_align_int64 =
+    paddle::lite::kernels::host::CtcAlignCompute<int64_t, PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(ctc_align, kHost, kInt64, kNCHW, ctc_align_int64, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .BindInput("InputLength",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .BindOutput("OutputLength",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .Finalize();
+
+using ctc_align_int32 =
+    paddle::lite::kernels::host::CtcAlignCompute<int32_t, PRECISION(kInt32)>;
+REGISTER_LITE_KERNEL(ctc_align, kHost, kInt32, kNCHW, ctc_align_int32, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("InputLength",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("OutputLength",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .Finalize();
diff --git a/lite/kernels/host/ctc_align_compute.h b/lite/kernels/host/ctc_align_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..737fb3be6c96d91a3cde4a8f9053c6f7b9c7ec69
--- /dev/null
+++ b/lite/kernels/host/ctc_align_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+template <typename T, PrecisionType PT>
+class CtcAlignCompute : public KernelLite<TARGET(kHost), PT> {
+ public:
+  void Run() override;
+
+  virtual ~CtcAlignCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/CMakeLists.txt b/lite/kernels/mlu/CMakeLists.txt
index 1c41f05ca0cb23013418654f195394f88adf05b1..f9395d45ccecccaf3f873797d0c2d71eda266319 100644
--- a/lite/kernels/mlu/CMakeLists.txt
+++ b/lite/kernels/mlu/CMakeLists.txt
@@ -6,3 +6,4 @@ add_subdirectory(bridges)
 add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges})
 add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
 add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
+add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
diff --git a/lite/kernels/mlu/bridges/CMakeLists.txt b/lite/kernels/mlu/bridges/CMakeLists.txt
index 302d580ee1594f983e516d42da6f57221b3b33c8..82510ab9b6a794f5c6b1ffb43d2d3c55db3a5514 100644
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
@@ -15,6 +15,9 @@ lite_cc_library(subgraph_bridge_elementwise_ops_mlu SRCS elementwise_ops.cc DEPS
 lite_cc_library(subgraph_bridge_pool_op_mlu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_mlu})
 lite_cc_library(subgraph_bridge_softmax_op_mlu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_mlu})
 lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS ${subgraph_bridge_deps_mlu})
 set(mlu_subgraph_bridges
         subgraph_bridge_registry
         subgraph_bridge_utility_mlu
@@ -26,16 +29,20 @@ set(mlu_subgraph_bridges
         subgraph_bridge_softmax_op_mlu
         subgraph_bridge_fc_op_mlu
         subgraph_bridge_batch_norm_op_mlu
+        subgraph_bridge_scale_op_mlu
+        subgraph_bridge_interp_op_mlu
+        subgraph_bridge_concat_op_mlu
         CACHE INTERNAL "mlu_subgraph_bridges")
 
-
-# lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
-# lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-# lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
-
+lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
+lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
diff --git a/lite/kernels/mlu/bridges/act_op.cc b/lite/kernels/mlu/bridges/act_op.cc
index 50291ec297f9d035f8a7fbe1b525f8ece27bfeb6..286195d9d5f961288dd0156db31ff8aacae58227 100644
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
@@ -31,20 +31,34 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   VLOG(3) << "[MLU] Converting " + op_type + "...";
 
   // Create act node and set params from op
+  auto fp_type = graph->FPType();
   auto x_var_name = op_info->Input("X").front();
   auto out_var_name = op_info->Output("Out").front();
   auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
   auto output_dims = output->dims().Vectorize();
   auto output_tensor = graph->AddNode(
-      out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type);
   CHECK(graph->HasNode(x_var_name));
   auto input_tensor = graph->GetNode(x_var_name);
-  cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type);
   cnmlBaseOp_t activation_op;
-  CNML_CALL(cnmlCreateActiveOp(&activation_op,
-                               act_type,
-                               input_tensor->mlu_tensor(),
-                               output_tensor->mlu_tensor()));
+  if (op_type == "leaky_relu") {
+    auto alpha = op_info->GetAttr<float>("alpha");
+    std::vector<int64_t> shape = {1, 1, 1, 1};
+    std::string alpha_var_name = string_format("leaky_relu_alpha_%p", op);
+    auto alpha_tensor =
+        graph->AddNode(alpha_var_name, shape, CNML_CONST, CNML_NHWC, fp_type);
+    graph->BindConstRawData(alpha_var_name, &alpha, 1, true);
+    CNML_CALL(cnmlCreatePreluOp(&activation_op,
+                                input_tensor->mlu_tensor(),
+                                output_tensor->mlu_tensor(),
+                                alpha_tensor->mlu_tensor()));
+  } else {
+    cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type);
+    CNML_CALL(cnmlCreateActiveOp(&activation_op,
+                                 act_type,
+                                 input_tensor->mlu_tensor(),
+                                 output_tensor->mlu_tensor()));
+  }
   graph->FuseOp(activation_op);
   return SUCCESS;
 }
@@ -54,4 +68,11 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
+REGISTER_SUBGRAPH_BRIDGE(sigmoid,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ActConverter);
diff --git a/lite/kernels/mlu/bridges/act_op_test.cc b/lite/kernels/mlu/bridges/act_op_test.cc
index 51cdc52dc6da764ab0c2d720b9159fd8b0a2c0df..7cec0529e49e694c362b3e0a550948f7855c85a2 100644
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
@@ -25,8 +25,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
-int ActConverter(void* ctx, OpLite* op);
-
 template void FillTensor<float, int>(Tensor* x,
                                      float lower = -2,
                                      float upper = -2);
@@ -136,7 +134,7 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
 
 TEST(MLUBridges, activation) {
   std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
-  std::vector<std::string> types{"sigmoid", "relu", "tanh"};
+  std::vector<std::string> types{"sigmoid", "relu", "tanh", "leaky_relu"};
   for (auto x_shape : shapes) {
     for (auto op_type : types) {
       test_act(x_shape, op_type);
@@ -149,8 +147,7 @@ TEST(MLUBridges, activation) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         sigmoid,
-                         paddle::lite::subgraph::mlu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter);
+USE_SUBGRAPH_BRIDGE(sigmoid, kMLU)
+USE_SUBGRAPH_BRIDGE(relu, kMLU)
+USE_SUBGRAPH_BRIDGE(tanh, kMLU)
+USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU)
diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc
index d95a5115c96c10a8881f50c44fee9881c6a9e218..7353a685dd5fd3a5bcc8c88def8ffb8b96fdde55 100644
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
@@ -42,7 +42,7 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto output = scope->FindVar(y_var_name)->GetMutable<Tensor>();
   auto output_dims = output->dims().Vectorize();
   auto output_tensor = graph->AddNode(
-      y_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+      y_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
 
   CHECK(graph->HasNode(x_var_name));
 
diff --git a/lite/kernels/mlu/bridges/batch_norm_op_test.cc b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
index 47e291bf3d83e8ce85216e86505817be6ed8b106..65b24a0a72a48a306b6a8976efd8839679d58038 100644
--- a/lite/kernels/mlu/bridges/batch_norm_op_test.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
@@ -23,8 +23,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
-int BatchNormConverter(void* ctx, OpLite* op);
-
 template <typename dtype>
 void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
   Scope* scope = op->scope();
@@ -139,9 +137,7 @@ void test_batch_norm(
             {bs, ic, ih, iw},
             {0, 2, 3, 1});
 
-  out->Resize({bs, ih, iw, ic});
   x->CopyDataFrom(input_trans);
-  x->Resize({bs, ih, iw, ic});
 
   LaunchOp(op, {x_var_name}, {out_var_name});
 
@@ -181,6 +177,4 @@ TEST(MLUBridges, batch_norm) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         batch_norm,
-                         paddle::lite::subgraph::mlu::BatchNormConverter);
+USE_SUBGRAPH_BRIDGE(batch_norm, kMLU)
diff --git a/lite/kernels/mlu/bridges/concat_op.cc b/lite/kernels/mlu/bridges/concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..14f0da746a00c1ea10ffae824217dbb2df84df55
--- /dev/null
+++ b/lite/kernels/mlu/bridges/concat_op.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X");
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto param_axis = op_info->GetAttr<int>("axis");
+
+  std::vector<cnmlTensor_t> input_tensor;
+  for (auto x_name : x_var_name) {
+    CHECK(graph->HasNode(x_name));
+    input_tensor.push_back(graph->GetNode(x_name)->mlu_tensor());
+  }
+
+  auto dims = output_dims.size();
+  int axis = (param_axis < 0) ? (param_axis + dims) : param_axis;
+  CHECK_LE(axis, 4) << "Unsupport dims in mlu concat";
+  int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2};
+  int nhwc_axis = nchw_to_nhwc_axis_map[axis];
+
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+
+  cnmlBaseOp_t concat_op;
+  cnmlTensor_t outputs = output_tensor->mlu_tensor();
+  CNML_CALL(cnmlCreateNdConcatOp(&concat_op,
+                                 nhwc_axis,
+                                 input_tensor.data(),
+                                 x_var_name.size(),
+                                 &outputs,
+                                 1));
+  graph->FuseOp(concat_op);
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ConcatConverter);
diff --git a/lite/kernels/mlu/bridges/concat_op_test.cc b/lite/kernels/mlu/bridges/concat_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4b48a9ef45430ec5867d231bbc2d0a798ec66d0
--- /dev/null
+++ b/lite/kernels/mlu/bridges/concat_op_test.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/concat_op.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void concat_ref(const std::shared_ptr<operators::ConcatOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = op_info->Input("X");
+  std::vector<lite::Tensor*> inputs;
+  for (auto var : x) {
+    inputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  }
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  int axis = op_info->GetAttr<int>("axis");
+  std::vector<lite::Tensor*> inputs_concat(inputs.size());
+  for (int j = 0; j < inputs.size(); ++j) {
+    inputs_concat[j] = inputs[j];
+  }
+  size_t num = inputs.size();
+  int rows = 1;
+  auto dim_0 = inputs[0]->dims();
+  for (int i = 0; i < axis; ++i) {
+    rows *= dim_0[i];
+  }
+  int out_rows = rows, out_cols = 0;
+  std::vector<int64_t> inputs_cols(inputs.size());
+  for (int i = 0; i < num; ++i) {
+    int t_cols = inputs[i]->numel() / rows;
+    out_cols += t_cols;
+    inputs_cols[i] = t_cols;
+  }
+  for (int k = 0; k < out_rows; ++k) {
+    float* dst_ptr = out->mutable_data<float>() + k * out_cols;
+    int col_idx = 0;
+    for (int j = 0; j < num; ++j) {
+      int col_len = inputs_cols[j];
+      const float* src_prt = inputs[j]->data<float>() + k * col_len;
+      std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
+      col_idx += col_len;
+    }
+  }
+}
+
+void test_concat(std::vector<std::vector<int64_t>> input, int axis) {
+  std::string x_var_name = "x";
+  std::string y_var_name = "y";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+
+  // prepare input&output variables
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(input[0]));
+  y->Resize(DDim(input[1]));
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  CHECK_EQ(out->dims(), out_ref->dims());
+
+  // initialize input&output data
+  FillTensor<float>(x);
+  FillTensor<float>(y);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("concat");
+  opdesc.SetInput("X", {x_var_name, y_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", axis);
+
+  auto op = CreateOp<operators::ConcatOpLite>(opdesc, &scope);
+  concat_ref(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input_x, input_y;
+  input_x.Resize(DDim(input[0]));
+  input_y.Resize(DDim(input[1]));
+  transpose(x->mutable_data<float>(),
+            input_x.mutable_data<float>(),
+            {static_cast<int>(input[0][0]),
+             static_cast<int>(input[0][1]),
+             static_cast<int>(input[0][2]),
+             static_cast<int>(input[0][3])},
+            {0, 2, 3, 1});
+  transpose(y->mutable_data<float>(),
+            input_y.mutable_data<float>(),
+            {static_cast<int>(input[1][0]),
+             static_cast<int>(input[1][1]),
+             static_cast<int>(input[1][2]),
+             static_cast<int>(input[1][3])},
+            {0, 2, 3, 1});
+  x->CopyDataFrom(input_x);
+  y->CopyDataFrom(input_y);
+
+  LaunchOp(op, {x_var_name, y_var_name}, {out_var_name});
+
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+
+  Tensor output_trans;
+  output_trans.Resize(out->dims());
+  auto os = out->dims();
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(os[0]),
+             static_cast<int>(os[2]),
+             static_cast<int>(os[3]),
+             static_cast<int>(os[1])},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4);
+  }
+}
+
+TEST(MLUBridges, concat) {
+  test_concat({{3, 3, 5, 2}, {2, 3, 5, 2}}, 0);
+  test_concat({{3, 5, 5, 2}, {3, 1, 5, 2}}, 1);
+  test_concat({{3, 3, 2, 2}, {3, 3, 4, 2}}, 2);
+  test_concat({{3, 3, 5, 2}, {3, 3, 5, 6}}, 3);
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(concat, kMLU);
diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc
index e9fdacdca92398cee9f5e01b3f34e41e672274b5..6a7ef408eb7432950d5a0985dd6e174236e937e0 100644
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
@@ -31,15 +31,16 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   const auto* scope = op->scope();
   VLOG(3) << "[MLU] Converting " << op_info->Type() << "... ";
 
-  // Get input, filter and op attributes
+  // get input, filter and op attributes
   const auto input_var_name = op_info->Input("Input").front();
-  const auto& input_dims_nhwc =
+  const auto& input_dims =
       scope->FindVar(input_var_name)->GetMutable<Tensor>()->dims();
-  const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
   const auto filter_var_name = op_info->Input("Filter").front();
   auto* filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
   const auto& filter_dims = filter->dims();
   const auto output_var_name = op_info->Output("Output").front();
+  auto* output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
+  const auto output_shape = output->dims().Vectorize();
   const auto bs = input_dims[0];
   const auto oc = filter_dims[0];
   CHECK_EQ(input_dims.size(), 4);
@@ -70,24 +71,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                       input_dims,
                                       filter_dims);
 
-  std::vector<int64_t> output_shape({bs, oc});
-  for (size_t i = 0; i < 2; i++) {
-    const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
-    output_shape.push_back(
-        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
-            strides[i] +
-        1);
-  }
-
-  const auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
-  const auto output_tensor = graph->AddNode(output_var_name,
-                                            output_shape_nhwc,
-                                            CNML_TENSOR,
-                                            CNML_NHWC,
-                                            graph->FPType());
-  scope->FindVar(output_var_name)
-      ->GetMutable<::paddle::lite::Tensor>()
-      ->Resize(output_shape_nhwc);
+  const auto output_tensor = graph->AddNode(
+      output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
 
   // Create filter node
   const auto filter_tensor = graph->AddNode(filter_var_name,
@@ -119,14 +104,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     LOG(FATAL) << "UnSupported weight precision!";
   }
 
-  cnmlConvOpParam_t conv_param;
-  CNML_CALL(cnmlCreateConvOpParam(&conv_param,
-                                  strides[0],
-                                  strides[1],
-                                  dilations[0],
-                                  dilations[1],
-                                  paddings[0] * 2,
-                                  paddings[2] * 2));
   std::string bias_var_name;
   std::shared_ptr<MLUTensor> bias_tensor;
   if (HasInputArg(op_info, scope, "Bias")) {
@@ -160,15 +137,75 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  graph->FPType());
     graph->BindConstData(bias_var_name, bias);
   }
-  cnmlBaseOp_t conv_op;
+
   const auto input_scale = op_info->GetAttr<float>("input_scale");
-  CNML_CALL(cnmlCreateConvOpForward(
-      &conv_op,
-      conv_param,
-      graph->GetNode(input_var_name)->mlu_tensor(),
-      output_tensor->mlu_tensor(),
-      filter_tensor->mlu_tensor(),
-      bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
+
+  bool use_first_conv = false;
+  if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) {
+    use_first_conv = true;
+  }
+
+  cnmlBaseOp_t conv_op;
+  if (use_first_conv) {
+    cnmlConvFirstOpParam_t conv_param;
+    CNML_CALL(cnmlCreateConvFirstOpParam_V2(&conv_param,
+                                            strides[0],
+                                            strides[1],
+                                            dilations[0],
+                                            dilations[1],
+                                            paddings[2],
+                                            paddings[2],
+                                            paddings[0],
+                                            paddings[0]));
+    const auto mean_tensor = graph->AddNode("first_conv_mean_tensor",
+                                            std::vector<int64_t>{3},
+                                            CNML_CONST,
+                                            CNML_CNHW,
+                                            graph->FPType());
+    const auto std_tensor = graph->AddNode("first_conv_std_tensor",
+                                           std::vector<int64_t>{3},
+                                           CNML_CONST,
+                                           CNML_CNHW,
+                                           graph->FPType());
+
+    graph->BindConstRawData("first_conv_mean_tensor",
+                            lite::DeviceInfo::Global().MeanVec().data(),
+                            3,
+                            false);
+    graph->BindConstRawData("first_conv_std_tensor",
+                            lite::DeviceInfo::Global().StdVec().data(),
+                            3,
+                            false);
+
+    graph->GetNode(input_var_name)->set_mlu_dtype(CNML_DATA_UINT8);
+    CNML_CALL(cnmlCreateConvFirstOpForward(
+        &conv_op,
+        conv_param,
+        graph->GetNode(input_var_name)->mlu_tensor(),
+        mean_tensor->mlu_tensor(),
+        output_tensor->mlu_tensor(),
+        filter_tensor->mlu_tensor(),
+        bias_tensor ? bias_tensor->mlu_tensor() : nullptr,
+        std_tensor->mlu_tensor()));
+    CNML_CALL(cnmlDestroyConvFirstOpParam(&conv_param));
+  } else {
+    cnmlConvOpParam_t conv_param;
+    CNML_CALL(cnmlCreateConvOpParam(&conv_param,
+                                    strides[0],
+                                    strides[1],
+                                    dilations[0],
+                                    dilations[1],
+                                    paddings[0] * 2,
+                                    paddings[2] * 2));
+    CNML_CALL(cnmlCreateConvOpForward(
+        &conv_op,
+        conv_param,
+        graph->GetNode(input_var_name)->mlu_tensor(),
+        output_tensor->mlu_tensor(),
+        filter_tensor->mlu_tensor(),
+        bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
+    CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
+  }
 
   graph->SetComputingDataType(
       conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
@@ -183,7 +220,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
   graph->BindConstData(filter_var_name, filter);
   graph->FuseOp(conv_op);
-  CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
diff --git a/lite/kernels/mlu/bridges/conv_op_test.cc b/lite/kernels/mlu/bridges/conv_op_test.cc
index e8ef9ba04fd6126f00f4ee2ff869495929bfdc9a..e34dd7c2a85dbda62596b6e82d820fc437bfd194 100644
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
@@ -25,8 +25,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
-int ConvConverter(void* ctx, OpLite* op);
-
 void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
   Scope* scope = op->scope();
   const OpInfo* op_info = op->op_info();
@@ -246,10 +244,6 @@ void test_conv(int bs,
     }
   }
 
-  input->Resize({bs, ih, iw, ic});
-  output->Resize(
-      {output_shape[0], output_shape[2], output_shape[3], output_shape[1]});
-
   // create and convert op to MLU model, then run it on MLU
   auto op = CreateOp<operators::ConvOpLite>(opdesc_mlu, &scope);
   LaunchOp(op, {input_var_name}, {output_var_name});
@@ -342,9 +336,5 @@ TEST(MLUBridges, conv) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         conv2d,
-                         paddle::lite::subgraph::mlu::ConvConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         depthwise_conv2d,
-                         paddle::lite::subgraph::mlu::ConvConverter);
+USE_SUBGRAPH_BRIDGE(conv2d, kMLU)
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU)
diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc
index 4ef949925d20e0a2cb1c7f25d840e2041d79dd7a..41526a0100ba71be9eda25983cb96aa888d6cf4d 100644
--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
@@ -77,7 +77,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto output_tensor = graph->AddNode(out_var_name,
                                       x->dims().Vectorize(),
                                       CNML_TENSOR,
-                                      CNML_NHWC,
+                                      CNML_NCHW,
                                       graph->FPType());
 
   cnmlBaseOp_t elementwise_op;
@@ -90,7 +90,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     auto mid_tensor = graph->AddNode(out_var_name + "_mid",
                                      x->dims().Vectorize(),
                                      CNML_TENSOR,
-                                     CNML_NHWC,
+                                     CNML_NCHW,
                                      graph->FPType());
     CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op,
                                        x_tensor->mlu_tensor(),
diff --git a/lite/kernels/mlu/bridges/elementwise_ops_test.cc b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
index 388aa68600e180945d19e1a4e4728cf26bf801e1..e5087dd708eee3ba255fbfa0383d31b12a6b6870 100644
--- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
@@ -24,8 +24,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
-int ElementwiseConverter(void* ctx, OpLite* op);
-
 template <typename dtype>
 void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
   Scope* scope = op->scope();
@@ -184,15 +182,7 @@ TEST(MLUBridges, elementwise_add) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         elementwise_add,
-                         paddle::lite::subgraph::mlu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         elementwise_sub,
-                         paddle::lite::subgraph::mlu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         elementwise_mul,
-                         paddle::lite::subgraph::mlu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         elementwise_div,
-                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU)
+USE_SUBGRAPH_BRIDGE(elementwise_sub, kMLU)
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU)
+USE_SUBGRAPH_BRIDGE(elementwise_div, kMLU)
diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc
index 43a75daa2b3d2d6200f3607e213ab62ee6ba3cdb..286feec8d4d44eaa025f333d559c32ca72f042ff 100644
--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
@@ -37,6 +37,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
   auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
   auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
+  auto output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
   auto x_dims = x->dims();
   auto w_dims = w->dims();
 
@@ -50,15 +51,11 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   auto input_scale = op_info->GetAttr<float>("input_scale");
 
-  std::vector<int64_t> output_shape_nhwc({1, 1, 1, w_dims[1]});
   auto output_tensor = graph->AddNode(output_var_name,
-                                      output_shape_nhwc,
+                                      output->dims().Vectorize(),
                                       CNML_TENSOR,
-                                      CNML_NHWC,
+                                      CNML_NCHW,
                                       graph->FPType());
-  scope->FindVar(output_var_name)
-      ->GetMutable<::paddle::lite::Tensor>()
-      ->Resize(output_shape_nhwc);
 
   std::string bias_var_name;
   std::shared_ptr<MLUTensor> bias_tensor;
diff --git a/lite/kernels/mlu/bridges/fc_op_test.cc b/lite/kernels/mlu/bridges/fc_op_test.cc
index 7e5cfdb32e7d993f32403dc764462575181f9d4d..8f92b6abad97650100d0862d49550abaf62daac9 100644
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
@@ -24,8 +24,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
-int FCConverter(void* ctx, OpLite* op);
-
 void fc_ref(const std::shared_ptr<operators::FcOpLite> op) {
   Scope* scope = op->scope();
   const OpInfo* op_info = op->op_info();
@@ -141,15 +139,34 @@ void test_fc(const std::vector<int64_t>& input_shape,
   }
 
   auto fc_op_mlu = CreateOp<operators::FcOpLite>(fc_op_desc_mlu, &scope);
-  input->Resize({static_cast<int>(input_shape[0]),
-                 static_cast<int>(input_shape[2]),
-                 static_cast<int>(input_shape[3]),
-                 static_cast<int>(input_shape[1])});
-  out->Resize({static_cast<int>(input_shape[0]), static_cast<int>(w_shape[1])});
+
+  Tensor input_tmp, out_tmp;
+  input_tmp.Resize(input_shape);
+  transpose(input->mutable_data<float>(),
+            input_tmp.mutable_data<float>(),
+            {static_cast<int>(input_shape[0]),
+             static_cast<int>(input_shape[1]),
+             static_cast<int>(input_shape[2]),
+             static_cast<int>(input_shape[3])},
+            {0, 2, 3, 1});
+  input->CopyDataFrom(input_tmp);
+
   LaunchOp(fc_op_mlu, {input_var_name}, {out_var_name});
 
-  // compare results
+  auto os = out->dims();
+  out_tmp.Resize(os);
   auto* out_data = out->mutable_data<float>();
+  //  transpose(out_data,
+  //            out_tmp.mutable_data<float>(),
+  //            {static_cast<int>(os[0]),
+  //             static_cast<int>(os[2]),
+  //             static_cast<int>(os[3]),
+  //             static_cast<int>(os[1])},
+  //            {0, 3, 1, 2});
+  //
+  //  out_data = out_tmp.mutable_data<float>();
+
+  // compare results
   auto* out_ref_data = out_ref->mutable_data<float>();
   for (int i = 0; i < out->dims().production(); i++) {
     EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
@@ -170,4 +187,4 @@ TEST(MLUBridges, fc) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(MLU, fc, paddle::lite::subgraph::mlu::FCConverter);
+USE_SUBGRAPH_BRIDGE(fc, kMLU);
diff --git a/lite/kernels/mlu/bridges/graph.cc b/lite/kernels/mlu/bridges/graph.cc
index 27c6ab2597fa6930b14c4c4e34750030608167b6..65c2f8214c13ee8d004dbe4b2e706523d007469c 100644
--- a/lite/kernels/mlu/bridges/graph.cc
+++ b/lite/kernels/mlu/bridges/graph.cc
@@ -25,12 +25,12 @@ namespace mlu {
 std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
                                           std::vector<int64_t> shape,
                                           cnmlTensorType_t tensor_type,
-                                          cnmlDataOrder_t data_order,
+                                          cnmlDataOrder_t shape_order,
                                           cnmlDataType_t mlu_dtype,
                                           void* raw_ptr) {
   CHECK(!HasNode(name));
   auto node = std::shared_ptr<MLUTensor>(
-      new MLUTensor(shape, tensor_type, data_order, mlu_dtype));
+      new MLUTensor(shape, tensor_type, shape_order, mlu_dtype));
   node->set_mlu_ptr(raw_ptr);
   nodes_.insert(std::make_pair(name, node));
   return node;
diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h
index 140900a2dde004281945e50fb1c72d09b58befa1..b846d15af06c683ad685b04da5588f7ecedd0d38 100644
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -23,6 +23,12 @@
 #include "lite/core/tensor.h"
 #include "lite/kernels/mlu/bridges/tensor.h"
 
+#define PRINT_HW_TIME false
+
+#if PRINT_HW_TIME
+#include <mutex>  //NOLINT
+#endif
+
 namespace paddle {
 namespace lite {
 namespace subgraph {
@@ -32,13 +38,30 @@ namespace mlu {
 // to the MLU IR graph
 class Graph {
  public:
-  Graph() { CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); }
+  Graph() {
+    CNML_CALL(cnmlCreateFusionOp(&fusion_op_));
+#if PRINT_HW_TIME
+    CNRT_CALL(cnrtCreateNotifier(&notifier_start_));
+    CNRT_CALL(cnrtCreateNotifier(&notifier_end_));
+#endif
+  }
 
   ~Graph() {
+    FreeConstData();
     CNML_CALL(cnmlDestroyFusionOp(&fusion_op_));
     for (auto op : ops_) {
       CNML_CALL(cnmlDestroyBaseOp(&op));
     }
+#if PRINT_HW_TIME
+    CNRT_CALL(cnrtDestroyNotifier(&notifier_start_));
+    CNRT_CALL(cnrtDestroyNotifier(&notifier_end_));
+    double total_time = 0;
+    for (auto& f : time_log_) {
+      total_time += f;
+    }
+    std::cout << "cnml hardware time for " << time_log_.size()
+              << " process:" << total_time / time_log_.size() << std::endl;
+#endif
   }
 
   // Data node
@@ -89,6 +112,10 @@ class Graph {
   }
 
   void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
+#if PRINT_HW_TIME
+    thread_local float hw_time;
+    CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
+#endif
     CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
                                             input_addrs_.data(),
                                             input_addrs_.size(),
@@ -96,7 +123,61 @@ class Graph {
                                             output_addrs_.size(),
                                             &forward_param,
                                             que));
+#if PRINT_HW_TIME
+    CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
+#endif
+
     CNRT_CALL(cnrtSyncQueue(que));
+#if PRINT_HW_TIME
+    CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
+    hw_time /= 1000.0f;
+    DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;
+    std::lock_guard<std::mutex> lk(time_mut_);
+    time_log_.push_back(hw_time);
+#endif
+  }
+
+  template <typename T>
+  void* RegisterConstData(size_t len) {
+    void* addr = malloc(len * sizeof(T));
+    const_data_storage_.push_back(addr);
+    return addr;
+  }
+
+  void FreeConstData() {
+    for (auto& addr : const_data_storage_) {
+      free(addr);
+    }
+  }
+
+  void BindConstRawData(std::string tensor_name,
+                        const float* data,
+                        size_t len,
+                        bool alloc = true) {
+    void* alloc_data;
+    if (fp_type_ == CNML_DATA_FLOAT32) {
+      if (alloc) {
+        alloc_data = RegisterConstData<float>(len);
+        memcpy(alloc_data, data, len * sizeof(float));
+      } else {
+        alloc_data = const_cast<void*>(static_cast<const void*>(data));
+      }
+      CNML_CALL(cnmlBindConstData_V2(
+          nodes_[tensor_name]->mlu_tensor(), alloc_data, false));
+    } else if (fp_type_ == CNML_DATA_FLOAT16) {
+      void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len);
+      CNRT_CALL(
+          cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
+                           CNRT_FLOAT32,
+                           data_fp16,
+                           CNRT_FLOAT16,
+                           len,
+                           nullptr));
+      CNML_CALL(cnmlBindConstData_V2(
+          nodes_[tensor_name]->mlu_tensor(), data_fp16, false));
+    } else {
+      CHECK(0);
+    }
   }
 
   void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
@@ -158,6 +239,12 @@ class Graph {
   std::vector<std::shared_ptr<MLUTensor>> output_tensors_;
   std::vector<cnmlBaseOp_t> ops_;
   cnmlFusionOp_t fusion_op_;
+  std::vector<void*> const_data_storage_;
+#if PRINT_HW_TIME
+  cnrtNotifier_t notifier_start_{}, notifier_end_{};
+  std::mutex time_mut_;
+  std::vector<float> time_log_;
+#endif
 };
 
 }  // namespace mlu
diff --git a/lite/kernels/mlu/bridges/interpolate_op.cc b/lite/kernels/mlu/bridges/interpolate_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e201199824d8042abd6002ccbe5bb659a9ca2898
--- /dev/null
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto out = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims();
+  CHECK_EQ(x_dims.size(), 4);
+  auto scale = op_info->GetAttr<float>("scale");
+  auto out_w = op_info->GetAttr<int>("out_w");
+  auto out_h = op_info->GetAttr<int>("out_h");
+  auto align_corners = op_info->GetAttr<bool>("align_corners");
+
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+
+  auto in_h = x_dims[2];
+  auto in_w = x_dims[3];
+
+  // Priority: SizeTensor > OutSize > Scale > scale > out_h/out_w
+  if (HasInputArg(op_info, scope, "SizeTensor")) {
+    LOG(ERROR) << "Not support SizeTensor input now";
+    CHECK(0);
+  } else {
+    if (HasInputArg(op_info, scope, "Scale")) {
+      LOG(ERROR) << "Not support Scale input now";
+      CHECK(0);
+    }
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+      out_h = out_h > 0 ? out_h : -1;
+      out_w = out_w > 0 ? out_w : -1;
+    }
+    if (HasInputArg(op_info, scope, "OutSize")) {
+      LOG(ERROR) << "Not support OutSize input now";
+      CHECK(0);
+    }
+  }
+
+  auto output_tensor = graph->AddNode(out_var_name,
+                                      out->dims().Vectorize(),
+                                      CNML_TENSOR,
+                                      CNML_NCHW,
+                                      graph->FPType());
+
+  cnmlBaseOp_t interp_op;
+  cnmlNearestNeighborOpParam_t nn_param;
+  CNML_CALL(cnmlCreateNearestNeighborOpParam(&nn_param, out_w, out_h));
+  CNML_CALL(cnmlSetNearestNeighborAlignCorner(&nn_param, align_corners));
+  CNML_CALL(cnmlCreateNearestNeighborOp(&interp_op,
+                                        input_tensor->mlu_tensor(),
+                                        output_tensor->mlu_tensor(),
+                                        nn_param));
+  CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param));
+  graph->FuseOp(interp_op);
+
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(nearest_interp,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::InterpolateConverter);
diff --git a/lite/kernels/mlu/bridges/interpolate_op_test.cc b/lite/kernels/mlu/bridges/interpolate_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e99da64358e6590af0b8e57dc3ddec142c8d0f0
--- /dev/null
+++ b/lite/kernels/mlu/bridges/interpolate_op_test.cc
@@ -0,0 +1,406 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/interpolate_op.h"
+#include <gtest/gtest.h>
+#include <string>
+#include "lite/core/device_info.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+template <typename dtype>
+void ResizeNearestAlign(const lite::Tensor* x,
+                        lite::Tensor* out,
+                        bool with_align) {
+  auto x_dims = x->dims();
+  int num = x_dims[0];
+  int channels = x_dims[1];
+  int hin = x_dims[2];
+  int win = x_dims[3];
+  int hout = out->dims()[2];
+  int wout = out->dims()[3];
+  dtype scale_w = (with_align) ? (static_cast<float>(win - 1) / (wout - 1))
+                               : (static_cast<float>(win) / (wout));
+  dtype scale_h = (with_align) ? (static_cast<float>(hin - 1) / (hout - 1))
+                               : (static_cast<float>(hin) / (hout));
+  const dtype* src = x->data<dtype>();
+  dtype* dst = out->mutable_data<dtype>();
+  int dst_stride_w = 1;
+  int dst_stride_h = wout;
+  int dst_stride_c = wout * hout;
+  int dst_stride_batch = wout * hout * channels;
+  int src_stride_w = 1;
+  int src_stride_h = win;
+  int src_stride_c = win * hin;
+  int src_stride_batch = win * hin * channels;
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      int src_index = n * src_stride_batch + c * src_stride_c;
+      for (int h = 0; h < hout; ++h) {
+        for (int w = 0; w < wout; ++w) {
+          int fw = (with_align) ? static_cast<int>(scale_w * w + 0.5)
+                                : static_cast<int>(scale_w * w);
+          fw = (fw < 0) ? 0 : fw;
+          int fh = (with_align) ? static_cast<int>(scale_h * h + 0.5)
+                                : static_cast<int>(scale_h * h);
+          fh = (fh < 0) ? 0 : fh;
+          int w_start = static_cast<int>(fw);
+          int h_start = static_cast<int>(fh);
+          int dst_index = n * dst_stride_batch + c * dst_stride_c +
+                          h * dst_stride_h + w * dst_stride_w;
+          dst[dst_index] =
+              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
+        }
+      }
+    }
+  }
+}
+
+template <typename DType>
+void BilinearInterpRef(const lite::Tensor* x,
+                       lite::Tensor* out,
+                       bool align_corners,
+                       int align_mode) {
+  auto x_dims = x->dims();
+  int batch_size = x_dims[0];
+  int channel_size = x_dims[1];
+  auto x_h = x_dims[2];
+  auto x_w = x_dims[3];
+  CHECK_EQ(x_dims.size(), 4);
+
+  auto out_dims = out->dims();
+  int out_h = out_dims[2];
+  int out_w = out_dims[3];
+
+  // copy from x if no change
+  if (x_h == out_h && x_w == out_w) {
+    out->CopyDataFrom(*x);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(x_h - 1) / (out_h - 1)
+                              : static_cast<float>(x_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(x_w - 1) / (out_w - 1)
+                              : static_cast<float>(x_w) / out_w;
+  }
+
+  // naive bilinear interpolation
+  auto x_data = x->data<DType>();
+  auto out_data = out->mutable_data<DType>();
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  std::vector<int> vy_n, vy_s;
+  std::vector<float> vd_n, vd_s;
+  vy_n.reserve(out_h);
+  vy_s.reserve(out_h);
+  vd_n.reserve(out_h);
+  vd_s.reserve(out_h);
+  for (int k = 0; k < out_h; k++) {
+    int yn = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                        : static_cast<int>(ratio_h * k);
+    yn = (yn > 0) ? yn : 0;
+    int ys = (yn + 1) < (x_h - 1) ? (yn + 1) : (x_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float dn = align_flag ? idx_src_y - yn : ratio_h * k - yn;
+    float ds = 1.f - dn;
+    {
+      vy_n[k] = yn;
+      vy_s[k] = ys;
+      vd_n[k] = dn;
+      vd_s[k] = ds;
+    }
+  }
+
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+  for (int l = 0; l < out_w; l++) {
+    int xw = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                        : static_cast<int>(ratio_w * l);
+    xw = (xw > 0) ? xw : 0;
+    int xe = (xw + 1) < (x_w - 1) ? (xw + 1) : (x_w - 1);
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float dw = align_flag ? idx_src_x - xw : ratio_w * l - xw;
+    float de = 1.f - dw;
+    {
+      vx_w[l] = xw;
+      vx_e[l] = xe;
+      vd_w[l] = dw;
+      vd_e[l] = de;
+    }
+  }
+
+  std::vector<int64_t> x_strides(x_dims.size(), 1);
+  for (int idx = x_strides.size() - 2; idx >= 0; idx--) {
+    x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1];
+  }
+  for (int i = 0; i < batch_size; i++) {
+    for (int j = 0; j < channel_size; j++) {
+      for (int k = 0; k < out_h; k++) {
+        for (int l = 0; l < out_w; l++) {
+          DType x0 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_n[k] * x_strides[2] + vx_w[l] * x_strides[3]];
+          DType x1 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_s[k] * x_strides[2] + vx_w[l] * x_strides[3]];
+          DType x2 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_n[k] * x_strides[2] + vx_e[l] * x_strides[3]];
+          DType x3 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_s[k] * x_strides[2] + vx_e[l] * x_strides[3]];
+          *out_data = x0 * vd_s[k] * vd_e[l] + x1 * vd_n[k] * vd_e[l] +
+                      x2 * vd_s[k] * vd_w[l] + x3 * vd_n[k] * vd_w[l];
+          out_data++;
+        }
+      }
+    }
+  }
+}
+
+class InterpComputeTester {
+ protected:
+  // common attributes for this op.
+  std::string x_var_name = "X";
+  std::string outsize_var_name = "OutSize";
+  std::string out_var_name = "Out";
+  std::string out_ref_var_name = "out_ref";
+  DDim dims_{{1, 2, 3, 4}};
+
+  Scope scope;
+  std::string interp_method_ = "nearest";
+  float scale_ = -1.f;
+  int out_h_ = -1;
+  int out_w_ = -1;
+  bool align_corners_ = true;
+  int align_mode_ = 1;
+  bool use_outsize_ = false;
+
+ public:
+  InterpComputeTester(const std::string& alias,
+                      DDim dims,
+                      std::string interp_method = "nearest",
+                      float scale = -1.f,
+                      int out_h = -1,
+                      int out_w = -1,
+                      bool align_corners = true,
+                      int align_mode = 1,
+                      bool use_outsize = false)
+      : dims_(dims),
+        interp_method_(interp_method),
+        scale_(scale),
+        out_h_(out_h),
+        out_w_(out_w),
+        align_corners_(align_corners),
+        align_mode_(align_mode),
+        use_outsize_(use_outsize) {}
+
+  void Execute(float abs_error) {
+    cpp::OpDesc op_desc;
+    auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+    auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+    auto* outsize = scope.Var(outsize_var_name)->GetMutable<Tensor>();
+    auto* outref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+    int out_h = out_h_;
+    int out_w = out_w_;
+    if (scale_ > 0) {
+      out_h = static_cast<int>(dims_[2] * scale_);
+      out_w = static_cast<int>(dims_[3] * scale_);
+    }
+    x->Resize(dims_);
+    /* printf("----output tensor dims: %ld, %d, %d, %ld\n", dims_[0], out_h,
+     * out_w, dims_[1]); */
+    std::vector<int64_t> out_shape_nchw = {dims_[0], dims_[1], out_h, out_w};
+    outref->Resize(out_shape_nchw);
+    outsize->Resize({2});
+
+    FillTensor<float, float>(x, -1.f, 1.f);
+
+    if (use_outsize_) {
+      outsize->mutable_data<int>()[0] = out_h;
+      outsize->mutable_data<int>()[1] = out_w;
+      outsize->set_persistable(true);
+    }
+
+    if (interp_method_ == "nearest") {
+      op_desc.SetType("nearest_interp");
+    } else if (interp_method_ == "bilinear") {
+      op_desc.SetType("bilinear_interp");
+    } else {
+      LOG(FATAL) << "unsupport";
+    }
+    op_desc.SetInput("X", {x_var_name});
+    if (use_outsize_) {
+      op_desc.SetInput("OutSize", {outsize_var_name});
+    }
+    op_desc.SetOutput("Out", {out_var_name});
+    op_desc.SetAttr("scale", scale_);
+    op_desc.SetAttr("out_h", out_h_);
+    op_desc.SetAttr("out_w", out_w_);
+    op_desc.SetAttr("align_corners", align_corners_);
+    op_desc.SetAttr("align_mode", align_mode_);
+    op_desc.SetAttr("interp_method", interp_method_);
+    auto op = CreateOp<operators::InterpolateOp>(op_desc, &scope);
+
+    if (interp_method_ == "nearest") {
+      ResizeNearestAlign<float>(x, outref, align_corners_);
+    } else if (interp_method_ == "bilinear") {
+      BilinearInterpRef<float>(x, outref, align_corners_, align_mode_);
+    }
+
+    int in = dims_[0], ic = dims_[1], ih = dims_[2], iw = dims_[3];
+    Tensor input_trans;
+    input_trans.Resize(dims_);
+    transpose(x->mutable_data<float>(),
+              input_trans.mutable_data<float>(),
+              {in, ic, ih, iw},
+              {0, 2, 3, 1});
+    x->CopyDataFrom(input_trans);
+    if (use_outsize_) {
+      LaunchOp(op, {x_var_name, outsize_var_name}, {out_var_name});
+    } else {
+      LaunchOp(op, {x_var_name}, {out_var_name});
+    }
+
+    auto* out_ref_data = outref->mutable_data<float>();
+
+    Tensor output_trans;
+    output_trans.Resize(out_shape_nchw);
+    transpose(
+        out->mutable_data<float>(),
+        output_trans.mutable_data<float>(),
+        {static_cast<int>(dims_[0]), out_h, out_w, static_cast<int>(dims_[1])},
+        {0, 3, 1, 2});
+    auto* out_data = output_trans.mutable_data<float>();
+    for (int i = 0; i < out->dims().production(); ++i) {
+      EXPECT_NEAR(out_data[i], out_ref_data[i], abs_error);
+    }
+  }
+};
+
+void TestInterpOuthw(float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    /* for (auto interp_method : std::vector<std::string>{"nearest",
+     * "bilinear"}) { */
+    for (auto interp_method : std::vector<std::string>{"nearest"}) {
+      for (int out_h : {6, 8, 12}) {
+        for (int out_w : {6, 9}) {
+          printf("testcase %s: out_w %d, out_h %d\n",
+                 interp_method.c_str(),
+                 out_w,
+                 out_h);
+          InterpComputeTester tester(
+              "def", DDim(x_dims), interp_method, -1.f, out_h, out_w);
+          tester.Execute(abs_error);
+        }
+      }
+    }
+  }
+}
+
+void TestInterpScale(float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    /* for (auto interp_method : std::vector<std::string>{"nearest",
+     * "bilinear"}) { */
+    for (auto interp_method : std::vector<std::string>{"nearest"}) {
+      for (float scale : {0.3f, 1.f, 1.7f}) {
+        printf("testcase %s: scale: %f\n", interp_method.c_str(), scale);
+        InterpComputeTester tester("def", DDim(x_dims), interp_method, scale);
+        tester.Execute(abs_error);
+      }
+    }
+  }
+}
+
+void TestInterpOutsize(float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    /* for (auto interp_method : std::vector<std::string>{"nearest",
+     * "bilinear"}) { */
+    for (auto interp_method : std::vector<std::string>{"nearest"}) {
+      printf("testcase %s: outsize: %d %d\n", interp_method.c_str(), 4, 4);
+      InterpComputeTester tester(
+          "def", DDim(x_dims), interp_method, -1, 4, 4, true, 1, true);
+      tester.Execute(abs_error);
+    }
+  }
+}
+
+void TestInterpAlignCorners(float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (bool align_corners : {true, false}) {
+      printf(
+          "testcase nearest: scale: 0.4, out_w -1 out_h -1, align_corners %d\n",
+          align_corners);
+      InterpComputeTester tester(
+          "def", DDim(x_dims), "nearest", 0.4, -1, -1, align_corners);
+      tester.Execute(abs_error);
+    }
+  }
+}
+
+void TestInterpAlignMode(float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (bool align_corners : {true, false}) {
+      for (int align_mode : {0, 1}) {
+        printf(
+            "testcase bilinear: scale: 0.7, out_w -1 out_h -1, align_corners "
+            "%d, mode %d\n",
+            align_corners,
+            align_mode);
+        InterpComputeTester tester("def",
+                                   DDim(x_dims),
+                                   "bilinear",
+                                   0.7,
+                                   -1,
+                                   -1,
+                                   align_corners,
+                                   align_mode);
+        tester.Execute(abs_error);
+      }
+    }
+  }
+}
+
+TEST(MLUBridges, interpolate) {
+  float abs_error = 2e-5;
+  TestInterpOuthw(abs_error);
+  TestInterpScale(abs_error);
+  // bug, not usable
+  // TestInterpOutsize(abs_error);
+  TestInterpAlignCorners(abs_error);
+  // only for bilinear interp
+  // TestInterpAlignMode(abs_error);
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
diff --git a/lite/kernels/mlu/bridges/paddle_use_bridges.h b/lite/kernels/mlu/bridges/paddle_use_bridges.h
index 1b12970afadd4e3bdcd7568c05bc15583ccbaaae..d31ba0dd41111860a3b26d8ac3afb3273bef4557 100644
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
@@ -22,3 +22,7 @@ USE_SUBGRAPH_BRIDGE(pool2d, kMLU);
 USE_SUBGRAPH_BRIDGE(softmax, kMLU);
 USE_SUBGRAPH_BRIDGE(batch_norm, kMLU);
 USE_SUBGRAPH_BRIDGE(fc, kMLU);
+USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
+USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU);
+USE_SUBGRAPH_BRIDGE(concat, kMLU);
+USE_SUBGRAPH_BRIDGE(scale, kMLU);
diff --git a/lite/kernels/mlu/bridges/pool_op.cc b/lite/kernels/mlu/bridges/pool_op.cc
index 3119b6c77dca10641c7c7c32072969fedb1ecef6..f77c8084c76fc52c39938e723f02bde9b3cac41b 100644
--- a/lite/kernels/mlu/bridges/pool_op.cc
+++ b/lite/kernels/mlu/bridges/pool_op.cc
@@ -47,9 +47,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // Get input, and attributes
   auto x_var_name = op_info->Input("X").front();
   auto x = scope->FindTensor(x_var_name);
-  auto input_dims_nhwc = x->dims();
-  const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
   auto output_var_name = op_info->Output("Out").front();
+  auto output_shape = scope->FindTensor(output_var_name)->dims().Vectorize();
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
   auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
   auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
@@ -81,23 +80,17 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  strides,
                                  ksize);
 
-  std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
-  for (size_t i = 0; i < 2; i++) {
-    output_shape.push_back(
-        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - ksize[0]) /
-            strides[i] +
-        1);
-  }
+  //  std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
+  //  for (size_t i = 0; i < 2; i++) {
+  //    output_shape.push_back(
+  //        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] -
+  //        ksize[0]) /
+  //            strides[i] +
+  //        1);
+  //  }
 
-  auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
-  auto output_tensor = graph->AddNode(output_var_name,
-                                      output_shape_nhwc,
-                                      CNML_TENSOR,
-                                      CNML_NHWC,
-                                      graph->FPType());
-  scope->FindVar(output_var_name)
-      ->GetMutable<::paddle::lite::Tensor>()
-      ->Resize(output_shape_nhwc);
+  auto output_tensor = graph->AddNode(
+      output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
 
   cnmlPoolOpParam_t pool_param;
   CNML_CALL(
diff --git a/lite/kernels/mlu/bridges/pool_op_test.cc b/lite/kernels/mlu/bridges/pool_op_test.cc
index 29ef68781f4a99ebcc20901dabab6ee22a258424..8cee8dbe86109b14cff49f329d71074a9b3bfb61 100644
--- a/lite/kernels/mlu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
@@ -24,8 +24,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
-int PoolConverter(void* ctx, OpLite* op);
-
 void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
   Scope* scope = op->scope();
   const OpInfo* op_info = op->op_info();
@@ -182,12 +180,7 @@ void test_pool(int bs,
             {0, 2, 3, 1});
 
   auto os = out->dims();
-  out->Resize({static_cast<int>(os[0]),
-               static_cast<int>(os[2]),
-               static_cast<int>(os[3]),
-               static_cast<int>(os[1])});
   x->CopyDataFrom(input_trans);
-  x->Resize({bs, ih, iw, ic});
 
   LaunchOp(op, {x_var_name}, {out_var_name});
 
@@ -275,6 +268,4 @@ TEST(MLUBridges, pool) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         pool2d,
-                         paddle::lite::subgraph::mlu::PoolConverter);
+USE_SUBGRAPH_BRIDGE(pool2d, kMLU)
diff --git a/lite/kernels/mlu/bridges/scale_op.cc b/lite/kernels/mlu/bridges/scale_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5557602bd7576ccd71c51f52a538a45fe27f7ada
--- /dev/null
+++ b/lite/kernels/mlu/bridges/scale_op.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Create act node and set params from op
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  auto bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
+  auto scale = op_info->GetAttr<float>("scale");
+  auto bias = op_info->GetAttr<float>("bias");
+  auto beta = bias_after_scale ? bias : bias * scale;
+
+  std::vector<int64_t> shape = {1, 1, 1, 1};
+
+  std::string prefix = string_format("_%p", op);
+  auto alpha_tensor = graph->AddNode(
+      "Alpha" + prefix, shape, CNML_CONST, CNML_NHWC, graph->FPType());
+  auto beta_tensor = graph->AddNode(
+      "Beta" + prefix, shape, CNML_CONST, CNML_NHWC, graph->FPType());
+
+  graph->BindConstRawData("Alpha" + prefix, &scale, 1);
+  graph->BindConstRawData("Beta" + prefix, &beta, 1);
+
+  auto input_tensor = graph->GetNode(x_var_name);
+  cnmlBaseOp_t scale_op;
+  CNML_CALL(cnmlCreateScaleOp(&scale_op,
+                              input_tensor->mlu_tensor(),
+                              output_tensor->mlu_tensor(),
+                              alpha_tensor->mlu_tensor(),
+                              beta_tensor->mlu_tensor()));
+  graph->FuseOp(scale_op);
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(scale,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ScaleConverter);
diff --git a/lite/kernels/mlu/bridges/scale_op_test.cc b/lite/kernels/mlu/bridges/scale_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0ed975a84a174d1a58c9ed23bb925fdcc82b46f
--- /dev/null
+++ b/lite/kernels/mlu/bridges/scale_op_test.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/scale_op.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void scale_ref(const std::shared_ptr<operators::ScaleOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  float scale = op_info->GetAttr<float>("scale");
+  float bias = op_info->GetAttr<float>("bias");
+  bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
+  if (!bias_after_scale) {
+    bias *= scale;
+  }
+  auto x_data = x->data<float>();
+  auto out_data = out->mutable_data<float>();
+  DDim x_dims = x->dims();
+  DDim out_dims = out->dims();
+  CHECK_EQ(x_dims.production(), out_dims.production());
+  for (int i = 0; i < out_dims.production(); i++) {
+    out_data[i] = x_data[i] * scale + bias;
+  }
+}
+
+void test_scale(int bs,
+                int ic,
+                int ih,
+                int iw,
+                bool bias_after_scale,
+                float scale,
+                float bias) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string out_ref_var_name("out_ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize({bs, ic, ih, iw});
+
+  // initialize input&output data
+  FillTensor<float, int>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("scale");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("bias_after_scale", bias_after_scale);
+  opdesc.SetAttr("scale", scale);
+  opdesc.SetAttr("bias", bias);
+
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::ScaleOp>(opdesc, &scope);
+  scale_ref(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input_trans;
+  input_trans.Resize({bs, ic, ih, iw});
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {bs, ic, ih, iw},
+            {0, 2, 3, 1});
+  auto os = out->dims();
+  out->Resize({static_cast<int>(os[0]),
+               static_cast<int>(os[2]),
+               static_cast<int>(os[3]),
+               static_cast<int>(os[1])});
+  x->CopyDataFrom(input_trans);
+  x->Resize({bs, ih, iw, ic});
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor('out')
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize(os);
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(os[0]),
+             static_cast<int>(os[2]),
+             static_cast<int>(os[3]),
+             static_cast<int>(os[1])},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(MLUBridges, scale) {
+  for (auto bs : {1, 3}) {
+    for (auto ic : {1, 3}) {
+      for (auto ih : {3, 4}) {
+        for (auto iw : {4, 3}) {
+          for (auto bias_after_scale : {false, true}) {
+            for (auto scale : {-1.0f, 5.0f}) {
+              for (auto bias : {-2.0f, 30.0f}) {
+                VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
+                        << " iw: " << iw
+                        // << " bias_after_scale: " << bias_after_scale
+                        << " scale: " << scale << " bias: " << bias;
+                test_scale(bs, ic, ih, iw, bias_after_scale, scale, bias);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(scale, kMLU);
diff --git a/lite/kernels/mlu/bridges/softmax_op.cc b/lite/kernels/mlu/bridges/softmax_op.cc
index b9e2b1116dc95ec276f8d85a5669cec45d98ea39..17c911675718a15c7ede4888b268ffcd62b4d8ed 100644
--- a/lite/kernels/mlu/bridges/softmax_op.cc
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
@@ -45,11 +45,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       axis = output_dims.size() + axis;
     }
   }
-
   int nhwc_axis = nchw_to_nhwc_aixs_map[axis];
 
   auto output_tensor = graph->AddNode(
-      out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
   cnmlBaseOp_t softmax_op;
   CNML_CALL(cnmlCreateNdSoftmaxOp(&softmax_op,
                                   nhwc_axis,
diff --git a/lite/kernels/mlu/bridges/softmax_op_test.cc b/lite/kernels/mlu/bridges/softmax_op_test.cc
index 7ceb050d8008f8186fdd737c394d8fe8dc0ffd7f..a5251ed43c9187fc2874f9b01853b45b8abf7f1c 100644
--- a/lite/kernels/mlu/bridges/softmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
@@ -23,8 +23,6 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
-int SoftmaxConverter(void* ctx, OpLite* op);
-
 template <typename dtype>
 void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
   Scope* scope = op->scope();
@@ -112,9 +110,7 @@ void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
             {bs, ic, ih, iw},
             {0, 2, 3, 1});
 
-  out->Resize({bs, ih, iw, ic});
   x->CopyDataFrom(input_trans);
-  x->Resize({bs, ih, iw, ic});
 
   LaunchOp(op, {x_var_name}, {out_var_name});
 
@@ -171,6 +167,4 @@ TEST(MLUBridges, softmax) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(MLU,
-                         softmax,
-                         paddle::lite::subgraph::mlu::SoftmaxConverter);
+USE_SUBGRAPH_BRIDGE(softmax, kMLU)
diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h
index 7bb2e1b20334e359b2db0ecf1fe61e16175413dc..12dc97a772dabc529bf183f783a22a9f2dfa936d 100644
--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
@@ -47,6 +47,8 @@ class MLUTensor {
     return mlu_ptr_;
   }
 
+  void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; }
+
   ~MLUTensor();
 
  private:
diff --git a/lite/kernels/mlu/bridges/test_helper.cc b/lite/kernels/mlu/bridges/test_helper.cc
index cf2d7bd6c1ec5634bb0d7556a16166ac0b0bcb45..377a00689ef3a27f78ae008072578ab3701cd337 100644
--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
@@ -28,7 +28,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
               const std::vector<std::string>& input_var_names,
               const std::vector<std::string>& output_var_names) {
   CNRT_CALL(cnrtInit(0));
-  SetMluDevice(0);
+  ::paddle::lite::SetMluDevice(0);
   cnrtQueue_t queue_;
   cnrtInvokeFuncParam_t forward_param;
   u32_t affinity = 1;
@@ -47,7 +47,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
   const auto& bridges = subgraph::Registry::Instance();
   CHECK(bridges.Exists(op_type, TARGET(kMLU)));
 
-  // Convert all of input data vars and added into the MLU IR graph
+  // Convert input data var and add it into the MLU IR graph
   for (auto& input_name : input_var_names) {
     auto input_tensor = scope->FindMutableTensor(input_name);
     CHECK(input_tensor);
@@ -58,7 +58,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
         graph.AddNode(input_name,
                       input_tensor->dims().Vectorize(),
                       CNML_TENSOR,
-                      CNML_NHWC,
+                      CNML_NCHW,
                       graph.FPType(),
                       reinterpret_cast<void*>(
                           input_tensor->mutable_data<float>(TARGET(kMLU))));
@@ -68,6 +68,8 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
                           sizeof(float) * input_tensor->dims().production(),
                           CNRT_MEM_TRANS_DIR_HOST2DEV));
   }
+  op->CheckShape();
+  op->InferShape();
   bridges.Select(op_type, TARGET(kMLU))(
       reinterpret_cast<void*>(&graph), const_cast<OpLite*>(op.get()), nullptr);
 
diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h
index 2af8274e07713300277f7280f12e6d1fcb47c3c2..fa8fb1597c0fb068a855928dd20057d48ecd5eaf 100644
--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
@@ -84,7 +84,7 @@ struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
 
 template <>
 struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
-  typedef ::paddle::lite::fluid::float16 T;
+  typedef paddle::lite::fluid::float16 T;
 };
 
 }  // namespace mlu
diff --git a/lite/kernels/mlu/io_copy_compute.cc b/lite/kernels/mlu/io_copy_compute.cc
index bc6e1838d70383edb3dcc65d7a9b0f627719e963..02e4d8b28e81e88201b895a4b8fbe9e93d3f17f9 100644
--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
@@ -133,22 +133,3 @@ REGISTER_LITE_KERNEL(
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
     .Finalize();
-
-//                     kMLU,
-//                     kFloat,
-//                     kNHWC,
-//                     paddle::lite::kernels::mlu::IoCopyHostToMluCompute,
-//                     host_to_device)
-//    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
-//    .Finalize();
-//
-//
-//                     kMLU,
-//                     kFloat,
-//                     kNHWC,
-//                     paddle::lite::kernels::mlu::IoCopyMluToHostCompute,
-//                     device_to_host)
-//    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-//    .Finalize();
diff --git a/lite/kernels/mlu/layout_compute.cc b/lite/kernels/mlu/layout_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4e16734d6d2dae6f5c119194008bce114a2e918
--- /dev/null
+++ b/lite/kernels/mlu/layout_compute.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/layout_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    layout,
+    kMLU,
+    kFloat,
+    kNHWC,
+    paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFloat)>,
+    def_layout_nhwc2nchw_fp32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    layout,
+    kMLU,
+    kFP16,
+    kNHWC,
+    paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFP16)>,
+    def_layout_nhwc2nchw_fp16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    layout,
+    kMLU,
+    kFloat,
+    kNHWC,
+    paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFloat)>,
+    def_layout_nchw2nhwc_fp32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    layout,
+    kMLU,
+    kFP16,
+    kNHWC,
+    paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFP16)>,
+    def_layout_nchw2nhwc_fp16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    layout,
+    kMLU,
+    kInt8,
+    kNHWC,
+    paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kInt8)>,
+    def_layout_nchw2nhwc_fp32_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/mlu/layout_compute.h b/lite/kernels/mlu/layout_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..edacdf8a98a2ffde6e538f61d4dd8259e3211b22
--- /dev/null
+++ b/lite/kernels/mlu/layout_compute.h
@@ -0,0 +1,175 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Eigen/Core>
+#include <string>
+#include <vector>
+#include "lite/backends/x86/math/math_function.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/layout_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+
+template <paddle::lite_api::PrecisionType>
+struct FPTypeTraits {};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
+  typedef float T;
+};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
+  typedef paddle::lite::fluid::float16 T;
+};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
+  typedef int8_t T;
+};
+
+template <lite::TargetType Target, typename T>
+inline void LayoutTransCompute(const int dim,
+                               const lite::Context<Target>& context,
+                               const lite::Tensor& in,
+                               lite::Tensor* out,
+                               const std::vector<int>& axis) {
+  switch (dim) {
+    case 2:
+      paddle::lite::x86::math::Transpose<lite::TargetType::kX86, T, 2> trans2;
+      trans2(context, in, out, axis);
+      break;
+    case 3:
+      paddle::lite::x86::math::Transpose<lite::TargetType::kX86, T, 3> trans3;
+      trans3(context, in, out, axis);
+      break;
+    case 4:
+      paddle::lite::x86::math::Transpose<lite::TargetType::kX86, T, 4> trans4;
+      trans4(context, in, out, axis);
+      break;
+    default:
+      CHECK(0) << ("Unsupport dim in mlu layout");
+  }
+}
+
+template <PrecisionType Precision>
+class LayoutNchwToNhwcCompute
+    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::LayoutParam;
+
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto* x = param.x;
+    auto* out = param.y;
+    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
+    auto x_dims = param.x->dims().size();
+    auto& context = this->ctx_->template As<X86Context>();
+
+    const auto origin_dims = out->dims().Vectorize();
+
+    std::vector<int> axis;
+    switch (x_dims) {
+      case 2:
+        axis = {0, 1};
+        break;
+      case 3:
+        axis = {0, 2, 1};
+        out->Resize(std::vector<int64_t>{
+            out->dims()[0], out->dims()[2], out->dims()[1]});
+        break;
+      case 4:
+        axis = {0, 2, 3, 1};
+        out->Resize(std::vector<int64_t>{
+            out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
+        break;
+      default:
+        CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc";
+    }
+
+    LayoutTransCompute<lite::TargetType::kX86,
+                       typename FPTypeTraits<Precision>::T>(
+        x_dims, context, *x, out, axis);
+
+    if (x_dims > 2) {
+      out->Resize(origin_dims);
+    }
+  }
+
+  std::string doc() const override {
+    return "Mlu layout transform nchw to nhwc";
+  }
+};
+
+template <PrecisionType Precision>
+class LayoutNhwcToNchwCompute
+    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::LayoutParam;
+
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto* x = param.x;
+    auto* out = param.y;
+    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
+    auto x_dims = param.x->dims().size();
+    auto& context = this->ctx_->template As<X86Context>();
+
+    const auto origin_dims = out->dims().Vectorize();
+
+    std::vector<int> axis;
+    switch (x_dims) {
+      case 2:
+        axis = {0, 1};
+        break;
+      case 3:
+        out->Resize(std::vector<int64_t>{
+            out->dims()[0], out->dims()[2], out->dims()[1]});
+        axis = {0, 2, 1};
+        break;
+      case 4:
+        out->Resize(std::vector<int64_t>{
+            out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]});
+        axis = {0, 3, 1, 2};
+        break;
+      default:
+        CHECK(0) << "Unsupport dim in mlu layout nhwc to nchw";
+    }
+
+    LayoutTransCompute<lite::TargetType::kX86,
+                       typename FPTypeTraits<Precision>::T>(
+        x_dims, context, *x, out, axis);
+
+    if (x_dims > 2) {
+      out->Resize(origin_dims);
+    }
+  }
+
+  std::string doc() const override {
+    return "Mlu layout transform nhwc to nchw";
+  }
+};
+
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h
index 06fc791fe7d07ba759e2ed0f9c6187432e195186..3bfba33f4d7e8fd86f7aaf276da2ca4a8b0bd7cf 100644
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -46,6 +46,32 @@ class SubgraphEngine : public subgraph::Engine {
     graph_.SetFPType(type);
   }
 
+  int Build() {
+    // In order to attach all of the ops of the block desc, we need to build
+    // the original program firstly.
+    BuildOriginProgram();
+    // Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph
+    build_device_program_status_ = BuildDeviceProgram();
+    return build_device_program_status_;
+  }
+
+  int Launch() {
+    // Rebuild device program when the shapes of input tensors have been
+    // changed.
+    if (subgraph::CHECK_SUCCESS(build_device_program_status_) &&
+        subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED(
+            build_device_program_status_) &&
+        InputShapeChanged()) {
+      Build();
+    }
+    if (subgraph::CHECK_FAILED(build_device_program_status_)) {
+      LaunchOriginProgram();
+    } else {
+      LaunchDeviceProgram();
+    }
+    return 0;
+  }
+
  protected:
   int BuildDeviceProgram() override {
     int status = 0;
@@ -57,7 +83,7 @@ class SubgraphEngine : public subgraph::Engine {
           graph_.AddNode(input_name,
                          input_tensor->dims().Vectorize(),
                          CNML_TENSOR,
-                         CNML_NHWC,
+                         CNML_NCHW,
                          graph_.FPType(),
                          const_cast<void*>(input_tensor->raw_data()));
       CHECK(input_node);
@@ -71,9 +97,9 @@ class SubgraphEngine : public subgraph::Engine {
     for (auto& inst : origin_program_) {
       auto op = inst.op();
       CHECK(op);
-      op->CheckShape();
-      op->InferShape();
       std::string op_type = op->op_info()->Type();
+      op->CheckShape();
+      const_cast<OpLite*>(op)->InferShape();
       if (!bridges.Exists(op_type, TARGET(kMLU))) {
         LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
         return subgraph::FAILED;
@@ -108,23 +134,23 @@ class SubgraphEngine : public subgraph::Engine {
       graph_.AddInput(graph_.GetNode(input_name));
     }
     CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
-    // auto& mlu_context = this->ctx_->template As<MLUContext>();
-    // auto core_version = mlu_context.MLUCoreVersion();
-    // auto core_number = mlu_context.MLUCoreNumber();
-    // graph_.Compile(core_version, core_number);
+    auto& mlu_context = this->ctx_->template As<MLUContext>();
+    auto core_version = mlu_context.MLUCoreVersion();
+    auto core_number = mlu_context.MLUCoreNumber();
+    graph_.Compile(core_version, core_number);
     return status;
   }
 
   int LaunchDeviceProgram() override {
-    // auto& mlu_context = this->ctx_->template As<MLUContext>();
-    // auto exec_queue = mlu_context.exec_queue();
-    // u32_t affinity = mlu_context.affinity();
-    // cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
-    // int data_param = 1;
-    // forward_param.data_parallelism = &data_param;
-    // forward_param.affinity = &affinity;
-    // forward_param.end = CNRT_PARAM_END;
-    // graph_.Compute(forward_param, exec_queue);
+    auto& mlu_context = this->ctx_->template As<MLUContext>();
+    auto exec_queue = mlu_context.exec_queue();
+    u32_t affinity = mlu_context.affinity();
+    cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
+    int data_param = 1;
+    forward_param.data_parallelism = &data_param;
+    forward_param.affinity = &affinity;
+    forward_param.end = CNRT_PARAM_END;
+    graph_.Compute(forward_param, exec_queue);
     return 0;
   }
 
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index bcf6ba63eb820ee187dd26b2722686a768f78c98..e53bd60c6bade98992524fe0959e2f80f535a6be 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU)
   return()
 endif()
 
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index 8b1a1f8d3d950840ce8fadef70150c452b54c186..d9fae3d48efb1eab2681338b02afa2fee65750b6 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -33,7 +33,7 @@ add_kernel(slice_opencl OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kerne
 add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
-
+add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
 # extra
 # wait to add ...
 
@@ -97,6 +97,10 @@ lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc
                  
 lite_cc_test(test_pad2d_image_opencl SRCS pad2d_image_compute_test.cc
                  DEPS pad2d_opencl layout_opencl op_registry program context)
+
+lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
+                 DEPS box_coder_opencl op_registry program context)
+
 ######################
 # buffer kernel      #
 ######################
diff --git a/lite/kernels/opencl/activation_buffer_compute.cc b/lite/kernels/opencl/activation_buffer_compute.cc
index c662aa89fb257aded70119ea14494111398f0529..03ccdac99e5f11e1c056374463f7a8068dbd4f56 100644
--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
@@ -32,8 +32,10 @@ class ReluCompute
   std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/relu_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -46,7 +48,7 @@ class ReluCompute
     auto* x_buf = param.X->data<float, cl::Buffer>();
     auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     VLOG(4) << TargetToStr(param.X->target());
     VLOG(4) << TargetToStr(param.Out->target());
@@ -74,6 +76,7 @@ class ReluCompute
  private:
   std::string kernel_func_name_{"relu"};
   std::string build_options_{"-DCL_DTYPE_float -DRELU"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
@@ -87,8 +90,10 @@ class SigmoidCompute
   }
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/sigmoid_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -101,7 +106,7 @@ class SigmoidCompute
     auto* x_buf = param.X->data<float, cl::Buffer>();
     auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     VLOG(4) << TargetToStr(param.X->target());
     VLOG(4) << TargetToStr(param.Out->target());
@@ -129,6 +134,7 @@ class SigmoidCompute
  private:
   std::string kernel_func_name_{"sigmoid"};
   std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/activation_image_compute.cc b/lite/kernels/opencl/activation_image_compute.cc
index dbe487ba91d00c2de4c08edf140526d727bac6b5..a99e588eccd79eb35a5e7c0f3da73471849ab581 100644
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -37,11 +37,12 @@ class ActivationComputeImageDefault
   }
 
   void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
     act_param_ = param_.get_mutable<param_t>();
     int act_type = static_cast<int>(act_param_->active_type);
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(1) << "ActivationTypeToStr(act_param_->active_type):"
             << ActivationTypeToStr(act_param_->active_type);
+#endif
     switch (act_type) {
       case 1:
         kernel_func_name_ = "relu";
@@ -71,41 +72,70 @@ class ActivationComputeImageDefault
         LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
         return;
     }
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/activation_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_img = param.X->data<half_t, cl::Image2D>();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
+#endif
 
     auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/activation_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  }
 
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_img);
+  void ReInitWhenNeeded() override {
+    act_param_ = param_.get_mutable<param_t>();
+    auto x_dims = act_param_->X->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      x_img_shape_ = default_convertor.InitImageDimInfoWith(
+          act_param_->X->dims());  // w, h
+      out_img_shape_ = default_convertor.InitImageDimInfoWith(
+          act_param_->Out->dims());  // w, h
+
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+
+  void GetGlobalWorkSize() {
+    global_work_size_ =
+        cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
+                    static_cast<cl::size_type>(x_img_shape_[1])};
+  }
+
+  void Run() override {
+    auto* x_img = act_param_->X->data<half_t, cl::Image2D>();
+    auto* out_img = act_param_->Out->mutable_data<half_t, cl::Image2D>(
+        out_img_shape_[0], out_img_shape_[1]);
+
+    auto kernel = kernel_;
+    cl_int status;
+    status = kernel.setArg(0, *x_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
+    status = kernel.setArg(1, *out_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, threshold_);
+    status = kernel.setArg(2, threshold_);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, scale_);
+    status = kernel.setArg(3, scale_);
     CL_CHECK_FATAL(status);
 
 #ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
+    const auto& x_dims = act_param_->X->dims();
+    const auto& y_dims = act_param_->Out->dims();  // useless: check dim only
+    VLOG(4) << TargetToStr(act_param_->X->target());
+    VLOG(4) << TargetToStr(act_param_->Out->target());
+    VLOG(4) << "x_img_shape_(w,h):" << x_img_shape_[0] << " "
+            << x_img_shape_[1];
     VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
             << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
     VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
@@ -115,13 +145,12 @@ class ActivationComputeImageDefault
     VLOG(4) << "kernel func name:" << kernel_func_name_;
 #endif
 
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
-        global_work_size,
+        global_work_size_,
         cl::NullRange,
         nullptr,
         event_.get());
@@ -131,10 +160,20 @@ class ActivationComputeImageDefault
 
  private:
   param_t* act_param_{nullptr};
+  DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim last_x_dims_;
   std::string kernel_func_name_{};
   float threshold_{6.f};
   float scale_{1.f};
+  cl::Kernel kernel_;
+  bool first_epoch_for_reinit_{true};
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 }  // namespace opencl
diff --git a/lite/kernels/opencl/bilinear_interp_image_compute.cc b/lite/kernels/opencl/bilinear_interp_image_compute.cc
index 7e32010c0b5ff5cedad8b0da7ce7233fbf73da6f..53f260789e12a94dc39f785df12a8e988d08bcbe 100644
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
@@ -43,8 +43,10 @@ class BilinearInterpImageCompute
     bilinear_interp_param_ = param_.get_mutable<param_t>();
 
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/bilinear_interp_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/bilinear_interp_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -103,7 +105,7 @@ class BilinearInterpImageCompute
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -159,6 +161,7 @@ class BilinearInterpImageCompute
   param_t* bilinear_interp_param_{nullptr};
   std::string kernel_func_name_{"bilinear_interp"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/box_coder_image_compute.cc b/lite/kernels/opencl/box_coder_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81ad858df0834f58b84b55ef594d71442a27f186
--- /dev/null
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
@@ -0,0 +1,171 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
+                                               PRECISION(kFP16),
+                                               DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::BoxCoderParam;
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    boxcoder_param_ = param_.get_mutable<param_t>();
+    if (boxcoder_param_->code_type == "decode_center_size" &&
+        boxcoder_param_->box_normalized == true) {
+      kernel_func_name_ = "decode_center_size";
+    } else {
+      printf("This code_type %s doesn't support \n",
+             boxcoder_param_->code_type.c_str());
+      return;
+    }
+    CHECK(context.cl_context() != nullptr);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/box_coder_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    boxcoder_param_ = param_.get_mutable<param_t>();
+    const auto& out_dims = boxcoder_param_->proposals->dims();
+    auto image_shape = InitImageDimInfoWith(out_dims);
+
+    auto* out_buf =
+        boxcoder_param_->proposals->mutable_data<half_t, cl::Image2D>(
+            image_shape["width"], image_shape["height"]);
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "boxcoder input shape:  ";
+
+#endif
+    const auto* input_priorbox = boxcoder_param_->prior_box;
+    const auto* input_priorboxvar = boxcoder_param_->prior_box_var;
+    const auto* input_targetbox = boxcoder_param_->target_box;
+    const auto& code_type = boxcoder_param_->code_type;
+    if (code_type == "decode_center_size") {
+      auto* prior_box_image = input_priorbox->data<half_t, cl::Image2D>();
+      auto* prior_box_var_image =
+          input_priorboxvar->data<half_t, cl::Image2D>();
+      auto* target_box_image = input_targetbox->data<half_t, cl::Image2D>();
+
+      int new_dims[4] = {1, 1, 1, 1};
+      for (int i = 0; i < out_dims.size(); i++) {
+        new_dims[4 - out_dims.size() + i] = out_dims[i];
+      }
+      auto& context = ctx_->As<OpenCLContext>();
+      CHECK(context.cl_context() != nullptr);
+      STL::stringstream kernel_key;
+      kernel_key << kernel_func_name_ << build_options_;
+      auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+      auto default_work_size =
+          DefaultWorkSize(out_dims,
+                          DDim(std::vector<DDim::value_type>{
+                              static_cast<int64_t>(image_shape["width"]),
+                              static_cast<int64_t>(image_shape["height"])}));
+
+      int out_C = new_dims[1];
+      int out_H = new_dims[2];
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(4) << TargetToStr(boxcoder_param_->proposals->target());
+      VLOG(4) << "output shape: " << out_dims[0] << ", " << out_dims[1] << ", "
+              << out_dims[2] << ", " << out_dims[3];
+      VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+              << image_shape["height"];
+      VLOG(4) << "out_C = " << out_C;
+      VLOG(4) << "out_H = " << out_H;
+      VLOG(4) << "default_work_size = " << default_work_size[0] << ", "
+              << default_work_size[1] << ", " << default_work_size[2];
+#endif
+      int arg_idx = 0;
+      cl_int status = kernel.setArg(arg_idx++, *prior_box_image);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, *prior_box_var_image);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, *target_box_image);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, *out_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, out_C);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(arg_idx++, out_H);
+      CL_CHECK_FATAL(status);
+      auto global_work_size =
+          cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+                      static_cast<cl::size_type>(default_work_size[2])};
+
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_wait_list()->emplace(out_buf, event_);
+
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
+              << global_work_size[1];
+#endif
+    }
+  }
+  std::string doc() { return "Boxcoder using cl::Image, kFP16"; }
+
+  param_t* boxcoder_param_{nullptr};
+  std::string kernel_func_name_{};
+  std::string build_options_{" -DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+typedef paddle::lite::kernels::opencl::BoxCoderComputeImage BoxCoder_image;
+
+REGISTER_LITE_KERNEL(
+    box_coder, kOpenCL, kFP16, kImageDefault, BoxCoder_image, ImageDefault)
+    .BindInput("PriorBox",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("PriorBoxVar",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("TargetBox",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("OutputBox",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/box_coder_image_compute_test.cc b/lite/kernels/opencl/box_coder_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ab37a8b015a80c0389bd6f62bb07c70c0d14a74a
--- /dev/null
+++ b/lite/kernels/opencl/box_coder_image_compute_test.cc
@@ -0,0 +1,298 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-1)
+namespace paddle {
+namespace lite {
+void box_coder_ref(float* proposals_data,
+                   const float* anchors_data,
+                   const float* bbox_deltas_data,
+                   const float* variances_data,
+                   int axis,
+                   bool box_normalized,
+                   std::string code_type,
+                   int row,
+                   int col) {
+  if (code_type == "decode_center_size") {
+    int anchor_len = 4;
+    int out_len = 4;
+    int var_len = 4;
+    int delta_len = 4;
+    float normalized = !box_normalized ? 1.f : 0;
+
+    for (int64_t row_id = 0; row_id < row; ++row_id) {
+      for (int64_t col_id = 0; col_id < col; ++col_id) {
+        size_t delta_offset = row_id * col * delta_len + col_id * delta_len;
+        size_t out_offset = row_id * col * out_len + col_id * out_len;
+        int prior_box_offset =
+            axis == 0 ? col_id * anchor_len : row_id * anchor_len;
+        int var_offset = axis == 0 ? col_id * var_len : row_id * var_len;
+        auto anchor_data_tmp = anchors_data + prior_box_offset;
+        auto bbox_deltas_data_tmp = bbox_deltas_data + delta_offset;
+        auto proposals_data_tmp = proposals_data + out_offset;
+        auto anchor_width =
+            anchor_data_tmp[2] - anchor_data_tmp[0] + normalized;
+        auto anchor_height =
+            anchor_data_tmp[3] - anchor_data_tmp[1] + normalized;
+        auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width;
+        auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height;
+        float bbox_center_x = 0, bbox_center_y = 0;
+        float bbox_width = 0, bbox_height = 0;
+
+        auto variances_data_tmp = variances_data + var_offset;
+        bbox_center_x =
+            variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width +
+            anchor_center_x;
+        bbox_center_y =
+            variances_data_tmp[1] * bbox_deltas_data_tmp[1] * anchor_height +
+            anchor_center_y;
+        bbox_width = std::exp(variances_data_tmp[2] * bbox_deltas_data_tmp[2]) *
+                     anchor_width;
+        bbox_height =
+            std::exp(variances_data_tmp[3] * bbox_deltas_data_tmp[3]) *
+            anchor_height;
+        proposals_data_tmp[0] = bbox_center_x - bbox_width / 2;
+        proposals_data_tmp[1] = bbox_center_y - bbox_height / 2;
+        proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized;
+        proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized;
+      }
+    }
+  } else if (code_type == "encode_center_size") {
+    LOG(FATAL) << "not implemented type: " << code_type;
+  } else {
+    LOG(FATAL) << "not supported type: " << code_type;
+  }
+}
+// #define BOXCODER_FP16_LOOP_TEST
+// #define BOXCODER_FP16_PRINT_RESULT
+TEST(box_coder_image2d, compute) {
+#ifdef BOXCODER_FP16_LOOP_TEST
+  for (auto n : {1, 2, 3, 4}) {
+    for (auto m : {1, 3, 4, 8}) {
+      for (auto norm : {true}) {
+        for (auto code_type : {"decode_center_size"}) {
+          for (auto axis : {0}) {
+#else
+  const int n = 1;
+  const int m = 1;
+  const bool norm = true;
+  const std::string code_type = "decode_center_size";
+  const int axis = 0;
+#endif  // BOXCODER_FP16_LOOP_TEST
+
+            LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << m
+                      << " ========";
+            LOG(INFO) << "======== parameters: norm = " << norm
+                      << ", axis = " << axis << "code_type: " << code_type;
+
+            auto kernels =
+                KernelRegistry::Global().Create("box_coder",
+                                                TARGET(kOpenCL),
+                                                PRECISION(kFP16),
+                                                DATALAYOUT(kImageDefault));
+            ASSERT_FALSE(kernels.empty());
+            auto kernel = std::move(kernels.front());
+            LOG(INFO) << "get kernel:" << kernel->doc();
+
+            lite::Tensor prior_box, prior_box_var, target_box, output_box;
+            operators::BoxCoderParam param;
+            param.prior_box = &prior_box;
+            param.prior_box_var = &prior_box_var;
+            param.target_box = &target_box;
+            param.proposals = &output_box;
+            param.axis = axis;
+            param.box_normalized = norm;
+            param.code_type = code_type;
+
+            std::unique_ptr<KernelContext> context(new KernelContext);
+            context->As<OpenCLContext>().InitOnce();
+
+            kernel->SetParam(param);
+            std::unique_ptr<KernelContext> boxcoder_context(new KernelContext);
+            context->As<OpenCLContext>().CopySharedTo(
+                &(boxcoder_context->As<OpenCLContext>()));
+            kernel->SetContext(std::move(boxcoder_context));
+
+            const DDim prior_box_dims =
+                DDim(std::vector<DDim::value_type>{1, 1, m, 4});
+            const DDim prior_box_var_dims =
+                DDim(std::vector<DDim::value_type>{1, 1, m, 4});
+            const DDim target_box_dims =
+                DDim(std::vector<DDim::value_type>{1, n, m, 4});
+            const DDim out_dim =
+                DDim(std::vector<DDim::value_type>{1, n, m, 4});
+            prior_box.Resize(prior_box_dims);
+            prior_box_var.Resize(prior_box_var_dims);
+            target_box.Resize(target_box_dims);
+            output_box.Resize(out_dim);
+
+            std::vector<float> prior_box_data(prior_box_dims.production());
+            std::vector<float> prior_box_var_data(
+                prior_box_var_dims.production());
+            std::vector<float> target_box_data(target_box_dims.production());
+            for (int i = 0; i < prior_box_dims.production(); i++) {
+              prior_box_data[i] = i * 1.1 / prior_box_dims.production();
+            }
+            for (int i = 0; i < prior_box_var_dims.production(); i++) {
+              prior_box_var_data[i] = i * 1.2 / prior_box_var_dims.production();
+            }
+            for (int i = 0; i < target_box_dims.production(); i++) {
+              target_box_data[i] = i * 1.3 / target_box_dims.production();
+            }
+
+            LOG(INFO) << "prepare input";
+            CLImageConverterDefault* default_converter =
+                new CLImageConverterDefault();
+            DDim prior_box_image_shape =
+                default_converter->InitImageDimInfoWith(prior_box_dims);
+            LOG(INFO) << "prior_box_image_shape = " << prior_box_image_shape[0]
+                      << " " << prior_box_image_shape[1];
+            std::vector<half_t> prior_box_image_data(
+                prior_box_image_shape.production() * 4);  // 4 : RGBA
+            default_converter->NCHWToImage(prior_box_data.data(),
+                                           prior_box_image_data.data(),
+                                           prior_box_dims);
+            auto* prior_box_image = prior_box.mutable_data<half_t, cl::Image2D>(
+                prior_box_image_shape[0],
+                prior_box_image_shape[1],
+                prior_box_image_data.data());
+
+            DDim prior_box_var_image_shape =
+                default_converter->InitImageDimInfoWith(prior_box_var_dims);
+            LOG(INFO) << "prior_box_var_image_shape = "
+                      << prior_box_var_image_shape[0] << " "
+                      << prior_box_var_image_shape[1];
+            std::vector<half_t> prior_box_var_image_data(
+                prior_box_var_image_shape.production() * 4);  // 4 : RGBA
+            default_converter->NCHWToImage(prior_box_var_data.data(),
+                                           prior_box_var_image_data.data(),
+                                           prior_box_var_dims);
+            auto* prior_box_var_image =
+                prior_box_var.mutable_data<half_t, cl::Image2D>(
+                    prior_box_var_image_shape[0],
+                    prior_box_var_image_shape[1],
+                    prior_box_var_image_data.data());
+
+            DDim target_box_image_shape =
+                default_converter->InitImageDimInfoWith(target_box_dims);
+            LOG(INFO) << "target_box_image_shape = "
+                      << target_box_image_shape[0] << " "
+                      << target_box_image_shape[1];
+            std::vector<half_t> target_box_image_data(
+                target_box_image_shape.production() * 4);  // 4 : RGBA
+            default_converter->NCHWToImage(target_box_data.data(),
+                                           target_box_image_data.data(),
+                                           target_box_dims);
+            auto* target_box_image =
+                target_box.mutable_data<half_t, cl::Image2D>(
+                    target_box_image_shape[0],
+                    target_box_image_shape[1],
+                    target_box_image_data.data());
+
+            DDim out_image_shape =
+                default_converter->InitImageDimInfoWith(out_dim);
+            LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+                      << out_image_shape[1];
+            auto* out_image = output_box.mutable_data<half_t, cl::Image2D>(
+                out_image_shape[0], out_image_shape[1]);
+            kernel->Launch();
+
+            auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+            auto* out_ptr = param.proposals->data<half_t, cl::Image2D>();
+            auto it = wait_list->find(out_ptr);
+            if (it != wait_list->end()) {
+              VLOG(4) << "--- Find the sync event for the target cl "
+                         "tensor. ---";
+              auto& event = *(it->second);
+              event.wait();
+            } else {
+              LOG(FATAL) << "Could not find the sync event for the "
+                            "target cl tensor.";
+            }
+
+            lite::Tensor out_ref_tensor;
+            out_ref_tensor.Resize(out_dim);
+            box_coder_ref(out_ref_tensor.mutable_data<float>(),
+                          prior_box_data.data(),
+                          target_box_data.data(),
+                          prior_box_var_data.data(),
+                          axis,
+                          norm,
+                          code_type,
+                          target_box_dims[0],
+                          target_box_dims[1]);
+
+            const size_t cl_image2d_row_pitch{0};
+            const size_t cl_image2d_slice_pitch{0};
+            half_t* out_image_data =
+                new half_t[40000];  // [out_image_shape.production() * 4];
+            TargetWrapperCL::ImgcpySync(out_image_data,
+                                        out_image,
+                                        out_image_shape[0],
+                                        out_image_shape[1],
+                                        cl_image2d_row_pitch,
+                                        cl_image2d_slice_pitch,
+                                        IoDirection::DtoH);
+            float* out_data = new float[out_image_shape.production() * 4];
+            default_converter->ImageToNCHW(
+                out_image_data, out_data, out_image_shape, out_dim);
+// result
+#ifdef BOXCODER_FP16_PRINT_RESULT
+            LOG(INFO) << "---- print kernel result (input -> output) ----";
+            for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
+              std::cout << target_box_data[eidx] << " -> " << out_data[eidx]
+                        << std::endl;
+            }
+#endif  // BOXCODER_FP16_PRINT_RESULT
+            const float* out_ref = out_ref_tensor.data<float>();
+            for (int i = 0; i < out_dim.production(); i++) {
+              auto abs_diff = abs(out_data[i] - out_ref[i]);
+              auto relative_diff =
+                  COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+              EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
+                            (abs_diff <= FP16_MAX_DIFF),
+                        true);
+              if ((relative_diff > FP16_MAX_DIFF) &&
+                  (abs_diff > FP16_MAX_DIFF)) {
+                LOG(ERROR) << "error idx:" << i << ", in_data[" << i
+                           << "]: " << target_box_data[i] << ", out_data[" << i
+                           << "]: " << out_data[i] << ", out_ref[" << i
+                           << "]: " << out_ref[i] << ", abs_diff: " << abs_diff
+                           << ", relative_diff: " << relative_diff
+                           << ", FP16_MAX_DIFF: " << FP16_MAX_DIFF;
+              }
+            }
+#ifdef BOXCODER_FP16_LOOP_TEST
+          }  // axis
+        }    // code_type
+      }      // norm
+    }        // m
+  }          // n
+#else
+// nothing to do.
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(box_coder, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/concat_buffer_compute.cc b/lite/kernels/opencl/concat_buffer_compute.cc
index 010e7726170ab1f40adc2fcb56a66835ac7d2bd2..414f62ff0c4f86f29756b933817de2a7682ecd4c 100644
--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
@@ -38,8 +38,10 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
     } else {
       kernel_func_name_ = "concat_mul";
     }
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/concat_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
 
     auto axis = concat_param_->axis;
     auto inputs = concat_param_->x;
@@ -88,7 +90,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
 
     auto inputs = param.x;
     int arg_idx = 0;
@@ -177,6 +179,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
   param_t* concat_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_float"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/concat_image_compute.cc b/lite/kernels/opencl/concat_image_compute.cc
index 95e64025662a4b87cd68c211ccc0b0fb7b84a9f2..60d1ac628ab1474d7e82f1861067bca838548569 100644
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -40,8 +40,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
       kernel_func_name_ = "concat_mul";
     }
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/concat_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/concat_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
 
     auto axis = concat_param_->axis;
     auto inputs = concat_param_->x;
@@ -117,7 +119,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
 
     auto inputs = param.x;
     int arg_idx = 0;
@@ -251,6 +253,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
   param_t* concat_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{" -DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/conv_buffer_compute.cc b/lite/kernels/opencl/conv_buffer_compute.cc
index 65477e89c7d00408bf4d639138dea936a61a3d70..4c118e1263c0d3c23eb223b01b98a8d9a53bac0e 100644
--- a/lite/kernels/opencl/conv_buffer_compute.cc
+++ b/lite/kernels/opencl/conv_buffer_compute.cc
@@ -114,8 +114,10 @@ void ConvCompute::PrepareForRun() {
   }
 
   for (size_t i = 0; i < kernel_func_names_.size(); i++) {
-    context.cl_context()->AddKernel(
-        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
+    context.cl_context()->AddKernel(kernel_func_names_[i],
+                                    kernel_func_paths_[i],
+                                    build_options_[i],
+                                    time_stamp_);
   }
 }
 
@@ -153,7 +155,7 @@ void ConvCompute::GemmlikeConv2d() {
 
   auto& context = ctx_->As<OpenCLContext>();
   std::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
+  kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
   auto img2col_kernel = context.cl_context()->GetKernel(kernel_key.str());
 
   int n_threads = c_in * h_out * w_out;
@@ -218,7 +220,7 @@ void ConvCompute::GemmlikeConv2d() {
   int n = h_out * w_out;
   VLOG(4) << "m = " << m << " n = " << n << " k = " << k;
   kernel_key.str("");
-  kernel_key << kernel_func_names_[1] << build_options_[1];
+  kernel_key << kernel_func_names_[1] << build_options_[1] << time_stamp_;
   auto gemm_kernel = context.cl_context()->GetKernel(kernel_key.str());
   GemmBatched(
       gemm_kernel, col_buf, filter_buf, bias_buf, output_buf, bs, m, n, k);
@@ -249,7 +251,8 @@ void ConvCompute::Conv2d1x1() {
 
   auto& context = ctx_->As<OpenCLContext>();
   std::stringstream kernel_key;
-  kernel_key << kernel_func_names_.front() << build_options_.front();
+  kernel_key << kernel_func_names_.front() << build_options_.front()
+             << time_stamp_;
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
   GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k);
diff --git a/lite/kernels/opencl/conv_buffer_compute.h b/lite/kernels/opencl/conv_buffer_compute.h
index 44ada55d92352edf3c64cd653e832b26718cdd2f..3dabe906f128ef96fb03dfa82ab3847febaeeed5 100644
--- a/lite/kernels/opencl/conv_buffer_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
@@ -21,6 +21,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 
 namespace paddle {
@@ -55,6 +56,7 @@ class ConvCompute
   std::vector<std::string> kernel_func_names_{};
   std::vector<std::string> kernel_func_paths_{};
   std::vector<std::string> build_options_{};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
index 40ffd697c2ac66223430441bae9800d06a743387..aadd7010cca2ec03ea417e3b486d8c946d80fcab 100644
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -369,15 +369,17 @@ void ConvImageCompute::PrepareForRun() {
   build_options_.push_back(build_options_single);
 
   for (size_t i = 0; i < kernel_func_names_.size(); i++) {
-    context.cl_context()->AddKernel(
-        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
+    context.cl_context()->AddKernel(kernel_func_names_[i],
+                                    kernel_func_paths_[i],
+                                    build_options_[i],
+                                    time_stamp_);
   }
 
   VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 
   std::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
+  kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
   kernel_ = context.cl_context()->GetKernel(kernel_key.str());
   VLOG(4) << "kernel_key: " << kernel_key.str();
   VLOG(4) << "kernel ready ... " << kernel_key.str();
diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h
index 31a14a0c5b94f357e78df8eb35f6823ec6f57998..6f293a0d7dd90e55bedd63c214ba38799a591080 100644
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -22,6 +22,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 
 namespace paddle {
@@ -56,6 +57,7 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
   std::vector<std::string> kernel_func_names_{};
   std::vector<std::string> kernel_func_paths_{};
   std::vector<std::string> build_options_{};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
   Tensor filter_gpu_image_;
   Tensor bias_gpu_image_;
diff --git a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
index 0c88509926041411eddac66bea08b5d3a08d6a3c..afe2aa1c66c04d2bdf180a77362e5d6f1271c1f6 100644
--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
@@ -44,8 +44,10 @@ class DepthwiseConv2dCompute
       build_options_ += " -DRELU6";
     }
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/depthwise_conv2d_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/depthwise_conv2d_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -67,7 +69,7 @@ class DepthwiseConv2dCompute
         param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     cl_int status;
@@ -120,6 +122,7 @@ class DepthwiseConv2dCompute
  private:
   std::string kernel_func_name_{"depthwise_conv2d"};
   std::string build_options_{"-DCL_DTYPE_float"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/dropout_image_compute.cc b/lite/kernels/opencl/dropout_image_compute.cc
index 490e34a8868a3f625591a1c621aa297bb0639576..2be5af2ef0bf3e30d1c586d57ed6c3d40d625b14 100644
--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
@@ -40,8 +40,10 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/dropout_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/dropout_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -63,7 +65,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     cl_int status;
 
@@ -101,6 +103,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"dropout"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.cc b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
index 3961ac7583917fdcd761614558c493e6917d3294..b70f7d1ee017566e399ac86d35df56bd4ba4d383 100644
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
@@ -25,8 +25,10 @@ namespace opencl {
 
 void ElementwiseAddCompute::PrepareForRun() {
   auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
+  context.cl_context()->AddKernel(kernel_func_name_,
+                                  "buffer/elementwise_add_kernel.cl",
+                                  build_options_,
+                                  time_stamp_);
   ele_param_ = param_.get_mutable<param_t>();
   UpdateParams();
 }
@@ -39,7 +41,7 @@ void ElementwiseAddCompute::Run() {
   auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
       TARGET(kOpenCL));
   STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
+  kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << TargetToStr(ele_param_->X->target());
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.h b/lite/kernels/opencl/elementwise_add_buffer_compute.h
index 5a9266ee69b81416d5f4dea9a3eb38aaed7b4165..7dbe5d0e8d5172386418d547812bf4e6c269f043 100644
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
@@ -16,6 +16,7 @@
 #include <memory>
 #include <string>
 #include "lite/core/kernel.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
 
@@ -46,6 +47,7 @@ class ElementwiseAddCompute
   param_t* ele_param_{nullptr};
   std::string kernel_func_name_{"elementwise_add"};
   std::string build_options_{"-DCL_DTYPE_float"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc
index 6d0ebf638f0a8967e27a657131e1cac89967ee0b..51d488d51b72dd9af8225b45a7ee56063312d055 100644
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -23,44 +23,84 @@ namespace lite {
 namespace kernels {
 namespace opencl {
 
-void ElementwiseAddImageCompute::PrepareForRun() {
-  ele_param_ = param_.get_mutable<param_t>();
-  auto* x = ele_param_->X;
-  auto* y = ele_param_->Y;
-  auto axis = ele_param_->axis;
+void ElementwiseAddImageCompute::PrepareForRun() {}
 
-  if (y->dims().size() == 4) {
-    kernel_func_name_ = "elementwise_add";  // y: ImageDefault
-  } else if (y->dims().size() == 1) {
-    if (axis == x->dims().size() - 1) {
-      kernel_func_name_ = "width_add";  // y: ImageDefault
-    } else if (axis == x->dims().size() - 3) {
-      kernel_func_name_ = "channel_add";  // y: ImageFolder
+void ElementwiseAddImageCompute::ReInitWhenNeeded() {
+  ele_param_ = param_.get_mutable<param_t>();
+  auto x_dims = ele_param_->X->dims();
+  if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+      first_epoch_for_reinit_) {
+    last_x_dims_ = x_dims;
+    first_epoch_for_reinit_ = false;
+
+    // choose kernel
+    auto* x = ele_param_->X;
+    auto* y = ele_param_->Y;
+    auto* out = ele_param_->Out;
+    auto axis = ele_param_->axis;
+
+    if (y->dims().size() == 4) {
+      kernel_func_name_ = "elementwise_add";  // y: ImageDefault
+    } else if (y->dims().size() == 1) {
+      if (axis == x->dims().size() - 1) {
+        kernel_func_name_ = "width_add";  // y: ImageDefault
+      } else if (axis == x->dims().size() - 3) {
+        kernel_func_name_ = "channel_add";  // y: ImageFolder
+      } else {
+        LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
+                   << ", x->dims().size():" << x->dims().size()
+                   << ", y->dims.size():" << y->dims().size();
+      }
     } else {
       LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
                  << ", x->dims().size():" << x->dims().size()
                  << ", y->dims.size():" << y->dims().size();
     }
-  } else {
-    LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-               << ", x->dims().size():" << x->dims().size()
-               << ", y->dims.size():" << y->dims().size();
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/elementwise_add_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+
+    // compute image shape
+    paddle::lite::CLImageConverterDefault default_convertor;
+    x_img_shape_ = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
+    y_img_shape_ = default_convertor.InitImageDimInfoWith(y->dims());
+    out_img_shape_ =
+        default_convertor.InitImageDimInfoWith(out->dims());  // w, h
+
+    // compute global work size
+    GetGlobalWorkSize();
   }
-  VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+}
 
-  auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
+void ElementwiseAddImageCompute::GetGlobalWorkSize() {
+  global_work_size_ = cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
+                                  static_cast<cl::size_type>(x_img_shape_[1])};
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "global_work_size:[2D]:" << x_img_shape_[0] << " "
+          << x_img_shape_[1];
+#endif
 }
 
 void ElementwiseAddImageCompute::Run() {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-
   auto* x = ele_param_->X;
   auto* y = ele_param_->Y;
   auto* out = ele_param_->Out;
   auto axis = ele_param_->axis;
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+
+  auto* x_img = x->data<half_t, cl::Image2D>();
+  auto* y_img = y->data<half_t, cl::Image2D>();
+  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
+                                                         out_img_shape_[1]);
 
 #ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "x->target():" << TargetToStr(x->target());
@@ -70,75 +110,53 @@ void ElementwiseAddImageCompute::Run() {
   VLOG(4) << "y->dims():" << y->dims();
   VLOG(4) << "out->dims():" << out->dims();
   VLOG(4) << "axis:" << axis;
-#endif
-
-  paddle::lite::CLImageConverterDefault default_convertor;
-  auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
-  auto x_img_width = x_img_shape[0];
-  auto x_img_height = x_img_shape[1];
-  auto out_img_shape =
-      default_convertor.InitImageDimInfoWith(out->dims());  // w, h
-  auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
 
-  auto* x_img = x->data<half_t, cl::Image2D>();
-  auto* y_img = y->data<half_t, cl::Image2D>();
-  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
-                                                         out_img_shape[1]);
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
-  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
-  VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
-          << out_img_shape[1];
+  VLOG(4) << "x_img_shape_[w,h]:" << x_img_shape_[0] << " " << x_img_shape_[1];
+  VLOG(4) << "y_img_shape_[w,h]:" << y_img_shape_[0] << " " << y_img_shape_[1];
+  VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
+          << out_img_shape_[1];
 #endif
 
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  int arg_idx = 0;
-  auto y_dims = y->dims();
+  cl_int status;
+  auto kernel = kernel_;
   if (y_dims.size() == 4) {
-    cl_int status = kernel.setArg(arg_idx, *x_img);
+    status = kernel.setArg(0, *x_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *y_img);
+    status = kernel.setArg(1, *y_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
+    status = kernel.setArg(2, *out_img);
     CL_CHECK_FATAL(status);
   } else if (y_dims.size() == 1) {
-    if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
-      int tensor_w = x->dims()[x->dims().size() - 1];
+    if (axis == x_dims.size() - 1 || axis == x_dims.size() - 3) {
+      const int tensor_w = x_dims[x_dims.size() - 1];
 #ifndef LITE_SHUTDOWN_LOG
       VLOG(4) << "tensor_w:" << tensor_w;
 #endif
-      cl_int status = kernel.setArg(arg_idx, *x_img);
+      status = kernel.setArg(0, *x_img);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *y_img);
+      status = kernel.setArg(1, *y_img);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_img);
+      status = kernel.setArg(2, *out_img);
       CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      status = kernel.setArg(3, tensor_w);
       CL_CHECK_FATAL(status);
     } else {
       LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-                 << ", x->dims().size():" << x->dims().size()
-                 << ", y->dims.size():" << y->dims().size();
+                 << ", x->dims().size():" << x_dims.size()
+                 << ", y->dims.size():" << y_dims.size();
     }
   } else {
     LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-               << ", x->dims().size():" << x->dims().size()
-               << ", y->dims.size():" << y->dims().size();
+               << ", x->dims().size():" << x_dims.size()
+               << ", y->dims.size():" << y_dims.size();
   }
 
-  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
-                                      static_cast<cl::size_type>(x_img_height)};
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
-#endif
-  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
+      global_work_size_,
       cl::NullRange,
       nullptr,
       event_.get());
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.h b/lite/kernels/opencl/elementwise_add_image_compute.h
index 084f0fe7fb18f9abe3c6ef41f10a9e38e31a54fc..a92a1b448176628381a3c65b838f6bba529eb4e0 100644
--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
@@ -15,8 +15,10 @@
 
 #include <memory>
 #include <string>
+#include <vector>
 #include "lite/backends/opencl/cl_half.h"
 #include "lite/core/kernel.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
 
@@ -34,6 +36,10 @@ class ElementwiseAddImageCompute
 
   void PrepareForRun() override;
 
+  void ReInitWhenNeeded() override;
+
+  void GetGlobalWorkSize();
+
   void Run() override;
 
   std::string doc() const override {
@@ -42,8 +48,21 @@ class ElementwiseAddImageCompute
 
  protected:
   param_t* ele_param_{nullptr};
+  DDim last_x_dims_;
+  DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim y_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+
   std::string kernel_func_name_{"elementwise_add"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
+  bool first_epoch_for_reinit_{true};
+  cl::Kernel kernel_;
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/elementwise_mul_image_compute.cc b/lite/kernels/opencl/elementwise_mul_image_compute.cc
index aa6af2a29bfdedfb5fdd3114693514b6fad13a64..96dc2de1affba7c36be6c9c0e952b85be726fca8 100644
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -71,8 +71,10 @@ class ElementwiseMulImageCompute
     VLOG(4) << "bias_dims.size():" << bias_dims.size();
 
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/elementwise_mul_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -114,7 +116,7 @@ class ElementwiseMulImageCompute
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     auto bias_dims = y->dims();
@@ -201,6 +203,7 @@ class ElementwiseMulImageCompute
   param_t* ele_param_{nullptr};
   std::string kernel_func_name_{"elementwise_mul"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.cc b/lite/kernels/opencl/elementwise_sub_image_compute.cc
index 0bc867d7f124582660b7a0a9a95d026d910fc2d3..b93167b99c064a2f9eb2256291adad99f3912baf 100644
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
@@ -49,8 +49,10 @@ void ElementwiseSubImageCompute::PrepareForRun() {
   VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
 
   auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "image/elementwise_sub_kernel.cl", build_options_);
+  context.cl_context()->AddKernel(kernel_func_name_,
+                                  "image/elementwise_sub_kernel.cl",
+                                  build_options_,
+                                  time_stamp_);
 }
 
 void ElementwiseSubImageCompute::Run() {
@@ -93,7 +95,7 @@ void ElementwiseSubImageCompute::Run() {
 #endif
 
   STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
+  kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
   auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
   int arg_idx = 0;
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.h b/lite/kernels/opencl/elementwise_sub_image_compute.h
index 48386b083e5375f8943c04afb1da70a2ed207dbf..db3e1db9813bffd985a41abbac14e5c89e574397 100644
--- a/lite/kernels/opencl/elementwise_sub_image_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
@@ -17,6 +17,7 @@
 #include <string>
 #include "lite/backends/opencl/cl_half.h"
 #include "lite/core/kernel.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
 
@@ -44,6 +45,7 @@ class ElementwiseSubImageCompute
   param_t* ele_param_{nullptr};
   std::string kernel_func_name_{"elementwise_sub"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc
index dbdedd136ea6b8c6b06d02d4f6d893e4ea849e8a..0fb83db2fe76e27baf7a096395369cb92b995072 100644
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
@@ -30,74 +31,98 @@ class FcCompute
  public:
   using param_t = operators::FcParam;
 
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    const auto x_dims = param.input->dims();
-    const auto w_dims = param.w->dims();
-
-    CHECK_GE(x_dims.size(), 2UL);
-    CHECK_GE(w_dims.size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
-
-    m_ = x_dims.Slice(0, param.in_num_col_dims).production();
-    k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
-    n_ = w_dims[1];
-    CHECK_EQ(k_, static_cast<int>(w_dims[0]));
-    VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
-            << " " << x_dims[3];
-    VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
-            << " " << w_dims[3];
-    VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
+  void PrepareForRun() override {}
 
+  void ReInitWhenNeeded() override {
+    fc_param_ = param_.get_mutable<param_t>();
+    const auto x_dims = fc_param_->input->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+
+      // compute m,n,k
+      const auto w_dims = fc_param_->w->dims();
+      CHECK_GE(x_dims.size(), 2UL);
+      CHECK_GE(w_dims.size(), 2UL);
+      CHECK_EQ(fc_param_->output->dims().size(), 2UL);
+
+      m_ = x_dims.Slice(0, fc_param_->in_num_col_dims).production();
+      k_ = x_dims.Slice(fc_param_->in_num_col_dims, x_dims.size()).production();
+      n_ = w_dims[1];
+      CHECK_EQ(k_, static_cast<int>(w_dims[0]));
+
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
+              << " " << x_dims[3];
+      VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
+              << " " << w_dims[3];
+      VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
+#endif
+
+      // choose kernel
+      if (m_ == 1) {  // gemv
+        kernel_func_name_ = "fc_gemv_1x4";
+      } else {  // gemm
+        kernel_func_name_ = "fc_gemm_4x4";
+      }
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+#endif
+
+      if (fc_param_->activation_type == "relu") {
+        build_options_ += "-DRELU";
+      }
+
+      auto& context = ctx_->As<OpenCLContext>();
+      context.cl_context()->AddKernel(kernel_func_name_,
+                                      "buffer/fc_kernel.cl",
+                                      build_options_,
+                                      time_stamp_);
+      STL::stringstream kernel_key;
+      kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+      kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+
+  void GetGlobalWorkSize() {
     if (m_ == 1) {  // gemv
-      kernel_func_name_ = "fc_gemv_1x4";
       global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
     } else {  // gemm
-      kernel_func_name_ = "fc_gemm_4x4";
       global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
                                       static_cast<size_t>((n_ + 3) / 4)};
     }
-    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-
-    if (param.activation_type == "relu") {
-      build_options_ += "-DRELU";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
   }
 
   void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* x_buf = param.input->data<float, cl::Buffer>();
-    auto* w_buf = param.w->data<float, cl::Buffer>();
-    auto* bias_buf = param.bias->data<float, cl::Buffer>();
+    auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
+    auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
+    auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
     auto* out_buf =
-        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+        fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
+    auto kernel = kernel_;
     cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, *x_buf);
+    status = kernel.setArg(0, *x_buf);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *w_buf);
+    status = kernel.setArg(1, *w_buf);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *bias_buf);
+    status = kernel.setArg(2, *bias_buf);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
+    status = kernel.setArg(3, *out_buf);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(m_));
+    status = kernel.setArg(4, static_cast<const int>(m_));
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(n_));
+    status = kernel.setArg(5, static_cast<const int>(n_));
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(k_));
+    status = kernel.setArg(6, static_cast<const int>(k_));
     CL_CHECK_FATAL(status);
 
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
@@ -111,9 +136,14 @@ class FcCompute
 
  private:
   int m_, n_, k_;
+  param_t* fc_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_float "};
+  std::string time_stamp_{GetTimeStamp()};
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
   cl::NDRange global_work_size_;
+  cl::Kernel kernel_;
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc b/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
index d76e00fa85d4ebb6da9d779e9c2b220a2fd731d9..730b70525e818512aea11e1f42c1282b125aae54 100644
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
@@ -28,8 +28,10 @@ class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
   void PrepareForRun() override {
     build_options_ += " -DRELU";
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/elementwise_add_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     ele_param_ = param_.get_mutable<param_t>();
     UpdateParams();
     auto act_t = static_cast<param_t*>(ele_param_)->act_type;
diff --git a/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc b/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
index e5c0e29bddf5cd6c25ccf98f05aa7cb091a4be7e..8e687340943dcb0f1b68e4c9495cbab1ad703645 100644
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/opencl/elementwise_add_image_compute.h"
+#include "lite/kernels/opencl/image_helper.h"
 
 namespace paddle {
 namespace lite {
@@ -30,8 +31,10 @@ class FusionElementwiseAddActivationImageCompute
   void PrepareForRun() override {
     build_options_ += " -DRELU";
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/elementwise_add_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     ele_param_ = param_.get_mutable<param_t>();
     auto act_t = static_cast<param_t*>(ele_param_)->act_type;
     VLOG(4) << "act: " << act_t;
diff --git a/lite/kernels/opencl/grid_sampler_image_compute.cc b/lite/kernels/opencl/grid_sampler_image_compute.cc
index 243737a81331a7159834d30ccfb2fab181baeebe..4fb13a61181ba282f7005ea158768ee18b94b7a0 100644
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
@@ -39,96 +39,120 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
   }
 
   void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/grid_sampler_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << "kernel_key: " << kernel_key.str();
+  }
+
+  void ReInitWhenNeeded() override {
     grid_param_ = param_.get_mutable<param_t>();
+    auto x_dims = grid_param_->x->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      out_img_shape_ =
+          default_convertor.InitImageDimInfoWith(grid_param_->out->dims());
+
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
 
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_);
-    VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
+  void GetGlobalWorkSize() {
+    auto default_work_size =
+        DefaultWorkSize(grid_param_->out->dims(),
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_img_shape_[0]),
+                            static_cast<int64_t>(out_img_shape_[1])}));
+    global_work_size_ =
+        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+                    static_cast<cl::size_type>(default_work_size[1]),
+                    static_cast<cl::size_type>(default_work_size[2] / 4)};
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+    VLOG(4) << "global_work_size_:[2D]:" << global_work_size_[0] << " "
+            << global_work_size_[1] << " " << global_work_size_[2];
+#endif
   }
 
   void Run() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-
     auto* x = grid_param_->x;
-    auto* out = grid_param_->out;
     auto* grid = grid_param_->grid;
+    auto* out = grid_param_->out;
+
     auto out_dims = out->dims();
-    auto in_dims = x->dims();
+    int out_height = out_dims[2];
+    int out_width = out_dims[3];
+
+    auto* x_img = x->data<half_t, cl::Image2D>();
+    auto* grid_img = x->data<half_t, cl::Image2D>();
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
+                                                           out_img_shape_[1]);
 
 #ifndef LITE_SHUTDOWN_LOG
+    auto in_dims = x->dims();
     VLOG(4) << "x->target():" << TargetToStr(x->target());
     VLOG(4) << "out->target():" << TargetToStr(out->target());
     VLOG(4) << "x->dims():" << in_dims;
     VLOG(4) << "out->dims():" << out_dims;
-#endif
-
-    auto out_image_shape = InitImageDimInfoWith(out_dims);
-    auto* x_img = x->data<half_t, cl::Image2D>();
     // VLOG(4) << "x_image: " << x_img;
-
-    auto* grid_img = x->data<half_t, cl::Image2D>();
     // VLOG(4) << "grid_img: " << grid_img;
-
-    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
-        out_image_shape["width"], out_image_shape["height"]);
-#ifndef LITE_SHUTDOWN_LOG
     // VLOG(4) << "out_image" << out_img;
-    VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
-            << out_image_shape["height"];
+    VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
+            << out_img_shape_[1];
 #endif
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
-    int arg_idx = 0;
-    int out_height = out_dims[2];
-    int out_width = out_dims[3];
-    auto default_work_size =
-        DefaultWorkSize(out_dims,
-                        DDim(std::vector<DDim::value_type>{
-                            static_cast<int64_t>(out_image_shape["width"]),
-                            static_cast<int64_t>(out_image_shape["height"])}));
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
-            << default_work_size[1] << ", " << default_work_size[2];
-#endif
-    cl_int status = kernel.setArg(arg_idx++, *x_img);
+    cl_int status;
+    auto kernel = kernel_;
+    status = kernel.setArg(0, *x_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, *grid_img);
+    status = kernel.setArg(1, *grid_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, *out_img);
+    status = kernel.setArg(2, *out_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, out_height);
+    status = kernel.setArg(3, out_height);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, out_width);
+    status = kernel.setArg(4, out_width);
     CL_CHECK_FATAL(status);
 
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
-                    static_cast<cl::size_type>(default_work_size[1]),
-                    static_cast<cl::size_type>(default_work_size[2] / 4)};
-
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
-        global_work_size,
+        global_work_size_,
         cl::NullRange,
         nullptr,
         event_.get());
     CL_CHECK_FATAL(status);
     context.cl_wait_list()->emplace(out_img, event_);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
-            << global_work_size[1] << " " << global_work_size[2];
-#endif
   }
 
  protected:
   param_t* grid_param_{nullptr};
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
   std::string kernel_func_name_{"grid_sampler"};
+  cl::Kernel kernel_;
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/image_helper.h b/lite/kernels/opencl/image_helper.h
index d0d282250d1c5658bc8f684b52b4b0d140895833..81d38bc683eb355b1d85a307d35839b4e3e8ef45 100644
--- a/lite/kernels/opencl/image_helper.h
+++ b/lite/kernels/opencl/image_helper.h
@@ -74,6 +74,12 @@ static std::vector<size_t> DefaultWorkSize(const DDim& image_dim,
   LOG(FATAL) << " not support this dim, need imp ";
 }
 
+static const std::string GetTimeStamp() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return std::to_string(time.tv_usec);
+}
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/opencl/instance_norm_image_compute.cc b/lite/kernels/opencl/instance_norm_image_compute.cc
index 6bdec0ca6cdfd16219becf704de4d5701aad3197..c5e02ae0ed4ae9facf36747d99ee825e6eab6515 100644
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
@@ -60,8 +60,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     }
 
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/instance_norm_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/instance_norm_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -115,7 +117,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
         out_image_shape["width"], out_image_shape["height"]);
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     cl_int status = kernel.setArg(0, out_w);
@@ -180,8 +182,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
     bias_image_.mutable_data<half_t, cl::Image2D>(
         scale_img_size[0], scale_img_size[1], bias_img.data());
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/instance_norm_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/instance_norm_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -234,7 +238,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     auto* scale_img = scale_image_.data<half_t, cl::Image2D>();
     auto* bias_img = bias_image_.data<half_t, cl::Image2D>();
@@ -271,6 +275,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
   param_t* instance_norm_param_{nullptr};
   std::string kernel_func_name_{"instance_norm_onnx"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
   Tensor scale_image_;
   Tensor bias_image_;
diff --git a/lite/kernels/opencl/io_copy_buffer_compute.cc b/lite/kernels/opencl/io_copy_buffer_compute.cc
index 6a49cc2577a58690e5e0b6a6ede82df0bdc99bb1..f76f667923fa8d39847db5dae8e07d7398f25f99 100644
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
@@ -106,6 +106,7 @@ class IoCopykOpenCLToHostCompute
 
     auto& context = ctx_->As<OpenCLContext>();
     auto* wait_list = context.cl_wait_list();
+
     auto it = wait_list->find(x_ptr);
     if (it != wait_list->end()) {
 #ifndef LITE_SHUTDOWN_LOG
@@ -113,6 +114,9 @@ class IoCopykOpenCLToHostCompute
 #endif
       auto& event = *(it->second);
       event.wait();
+      auto command_queue = CLRuntime::Global()->command_queue();
+      command_queue.flush();
+      command_queue.finish();
     } else {
       LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
     }
diff --git a/lite/kernels/opencl/lrn_image_compute.cc b/lite/kernels/opencl/lrn_image_compute.cc
index edce0368ddc9cda54fdab44b472fcd0e771413ae..0e01bdc107c4fcb4a0caf943cfb1b768557dd671 100644
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
@@ -48,7 +48,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
     beta_ = lrn_param_->beta;
     norm_region_ = lrn_param_->norm_region;
     context.cl_context()->AddKernel(
-        kernel_func_name_, "image/lrn_kernel.cl", build_options_);
+        kernel_func_name_, "image/lrn_kernel.cl", build_options_, time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -91,7 +91,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -152,6 +152,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
   std::string norm_region_{"AcrossChannels"};
   std::string kernel_func_name_{"lrn"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/mul_buffer_compute.cc b/lite/kernels/opencl/mul_buffer_compute.cc
index 4c46da67da9877fb37b214b6d738b3dd3da3e5bb..e8edb359898fb47cf47919a25e521ca9f8353104 100644
--- a/lite/kernels/opencl/mul_buffer_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
@@ -32,8 +33,10 @@ class MulCompute
 
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/mat_mul_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/mat_mul_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     const auto& param = *param_.get_mutable<param_t>();
     const auto* x_data = param.x->data<float>();
     const auto* y_data = param.y->data<float>();
@@ -68,7 +71,7 @@ class MulCompute
         param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     cl_int status;
@@ -103,6 +106,7 @@ class MulCompute
   int m_, n_, k_;
   std::string kernel_func_name_{"mat_mul"};
   std::string build_options_{"-DCL_DTYPE_float"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/nearest_interp_image_compute.cc b/lite/kernels/opencl/nearest_interp_image_compute.cc
index 082f21ab1ae792ae33e9e2a368073274258b8884..17637e2569556d1eeb8b6002c0073223345ac7ec 100644
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
@@ -38,8 +38,10 @@ class NearestInterpComputeImageDefault
 
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/nearest_interp_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -66,7 +68,7 @@ class NearestInterpComputeImageDefault
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -121,6 +123,7 @@ class NearestInterpComputeImageDefault
  private:
   std::string kernel_func_name_{"nearest_interp"};
   std::string build_options_{" -DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/pad2d_image_compute.cc b/lite/kernels/opencl/pad2d_image_compute.cc
index 1be4729ee1b24ac77383de4d7c111e9d37d29d6b..f16642d449d29c2afd3db7097432945c73d107e3 100644
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
@@ -52,8 +52,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
     }
 
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/pad2d_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/pad2d_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 
@@ -93,7 +95,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
 #endif
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int arg_idx = 0;
@@ -159,6 +161,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
   param_t* pad2d_param_{nullptr};
   std::string kernel_func_name_{};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/pool_buffer_compute.cc b/lite/kernels/opencl/pool_buffer_compute.cc
index 3f491afb86d4e4d5144522b6fb028c225c9a97e4..aeba4bcd2ea1d9b1f14ac86509ab9dbec2509ad0 100644
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
@@ -37,8 +37,10 @@ class PoolCompute
     const auto& param = *param_.get_mutable<param_t>();
     kernel_func_name_ += param.pooling_type;
     auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/pool_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "buffer/pool_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -69,7 +71,7 @@ class PoolCompute
     auto* output_buf =
         param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
     cl_int status;
     auto numel = out_dims.production();
@@ -117,6 +119,7 @@ class PoolCompute
  private:
   std::string kernel_func_name_{"pool_"};
   std::string build_options_{"-DCL_DTYPE_float"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/pool_image_compute.cc b/lite/kernels/opencl/pool_image_compute.cc
index 39da325ebb10c85f153e349173aa833bbf5e1f6e..34524122c8e475df63db02eae32b7d100abfa2d9 100644
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
@@ -47,7 +47,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
     auto& context = ctx_->As<OpenCLContext>();
     context.cl_context()->AddKernel(
-        kernel_func_name_, "image/pool_kernel.cl", build_options_);
+        kernel_func_name_, "image/pool_kernel.cl", build_options_, time_stamp_);
   }
 
   void Run() override {
@@ -112,7 +112,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     //    VLOG(4) << "out_image" << out_img;
 
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     int c_block = (out_dims[1] + 3) / 4;
@@ -164,6 +164,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"pool_"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/reshape_image_compute.cc b/lite/kernels/opencl/reshape_image_compute.cc
index 376add226216a57a0868c9c52497b784929a207e..febb1c33d9c4df2cb58580a03bda1eff93ed4da7 100644
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
@@ -36,8 +36,10 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/reshape_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/reshape_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -110,7 +112,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
 #ifndef LITE_SHUTDOWN_LOG
@@ -166,6 +168,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"reshape"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/scale_image_compute.cc b/lite/kernels/opencl/scale_image_compute.cc
index 5fd9a2b46b5ce3b0ad84449785f510d5f0391250..97b56e68d47fcdf1647433f5e267c264fb36c5c2 100644
--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
@@ -37,53 +37,66 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
 
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/scale_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/scale_kernel.cl", build_options_);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  }
+
+  void ReInitWhenNeeded() override {
+    scale_param_ = param_.get_mutable<param_t>();
+    auto x_dims = scale_param_->x->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      out_img_shape_ =
+          default_convertor.InitImageDimInfoWith(scale_param_->output->dims());
+
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+
+  void GetGlobalWorkSize() {
+    global_work_size_ =
+        cl::NDRange{static_cast<cl::size_type>(out_img_shape_[0]),
+                    static_cast<cl::size_type>(out_img_shape_[1])};
   }
 
   void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    const auto& in_dims = param.x->dims();
-    auto* x_img = param.x->data<half_t, cl::Image2D>();
-    const float scale = param.scale;
-    const float bias = param.bias;
-
-    //    LOG(INFO) << "x_image" << x_img;
-    auto out_image_shape = InitImageDimInfoWith(in_dims);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
-            << out_image_shape["height"];
-#endif
-    auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
-        out_image_shape["width"], out_image_shape["height"]);
-    //    LOG(INFO) << "out_image" << out_img;
+    auto* x_img = scale_param_->x->data<half_t, cl::Image2D>();
+    auto* out_img = scale_param_->output->mutable_data<half_t, cl::Image2D>(
+        out_img_shape_[0], out_img_shape_[1]);
+    const float scale = scale_param_->scale;
+    const float bias = scale_param_->bias;
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(out_image_shape["width"]),
-                    static_cast<cl::size_type>(out_image_shape["height"])};
 
+    auto kernel = kernel_;
     cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, *x_img);
+    status = kernel.setArg(0, *x_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
+    status = kernel.setArg(1, *out_img);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, scale);
+    status = kernel.setArg(2, scale);
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, bias);
+    status = kernel.setArg(3, bias);
     CL_CHECK_FATAL(status);
 
     status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
         kernel,
         cl::NullRange,
-        global_work_size,
+        global_work_size_,
         cl::NullRange,
         nullptr,
         event_.get());
@@ -94,7 +107,17 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"scale"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
+
+  param_t* scale_param_{nullptr};
+  cl::Kernel kernel_;
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/slice_image_compute.cc b/lite/kernels/opencl/slice_image_compute.cc
index 149ef35afe3d49ca8793769ee7ad366292462296..dd231ec8647ba88ab0f953661af47bc36c948e8b 100644
--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
@@ -38,8 +38,10 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/slice_kernel.cl", build_options_);
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/slice_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
   }
 
   void Run() override {
@@ -68,7 +70,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
     cl_int status;
@@ -108,6 +110,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
  private:
   std::string kernel_func_name_{"slice"};
   std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt
index 3d79dc3dfee80613c39f51323e7ba61adcf7cd8a..2036a343d722d5c01a4b9dcd0d4cdf682a92d218 100644
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -2,7 +2,7 @@ if(NOT LITE_WITH_X86)
     return()
 endif()
 
-add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops math_function)
+add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_function)
 # lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps})
@@ -30,6 +30,8 @@ add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps}
 add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps} blas math_function sequence2batch gru_compute)
 #add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(sequence_unpad_compute_x86 X86 basic SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps} sequence_padding)
+add_kernel(sequence_conv_compute_x86 X86 basic SRCS sequence_conv_compute.cc DEPS ${lite_kernel_deps} math_function blas context_project)
 
 # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
 add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type)
diff --git a/lite/kernels/x86/activation_compute.h b/lite/kernels/x86/activation_compute.h
index 5d8110e67c17f3a0f8d3211179df831dad83cc9b..65d270e02fab902a1dfa92ddf27de040ef43a1b9 100644
--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
@@ -21,7 +21,7 @@
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/fluid/eigen.h"
-#include "lite/operators/activation_ops.h"
+#include "lite/operators/op_params.h"
 
 namespace paddle {
 namespace lite {
@@ -231,8 +231,8 @@ class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     // auto& context = ctx_->As<X86Context>();
     auto& param = *param_.get_mutable<operators::ActivationParam>();
 
-    const T* x_data = param.X->data<T>();
-    T* out_data = param.Out->mutable_data<T>();
+    const T* x_data = param.X->template data<T>();
+    T* out_data = param.Out->template mutable_data<T>();
     size_t x_size = param.X->numel();
     for (size_t i = 0; i < x_size; i++) {
       out_data[i] = x_data[i] / (static_cast<T>(1) + std::abs(x_data[i]));
diff --git a/lite/kernels/x86/attention_padding_mask_compute.h b/lite/kernels/x86/attention_padding_mask_compute.h
index b9124e5ad49a0d68c41a21fe55d28102f09d14b9..f6d3d5aa31df1f188c196ac283c734c879f40244 100644
--- a/lite/kernels/x86/attention_padding_mask_compute.h
+++ b/lite/kernels/x86/attention_padding_mask_compute.h
@@ -45,9 +45,9 @@ class AttentionPaddingMaskCompute
     auto src_len = static_cast<int64_t>(bottom1->lod()[0][1]);
     const int att_batch = bottom0->lod()[0].size() - 1;
     const int src_batch = bottom1->lod()[0].size() - 1;
-    int* pad_begin = _pad_begin->mutable_data<int>();
+    int* pad_begin = _pad_begin->template mutable_data<int>();
     for (int i = 0; i < src_batch; ++i) {
-      const auto* src_data = bottom1->data<T>() + src_len * i;
+      const auto* src_data = bottom1->template data<T>() + src_len * i;
       int index = src_len - 1;
       for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
            --index) {
@@ -56,13 +56,14 @@ class AttentionPaddingMaskCompute
     }
 
     const auto att_len = static_cast<int64_t>(bottom0->lod()[0][1]);
-    auto* top_data = top->mutable_data<T>();
+    auto* top_data = top->template mutable_data<T>();
     memcpy(top_data,
-           bottom0->data<T>(),
+           bottom0->template data<T>(),
            bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T));
     for (int i = 0; i < att_batch; ++i) {
       for (int j = 0; j < att_len; ++j) {
-        top_data = top->mutable_data<T>() + src_len * (att_len * i + j);
+        top_data =
+            top->template mutable_data<T>() + src_len * (att_len * i + j);
         int src_idx = i % src_batch;
         for (int k = pad_begin[src_idx]; k < src_len; ++k) {
           top_data[k] = _mask;
diff --git a/lite/kernels/x86/batch_norm_compute.h b/lite/kernels/x86/batch_norm_compute.h
index 092280752cb92e1784eefc09cb26fa3bea8eb939..0f206b8c32aaaf9b3a1b278a69f3a9aa77a11ba6 100644
--- a/lite/kernels/x86/batch_norm_compute.h
+++ b/lite/kernels/x86/batch_norm_compute.h
@@ -59,26 +59,26 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     const int sample_size = x->dims().production() / N / C;
 
     // alloc memory
-    param.y->mutable_data<T>();
+    param.y->template mutable_data<T>();
     if (!param.is_test) {
-      param.mean_out->mutable_data<T>();
-      param.variance_out->mutable_data<T>();
-      param.saved_mean->mutable_data<T>();
-      param.saved_variance->mutable_data<T>();
+      param.mean_out->template mutable_data<T>();
+      param.variance_out->template mutable_data<T>();
+      param.saved_mean->template mutable_data<T>();
+      param.saved_variance->template mutable_data<T>();
     }
     if (!global_stats) {
       // saved_xx is use just in this batch of data
-      EigenVectorArrayMap<T> saved_mean_e(param.saved_mean->mutable_data<T>(),
-                                          C);
+      EigenVectorArrayMap<T> saved_mean_e(
+          param.saved_mean->template mutable_data<T>(), C);
       EigenVectorArrayMap<T> saved_variance_e(
-          param.saved_variance->mutable_data<T>(), C);
+          param.saved_variance->template mutable_data<T>(), C);
       saved_mean_e.setZero();
       saved_variance_e.setZero();
 
-      EigenVectorArrayMap<T> running_mean_arr(param.mean_out->mutable_data<T>(),
-                                              C);
+      EigenVectorArrayMap<T> running_mean_arr(
+          param.mean_out->template mutable_data<T>(), C);
       EigenVectorArrayMap<T> running_var_arr(
-          param.variance_out->mutable_data<T>(), C);
+          param.variance_out->template mutable_data<T>(), C);
 
       if ((N * sample_size) == 1) {
         LOG(WARNING) << "Only 1 element in normalization dimension, "
@@ -89,7 +89,8 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
       switch (param.data_layout) {
         case DATALAYOUT(kNCHW): {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+          ConstEigenArrayMap<T> x_arr(
+              x->template data<T>(), sample_size, N * C);
           for (int nc = 0; nc < N * C; ++nc) {
             saved_mean_e(nc % C) += x_arr.col(nc).sum();
           }
@@ -115,33 +116,37 @@ class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     // use SavedMean and SavedVariance to do normalize
     Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
     if (global_stats) {
-      ConstEigenVectorArrayMap<T> var_arr(param.variance->data<T>(), C);
+      ConstEigenVectorArrayMap<T> var_arr(param.variance->template data<T>(),
+                                          C);
       inv_std = (var_arr + param.epsilon).sqrt().inverse();
     } else {
       EigenVectorArrayMap<T> saved_inv_std(
-          param.saved_variance->mutable_data<T>(), C);
+          param.saved_variance->template mutable_data<T>(), C);
       // inverse SavedVariance first, gradient will use it too.
       saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt();
       inv_std = saved_inv_std;
     }
 
     ConstEigenVectorArrayMap<T> mean_arr(
-        global_stats ? param.mean->data<T>() : param.saved_mean->data<T>(), C);
+        global_stats ? param.mean->template data<T>()
+                     : param.saved_mean->template data<T>(),
+        C);
 
     //   ((x - est_mean) * (inv_var) * scale + bias
     //   formula transform ====>
     //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
 
-    ConstEigenVectorArrayMap<T> scale_arr(param.scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> bias_arr(param.bias->data<T>(), C);
+    ConstEigenVectorArrayMap<T> scale_arr(param.scale->template data<T>(), C);
+    ConstEigenVectorArrayMap<T> bias_arr(param.bias->template data<T>(), C);
     Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
     Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
         bias_arr - mean_arr * inv_std * scale_arr;
 
     switch (param.data_layout) {
       case DATALAYOUT(kNCHW): {
-        EigenArrayMap<T> y_arr(param.y->mutable_data<T>(), sample_size, N * C);
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+        EigenArrayMap<T> y_arr(
+            param.y->template mutable_data<T>(), sample_size, N * C);
+        ConstEigenArrayMap<T> x_arr(x->template data<T>(), sample_size, N * C);
         for (int nc = 0; nc < N * C; ++nc) {
           y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
         }
diff --git a/lite/kernels/x86/cast_compute.cc b/lite/kernels/x86/cast_compute.cc
index d342056c7f19e9eba0fe16196d772da6bd5fda3c..bbb63e595269667dedebeafd83cc962d1d0fb878 100644
--- a/lite/kernels/x86/cast_compute.cc
+++ b/lite/kernels/x86/cast_compute.cc
@@ -23,3 +23,14 @@ REGISTER_LITE_KERNEL(cast,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    cast,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::CastCompute<::paddle::lite::fluid::float16>,
+    fp16_to_any)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/concat_compute.h b/lite/kernels/x86/concat_compute.h
index 935f0811d4e7a7cbe2ce5fafa61b6d16a25d4a81..e423cd04f16917f200f45ac93d9a6a09f3fb1c54 100644
--- a/lite/kernels/x86/concat_compute.h
+++ b/lite/kernels/x86/concat_compute.h
@@ -47,7 +47,7 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     int64_t axis = static_cast<int64_t>(param.axis);
     auto* axis_tensor = param.axis_tensor;
     if (axis_tensor != nullptr) {
-      auto* axis_tensor_data = axis_tensor->data<int>();
+      auto* axis_tensor_data = axis_tensor->template data<int>();
       axis = static_cast<int64_t>(axis_tensor_data[0]);
     }
 
@@ -60,7 +60,7 @@ class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     int concat_input_size = count(axis + 1, x_dims.size(), x_dims);
     const int top_concat_axis = out->dims()[axis];
     for (size_t i = 0; i < param.x.size(); ++i) {
-      const T* bottom_data = param.x[i]->data<T>();
+      const T* bottom_data = param.x[i]->template data<T>();
       const int64_t bottom_concat_axis = param.x[i]->dims()[axis];
       for (int n = 0; n < num_concat; ++n) {
         std::memcpy(
diff --git a/lite/kernels/x86/conv_compute.h b/lite/kernels/x86/conv_compute.h
index e9f403059f90cf6635bc22db3e6890b86cbe85f6..29442158c756418327dd3de31fd4dfdbec2cbc1d 100644
--- a/lite/kernels/x86/conv_compute.h
+++ b/lite/kernels/x86/conv_compute.h
@@ -52,7 +52,7 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& context = ctx_->As<X86Context>();
     auto& param = *param_.get_mutable<operators::ConvParam>();
     lite::Tensor filter = *param.filter;
-    param.output->mutable_data<T>();
+    param.output->template mutable_data<T>();
     const int batch_size = static_cast<int>(param.x->dims()[0]);
 
     std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
@@ -95,9 +95,9 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto blas =
         paddle::lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
     for (int i = 0; i < batch_size; i++) {
-      lite::Tensor in_batch = param.x->Slice<T>(i, i + 1);
+      lite::Tensor in_batch = param.x->template Slice<T>(i, i + 1);
       in_batch.Resize(input_shape);
-      lite::Tensor out_batch = param.output->Slice<T>(i, i + 1);
+      lite::Tensor out_batch = param.output->template Slice<T>(i, i + 1);
       out_batch.Resize(output_matrix_shape);
       for (int g = 0; g < param.groups; g++) {
         lite::Tensor in_slice =
diff --git a/lite/kernels/x86/dropout_compute.h b/lite/kernels/x86/dropout_compute.h
index 2ba383bdbdc99e7643f3bf09350f833665c8548e..4b5f3359501b8b4c801c395dfa7d5990d9d4d7a3 100644
--- a/lite/kernels/x86/dropout_compute.h
+++ b/lite/kernels/x86/dropout_compute.h
@@ -38,10 +38,10 @@ class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   using param_t = operators::DropoutParam;
   void Run() override {
     auto& param = *param_.get_mutable<operators::DropoutParam>();
-    const auto* x_data = param.x->data<T>();
-    auto* out_data = param.output->mutable_data<T>();
+    const auto* x_data = param.x->template data<T>();
+    auto* out_data = param.output->template mutable_data<T>();
     if (!param.is_test) {
-      auto* mask_data = param.mask->mutable_data<T>();
+      auto* mask_data = param.mask->template mutable_data<T>();
       std::random_device rnd;
       std::minstd_rand engine;
       int seed = param.fix_seed ? param.seed : rnd();
diff --git a/lite/kernels/x86/elementwise_op_function.h b/lite/kernels/x86/elementwise_op_function.h
index 40116479f6f4d6dc8658c2d781a48b7a07dd20c9..42ea38d979e39f97a8aef971370c83303c53c48f 100644
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
@@ -248,8 +248,8 @@ class TransformFunctor {
                    lite::Tensor *z,
                    const lite::Context<Target> &ctx,
                    Functor func)
-      : x_(x->data<T>()),
-        y_(y->data<T>()),
+      : x_(x->template data<T>()),
+        y_(y->template data<T>()),
         z_(z->mutable_data<OutType>()),
         nx_(x->numel()),
         ctx_(ctx),
@@ -483,9 +483,10 @@ void FusedElemwiseAndActComputeNoBroadcast(const lite::Context<Target> &ctx,
           x.data<T>(),
           y.data<T>(),
           compound_functor,
-          out->mutable_data<T>(),
-          intermediate_out == nullptr ? nullptr
-                                      : intermediate_out->mutable_data<T>()});
+          out->template mutable_data<T>(),
+          intermediate_out == nullptr
+              ? nullptr
+              : intermediate_out->template mutable_data<T>()});
 }
 
 template <lite::TargetType Target,
@@ -523,9 +524,10 @@ void FusedElemwiseAndActComputeWithBroadcast(const lite::Context<Target> &ctx,
         compound_functor,
         h,
         w,
-        out->mutable_data<T>(),
-        intermediate_out == nullptr ? nullptr
-                                    : intermediate_out->mutable_data<T>());
+        out->template mutable_data<T>(),
+        intermediate_out == nullptr
+            ? nullptr
+            : intermediate_out->template mutable_data<T>());
 
   } else {
     FusedElemwiseAndActBroadcast2CPU<T,
@@ -539,9 +541,10 @@ void FusedElemwiseAndActComputeWithBroadcast(const lite::Context<Target> &ctx,
         n,
         post,
         compound_functor,
-        out->mutable_data<T>(),
-        intermediate_out == nullptr ? nullptr
-                                    : intermediate_out->mutable_data<T>());
+        out->template mutable_data<T>(),
+        intermediate_out == nullptr
+            ? nullptr
+            : intermediate_out->template mutable_data<T>());
   }
 }
 
diff --git a/lite/kernels/x86/fc_compute.h b/lite/kernels/x86/fc_compute.h
index e719b8d2216949746f612bca0689c22be0606031..9f25a2584fe8d2579939e144d6799ba79927ae63 100644
--- a/lite/kernels/x86/fc_compute.h
+++ b/lite/kernels/x86/fc_compute.h
@@ -140,9 +140,9 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
     int M = output->dims().production() / w_dims1;
 
-    const T* input_data = input->data<T>();
-    const T* w_data = w->data<T>();
-    T* output_data = output->mutable_data<T>();
+    const T* input_data = input->template data<T>();
+    const T* w_data = w->template data<T>();
+    T* output_data = output->template mutable_data<T>();
 
     auto& context = ctx_->As<X86Context>();
     FCFunctor<lite::TargetType::kX86, T> fc;
@@ -153,7 +153,7 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
        input_data,
        w_data,
        output_data,
-       bias ? bias->data<T>() : NULL,
+       bias ? bias->template data<T>() : NULL,
        with_relu,
        padding_weights);
   }
diff --git a/lite/kernels/x86/fill_constant_batch_size_like_compute.h b/lite/kernels/x86/fill_constant_batch_size_like_compute.h
index 8d49b0816d85f30351a4ded81e0f6ef650b6c445..1c54912c21d1479b990c5a56064d9789e8619400 100644
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute.h
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute.h
@@ -42,9 +42,9 @@ class FillConstantBatchSizeLikeCompute
       int output_dim_idx = param.output_dim_idx;
       odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
       out->Resize(odims);
-      // out->mutable_data<T>();
+      // out->template mutable_data<T>();
     }
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
     auto value = param.value;
 
     paddle::lite::x86::math::SetConstant<lite::TargetType::kX86, T> setter;
diff --git a/lite/kernels/x86/gather_compute.h b/lite/kernels/x86/gather_compute.h
index 6ee270647f8fb7d7ec540047cd4d546a7eb89ce8..bd01d9da3af1640770838c262dcd848b557d40c3 100644
--- a/lite/kernels/x86/gather_compute.h
+++ b/lite/kernels/x86/gather_compute.h
@@ -50,9 +50,9 @@ void CPUGather(const lite::Tensor* src,
 
   auto src_dims = src->dims();
 
-  const T* p_src = src->data<T>();
+  const T* p_src = src->template data<T>();
   const IndexT* p_index = index->data<IndexT>();
-  T* p_output = output->mutable_data<T>();
+  T* p_output = output->template mutable_data<T>();
 
   // slice size
   int slice_size = 1;
@@ -77,7 +77,7 @@ class GatherCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto index = param.Index;
     auto out = param.Out;
 
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
     if (x->dims().production() == 0) return;
     /*
      * Since there's no type defined for lite::Tensor in Paddle-Lite, then
diff --git a/lite/kernels/x86/gru_compute.h b/lite/kernels/x86/gru_compute.h
index 89076b51dae1fed4b8f56b280f177caf1f142158..e701ba16a55e9695c6b70f07cc4e1443e6b75698 100644
--- a/lite/kernels/x86/gru_compute.h
+++ b/lite/kernels/x86/gru_compute.h
@@ -44,7 +44,7 @@ inline void ReorderInitState(const lite::Context<TARGET(kX86)>& context,
                              bool indexed_src) {
   lite::x86::math::CopyMatrixRowsFunctor<TARGET(kX86), T> row_shuffle;
   dst->Resize(src.dims());
-  dst->mutable_data<T>();
+  dst->template mutable_data<T>();
   row_shuffle(context, src, index_lod, dst, indexed_src);
 }
 
@@ -65,18 +65,19 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto* input = param.input;
     auto* h0 = param.h0;
     auto* weight = param.weight;
-    const T* weight_data = weight->data<T>();
+    const T* weight_data = weight->template data<T>();
     auto* bias = param.bias;
 
     auto* batch_gate = param.batch_gate;
     auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev;
     auto* batch_hidden = param.batch_hidden;
-    T* batch_gate_ptr = batch_gate->mutable_data<T>();
-    T* batch_reset_hidden_prev_ptr = batch_reset_hidden_prev->mutable_data<T>();
-    T* batch_hidden_ptr = batch_hidden->mutable_data<T>();
+    T* batch_gate_ptr = batch_gate->template mutable_data<T>();
+    T* batch_reset_hidden_prev_ptr =
+        batch_reset_hidden_prev->template mutable_data<T>();
+    T* batch_hidden_ptr = batch_hidden->template mutable_data<T>();
 
     auto* hidden = param.hidden;
-    hidden->mutable_data<T>();
+    hidden->template mutable_data<T>();
 
     const auto& hidden_dims = hidden->dims();
 
@@ -99,7 +100,7 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
       // Since the batch computing for GRU reorders the input sequences
       // according to their length. The initialized cell state also needs
       // to reorder.
-      const std::vector<size_t>& order(batch_gate->lod()[2]);
+      const std::vector<uint64_t>& order(batch_gate->lod()[2]);
       ReorderInitState<T>(context, *h0, order, &ordered_h0, true);
       gru_value.prev_out_value = ordered_h0.mutable_data<T>();
     } else {
diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h
index ca2ddf60c5e150ba7d2712ccb2e67e444cd07010..46d151bbc406e19b498b87420029da7f9c1c2f12 100644
--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
@@ -47,9 +47,9 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
     auto x_dims = x->dims();
 
-    y->mutable_data<T>();
-    Mean->mutable_data<T>();
-    Var->mutable_data<T>();
+    y->template mutable_data<T>();
+    Mean->template mutable_data<T>();
+    Var->template mutable_data<T>();
 
     auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
     int left = static_cast<int>(matrix_dim[0]);
@@ -73,10 +73,10 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
                    .At(right);
     ker(in.mutable_data<T>(),
         out.mutable_data<T>(),
-        Mean->mutable_data<T>(),
-        Var->mutable_data<T>(),
-        Scale->data<T>(),
-        Bias->data<T>(),
+        Mean->template mutable_data<T>(),
+        Var->template mutable_data<T>(),
+        Scale->template data<T>(),
+        Bias->template data<T>(),
         static_cast<int>(left),
         epsilon,
         right);
diff --git a/lite/kernels/x86/lookup_table_compute.h b/lite/kernels/x86/lookup_table_compute.h
index 1801144f6eeb25a40fa052440b63913bc41a65a3..73cffe4ce8130b18612e42b0243205e74e011005 100644
--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
@@ -33,15 +33,15 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto *ids_t = param.Ids;
     auto *output_t = param.Out;
     int64_t padding_idx = param.padding_idx;
-    const int64_t *ids = ids_t->data<int64_t>();
+    const int64_t *ids = ids_t->template data<int64_t>();
     int64_t ids_numel = ids_t->dims().production();
 
     auto *table_t = param.W;
     int64_t row_number = table_t->dims()[0];
     int64_t row_width = table_t->dims()[1];
 
-    const T *table = table_t->data<T>();
-    T *output = output_t->mutable_data<T>();
+    const T *table = table_t->template data<T>();
+    T *output = output_t->template mutable_data<T>();
     memset(output, 0, output_t->dims().production() * sizeof(T));
     for (int64_t i = 0; i < ids_numel; ++i) {
       if (padding_idx != -1 && ids[i] == padding_idx) {
diff --git a/lite/kernels/x86/match_matrix_tensor_compute.cc b/lite/kernels/x86/match_matrix_tensor_compute.cc
index feda180d22e59b2ca0e8f0f89f3c7a1ddb8acd4a..171308b1a8b0294241e77366390c4828172bc077 100644
--- a/lite/kernels/x86/match_matrix_tensor_compute.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute.cc
@@ -35,7 +35,7 @@ void MatchMatrixTensorCompute<T>::Run() {
   const auto& offset_l = x->lod()[0];
   const auto& offset_r = y->lod()[0];
 
-  std::vector<size_t> top_offset;
+  std::vector<uint64_t> top_offset;
   int top_size = 0;
   top_offset.push_back(top_size);
   for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
@@ -97,9 +97,9 @@ void MatchMatrixTensorCompute<T>::Run() {
   int batch_size = x->lod()[0].size() - 1;
   int lod_lv1_size = batch_size * dim_t;
   int lod_lv2_size = x->lod()[0].back() * dim_t;
-  std::vector<size_t> out_lod0(batch_size + 1, 0);
-  std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
-  std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
+  std::vector<uint64_t> out_lod0(batch_size + 1, 0);
+  std::vector<uint64_t> out_lod1(lod_lv1_size + 1, 0);
+  std::vector<uint64_t> out_lod2(lod_lv2_size + 1, 0);
   for (int i = 0; i < batch_size; i++) {
     out_lod0[i + 1] = out_lod0[i] + dim_t;
     int len_l = offset_l[i + 1] - offset_l[i];
diff --git a/lite/kernels/x86/matmul_compute.h b/lite/kernels/x86/matmul_compute.h
index 3d2b3c7482c266d0c8771c9be1dbac540a315528..e17f12b6b6471bfb587fc3866695b808e11122da 100644
--- a/lite/kernels/x86/matmul_compute.h
+++ b/lite/kernels/x86/matmul_compute.h
@@ -56,7 +56,7 @@ class MatMulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto *x = param.X;
     auto *y = param.Y;
     auto *out = param.Out;
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
 
     auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
     auto mat_dim_a = lite::x86::math::CreateMatrixDescriptor(
diff --git a/lite/kernels/x86/mul_compute.h b/lite/kernels/x86/mul_compute.h
index be58f24ba2ed37db6661ecaaceb0d9d70fdd75d4..5c3dbe9342c8642470e8997fc2fec6428c2aa832 100644
--- a/lite/kernels/x86/mul_compute.h
+++ b/lite/kernels/x86/mul_compute.h
@@ -64,7 +64,7 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
       y_matrix = *y;
     }
 
-    z->mutable_data<T>();
+    z->template mutable_data<T>();
     auto z_dim = z->dims();
     if (z_dim.size() != 2) {
       z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
diff --git a/lite/kernels/x86/reduce_compute.h b/lite/kernels/x86/reduce_compute.h
index f93157c837995792772c86d969312bfa28341ce4..1b7c99eeef9dd80525eb9ed249bdf6ed1e493443 100644
--- a/lite/kernels/x86/reduce_compute.h
+++ b/lite/kernels/x86/reduce_compute.h
@@ -49,7 +49,7 @@ class ReduceSumCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     bool reduce_all = param.reduce_all;
     auto* input = param.x;
     auto* output = param.output;
-    param.output->mutable_data<T>();
+    param.output->template mutable_data<T>();
 
     const auto& dims = param.dim;
     bool keep_dim = param.keep_dim;
diff --git a/lite/kernels/x86/scale_compute.h b/lite/kernels/x86/scale_compute.h
index c78f385b96dd2bdbf83204f2a80739657350ae7e..978a81fb22f382f9f036e503e3f674d38f1467a6 100644
--- a/lite/kernels/x86/scale_compute.h
+++ b/lite/kernels/x86/scale_compute.h
@@ -41,8 +41,8 @@ class ScaleCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
-    scale_compute(param.x->data<T>(),
-                  param.output->mutable_data<T>(),
+    scale_compute(param.x->template data<T>(),
+                  param.output->template mutable_data<T>(),
                   param.x->dims().production(),
                   param.scale,
                   param.bias,
diff --git a/lite/kernels/x86/search_grnn_compute.cc b/lite/kernels/x86/search_grnn_compute.cc
index 95839ba71b9f63fad9d659fd65c0028005d29799..f25c960f19b60056bd9702a31774a378378f24d6 100644
--- a/lite/kernels/x86/search_grnn_compute.cc
+++ b/lite/kernels/x86/search_grnn_compute.cc
@@ -84,7 +84,7 @@ void SearchGrnnCompute<T>::PrepareLayout(const Tensor* input_blob) {
   int max_width = width_data[idx_sorted_by_width_data[0]];
 
   // start of reorganizing the input
-  std::vector<size_t> new_offset;
+  std::vector<uint64_t> new_offset;
   new_offset.resize(max_width + 1);
 
   new_offset[0] = 0;
diff --git a/lite/kernels/x86/search_group_padding_compute.h b/lite/kernels/x86/search_group_padding_compute.h
index 17244d15d9124d9d61d1f4fdef4f12590958c0be..eee2a8ac8ef757d776580eac9dfc2c6e31694107 100644
--- a/lite/kernels/x86/search_group_padding_compute.h
+++ b/lite/kernels/x86/search_group_padding_compute.h
@@ -50,7 +50,7 @@ class SearchGroupPaddingCompute
       }
     }
 
-    std::vector<size_t> new_offset;
+    std::vector<uint64_t> new_offset;
     new_offset.resize(batch + 1);
     for (int i = 0; i < batch + 1; ++i) {
       new_offset[i] = i * max_seq;
@@ -67,7 +67,7 @@ class SearchGroupPaddingCompute
     top1_lod.push_back(offset);
     top1->set_lod(top1_lod);
     top1->Resize({dim0, 1});
-    memset(top1->mutable_data<T>(),
+    memset(top1->template mutable_data<T>(),
            0,
            top1->dims()[0] * top1->dims()[1] * sizeof(T));
     // for padding input id
@@ -76,9 +76,9 @@ class SearchGroupPaddingCompute
     top2->set_lod(top2_lod);
     top2->Resize({batch * max_seq, 1});
     // copy data
-    const auto* bottom_data = bottom0->data<T>();
-    auto* top_data = top0->mutable_data<T>();
-    auto* top_padding_input_data = top2->mutable_data<T>();
+    const auto* bottom_data = bottom0->template data<T>();
+    auto* top_data = top0->template mutable_data<T>();
+    auto* top_padding_input_data = top2->template mutable_data<T>();
     for (int i = 0; i < batch; i++) {
       const int copy_step = offset[i + 1] - offset[i];
       const int start = i * max_seq;
diff --git a/lite/kernels/x86/search_seq_fc_compute.h b/lite/kernels/x86/search_seq_fc_compute.h
index 80ef54b30b762848eceb16940c9f60ef8ba96927..0f19466e0862e36e744fe74d985ab6136dee0e8d 100644
--- a/lite/kernels/x86/search_seq_fc_compute.h
+++ b/lite/kernels/x86/search_seq_fc_compute.h
@@ -58,8 +58,10 @@ class SearchSeqFcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
       int M = x_dims[0];
       int N = w_dims[0];
       for (int i = 0; i < M; i++) {
-        blas.AXPY(
-            N, static_cast<T>(1), b->data<T>(), out->mutable_data<T>() + i * N);
+        blas.AXPY(N,
+                  static_cast<T>(1),
+                  b->template data<T>(),
+                  out->template mutable_data<T>() + i * N);
       }
     }
   }
diff --git a/lite/kernels/x86/sequence_arithmetic_compute.h b/lite/kernels/x86/sequence_arithmetic_compute.h
index 88510b8b1c7a04ab01da9af331f9d1f72765b215..080d0bcd0b42f6f59266e56d0f729eb2a28d4179 100644
--- a/lite/kernels/x86/sequence_arithmetic_compute.h
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
@@ -39,9 +39,9 @@ class SequenceArithmeticCompute
     out->Resize(x->dims());
     out->set_lod(x->lod());
 
-    auto x_data = x->data<T>();
-    auto y_data = y->data<T>();
-    auto out_data = out->mutable_data<T>();
+    auto x_data = x->template data<T>();
+    auto y_data = y->template data<T>();
+    auto out_data = out->template mutable_data<T>();
     auto x_seq_offset = x->lod()[0];
     auto y_seq_offset = y->lod()[0];
     int seq_num = x_seq_offset.size() - 1;
diff --git a/lite/kernels/x86/sequence_concat_compute.h b/lite/kernels/x86/sequence_concat_compute.h
index 8dd7077f7dbbb3e61f21d63e8c935157b3d2d579..cbf8a41b7e2228d3b2fab3fe5049281850961c1e 100644
--- a/lite/kernels/x86/sequence_concat_compute.h
+++ b/lite/kernels/x86/sequence_concat_compute.h
@@ -25,7 +25,7 @@ namespace x86 {
 template <typename T>
 inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
                      std::vector<lite::Tensor>* xs_in_order) {
-  std::vector<size_t> result;
+  std::vector<uint64_t> result;
   result.resize(xs[0]->lod()[0].size());
 
   for (size_t i = 1; i < result.size(); ++i) {
@@ -75,7 +75,7 @@ class SequenceConcatCompute
     out_dims[0] = batch_size;
     param.Out->Resize(out_dims);
 
-    T* dout = param.Out->mutable_data<T>();
+    T* dout = param.Out->template mutable_data<T>();
 
     std::vector<lite::Tensor> x_in_order;
     param.Out->set_lod(ConcatLoD<T>(param.X, &x_in_order));
diff --git a/lite/kernels/x86/sequence_concat_compute_test.cc b/lite/kernels/x86/sequence_concat_compute_test.cc
index be1f86a5c848b5c03634ea2a1aed0d57f2283879..eb6678a655ed1eb5a7bcda1dc2a6b8afe4477d2d 100644
--- a/lite/kernels/x86/sequence_concat_compute_test.cc
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
@@ -26,7 +26,7 @@ namespace x86 {
 namespace {
 inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
                      std::vector<lite::Tensor>* xs_in_order) {
-  std::vector<size_t> result;
+  std::vector<uint64_t> result;
   result.resize(xs[0]->lod()[0].size());
 
   for (size_t i = 1; i < result.size(); ++i) {
diff --git a/lite/kernels/x86/sequence_conv_compute.cc b/lite/kernels/x86/sequence_conv_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32bf8b315c7952a74846af5c4e5548767c80e63e
--- /dev/null
+++ b/lite/kernels/x86/sequence_conv_compute.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_conv_compute.h"
+
+REGISTER_LITE_KERNEL(sequence_conv,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SequenceConvCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_conv_compute.h b/lite/kernels/x86/sequence_conv_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1a47aa20f4886aa5dddbe6b398e5365abdc16f2
--- /dev/null
+++ b/lite/kernels/x86/sequence_conv_compute.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <algorithm>
+#include <vector>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/backends/x86/math/context_project.h"
+#include "lite/backends/x86/math/math_function.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+namespace math = paddle::lite::x86::math;
+
+template <typename T>
+class SequenceConvCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceConvParam;
+
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto& ctx = this->ctx_->template As<X86Context>();
+
+    auto* in = param.X;
+    auto* filter = param.Filter;
+    auto* out = param.Out;
+    out->template mutable_data<T>();
+    CHECK(in->lod().size() == 1) << "Only support one level sequence now";
+
+    int context_start = param.contextStart;
+    int context_stride = param.contextStride;
+    int context_length = param.contextLength;
+    bool padding_trainable = false;
+    const Tensor* padding_data = nullptr;
+
+    int up_pad = std::max(0, -context_start);
+    int down_pad = std::max(0, context_start + context_length - 1);
+    auto sequence_width = static_cast<int64_t>(in->dims()[1]);
+
+    std::vector<int64_t> col_shape{in->dims()[0],
+                                   context_length * sequence_width};
+    Tensor col;
+    col.Resize(col_shape);
+    col.mutable_data<T>();
+
+    // Because if padding_trainable is false, padding data should be zeros.
+    math::SetConstant<TARGET(kX86), T> set_zero;
+    auto blas = math::GetBlas<TARGET(kX86), T>(ctx);
+    set_zero(ctx, &col, static_cast<T>(0));
+    math::ContextProjectFunctor<TARGET(kX86), T> seq_project_functor;
+
+    seq_project_functor(ctx,
+                        *in,
+                        padding_data,
+                        padding_trainable,
+                        context_start,
+                        context_length,
+                        context_stride,
+                        up_pad,
+                        down_pad,
+                        &col);
+
+    blas.MatMul(col, *filter, out);
+  }
+
+  virtual ~SequenceConvCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/sequence_expand_as_compute.h b/lite/kernels/x86/sequence_expand_as_compute.h
index 16759c1b9f1d136d5aaf58d4531882ab6a2618a2..badbfac14cbeb120d23ea1174a9fc3a218b2224f 100644
--- a/lite/kernels/x86/sequence_expand_as_compute.h
+++ b/lite/kernels/x86/sequence_expand_as_compute.h
@@ -29,9 +29,10 @@ using Tensor = lite::Tensor;
 
 template <typename T>
 struct SequenceExpandFunctor {
-  void operator()(const Tensor &x,
-                  const std::vector<size_t> &ref_lod, /*expand referenced lod*/
-                  Tensor *out) {
+  void operator()(
+      const Tensor &x,
+      const std::vector<uint64_t> &ref_lod, /*expand referenced lod*/
+      Tensor *out) {
     int64_t hight = x.dims()[0];
     int64_t width = x.data_size() / hight;
 
@@ -39,13 +40,13 @@ struct SequenceExpandFunctor {
     T *out_data = out->mutable_data<T, T>();
 
     for (int h_id = 0; h_id < hight; ++h_id) {
-      size_t span = ref_lod[h_id + 1] - ref_lod[h_id];
+      uint64_t span = ref_lod[h_id + 1] - ref_lod[h_id];
       if (span == 0) continue;
       const T *src = in_data + h_id * width;
-      for (int64_t w_id = 0; w_id < width; ++w_id) {
+      for (uint64_t w_id = 0; w_id < width; ++w_id) {
         T ele = src[w_id];
         size_t offset = ref_lod[h_id] * width;
-        for (size_t k = 0; k < span; ++k) {
+        for (uint64_t k = 0; k < span; ++k) {
           out_data[offset + k * width + w_id] = ele;
         }
       }
@@ -68,7 +69,7 @@ class SequenceExpandAsCompute
     CHECK_EQ(y_lod.size(), 1);
     CHECK_GT(y_lod[0].size(), 1);
 
-    out->mutable_data<T, T>();
+    out->template mutable_data<T, T>();
 
     SequenceExpandFunctor<T> seq_espand_functor;
     seq_espand_functor(*x, y_lod[0], out);
diff --git a/lite/kernels/x86/sequence_pool_compute.h b/lite/kernels/x86/sequence_pool_compute.h
index 329a76658d342078ed5d708125d9ff01e0ecef02..20e0307cef2347ce68237f70c990362bbaa210e7 100644
--- a/lite/kernels/x86/sequence_pool_compute.h
+++ b/lite/kernels/x86/sequence_pool_compute.h
@@ -40,7 +40,7 @@ class SequencePoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
     dims[0] = lod[0].size() - 1;
     out->Resize({dims});
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
     lite::Tensor* index = nullptr;
 
     const bool is_test = true;
diff --git a/lite/kernels/x86/sequence_reshape_compute.h b/lite/kernels/x86/sequence_reshape_compute.h
index 99f84ebd06e1f5742bbaee9f98ec17aee44bd871..d166f8bc3d80d9f87efb0315462daee3296f393f 100644
--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
@@ -64,9 +64,9 @@ class SequenceReshapeCompute
 
     out->Resize(std::vector<int64_t>{static_cast<int64_t>(out->lod()[0].back()),
                                      out_width});
-    auto* dst_ptr = out->mutable_data<T>();
+    auto* dst_ptr = out->template mutable_data<T>();
     auto size = in->numel() * sizeof(T);
-    std::memcpy(dst_ptr, in->data<T>(), size);
+    std::memcpy(dst_ptr, in->template data<T>(), size);
   }
 
   virtual ~SequenceReshapeCompute() = default;
diff --git a/lite/kernels/x86/sequence_unpad_compute.cc b/lite/kernels/x86/sequence_unpad_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..430f3c47c60b8f5a506ff1191a118db754f1dffe
--- /dev/null
+++ b/lite/kernels/x86/sequence_unpad_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_unpad_compute.h"
+
+REGISTER_LITE_KERNEL(sequence_unpad,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SequenceUnpadCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Length",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_unpad_compute.h b/lite/kernels/x86/sequence_unpad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b4e3f6c1638975ec042598942363f516ddf3bb9
--- /dev/null
+++ b/lite/kernels/x86/sequence_unpad_compute.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/x86/math/sequence_padding.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+namespace math = paddle::lite::x86::math;
+
+template <typename T>
+class SequenceUnpadCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceUnpadParam;
+
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto& ctx = this->ctx_->template As<X86Context>();
+
+    param.Out->template mutable_data<T>();
+    int64_t padded_length = param.X->dims()[1];
+    math::UnpaddingLoDTensorFunctor<lite::TargetType::kX86, T>()(
+        ctx,
+        *param.X,
+        param.Out,
+        padded_length,
+        0,
+        false,
+        math::kBatchLengthWidth);
+  }
+
+  virtual ~SequenceUnpadCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/shape_compute.h b/lite/kernels/x86/shape_compute.h
index ee3678a7f1c6651226c479aeedcacce91085b295..e78684e629727fc7023e6ae4c3385f9c58d48a6b 100644
--- a/lite/kernels/x86/shape_compute.h
+++ b/lite/kernels/x86/shape_compute.h
@@ -29,7 +29,7 @@ class ShapeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   void Run() override {
     auto& param = *param_.get_mutable<operators::ShapeParam>();
     // auto& context = context_->As<X86Context>();
-    auto out_data = param.Out->mutable_data<int32_t>();
+    auto out_data = param.Out->template mutable_data<int32_t>();
     auto in_dims = param.X->dims();
     for (int i = 0; i < in_dims.size(); ++i) {
       out_data[i] = in_dims[i];
diff --git a/lite/kernels/x86/softmax_compute.h b/lite/kernels/x86/softmax_compute.h
index 5a18a8022773682c0853a3592a9925f3a6015e83..3abc15145bde35a2c442daa9feff7137bcb40fb4 100644
--- a/lite/kernels/x86/softmax_compute.h
+++ b/lite/kernels/x86/softmax_compute.h
@@ -58,7 +58,7 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
     auto* x = param.x;
     auto* output = param.output;
-    output->mutable_data<T>();
+    output->template mutable_data<T>();
 
     const int rank = x->dims().size();
     const int axis = CanonicalAxis(param.axis, rank);
diff --git a/lite/kernels/x86/squeeze_compute.h b/lite/kernels/x86/squeeze_compute.h
index 67086f8c732d412064c6bb0bece7e8208f8a0799..3288421c14447a348efd63c8cc5ea4de9bd2b24e 100644
--- a/lite/kernels/x86/squeeze_compute.h
+++ b/lite/kernels/x86/squeeze_compute.h
@@ -35,8 +35,8 @@ class SqueezeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto x = param.X;
     auto output = param.Out;
     auto x_dims = x->dims();
-    auto* x_data = x->data<T>();
-    auto* out_data = output->mutable_data<T>();
+    auto* x_data = x->template data<T>();
+    auto* out_data = output->template mutable_data<T>();
     memcpy(out_data, x_data, x_dims.production() * sizeof(T));
   }
 
@@ -54,9 +54,9 @@ class Squeeze2Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto output = param.Out;
     auto xshape = param.XShape;
     auto x_dims = x->dims();
-    auto* x_data = x->data<T>();
-    auto* out_data = output->mutable_data<T>();
-    auto* xshape_data = xshape->mutable_data<T>();
+    auto* x_data = x->template data<T>();
+    auto* out_data = output->template mutable_data<T>();
+    auto* xshape_data = xshape->template mutable_data<T>();
     memcpy(out_data, x_data, x_dims.production() * sizeof(T));
     memcpy(xshape_data, x_data, x_dims.production() * sizeof(T));
   }
diff --git a/lite/kernels/x86/stack_compute.h b/lite/kernels/x86/stack_compute.h
index 12a6c3490eff9d446de96366c8dd5fe6b2a4bd06..08b3515948750a5cb36627f0349c852e597619e6 100644
--- a/lite/kernels/x86/stack_compute.h
+++ b/lite/kernels/x86/stack_compute.h
@@ -40,9 +40,9 @@ class StackCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     if (axis < 0) axis += (x[0]->dims().size() + 1);
 
     int n = static_cast<int>(x.size());
-    auto y_data = y->mutable_data<T>();
+    auto y_data = y->template mutable_data<T>();
     std::vector<const T*> x_datas(n);
-    for (int i = 0; i < n; ++i) x_datas[i] = x[i]->data<T>();
+    for (int i = 0; i < n; ++i) x_datas[i] = x[i]->template data<T>();
 
     int pre = 1, post = 1;
     auto dim = x[0]->dims();
diff --git a/lite/kernels/x86/transpose_compute.h b/lite/kernels/x86/transpose_compute.h
index 603b96015e267aa24d20bf20f2c3090a2daab74c..5f6faed2017b6bdef60e7505bf1f0088d86b3ec1 100644
--- a/lite/kernels/x86/transpose_compute.h
+++ b/lite/kernels/x86/transpose_compute.h
@@ -73,7 +73,7 @@ class TransposeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& param = *param_.get_mutable<param_t>();
     auto* x = param.x;
     auto* out = param.output;
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
     int ndims = param.axis.size();
     auto& context = ctx_->As<X86Context>();
     TransCompute<lite::TargetType::kX86, T>(
@@ -92,7 +92,7 @@ class Transpose2Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& param = *param_.get_mutable<param_t>();
     auto* x = param.x;
     auto* out = param.output;
-    out->mutable_data<T>();
+    out->template mutable_data<T>();
     int ndims = param.axis.size();
     auto& context = ctx_->As<X86Context>();
     TransCompute<lite::TargetType::kX86, T>(
diff --git a/lite/kernels/x86/uniform_random_compute.cc b/lite/kernels/x86/uniform_random_compute.cc
index 64a701d4c67a9bf908f7fc87e9923f22dde811e3..45c1c08d46e5a23857547aac15b952a1123e741f 100644
--- a/lite/kernels/x86/uniform_random_compute.cc
+++ b/lite/kernels/x86/uniform_random_compute.cc
@@ -34,8 +34,8 @@ class UniformRandomCompute
 
     auto *param_out = &param.Out->raw_tensor();
 
-    T *data =
-        param_out->mutable_data<T>(context.x86_device_context()->GetPlace());
+    T *data = param_out->template mutable_data<T>(
+        context.x86_device_context()->GetPlace());
 
     unsigned int seed = static_cast<unsigned int>(param.seed);
     std::minstd_rand engine;
diff --git a/lite/kernels/x86/var_conv_2d_compute.h b/lite/kernels/x86/var_conv_2d_compute.h
index 7a9ba16d2ea87adb40df23e1fbe149ab805afbc8..1bed39f479c87636ff217c8fd7234ea2c5bd5904 100644
--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
@@ -80,7 +80,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     std::vector<int64_t> col_dims_vec{top_size};
     col_dims_vec.push_back(1);
     col->Resize(col_dims_vec);
-    auto* top_data = col->mutable_data<T>();
+    auto* top_data = col->template mutable_data<T>();
     const auto* bottom_data = input.data<T>();
 
     int kernel_win_size = kernel_h * kernel_w;
@@ -149,7 +149,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     // const auto& offset_y = in_row->lod()[0];
     const auto& offset_y = param.X->lod()[1];
     const auto& offset_x = param.X->lod()[2];
-    std::vector<size_t> top_offset;
+    std::vector<uint64_t> top_offset;
     int top_size = 0;
     top_offset.push_back(top_size);
     for (int b = 0; b < batch; ++b) {
@@ -178,9 +178,9 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     std::vector<int64_t> top_dims_vec{top_size};
     top_dims_vec.push_back(1);
     top->Resize(top_dims_vec);
-    auto* top_data = top->mutable_data<T>();
-    const auto* w_data = w->data<T>();
-    const auto* col_data = col->data<T>();
+    auto* top_data = top->template mutable_data<T>();
+    const auto* w_data = w->template data<T>();
+    const auto* col_data = col->template data<T>();
 
     auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
     for (int b = 0; b < batch; ++b) {
diff --git a/lite/kernels/x86/var_conv_2d_compute_test.cc b/lite/kernels/x86/var_conv_2d_compute_test.cc
index d6ae5a67bfc9deba1fb097fa5c0c0cf323b65e48..edef8cb2df75dfb45ad4964975365d4ddbbe9086 100644
--- a/lite/kernels/x86/var_conv_2d_compute_test.cc
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
@@ -140,7 +140,7 @@ static void var_conv_2d_ref(const lite::Tensor* bottom,
   const auto& col_offset = col->lod()[0];
   const auto& offset_x = in_col->lod()[0];
   const auto& offset_y = in_row->lod()[0];
-  std::vector<size_t> top_offset;
+  std::vector<uint64_t> top_offset;
   int top_size = 0;
   top_offset.push_back(top_size);
   for (int b = 0; b < batch; ++b) {
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index d9c6de358650d5bc84e12762198988c0e46e34bf..07dc127695e3906719b45020a585966877bec868 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -1,4 +1,27 @@
+if(NOT LITE_WITH_XPU)
+  return()
+endif()
 
-add_subdirectory(bridges)
-
-add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
+if(LITE_WITH_XTCL)
+  add_subdirectory(bridges)
+  add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
+else()
+  add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} target_wrapper_xpu)
+  add_kernel(batch_norm_compute_xpu XPU basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(activation_compute_xpu XPU basic SRCS activation_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(pool_compute_xpu XPU basic SRCS pool_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(elementwise_compute_xpu XPU basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(mul_compute_xpu XPU basic SRCS mul_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(softmax_compute_xpu XPU basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(scale_compute_xpu XPU basic SRCS scale_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(lookup_table_compute_xpu XPU basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(layer_norm_compute_xpu XPU basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(dropout_compute_xpu XPU basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(matmul_compute_xpu XPU basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__multi_encoder_compute_xpu XPU extra SRCS __xpu__multi_encoder_compute.cc DEPS ${lite_kernel_deps})
+endif()
diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0ba33110d2b3efd4a5e164da86ea949c95bbb63
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__multi_encoder_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUMultiEncoderCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  for (auto* fc_weight : param.fc_weight) {
+    arg_fc_weight_.push_back(
+        reinterpret_cast<const int16_t*>(fc_weight->data<float>()));
+  }
+  for (auto* fc_bias : param.fc_bias) {
+    arg_fc_bias_.push_back(fc_bias->data<float>());
+  }
+  for (auto* ln_scale : param.ln_scale) {
+    arg_ln_scale_.push_back(ln_scale->data<float>());
+  }
+  for (auto* ln_bias : param.ln_bias) {
+    arg_ln_bias_.push_back(ln_bias->data<float>());
+  }
+  if (param.act_type == "relu") {
+    act_type_ = xdnn::Activation_t::RELU;
+  }
+}
+
+void XPUMultiEncoderCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int batch_size = param.input->dims()[0];
+  int seq_len = param.input->dims()[1];
+  int r = xdnn::bert_encoder_transformer_int16<int16_t>(
+      ctx.GetRawContext(),                             /* context */
+      batch_size,                                      /* batch_size */
+      seq_len,                                         /* from_seq_len */
+      seq_len,                                         /* to_seq_len */
+      param.head_num,                                  /* head_num */
+      param.size_per_head,                             /* size_per_head */
+      param.n_layers,                                  /* n_layers */
+      param.input->data<float>(),                      /* from_tensor */
+      param.input->data<float>(),                      /* to_tensor */
+      param.mask->data<float>(),                       /* att_mask */
+      &arg_fc_weight_[0],                              /* fc_weights */
+      &arg_fc_bias_[0],                                /* fc_biass */
+      &arg_ln_scale_[0],                               /* ln_scales */
+      &arg_ln_bias_[0],                                /* ln_biass */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* output */
+      param.fc_weight_max->data<float>(),              /* fc_weights_max */
+      true,                                            /* pretrans_b */
+      true,                                            /* use_l3 */
+      act_type_ /* act_type */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__multi_encoder,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMultiEncoderCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("FCWeight", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("FCBias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("LNScale", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("LNBias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("FCWeightMax", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.h b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..71db4e6f44f9c36e4acdaf0a440463a61f4e3099
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUMultiEncoderCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMultiEncoderParam;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+ private:
+  std::vector<const int16_t *> arg_fc_weight_;
+  std::vector<const float *> arg_fc_bias_;
+  std::vector<const float *> arg_ln_scale_;
+  std::vector<const float *> arg_ln_bias_;
+  xdnn::Activation_t act_type_{xdnn::Activation_t::GELU};
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/__xpu__resnet50_compute.cc b/lite/kernels/xpu/__xpu__resnet50_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e63e03fc9c1d52be42a8ff9b1d6260b3396a2fe
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__resnet50_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUResNet50Compute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  for (auto* filter : param.filter) {
+    arg_filter_.push_back(
+        reinterpret_cast<const int16_t*>(filter->data<float>()));
+  }
+  for (auto* bias : param.bias) {
+    arg_bias_.push_back(bias->data<float>());
+  }
+  for (auto* max_filter : param.max_filter) {
+    arg_max_filter_.push_back(max_filter->data<float>());
+  }
+}
+
+void XPUResNet50Compute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int batch_size = param.input->dims()[0];
+  int r = xdnn::conv2d_int16_resnet<float, int16_t>(
+      ctx.GetRawContext(),                             /* context */
+      batch_size,                                      /* num */
+      param.input->data<float>(),                      /* bottom */
+      &arg_filter_[0],                                 /* weight_list */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* top */
+      &arg_bias_[0],                                   /* bias_list */
+      &arg_max_filter_[0] /* max_filter_list */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__resnet50,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUResNet50Compute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("MaxFilter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__resnet50_compute.h b/lite/kernels/xpu/__xpu__resnet50_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d42f8b6f26edf615dba165b553b633673a4ae66
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUResNet50Compute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUResNet50Param;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+ private:
+  std::vector<const int16_t *> arg_filter_;
+  std::vector<const float *> arg_max_filter_;
+  std::vector<const float *> arg_bias_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/activation_compute.cc b/lite/kernels/xpu/activation_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a46b33252e40a56299ebc7d0f133520a04b7cb20
--- /dev/null
+++ b/lite/kernels/xpu/activation_compute.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/activation_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void ReluCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),      /* context */
+      xdnn::Activation_t::RELU, /* type */
+      param.X->numel(),         /* len */
+      param.X->data<float>(),   /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+void TanhCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),      /* context */
+      xdnn::Activation_t::TANH, /* type */
+      param.X->numel(),         /* len */
+      param.X->data<float>(),   /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+void SigmoidCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),         /* context */
+      xdnn::Activation_t::SIGMOID, /* type */
+      param.X->numel(),            /* len */
+      param.X->data<float>(),      /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    relu, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ReluCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    tanh, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::TanhCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sigmoid,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SigmoidCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/activation_compute.h b/lite/kernels/xpu/activation_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..e440bde4146a88929c52c20ff1038eb35be91d38
--- /dev/null
+++ b/lite/kernels/xpu/activation_compute.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class ReluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~ReluCompute() = default;
+};
+
+class TanhCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~TanhCompute() = default;
+};
+
+class SigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~SigmoidCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/batch_norm_compute.cc b/lite/kernels/xpu/batch_norm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b3139165a06fd0f42897e9ed6c98d80d27adeab
--- /dev/null
+++ b/lite/kernels/xpu/batch_norm_compute.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/batch_norm_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void BatchNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  float epsilon = param.epsilon;
+  auto& x_dims = param.x->dims();
+
+  int r = xdnn::batch_norm_infer_forward(
+      ctx.GetRawContext(),                        /* context */
+      epsilon,                                    /* epsilon */
+      x_dims[0],                                  /* img_n */
+      x_dims[1],                                  /* img_c */
+      x_dims[2],                                  /* img_h */
+      x_dims[3],                                  /* img_w */
+      param.x->data<float>(),                     /* img_gm */
+      param.y->mutable_data<float>(TARGET(kXPU)), /* out_gm */
+      param.scale->data<float>(),                 /* scale_gm */
+      param.bias->data<float>(),                  /* bias_gm */
+      param.mean->data<float>(),                  /* mean_gm */
+      param.variance->data<float>() /* var__gm */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(batch_norm,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::BatchNormCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/batch_norm_compute.h b/lite/kernels/xpu/batch_norm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b428476b96ca3b2b60c66df28b7f82e8f57bebc
--- /dev/null
+++ b/lite/kernels/xpu/batch_norm_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class BatchNormCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BatchNormParam;
+
+  virtual void Run();
+
+  virtual ~BatchNormCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/bridges/CMakeLists.txt b/lite/kernels/xpu/bridges/CMakeLists.txt
index 93f3cdb445af7b75adc76294b287d9963f4e3cca..0d6d708952b0806da7b060bb76b3ce35df352c26 100644
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LITE_WITH_XPU)
+if(NOT LITE_WITH_XTCL)
   return()
 endif()
 
diff --git a/lite/kernels/xpu/bridges/graph.h b/lite/kernels/xpu/bridges/graph.h
index dafd8d853210278220b79fdf58895484cbd89ec0..562e5fea9eef92fae306fe4bb48a4e224b3c76ee 100644
--- a/lite/kernels/xpu/bridges/graph.h
+++ b/lite/kernels/xpu/bridges/graph.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
-#include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 
diff --git a/lite/kernels/xpu/bridges/utility.h b/lite/kernels/xpu/bridges/utility.h
index 776955854567b919234e7c79dcf6321e8e24b70a..0deb4fd7b4723d97a9159a88c6d8a054a047dc92 100644
--- a/lite/kernels/xpu/bridges/utility.h
+++ b/lite/kernels/xpu/bridges/utility.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 
diff --git a/lite/kernels/xpu/cast_compute.cc b/lite/kernels/xpu/cast_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7eabd28a16073db218dcd03542bac0d1e3459be
--- /dev/null
+++ b/lite/kernels/xpu/cast_compute.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/cast_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename InType>
+void CastCompute<InType>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* x = param.X;
+  auto* out = param.Out;
+  int out_dtype = param.out_dtype;
+  auto* in_data = x->template data<InType>();
+  int numel = x->numel();
+
+  int r = 0;
+  // BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6;
+  // SIZE_T = 19;UINT8 = 20;INT8 = 21;
+  if (out_dtype == 5) {
+    auto* out_data = out->template mutable_data<float>(TARGET(kXPU));
+    r = xdnn::cast<InType, float>(
+        ctx.GetRawContext(), in_data, out_data, numel);
+  } else if (out_dtype == 2) {
+    auto* out_data = out->template mutable_data<int>(TARGET(kXPU));
+    r = xdnn::cast<InType, int>(ctx.GetRawContext(), in_data, out_data, numel);
+  } else if (out_dtype == 3) {
+    auto* out_data = out->template mutable_data<int64_t>(TARGET(kXPU));
+    r = xdnn::cast<InType, int64_t>(
+        ctx.GetRawContext(), in_data, out_data, numel);
+  } else {
+    CHECK(false);
+  }
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(cast,
+                     kXPU,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::CastCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))})
+    .Finalize();
diff --git a/lite/kernels/xpu/cast_compute.h b/lite/kernels/xpu/cast_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..8992c29732630a5bf0d9c092461569234257e3a9
--- /dev/null
+++ b/lite/kernels/xpu/cast_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename InType>
+class CastCompute : public KernelLite<TARGET(kXPU), PRECISION(kAny)> {
+ public:
+  using param_t = operators::CastParam;
+
+  void Run() override;
+
+  virtual ~CastCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/conv_compute.cc b/lite/kernels/xpu/conv_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ed692fd0e2d474cbe5ce9f06633280bb09c3878c
--- /dev/null
+++ b/lite/kernels/xpu/conv_compute.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/conv_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <>
+void Conv2dCompute<PRECISION(kFloat)>::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.x->dims();
+  auto& w_dims = param.filter->dims();
+  int groups = param.groups;
+  auto& strides = param.strides;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
+  int r = xdnn::conv2d_forward_int16<float, float, float, float>(
+      ctx.GetRawContext(),                             /* context */
+      x_dims[0],                                       /* num */
+      x_dims[1],                                       /* input_c */
+      x_dims[2],                                       /* input_h */
+      x_dims[3],                                       /* input_w */
+      w_dims[0],                                       /* num_filter */
+      w_dims[2],                                       /* kernel_h */
+      w_dims[3],                                       /* kernel_w */
+      strides[0],                                      /* stride_h */
+      strides[1],                                      /* stride_w */
+      paddings[0],                                     /* pad_h */
+      paddings[1],                                     /* pad_w */
+      dilations[0],                                    /* dilation_h */
+      dilations[1],                                    /* dilation_w */
+      groups,                                          /* group */
+      param.x->data<float>(),                          /* bottom */
+      param.filter->data<float>(),                     /* weight */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* top */
+      nullptr,                                         /* bias */
+      nullptr,                                         /* branch */
+      xdnn::Activation_t::LINEAR,                      /* type */
+      nullptr,                                         /* max_image_ptr */
+      nullptr,                                         /* max_filter_ptr */
+      nullptr /* max_result_ptr */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace xpu = paddle::lite::kernels::xpu;
+using Conv2dFp32 = xpu::Conv2dCompute<PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(conv2d, kXPU, kFloat, kNCHW, Conv2dFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/conv_compute.h b/lite/kernels/xpu/conv_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7631ce4e5773afe7cdd797a245c806b51d25c56
--- /dev/null
+++ b/lite/kernels/xpu/conv_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <PrecisionType FilterPtype>
+class Conv2dCompute : public KernelLite<TARGET(kXPU), FilterPtype> {
+ public:
+  using param_t = operators::ConvParam;
+
+  virtual void Run();
+
+  virtual ~Conv2dCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/dropout_compute.cc b/lite/kernels/xpu/dropout_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f42d3eeff5da40251c27476a53709aee1e65fbcf
--- /dev/null
+++ b/lite/kernels/xpu/dropout_compute.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/dropout_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void DropoutCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int size = param.x->numel() * sizeof(float);
+
+  int r = xdnn::memcpy_device(
+      ctx.GetRawContext(),                             /* context */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* dst */
+      param.x->data<float>(),                          /* src */
+      size /* size */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(dropout,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::DropoutCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/dropout_compute.h b/lite/kernels/xpu/dropout_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..0eaafb4f5555a163623402fee82d50bfa095b0b3
--- /dev/null
+++ b/lite/kernels/xpu/dropout_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class DropoutCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::DropoutParam;
+
+  virtual void Run();
+
+  virtual ~DropoutCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/elementwise_compute.cc b/lite/kernels/xpu/elementwise_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e37337948bf639832ea936de2b5b929d26f534cc
--- /dev/null
+++ b/lite/kernels/xpu/elementwise_compute.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/elementwise_compute.h"
+#include <functional>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void ElementwiseAddCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.X->dims().data();
+  auto& y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (param.axis == -1) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int iter = std::accumulate(
+      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
+  int stride = param.Y->numel();
+
+  for (int i = 0; i < iter; ++i) {
+    const float* x_ptr = param.X->data<float>() + i * stride;
+    const float* y_ptr = param.Y->data<float>();
+    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
+    int r = xdnn::elementwise_add(ctx.GetRawContext(), /* context */
+                                  x_ptr,               /* x */
+                                  y_ptr,               /* y */
+                                  o_ptr,               /* z */
+                                  stride /* len */);
+    CHECK_EQ(r, 0);
+  }
+}
+
+void ElementwiseSubCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.X->dims().data();
+  auto& y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (param.axis == -1) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int iter = std::accumulate(
+      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
+  int stride = param.Y->numel();
+
+  for (int i = 0; i < iter; ++i) {
+    const float* x_ptr = param.X->data<float>() + i * stride;
+    const float* y_ptr = param.Y->data<float>();
+    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
+    int r = xdnn::elementwise_sub(ctx.GetRawContext(), /* context */
+                                  x_ptr,               /* x */
+                                  y_ptr,               /* y */
+                                  o_ptr,               /* z */
+                                  stride /* len */);
+    CHECK_EQ(r, 0);
+  }
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ElementwiseAddCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ElementwiseSubCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/elementwise_compute.h b/lite/kernels/xpu/elementwise_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..863ee3c643f9c431dacd057e251941914b1dd1c5
--- /dev/null
+++ b/lite/kernels/xpu/elementwise_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  virtual void Run();
+
+  virtual ~ElementwiseAddCompute() = default;
+};
+
+class ElementwiseSubCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  virtual void Run();
+
+  virtual ~ElementwiseSubCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/io_copy_compute.cc b/lite/kernels/xpu/io_copy_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ee809563475434cfa286cc3a535bf9acac5d923
--- /dev/null
+++ b/lite/kernels/xpu/io_copy_compute.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/target_wrapper.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+/*
+ * This kernel copies a tensor from host to XPU.
+ */
+class IoCopyHostToXPUCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override {
+    auto& param = Param<operators::IoCopyParam>();
+    CHECK(param.x->target() == TARGET(kHost) ||
+          param.x->target() == TARGET(kX86) ||
+          param.x->target() == TARGET(kARM));
+    auto mem_size = param.x->memory_size();
+    VLOG(4) << "host to xpu, copy size " << mem_size;
+    auto* data = param.y->mutable_data(TARGET(kXPU), mem_size);
+    TargetWrapperXPU::MemcpySync(
+        data, param.x->raw_data(), mem_size, IoDirection::HtoD);
+  }
+
+  std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
+    std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
+    *res = [](const std::map<std::string, const Type*>& inputs,
+              const std::string& out) -> const Type* {
+      CHECK(!inputs.empty());
+      auto* type = inputs.at("Input");
+      CHECK(type->target() == TARGET(kHost));
+
+      auto out_place = type->place();
+      out_place.target = TARGET(kXPU);
+      auto* out_type = Type::Get(type->id(),
+                                 out_place.target,
+                                 out_place.precision,
+                                 out_place.layout,
+                                 out_place.device);
+      return out_type;
+    };
+    return res;
+  }
+
+  std::string doc() const override { return "Copy IO from HOST to XPU"; }
+};
+
+/*
+ * This kernel copies a tensor from XPU to host.
+ */
+class IoCopyXPUToHostCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  void Run() override {
+    auto& param = Param<operators::IoCopyParam>();
+    CHECK(param.x->target() == TARGET(kXPU));
+    auto mem_size = param.x->memory_size();
+    VLOG(4) << "xpu to host, copy size " << mem_size;
+    auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
+    TargetWrapperXPU::MemcpySync(
+        data, param.x->raw_data(), mem_size, IoDirection::DtoH);
+  }
+
+  std::string doc() const override { return "Copy IO from XPU to HOST"; }
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(io_copy,
+                     kXPU,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::xpu::IoCopyHostToXPUCompute,
+                     host_to_device)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kXPU),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(io_copy,
+                     kXPU,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::xpu::IoCopyXPUToHostCompute,
+                     device_to_host)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(io_copy_once,
+                     kXPU,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::xpu::IoCopyHostToXPUCompute,
+                     host_to_device)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kXPU),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(io_copy_once,
+                     kXPU,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::xpu::IoCopyXPUToHostCompute,
+                     device_to_host)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
diff --git a/lite/kernels/xpu/layer_norm_compute.cc b/lite/kernels/xpu/layer_norm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..538ad849d93182488ca35433800f687027c02e4a
--- /dev/null
+++ b/lite/kernels/xpu/layer_norm_compute.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/layer_norm_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void LayerNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto x_dims = param.X->dims();
+  auto axis = param.begin_norm_axis;
+  auto matrix_dim = x_dims.Flatten2D(axis);
+  float epsilon = param.epsilon;
+
+  int r = xdnn::layer_norm(ctx.GetRawContext(),    /* context */
+                           matrix_dim[0],          /* m */
+                           matrix_dim[1],          /* n */
+                           param.X->data<float>(), /* in */
+                           param.Y->mutable_data<float>(TARGET(kXPU)), /* out */
+                           param.Scale->data<float>(), /* scale */
+                           param.Bias->data<float>(),  /* bias */
+                           epsilon /* epsilon */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(layer_norm,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::LayerNormCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/layer_norm_compute.h b/lite/kernels/xpu/layer_norm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d2df37795811ef8027e12b25139f2b7091cceed
--- /dev/null
+++ b/lite/kernels/xpu/layer_norm_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class LayerNormCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::LayerNormParam;
+
+  virtual void Run();
+
+  virtual ~LayerNormCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/lookup_table_compute.cc b/lite/kernels/xpu/lookup_table_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..568d303adefaa06bb8665b4cc92d4a949419d587
--- /dev/null
+++ b/lite/kernels/xpu/lookup_table_compute.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/lookup_table_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void LookupTableCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int num = param.Ids->numel();
+  int embed_dim = param.W->dims()[1];
+
+  int r = xdnn::embedding<float, int64_t>(
+      ctx.GetRawContext(),        /* context */
+      num,                        /* num */
+      param.Ids->data<int64_t>(), /* indices */
+      embed_dim,                  /* embed_dim */
+      param.W->data<float>(),     /* table */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* top */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(lookup_table,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::LookupTableCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/lookup_table_compute.h b/lite/kernels/xpu/lookup_table_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ba1afc869cf9c3a49ab1ad29c66c6c89ba87d19
--- /dev/null
+++ b/lite/kernels/xpu/lookup_table_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class LookupTableCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::LookupTableParam;
+
+  virtual void Run();
+
+  virtual ~LookupTableCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/matmul_compute.cc b/lite/kernels/xpu/matmul_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62e018889d415de8968444594804facc3292e799
--- /dev/null
+++ b/lite/kernels/xpu/matmul_compute.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/matmul_compute.h"
+#include "lite/backends/xpu/math.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+namespace math = paddle::lite::xpu::math;
+
+void MatMulCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* x = param.X;
+  auto* y = param.Y;
+  auto* out = param.Out;
+
+  auto mat_dim_a = math::CreateMatrixDescriptor(
+      math::RowMatrixFromVector(x->dims()), 0, param.transpose_X);
+  auto mat_dim_b = math::CreateMatrixDescriptor(
+      math::ColumnMatrixFromVector(y->dims()), 0, param.transpose_Y);
+  int lda = (mat_dim_a.trans_ ? mat_dim_a.height_ : mat_dim_a.width_);
+  int ldb = (mat_dim_b.trans_ ? mat_dim_b.height_ : mat_dim_b.width_);
+  int ldc = mat_dim_b.width_;
+
+  int r = 0;
+  if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
+    r = xdnn::fc_int16(ctx.GetRawContext(), /* context */
+                       mat_dim_a.trans_,    /* TransA */
+                       mat_dim_b.trans_,    /* TransB */
+                       mat_dim_a.height_,   /* m */
+                       mat_dim_b.width_,    /* n */
+                       mat_dim_a.width_,    /* k */
+                       param.alpha,         /* alpha */
+                       x->data<float>(),    /* A */
+                       y->data<float>(),    /* B */
+                       0.0f,                /* beta */
+                       out->mutable_data<float>(TARGET(kXPU)) /* C */);
+  } else {
+    // batch matmul
+    r = xdnn::gemm_strided_batched_int16<float, float, float>(
+        ctx.GetRawContext(),                    /* context */
+        mat_dim_a.trans_,                       /* TransA */
+        mat_dim_b.trans_,                       /* TransB */
+        mat_dim_a.batch_size_,                  /* batch_size */
+        mat_dim_a.height_,                      /* M */
+        mat_dim_b.width_,                       /* N */
+        mat_dim_a.width_,                       /* K */
+        param.alpha,                            /* alpha */
+        x->data<float>(),                       /* A */
+        lda,                                    /* lda */
+        mat_dim_a.stride_,                      /* stride_a */
+        y->data<float>(),                       /* B */
+        ldb,                                    /* ldb */
+        mat_dim_b.stride_,                      /* stride_b */
+        0.0f,                                   /* beta */
+        out->mutable_data<float>(TARGET(kXPU)), /* C */
+        ldc,                                    /* ldc */
+        mat_dim_a.height_ * mat_dim_b.width_ /* stride_c */);
+  }
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    matmul, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::MatMulCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/matmul_compute.h b/lite/kernels/xpu/matmul_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..aca3cbc603eff490ae19fd2546352adca3c1a7cf
--- /dev/null
+++ b/lite/kernels/xpu/matmul_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class MatMulCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatMulParam;
+
+  virtual void Run();
+
+  virtual ~MatMulCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/mul_compute.cc b/lite/kernels/xpu/mul_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8aa93a9c8b8d84874b95dae2c15bf985585c916c
--- /dev/null
+++ b/lite/kernels/xpu/mul_compute.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/mul_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void MulCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& origin_x = *param.x;
+  auto& origin_y = *param.y;
+  auto& x_dims = origin_x.dims();
+  auto& y_dims = origin_y.dims();
+  Tensor x_matrix, y_matrix;
+  if (x_dims.size() > 2) {
+    x_matrix = ReshapeToMatrix(origin_x, param.x_num_col_dims);
+  } else {
+    x_matrix = origin_x;
+  }
+  if (y_dims.size() > 2) {
+    y_matrix = ReshapeToMatrix(origin_y, param.y_num_col_dims);
+  } else {
+    y_matrix = origin_y;
+  }
+  int m = x_matrix.dims()[0];
+  int k = x_matrix.dims()[1];
+  int n = y_matrix.dims()[1];
+
+  int r =
+      xdnn::fc_int16(ctx.GetRawContext(), /* context */
+                     false,               /* TransA */
+                     false,               /* TransB */
+                     m,
+                     n,
+                     k,
+                     1.0f,                   /* alpha */
+                     x_matrix.data<float>(), /* A */
+                     y_matrix.data<float>(), /* B */
+                     0.0f,                   /* beta */
+                     param.output->mutable_data<float>(TARGET(kXPU)) /* C */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    mul, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::MulCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/mul_compute.h b/lite/kernels/xpu/mul_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb2778c0e73189b11135395b42655e0250bbfd0a
--- /dev/null
+++ b/lite/kernels/xpu/mul_compute.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+static inline lite::Tensor ReshapeToMatrix(const lite::Tensor& src,
+                                           int num_col_dims) {
+  int rank = src.dims().size();
+  if (rank == 2) {
+    return src;
+  }
+  lite::Tensor res;
+  res.ShareDataWith(src);
+  res.Resize(src.dims().Flatten2D(num_col_dims));
+  return res;
+}
+
+class MulCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulParam;
+
+  virtual void Run();
+
+  virtual ~MulCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/pool_compute.cc b/lite/kernels/xpu/pool_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4480e4875cb3317ddeeea7017f4aa825e2afe320
--- /dev/null
+++ b/lite/kernels/xpu/pool_compute.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/pool_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void Pool2DCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.x->dims();
+  CHECK_EQ(x_dims.size(), 4);
+  auto& o_dims = param.output->dims();
+  CHECK_EQ(param.ksize.size(), 2);
+  if (param.global_pooling) {
+    param.ksize[0] = x_dims[2];
+    param.ksize[1] = x_dims[3];
+  }
+  CHECK_EQ(param.strides.size(), 2);
+  CHECK_EQ(param.paddings->size(), 4);
+  auto& paddings = *param.paddings;
+  auto type = xdnn::MAX_WITHOUT_INDEX;
+  if (param.pooling_type == "avg") {
+    if (paddings[0] == 0 && paddings[1] == 0 && paddings[2] == 0 &&
+        paddings[3] == 0) {
+      type = xdnn::AVG_WITHOUT_PAD;
+    } else {
+      type = xdnn::AVG_WITH_PAD;
+    }
+  }
+
+  int r = xdnn::pooling_forward<float, float>(
+      ctx.GetRawContext(),                             /* context */
+      param.x->data<float>(),                          /* x */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* y */
+      nullptr,                                         /* y_index */
+      type,                                            /* type */
+      x_dims[0] * x_dims[1],                           /* c */
+      x_dims[2],                                       /* in_h */
+      x_dims[3],                                       /* in_w */
+      paddings[0],                                     /* pad_left */
+      paddings[1],                                     /* pad_right */
+      paddings[2],                                     /* pad_up */
+      paddings[3],                                     /* pad_down */
+      param.ksize[0],                                  /* win_h */
+      param.ksize[1],                                  /* win_w */
+      param.strides[0],                                /* stride_h */
+      param.strides[1],                                /* stride_w */
+      o_dims[2],                                       /* out_h */
+      o_dims[3] /* out_w */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    pool2d, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Pool2DCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/pool_compute.h b/lite/kernels/xpu/pool_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..5648554c41c76396184b7dc536f8c8628cbf23e4
--- /dev/null
+++ b/lite/kernels/xpu/pool_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class Pool2DCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::PoolParam;
+
+  virtual void Run();
+
+  virtual ~Pool2DCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/scale_compute.cc b/lite/kernels/xpu/scale_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c8d3b0a238880402c09e014aeb91a898b252660
--- /dev/null
+++ b/lite/kernels/xpu/scale_compute.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/scale_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void ScaleCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.x->dims();
+
+  int r = xdnn::scale(ctx.GetRawContext(),    /* context */
+                      x_dims.production(),    /* len */
+                      param.scale,            /* alpha */
+                      param.bias,             /* beta */
+                      param.bias_after_scale, /* bias_after_scale */
+                      param.x->data<float>(), /* x */
+                      param.output->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    scale, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ScaleCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/scale_compute.h b/lite/kernels/xpu/scale_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6989b0f0f31e54a63dac2f7c2090dc676e31acfb
--- /dev/null
+++ b/lite/kernels/xpu/scale_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class ScaleCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ScaleParam;
+
+  virtual void Run();
+
+  virtual ~ScaleCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/slice_compute.cc b/lite/kernels/xpu/slice_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5919f84dbd3f0923cc44f2ad4bee13d1bb13f98d
--- /dev/null
+++ b/lite/kernels/xpu/slice_compute.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/slice_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SliceCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.X->dims();
+  x_shape_.reserve(x_dims.size());
+  x_dim_begin_.reserve(x_dims.size());
+  x_dim_end_.reserve(x_dims.size());
+}
+
+void SliceCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto x_dims = param.X->dims();
+  for (size_t i = 0; i < x_dims.size(); ++i) {
+    x_shape_[i] = x_dims[i];
+    x_dim_begin_[i] = 0;
+    x_dim_end_[i] = x_dims[i];
+  }
+  for (size_t i = 0; i < param.axes.size(); ++i) {
+    int axis = param.axes[i];
+    x_dim_begin_[axis] = param.starts[i];
+    x_dim_end_[axis] = param.ends[i];
+  }
+
+  int ndim = param.X->dims().size();
+  int r = xdnn::slice_forward(
+      ctx.GetRawContext(),    /* context */
+      &x_shape_[0],           /* shape */
+      &x_dim_begin_[0],       /* starts */
+      &x_dim_end_[0],         /* ends */
+      ndim,                   /* n */
+      param.X->data<float>(), /* in */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    slice, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SliceCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/slice_compute.h b/lite/kernels/xpu/slice_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fb34e30c143d0890dc76e9b0fd3b2d1bfcef8e9
--- /dev/null
+++ b/lite/kernels/xpu/slice_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SliceCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SliceParam;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+  virtual ~SliceCompute() = default;
+
+ private:
+  std::vector<int> x_shape_;
+  std::vector<int> x_dim_begin_;
+  std::vector<int> x_dim_end_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/softmax_compute.cc b/lite/kernels/xpu/softmax_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e4a6c19f3bfc9ced852c5b6aa7f63e568bc7669
--- /dev/null
+++ b/lite/kernels/xpu/softmax_compute.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/softmax_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SoftmaxCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.x->dims();
+  int axis = CanonicalAxis(param.axis, x_dims.size());
+  int rows = SizeToAxis(axis, x_dims);
+  int cols = SizeFromAxis(axis, x_dims);
+
+  int r = xdnn::softmax2d_forward(
+      ctx.GetRawContext(),                             /* context */
+      param.x->data<float>(),                          /* x */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* y */
+      rows,                                            /* rows */
+      cols /* cols */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(softmax,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SoftmaxCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/softmax_compute.h b/lite/kernels/xpu/softmax_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..e807f38a2ea3c9645b78340ac4dc87d1984c40f7
--- /dev/null
+++ b/lite/kernels/xpu/softmax_compute.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+
+static inline int SizeToAxis(const int axis, lite::DDim dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeFromAxis(const int axis, lite::DDim dims) {
+  int size = 1;
+  for (size_t i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+class SoftmaxCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SoftmaxParam;
+
+  virtual void Run();
+
+  virtual ~SoftmaxCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/stack_compute.cc b/lite/kernels/xpu/stack_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9e5c19d25135ac5877e38eaf65829fefc500e07
--- /dev/null
+++ b/lite/kernels/xpu/stack_compute.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/stack_compute.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void StackCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  int n = param.X.size();
+  void* x_ptr = nullptr;
+  xpu_malloc(&x_ptr, n * 8 /* sizeof(__global__ float*) */);
+  x_ptr_guard_.reset(x_ptr);
+  x_ptr_cpu_.reserve(n);
+}
+
+void StackCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int n = param.X.size();
+  auto x_dims = param.X[0]->dims();
+  int axis = param.axis;
+  // XXX(miaotianxiang): +1?
+  if (axis < 0) axis += (x_dims.size() + 1);
+  auto matrix = x_dims.Flatten2D(axis);
+  int height = matrix[0];
+  int width = matrix[1];
+
+  for (int i = 0; i < n; ++i) {
+    x_ptr_cpu_[i] = param.X[i]->data<float>();
+  }
+  xpu_memcpy(x_ptr_guard_.get(), &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE);
+
+  int r = xdnn::stack_forward(
+      ctx.GetRawContext(), /* context */
+      height,              /* height */
+      width,               /* width */
+      n,                   /* n */
+      x_ptr_guard_.get(),  /* x_ptr */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    stack, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::StackCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/stack_compute.h b/lite/kernels/xpu/stack_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f77cbb3a73bce2d5496f840b2a1f8e14313e776
--- /dev/null
+++ b/lite/kernels/xpu/stack_compute.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+struct XPUFreeDeleter {
+  void operator()(void* p) const { xpu_free(p); }
+};
+
+class StackCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::StackParam;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+  virtual ~StackCompute() = default;
+
+ private:
+  std::unique_ptr<void, XPUFreeDeleter> x_ptr_guard_;
+  std::vector<const float*> x_ptr_cpu_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h
index 1faada3978a2ab33fbe0135d57f21a94c97d5c61..601c8821bc826e350c233573bf7eff89cdf5c1f5 100644
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
 #include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/kernel.h"
 #include "lite/kernels/npu/bridges/engine.h"
 #include "lite/kernels/npu/bridges/registry.h"
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index ae9ec3ad47fbc00c91ba06c1597bd65e510b629b..c7fa674bff745df29b271e10c8c4d99687a889ed 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -14,7 +14,7 @@ add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} )
 add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS})
 add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS})
 add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS})
-add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
+add_operator(activation_basic_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
 add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS})
 add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS})
 add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
@@ -60,6 +60,7 @@ add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS})
 add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS})
 
 # 3.extra ops
+add_operator(activation_extra_ops extra SRCS activation_extra_ops.cc DEPS ${op_DEPS})
 add_operator(search_group_padding extra SRCS search_group_padding_op.cc DEPS ${op_DEPS})
 add_operator(lrn_op_lite extra SRCS lrn_op.cc DEPS ${op_DEPS})
 add_operator(decode_bboxes_op_lite extra SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
@@ -73,6 +74,7 @@ add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS})
 add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS})
 add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS})
+add_operator(sequence_unpad_op_lite extra SRCS sequence_unpad_op.cc DEPS ${op_DEPS})
 add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS})
 add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS})
 add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS})
@@ -105,6 +107,7 @@ add_operator(conditional_block_op_lite extra SRCS conditional_block_op.cc DEPS $
 add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.cc DEPS ${op_DEPS})
 add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS})
 add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS})
+add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS})
 
 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
@@ -148,6 +151,10 @@ add_operator(elementwise_grad_op train SRCS elementwise_grad_ops.cc DEPS ${op_DE
 add_operator(mul_grad_op train SRCS mul_grad_op.cc DEPS ${op_DEPS})
 add_operator(sgd_op train SRCS sgd_op.cc DEPS ${op_DEPS})
 
+# Only for XPU
+add_operator(__xpu__resnet50_op extra SRCS __xpu__resnet50_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__multi_encoder_op extra SRCS __xpu__multi_encoder_op.cc DEPS ${op_DEPS})
+
 if (NOT LITE_WITH_X86)
     lite_cc_test(test_fc_op SRCS fc_op_test.cc
                 DEPS fc_op memory
diff --git a/lite/operators/__xpu__multi_encoder_op.cc b/lite/operators/__xpu__multi_encoder_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d8aca942592668831b8d46d3e07ce83a57f1011
--- /dev/null
+++ b/lite/operators/__xpu__multi_encoder_op.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__multi_encoder_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUMultiEncoderOp::CheckShape() const { return true; }
+
+bool XPUMultiEncoderOp::InferShapeImpl() const {
+  auto input_shape = param_.input->dims();
+  param_.output->Resize(input_shape);
+  return true;
+}
+
+bool XPUMultiEncoderOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                   lite::Scope* scope) {
+  param_.input = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
+  param_.mask = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("Mask").front())->Get<lite::Tensor>());
+  param_.fc_weight_max = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("FCWeightMax").front())
+           ->Get<lite::Tensor>());
+  param_.output = scope->FindVar(op_desc.Output("Output").front())
+                      ->GetMutable<lite::Tensor>();
+
+  param_.fc_weight.clear();
+  for (auto& name : op_desc.Input("FCWeight")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.fc_weight.push_back(t);
+  }
+  param_.fc_bias.clear();
+  for (auto& name : op_desc.Input("FCBias")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.fc_bias.push_back(t);
+  }
+  param_.ln_scale.clear();
+  for (auto& name : op_desc.Input("LNScale")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.ln_scale.push_back(t);
+  }
+  param_.ln_bias.clear();
+  for (auto& name : op_desc.Input("LNBias")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.ln_bias.push_back(t);
+  }
+
+  param_.n_layers = op_desc.GetAttr<int>("n_layers");
+  param_.head_num = op_desc.GetAttr<int>("head_num");
+  param_.size_per_head = op_desc.GetAttr<int>("size_per_head");
+  param_.act_type = op_desc.GetAttr<std::string>("act_type");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__multi_encoder,
+                 paddle::lite::operators::XPUMultiEncoderOp);
diff --git a/lite/operators/__xpu__multi_encoder_op.h b/lite/operators/__xpu__multi_encoder_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c20562151ad751f3a8c72ce9ce262cf1f0a286a
--- /dev/null
+++ b/lite/operators/__xpu__multi_encoder_op.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUMultiEncoderOp : public OpLite {
+ public:
+  XPUMultiEncoderOp() {}
+  explicit XPUMultiEncoderOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "MultiEncoder"; }
+
+ private:
+  mutable XPUMultiEncoderParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/__xpu__resnet50_op.cc b/lite/operators/__xpu__resnet50_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02ea6dc1799baaab486b839a4d3137020a9f7a5c
--- /dev/null
+++ b/lite/operators/__xpu__resnet50_op.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__resnet50_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUResNet50Op::CheckShape() const { return true; }
+
+bool XPUResNet50Op::InferShapeImpl() const {
+  auto input_shape = param_.input->dims();
+  input_shape[1] = 2048;
+  input_shape[2] = 1;
+  input_shape[3] = 1;
+  param_.output->Resize(input_shape);
+  return true;
+}
+
+bool XPUResNet50Op::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  param_.input = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
+  param_.output = scope->FindVar(op_desc.Output("Output").front())
+                      ->GetMutable<lite::Tensor>();
+
+  param_.filter.clear();
+  for (auto& name : op_desc.Input("Filter")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.filter.push_back(t);
+  }
+  param_.bias.clear();
+  for (auto& name : op_desc.Input("Bias")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.bias.push_back(t);
+  }
+  param_.max_filter.clear();
+  for (auto& name : op_desc.Input("MaxFilter")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.max_filter.push_back(t);
+  }
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__resnet50, paddle::lite::operators::XPUResNet50Op);
diff --git a/lite/operators/__xpu__resnet50_op.h b/lite/operators/__xpu__resnet50_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..97f4d42006c64243818af21aa26f708d7889ba96
--- /dev/null
+++ b/lite/operators/__xpu__resnet50_op.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUResNet50Op : public OpLite {
+ public:
+  XPUResNet50Op() {}
+  explicit XPUResNet50Op(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "ResNet50"; }
+
+ private:
+  mutable XPUResNet50Param param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/activation_extra_ops.cc b/lite/operators/activation_extra_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c773b4327abd48532a1bc9283963bd0dad19da6
--- /dev/null
+++ b/lite/operators/activation_extra_ops.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.i
+
+#include "lite/core/op_registry.h"
+#include "lite/operators/activation_ops.h"
+
+// Extra activation ops
+REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(relu_clipped, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(hard_swish, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(reciprocal, paddle::lite::operators::ActivationOp);
diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc
index 13abe0c53e95363e7f54c56819eaac26ef720072..a3d9895955d99b96609a8c35e2493b17a11b9181 100644
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -74,6 +74,14 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   } else if (opdesc.Type() == "abs") {
     // abs
     param_.active_type = lite_api::ActivationType::kAbs;
+  } else if (opdesc.Type() == "hard_swish") {
+    // hard_swish
+    param_.active_type = lite_api::ActivationType::kHardSwish;
+    param_.hard_swish_threshold = opdesc.GetAttr<float>("threshold");
+    param_.hard_swish_scale = opdesc.GetAttr<float>("scale");
+    param_.hard_swish_offset = opdesc.GetAttr<float>("offset");
+  } else if (opdesc.Type() == "reciprocal") {
+    param_.active_type = lite_api::ActivationType::kReciprocal;
   }
   VLOG(4) << "opdesc.Type():" << opdesc.Type();
 
@@ -84,21 +92,11 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
-REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(relu_clipped, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp);
+
+// Baisc activation ops
 REGISTER_LITE_OP(sigmoid, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(tanh, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp);
diff --git a/lite/operators/calib_op.cc b/lite/operators/calib_op.cc
index 8da8747f8c9df038ee424395fd75a20a718f1970..ce45fa1409b83e922fb132e79562bfba23a19414 100644
--- a/lite/operators/calib_op.cc
+++ b/lite/operators/calib_op.cc
@@ -26,6 +26,7 @@ bool CalibOpLite::CheckShape() const {
 }
 bool CalibOpLite::InferShapeImpl() const {
   param_.output->Resize(param_.input->dims());
+  param_.output->set_lod(param_.input->lod());
   return true;
 }
 
diff --git a/lite/operators/ctc_align_op.cc b/lite/operators/ctc_align_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea8e0c27059258a4e7c857c80ab64eb381446035
--- /dev/null
+++ b/lite/operators/ctc_align_op.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/ctc_align_op.h"
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool CtcAlignOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.input != nullptr);
+  CHECK_OR_FALSE(param_.output != nullptr);
+
+  auto* input = param_.input;
+  auto* input_length = param_.input_length;
+  auto input_lod = input->lod();
+  CHECK_OR_FALSE(!input_lod.empty() || input_length != nullptr);
+  return true;
+}
+
+bool CtcAlignOpLite::InferShapeImpl() const {
+  auto input_dims = param_.input->dims();
+  // It is tricky to set the wrong dimension here.
+  param_.output->Resize(input_dims);
+  if (param_.input_length != nullptr && param_.output_length != nullptr) {
+    param_.output_length->Resize({input_dims[0], 1});
+  }
+  return true;
+}
+
+bool CtcAlignOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                lite::Scope* scope) {
+  AttachInput(op_desc, scope, "Input", false, &param_.input);
+  AttachInput(op_desc, scope, "InputLength", true, &param_.input_length);
+  AttachOutput(op_desc, scope, "Output", false, &param_.output);
+  AttachOutput(op_desc, scope, "OutputLength", true, &param_.output_length);
+  param_.blank = op_desc.GetAttr<int>("blank");
+  param_.merge_repeated = op_desc.GetAttr<bool>("merge_repeated");
+  param_.padding_value = op_desc.GetAttr<int>("padding_value");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(ctc_align, paddle::lite::operators::CtcAlignOpLite);
diff --git a/lite/operators/ctc_align_op.h b/lite/operators/ctc_align_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7593860e06c3d0104ca1f7ea7281d23149408923
--- /dev/null
+++ b/lite/operators/ctc_align_op.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class CtcAlignOpLite : public OpLite {
+ public:
+  CtcAlignOpLite() {}
+
+  explicit CtcAlignOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "ctc_align"; }
+
+ private:
+  mutable CtcAlignParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/io_copy_op.cc b/lite/operators/io_copy_op.cc
index 05b2d3d800d2d2989ae23f9a1ccac57021e82ac1..af53212caae0676526db4ff9cdeec0b71a6e0a88 100644
--- a/lite/operators/io_copy_op.cc
+++ b/lite/operators/io_copy_op.cc
@@ -26,6 +26,7 @@ bool IoCopyOp::CheckShape() const {
 }
 bool IoCopyOp::InferShapeImpl() const {
   param_.y->Resize(param_.x->dims());
+  param_.y->set_lod(param_.x->lod());
   return true;
 }
 bool IoCopyOp::Run() { return OpLite::Run(); }
diff --git a/lite/operators/layout_op.cc b/lite/operators/layout_op.cc
index d71dab68702ddd53af1540c2a6dce14d43b27e09..4465dfd5d6d49889777b04c3b661cea1e3d3e311 100644
--- a/lite/operators/layout_op.cc
+++ b/lite/operators/layout_op.cc
@@ -26,6 +26,7 @@ bool LayoutOp::CheckShape() const {
 }
 bool LayoutOp::InferShapeImpl() const {
   param_.y->Resize(param_.x->dims());
+  param_.y->set_lod(param_.x->lod());
   return true;
 }
 bool LayoutOp::Run() { return OpLite::Run(); }
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 3fdca389bca1ba09ebfe008365b6992b717270d8..466de112fb2983e325b2bec17e90018984d7e233 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -336,17 +336,22 @@ struct ConcatParam : ParamBase {
 /// ----------------------- activation operators ----------------------
 struct ActivationParam : ParamBase {
   const lite::Tensor* X{};
+  lite::Tensor* Out{};
+  lite_api::ActivationType active_type;
+  bool has_active{false};
   float Leaky_relu_alpha{0};   // leaky_relu param
   float Relu_clipped_coef{6};  // relu_clipped param
   std::string Prelu_mode{
       "channel"};  // prelu param, can be "all", "channel" or "element"
   lite::Tensor* Prelu_alpha{};  // prelu param
   float Swish_beta;             // swish param
+  // hard_sigmoid param
   float hard_sigmoid_slope{0.2};
   float hard_sigmoid_offset{0.5};
-  lite::Tensor* Out{};
-  bool has_active{false};
-  lite_api::ActivationType active_type;
+  // hard_swish param
+  float hard_swish_threshold{6.0};
+  float hard_swish_scale{6.0};
+  float hard_swish_offset{3.0};
 };
 
 struct ActivationGradParam : ParamBase {
@@ -1019,6 +1024,12 @@ struct SequenceExpandParam : ParamBase {
   int ref_level{-1};
 };
 
+struct SequenceUnpadParam : ParamBase {
+  const lite::Tensor* X{};
+  const lite::Tensor* Length{};
+  lite::Tensor* Out{};
+};
+
 struct SequenceExpandAsParam : ParamBase {
   const lite::Tensor* x{nullptr};
   const lite::Tensor* y{nullptr};
@@ -1438,6 +1449,40 @@ struct CrfDecodingParam : ParamBase {
   lite::Tensor* viterbi_path{};
 };
 
+struct CtcAlignParam : ParamBase {
+  lite::Tensor* input{};
+  lite::Tensor* input_length{};
+  lite::Tensor* output{};
+  lite::Tensor* output_length{};
+  int blank{0};
+  bool merge_repeated{true};
+  int padding_value{0};
+};
+
+struct XPUResNet50Param : ParamBase {
+  lite::Tensor* input{};
+  std::vector<lite::Tensor*> filter;
+  std::vector<lite::Tensor*> bias;
+  std::vector<lite::Tensor*> max_filter;
+  lite::Tensor* output{};
+};
+
+struct XPUMultiEncoderParam : ParamBase {
+  lite::Tensor* input{};
+  std::vector<lite::Tensor*> fc_weight;
+  std::vector<lite::Tensor*> fc_bias;
+  std::vector<lite::Tensor*> ln_scale;
+  std::vector<lite::Tensor*> ln_bias;
+  lite::Tensor* fc_weight_max{};
+  lite::Tensor* mask{};
+  lite::Tensor* output{};
+
+  int n_layers{};
+  int head_num{};
+  int size_per_head{};
+  std::string act_type{};
+};
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/sequence_unpad_op.cc b/lite/operators/sequence_unpad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b91d43c741f002b2bdb30e161688cd40b462faee
--- /dev/null
+++ b/lite/operators/sequence_unpad_op.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_unpad_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceUnpadOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Length);
+  CHECK_OR_FALSE(param_.Out);
+  auto x_dims = param_.X->dims();
+  auto len_dims = param_.Length->dims();
+  CHECK(x_dims.size() >= 2) << "Rank of X can't be less than 2";
+  CHECK(len_dims.size() == 1) << "Rank of Length should be 1";
+  CHECK(x_dims[0] == len_dims[0])
+      << "X and Length should have the same 1st dim";
+  return true;
+}
+
+bool SequenceUnpadOp::InferShapeImpl() const {
+  auto x_dims = param_.X->dims();
+  auto len_dims = param_.Length->dims();
+
+  auto *seq_len_ptr = param_.Length->data<int64_t>();
+  int64_t batch_size = len_dims[0];
+  std::vector<uint64_t> out_lod0(batch_size + 1, 0);
+  for (int64_t i = 0; i < batch_size; ++i) {
+    out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
+  }
+  paddle::lite::LoD out_lod;
+  out_lod.push_back(out_lod0);
+
+  int64_t out_dim0 = out_lod0.back();
+  std::vector<int64_t> out_dims{out_dim0};
+  if (x_dims.size() == 2) {
+    out_dims.push_back(1);
+  } else {
+    for (size_t i = 2; i < x_dims.size(); ++i) {
+      out_dims.push_back(x_dims[i]);
+    }
+  }
+  param_.Out->Resize(out_dims);
+  param_.Out->set_lod(out_lod);
+  return true;
+}
+
+bool SequenceUnpadOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                 lite::Scope *scope) {
+  param_.X = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  param_.Length = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("Length").front())->Get<lite::Tensor>());
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_unpad, paddle::lite::operators::SequenceUnpadOp);
diff --git a/lite/operators/sequence_unpad_op.h b/lite/operators/sequence_unpad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..508f0437fe32f9b65716f78124df377b99b1ef49
--- /dev/null
+++ b/lite/operators/sequence_unpad_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceUnpadOp : public OpLite {
+ public:
+  SequenceUnpadOp() {}
+  explicit SequenceUnpadOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_unpad"; }
+
+ private:
+  mutable SequenceUnpadParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/stack_op.cc b/lite/operators/stack_op.cc
index 0f9ba6662b16ce20acad497a4915cfc848b319cd..d4fb71c4b5cb429d1b3961d5c65f739af56ff39d 100644
--- a/lite/operators/stack_op.cc
+++ b/lite/operators/stack_op.cc
@@ -47,6 +47,7 @@ bool StackOp::InferShapeImpl() const {
 bool StackOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
   auto X = op_desc.Input("X");
   auto Out = op_desc.Output("Y").front();
+  param_.X.clear();
   for (auto var : X) {
     param_.X.emplace_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
   }
diff --git a/lite/tests/CMakeLists.txt b/lite/tests/CMakeLists.txt
index 0416c33a81b524b4dba1c1b406d91204cca6946d..a94a46897a8ae8415efd8edf19e216ede69f8888 100644
--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(kernels)
 add_subdirectory(math)
 add_subdirectory(cv)
+add_subdirectory(api)
diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c31e3ba58fc793aa92a5b37a59ad612e03c61a53
--- /dev/null
+++ b/lite/tests/api/CMakeLists.txt
@@ -0,0 +1,14 @@
+if(LITE_WITH_XPU)
+    lite_cc_test(test_resnet50_lite_xpu SRCS test_resnet50_lite_xpu.cc
+      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+    lite_cc_test(test_ernie_lite_xpu SRCS test_ernie_lite_xpu.cc
+      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+    lite_cc_test(test_bert_lite_xpu SRCS test_bert_lite_xpu.cc
+      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+endif()
diff --git a/lite/tests/api/test_bert_lite_xpu.cc b/lite/tests/api/test_bert_lite_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3ee9febb3f0eabd36118680beca66ace9470de4
--- /dev/null
+++ b/lite/tests/api/test_bert_lite_xpu.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename T>
+lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
+  lite::Tensor ret;
+  ret.Resize(shape);
+  T* ptr = ret.mutable_data<T>();
+  for (int i = 0; i < ret.numel(); ++i) {
+    ptr[i] = (T)1;
+  }
+  return ret;
+}
+
+TEST(Ernie, test_ernie_lite_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  int64_t batch_size = 1;
+  int64_t seq_len = 64;
+  Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
+  std::vector<int64_t> input_shape{batch_size, seq_len, 1};
+  predictor->GetInput(0)->Resize(input_shape);
+  predictor->GetInput(1)->Resize(input_shape);
+  predictor->GetInput(2)->Resize(input_shape);
+  predictor->GetInput(3)->Resize(input_shape);
+
+  memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  results.emplace_back(std::vector<float>({0.278893, 0.330888, 0.39022}));
+  auto out = predictor->GetOutput(0);
+  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape()[0], 1);
+  ASSERT_EQ(out->shape()[1], 3);
+
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(
+          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/api/test_ernie_lite_xpu.cc b/lite/tests/api/test_ernie_lite_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b614fec96cbcc5d9c96653681d0e8794cf4ab8f
--- /dev/null
+++ b/lite/tests/api/test_ernie_lite_xpu.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename T>
+lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
+  lite::Tensor ret;
+  ret.Resize(shape);
+  T* ptr = ret.mutable_data<T>();
+  for (int i = 0; i < ret.numel(); ++i) {
+    ptr[i] = (T)1;
+  }
+  return ret;
+}
+
+TEST(Ernie, test_ernie_lite_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  int64_t batch_size = 1;
+  int64_t seq_len = 64;
+  Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
+  std::vector<int64_t> input_shape{batch_size, seq_len, 1};
+  predictor->GetInput(0)->Resize(input_shape);
+  predictor->GetInput(1)->Resize(input_shape);
+  predictor->GetInput(2)->Resize(input_shape);
+  predictor->GetInput(3)->Resize(input_shape);
+
+  memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+  memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
+         sample_input.raw_data(),
+         sizeof(int64_t) * batch_size * seq_len);
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  results.emplace_back(std::vector<float>({0.108398}));
+  auto out = predictor->GetOutput(0);
+  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape()[0], 1);
+  ASSERT_EQ(out->shape()[1], 1);
+
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(
+          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/api/test_resnet50_lite_xpu.cc b/lite/tests/api/test_resnet50_lite_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be30369b9e187dd5d82527cb87eed405bc463ae3
--- /dev/null
+++ b/lite/tests/api/test_resnet50_lite_xpu.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+TEST(Resnet50, test_resnet50_lite_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  auto input_tensor = predictor->GetInput(0);
+  std::vector<int64_t> input_shape{1, 3, 224, 224};
+  input_tensor->Resize(input_shape);
+  auto* data = input_tensor->mutable_data<float>();
+  int input_num = 1;
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    input_num *= input_shape[i];
+  }
+  for (int i = 0; i < input_num; i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  results.emplace_back(std::vector<float>(
+      {0.000268651, 0.000174053, 0.000213181, 0.000396771, 0.000591516,
+       0.00018169,  0.000289721, 0.000855934, 0.000732185, 9.2055e-05,
+       0.000220664, 0.00235289,  0.00571265,  0.00357688,  0.00129667,
+       0.000465392, 0.000143775, 0.000211628, 0.000617144, 0.000265033}));
+  auto out = predictor->GetOutput(0);
+  ASSERT_EQ(out->shape().size(), 2);
+  ASSERT_EQ(out->shape()[0], 1);
+  ASSERT_EQ(out->shape()[1], 1000);
+
+  int step = 50;
+  for (size_t i = 0; i < results.size(); ++i) {
+    for (size_t j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
+                  results[i][j],
+                  1e-5);
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/cv/CMakeLists.txt b/lite/tests/cv/CMakeLists.txt
index 697c9874ef2072eedf6b654863e25e981fb6834a..1ab73792e7fa3a46fd4c4b4479e4f231d55608f6 100644
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
@@ -1,3 +1,3 @@
-if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
+if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
     lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
 endif()
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index e108e35af76c6b5f2c5719b650b06d849a2f3887..cb454c4da5bc15d65e480f55dabe01124bf18ca5 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -1,4 +1,4 @@
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
     lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -61,6 +61,7 @@ if(LITE_BUILD_EXTRA)
     lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 
     # for training kernel
     if (LITE_WITH_TRAIN)
diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc
index 5a0b033b1b8c4d8f28aa05c3f2fcac40f2569bf4..c71eac8d4532eefd5569421807c85128746c6c8b 100644
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -36,7 +36,9 @@ enum activation_type_test {
   FLOOR,
   RSQRT,
   GELU,
-  SQUARE
+  SQUARE,
+  HARD_SWISH,
+  RECIPROCAL
 };
 
 class ActivationComputeTester : public arena::TestCase {
@@ -49,6 +51,9 @@ class ActivationComputeTester : public arena::TestCase {
   float relu_clipped_coef_ = 6.;
   std::string prelu_mode_ = "";
   float swish_beta_ = 0.;
+  float hard_swish_threshold = 6.0;
+  float hard_swish_scale = 6.0;
+  float hard_swish_offset = 3.0;
   DDim dims_{{1}};
   std::string type_ = "";
   activation_type_test act_type_ = RELU;
@@ -199,6 +204,20 @@ class ActivationComputeTester : public arena::TestCase {
         }
         break;
       }
+      case HARD_SWISH: {
+        for (int i = 0; i < dims_.production(); i++) {
+          float max_value = std::max(0.f, x_data[i] + hard_swish_offset);
+          float min_value = std::min(max_value, hard_swish_threshold);
+          output_data[i] = min_value * x_data[i] / hard_swish_scale;
+        }
+        break;
+      }
+      case RECIPROCAL: {
+        for (int i = 0; i < dims_.production(); i++) {
+          output_data[i] = 1.0 / x_data[i];
+        }
+        break;
+      }
       default:
         LOG(INFO) << "the type of activation is unknow.";
     }
@@ -221,6 +240,11 @@ class ActivationComputeTester : public arena::TestCase {
     if (act_type_ == SWISH) {
       op_desc->SetAttr("beta", swish_beta_);
     }
+    if (act_type_ == HARD_SWISH) {
+      op_desc->SetAttr("threshold", hard_swish_threshold);
+      op_desc->SetAttr("scale", hard_swish_scale);
+      op_desc->SetAttr("offset", hard_swish_offset);
+    }
   }
 
   void PrepareData() override {
@@ -552,5 +576,61 @@ TEST(Activation_gelu, precision) {
   }
 }
 
+TEST(activation_hard_swish, precision) {
+  LOG(INFO) << "test hard_swish op";
+  Place place;
+  float abs_error = 2e-5;
+
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(
+        new ActivationComputeTester(place,
+                                    "def",
+                                    0.01,
+                                    6.,
+                                    "all",
+                                    0.,
+                                    DDim(dims),
+                                    "hard_swish",
+                                    HARD_SWISH));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
+TEST(activation_reciprocal, precision) {
+  LOG(INFO) << "test reciprocal op";
+  Place place;
+  float abs_error = 2e-5;
+
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(
+        new ActivationComputeTester(place,
+                                    "def",
+                                    0.01,
+                                    6.,
+                                    "all",
+                                    0.,
+                                    DDim(dims),
+                                    "reciprocal",
+                                    RECIPROCAL));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/tests/kernels/ctc_align_compute_test.cc b/lite/tests/kernels/ctc_align_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e32012549cab42858938388857c65e14f65be099
--- /dev/null
+++ b/lite/tests/kernels/ctc_align_compute_test.cc
@@ -0,0 +1,254 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+class CtcAlignComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "input";
+  std::string input_length_ = "input_length";
+  std::string output_ = "output";
+  std::string output_length_ = "output_length";
+  std::vector<int> input_data_;
+  std::vector<int64_t> input_shape_;
+  std::vector<std::vector<uint64_t>> input_lod_;
+  std::vector<int> input_length_data_;
+  std::vector<int64_t> input_length_shape_;
+  std::vector<int> output_data_;
+  std::vector<int64_t> output_shape_;
+  std::vector<std::vector<uint64_t>> output_lod_;
+  std::vector<int> output_length_data_;
+  std::vector<int64_t> output_length_shape_;
+  int blank_;
+  bool merge_repeated_;
+  int padding_value_;
+
+ public:
+  CtcAlignComputeTester(const Place& place,
+                        const std::string& alias,
+                        const std::vector<int>& input_data,
+                        const std::vector<int64_t> input_shape,
+                        const std::vector<std::vector<uint64_t>>& input_lod,
+                        const std::vector<int>& input_length_data,
+                        const std::vector<int64_t> input_length_shape,
+                        const int blank,
+                        const bool merge_repeated,
+                        const int padding_value,
+                        const std::vector<int>& output_data,
+                        const std::vector<int64_t>& output_shape,
+                        const std::vector<std::vector<uint64_t>>& output_lod,
+                        const std::vector<int>& output_length_data,
+                        const std::vector<int64_t>& output_length_shape)
+      : TestCase(place, alias) {
+    input_data_ = input_data;
+    input_shape_ = input_shape;
+    input_lod_ = input_lod;
+    input_length_data_ = input_length_data;
+    input_length_shape_ = input_length_shape;
+    blank_ = blank;
+    merge_repeated_ = merge_repeated;
+    padding_value_ = padding_value;
+    output_data_ = output_data;
+    output_shape_ = output_shape;
+    output_lod_ = output_lod;
+    output_length_data_ = output_length_data;
+    output_length_shape_ = output_length_shape;
+  }
+
+  void RunBaseline(Scope* scope) override {
+    auto* output_tensor = scope->NewTensor(output_);
+    output_tensor->Resize(output_shape_);
+    if (!output_lod_.empty()) {
+      output_tensor->set_lod(output_lod_);
+    }
+    auto* output_data = output_tensor->mutable_data<int>();
+    int64_t output_num = 1;
+    for (auto e : output_shape_) {
+      output_num *= e;
+    }
+    for (int i = 0; i < output_num; i++) {
+      output_data[i] = output_data_[i];
+    }
+
+    if (!input_length_data_.empty() && !output_length_data_.empty()) {
+      auto* output_length_tensor = scope->NewTensor(output_length_);
+      output_length_tensor->Resize(output_length_shape_);
+      auto* output_length_data = output_length_tensor->mutable_data<int>();
+      int64_t num = 1;
+      for (auto e : output_length_shape_) {
+        num *= e;
+      }
+      for (int i = 0; i < num; i++) {
+        output_length_data[i] = output_length_data_[i];
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("ctc_align");
+    op_desc->SetInput("Input", {input_});
+    op_desc->SetOutput("Output", {output_});
+    if (!input_length_data_.empty()) {
+      op_desc->SetInput("InputLength", {input_length_});
+      op_desc->SetOutput("OutputLength", {output_length_});
+    }
+    op_desc->SetAttr("blank", blank_);
+    op_desc->SetAttr("merge_repeated", merge_repeated_);
+    op_desc->SetAttr("padding_value", padding_value_);
+  }
+
+  void PrepareData() override {
+    SetCommonTensor(input_, DDim(input_shape_), input_data_.data(), input_lod_);
+    if (!input_length_data_.empty()) {
+      SetCommonTensor(
+          input_length_, DDim(input_length_shape_), input_length_data_.data());
+    }
+  }
+};
+TEST(CtcAlign1, precision) {
+  LOG(INFO) << "test ctc_align op";
+#ifdef LITE_WITH_ARM
+  // Define variable
+  const std::vector<int>& input_data = {
+      0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0};
+  const std::vector<int64_t> input_shape = {18, 1};
+  const std::vector<std::vector<uint64_t>> input_lod = {{11, 7}};
+  const std::vector<int> input_length_data = {};
+  const std::vector<int64_t> input_length_shape = {};
+  const int blank = 0;
+  const bool merge_repeated = false;
+  const int padding_value = 0;
+  const std::vector<int> output_data = {1, 2, 2, 4, 4, 5, 6, 6, 7, 7, 7};
+  const std::vector<int64_t> output_shape = {11, 1};
+  const std::vector<std::vector<uint64_t>> output_lod = {{7, 4}};
+  const std::vector<int> output_length_data = {};
+  const std::vector<int64_t> output_length_shape = {};
+
+  // Test
+  Place place(TARGET(kHost), PRECISION(kInt32));
+  std::unique_ptr<arena::TestCase> tester(
+      new CtcAlignComputeTester(place,
+                                "def",
+                                input_data,
+                                input_shape,
+                                input_lod,
+                                input_length_data,
+                                input_length_shape,
+                                blank,
+                                merge_repeated,
+                                padding_value,
+                                output_data,
+                                output_shape,
+                                output_lod,
+                                output_length_data,
+                                output_length_shape));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+#endif
+}
+
+TEST(CtcAlign2, precision) {
+  LOG(INFO) << "test ctc_align op";
+#ifdef LITE_WITH_ARM
+  // Define variable
+  const std::vector<int>& input_data = {
+      0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 0, 0, 7, 7, 7, 0, 0};
+  const std::vector<int64_t> input_shape = {3, 6};
+  const std::vector<std::vector<uint64_t>> input_lod = {};
+  const std::vector<int> input_length_data = {6, 5, 4};
+  const std::vector<int64_t> input_length_shape = {3, 1};
+  const int blank = 0;
+  const bool merge_repeated = true;
+  const int padding_value = 0;
+  const std::vector<int> output_data = {
+      1, 2, 4, 0, 0, 0, 4, 5, 6, 0, 0, 0, 7, 0, 0, 0, 0, 0};
+  const std::vector<int64_t> output_shape = {3, 6};
+  const std::vector<std::vector<uint64_t>> output_lod = {};
+  const std::vector<int> output_length_data = {3, 3, 1};
+  const std::vector<int64_t> output_length_shape = {3, 1};
+
+  // Test
+  Place place(TARGET(kHost), PRECISION(kInt32));
+  std::unique_ptr<arena::TestCase> tester(
+      new CtcAlignComputeTester(place,
+                                "def",
+                                input_data,
+                                input_shape,
+                                input_lod,
+                                input_length_data,
+                                input_length_shape,
+                                blank,
+                                merge_repeated,
+                                padding_value,
+                                output_data,
+                                output_shape,
+                                output_lod,
+                                output_length_data,
+                                output_length_shape));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+#endif
+}
+
+TEST(CtcAlign3, precision) {
+  LOG(INFO) << "test ctc_align op";
+#ifdef LITE_WITH_ARM
+  // Define variable
+  const std::vector<int>& input_data = {
+      0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 0, 0, 7, 7, 7, 0, 0};
+  const std::vector<int64_t> input_shape = {3, 6};
+  const std::vector<std::vector<uint64_t>> input_lod = {};
+  const std::vector<int> input_length_data = {6, 5, 4};
+  const std::vector<int64_t> input_length_shape = {3, 1};
+  const int blank = 0;
+  const bool merge_repeated = false;
+  const int padding_value = 0;
+  const std::vector<int> output_data = {
+      1, 2, 2, 4, 0, 0, 4, 5, 6, 0, 0, 0, 7, 7, 7, 0, 0, 0};
+  const std::vector<int64_t> output_shape = {3, 6};
+  const std::vector<std::vector<uint64_t>> output_lod = {};
+  const std::vector<int> output_length_data = {4, 3, 3};
+  const std::vector<int64_t> output_length_shape = {3, 1};
+
+  // Test
+  Place place(TARGET(kHost), PRECISION(kInt32));
+  std::unique_ptr<arena::TestCase> tester(
+      new CtcAlignComputeTester(place,
+                                "def",
+                                input_data,
+                                input_shape,
+                                input_lod,
+                                input_length_data,
+                                input_length_shape,
+                                blank,
+                                merge_repeated,
+                                padding_value,
+                                output_data,
+                                output_shape,
+                                output_lod,
+                                output_length_data,
+                                output_length_shape));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+#endif
+}
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/math/CMakeLists.txt b/lite/tests/math/CMakeLists.txt
index 7dd4f522dbc0f10e8cfb7d19e95da4354ac4b779..e02307aa73cccdacd38bfd2bc9b4ca422a56d06c 100644
--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
@@ -1,4 +1,4 @@
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
     lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
diff --git a/lite/tools/benchmark.sh b/lite/tools/benchmark.sh
index 23bb183ec9711a43def5636f15a9b17795f0ec24..3af8176f97896d04b85195530f9b554fe4ddc5f7 100644
--- a/lite/tools/benchmark.sh
+++ b/lite/tools/benchmark.sh
@@ -2,12 +2,12 @@
 set -e
 
 # Check input
-if [ $# -lt  2 ];
+if [ $# -lt  3 ];
 then
     echo "Input error"
     echo "Usage:"
-    echo "  sh benchmark.sh benchmark_bin_path benchmark_models_path <result_filename> <input_shape> <power_mode: [0|1|2|3]> <is_run_model_optimize: [true|false]> <is_run_quantized_model: [trur|false]>"
-    echo "\npower_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores,  3 for no bind."
+    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename>"
+    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename> <is_run_model_optimize: [true|false]>"
     exit
 fi
 
@@ -15,10 +15,8 @@ fi
 ANDROID_DIR=/data/local/tmp
 BENCHMARK_BIN=$1
 MODELS_DIR=$2
+RESULT_FILENAME=$3
 
-RESULT_FILENAME=result.txt
-INPUT_SHAPE=1,3,244,244
-POWER_MODE=3
 WARMUP=10
 REPEATS=30
 IS_RUN_MODEL_OPTIMIZE=false
@@ -27,25 +25,9 @@ NUM_THREADS_LIST=(1 2 4)
 MODELS_LIST=$(ls $MODELS_DIR)
 
 # Check input
-if [ $# -gt  2 ];
-then
-    RESULT_FILENAME=$3
-fi
 if [ $# -gt  3 ];
 then
-    INPUT_SHAPE=$4
-fi
-if [ $# -gt  4 ];
-then
-    POWER_MODE=$5
-fi
-if [ $# -gt  5 ];
-then
-    IS_RUN_MODEL_OPTIMIZE=$6
-fi
-if [ $# -gt  6 ];
-then
-    IS_RUN_QUANTIZED_MODEL=$7
+    IS_RUN_MODEL_OPTIMIZE=$4
 fi
 
 # Adb push benchmark_bin, models
@@ -54,26 +36,31 @@ adb shell chmod +x $ANDROID_DIR/benchmark_bin
 adb push $MODELS_DIR $ANDROID_DIR
 
 # Run benchmark
-adb shell "echo 'PaddleLite Benchmark (in ms)\n' > $ANDROID_DIR/$RESULT_FILENAME"
+adb shell "echo 'PaddleLite Benchmark' > $ANDROID_DIR/$RESULT_FILENAME"
 for threads in ${NUM_THREADS_LIST[@]}; do
-    adb shell "echo threads=$threads warmup=$WARMUP repeats=$REPEATS input_shape=$INPUT_SHAPE power_mode=$POWER_MODE >> $ANDROID_DIR/$RESULT_FILENAME"
+    adb shell "echo Threads=$threads Warmup=$WARMUP Repeats=$REPEATS >> $ANDROID_DIR/$RESULT_FILENAME"
     for model_name in ${MODELS_LIST[@]}; do
       echo "Model=$model_name Threads=$threads"
-      adb shell "$ANDROID_DIR/benchmark_bin \
+      if [ "$IS_RUN_MODEL_OPTIMIZE" = true ]; 
+      then
+          adb shell "$ANDROID_DIR/benchmark_bin \
                    --model_dir=$ANDROID_DIR/${MODELS_DIR}/$model_name \
-                   --input_shape=$INPUT_SHAPE \
                    --warmup=$WARMUP \
                    --repeats=$REPEATS \
                    --threads=$threads \
-                   --power_mode=$POWER_MODE \
-                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME \
-                   --run_model_optimize=$IS_RUN_MODEL_OPTIMIZE \
-                   --is_quantized_model=$IS_RUN_QUANTIZED_MODEL"
+                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME"
+      else
+          adb shell "$ANDROID_DIR/benchmark_bin \
+                   --optimized_model_path=$ANDROID_DIR/${MODELS_DIR}/$model_name \
+                   --warmup=$WARMUP \
+                   --repeats=$REPEATS \
+                   --threads=$threads \
+                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME"
+      fi
     done
     adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
 done
-adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
-adb shell "echo power_mode refer: 0 for big cluster, 1 for little cluster, 2 for all cores,  3 for no bind >> $ANDROID_DIR/$RESULT_FILENAME"
+
 # Adb pull benchmark result, show result
 adb pull $ANDROID_DIR/$RESULT_FILENAME .
 echo "\n--------------------------------------"
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index a888d8ef25cbe7c816693fa45d954672a8ad5b1f..e7394fcb6edbd7a2f4b564b7a0e7d5aa43506843 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -25,6 +25,7 @@ SHUTDOWN_LOG=ON
 BUILD_NPU=OFF
 NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/
 BUILD_XPU=OFF
+BUILD_XTCL=OFF
 XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/"
 LITE_WITH_ARM_LANG=OFF
 
@@ -138,6 +139,7 @@ function make_tiny_publish_so {
       -DLITE_WITH_NPU=$BUILD_NPU \
       -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
       -DLITE_WITH_XPU=$BUILD_XPU \
+      -DLITE_WITH_XTCL=$BUILD_XTCL \
       -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
@@ -226,6 +228,7 @@ function make_full_publish_so {
       -DLITE_WITH_NPU=$BUILD_NPU \
       -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
       -DLITE_WITH_XPU=$BUILD_XPU \
+      -DLITE_WITH_XTCL=$BUILD_XTCL \
       -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
       -DLITE_WITH_TRAIN=$BUILD_TRAIN \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
@@ -260,6 +263,7 @@ function make_all_tests {
       -DLITE_WITH_NPU=$BUILD_NPU \
       -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
       -DLITE_WITH_XPU=$BUILD_XPU \
+      -DLITE_WITH_XTCL=$BUILD_XTCL \
       -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
@@ -330,7 +334,10 @@ function make_cuda {
             -DWITH_TESTING=OFF \
             -DLITE_WITH_ARM=OFF \
             -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
-            -DLITE_BUILD_EXTRA=ON
+            -DLITE_BUILD_EXTRA=ON \
+            -DLITE_WITH_XPU=$BUILD_XPU \
+            -DLITE_WITH_XTCL=$BUILD_XTCL \
+            -DXPU_SDK_ROOT=$XPU_SDK_ROOT
  
   make publish_inference -j$NUM_PROC
   cd -
@@ -362,9 +369,10 @@ function make_x86 {
             -DWITH_GPU=OFF \
             -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DLITE_WITH_XPU=$BUID_XPU \
-            -DXPU_SDK_ROOT=$XPU_SDK_ROOT
+            -DLITE_WITH_XPU=$BUILD_XPU \
+            -DLITE_WITH_XTCL=$BUILD_XTCL \
+            -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+            -DCMAKE_BUILD_TYPE=Release
 
   make publish_inference -j$NUM_PROC
   cd -
@@ -483,6 +491,10 @@ function main {
                 BUILD_XPU="${i#*=}"
                 shift
                 ;;
+            --build_xtcl=*)
+                BUILD_XTCL="${i#*=}"
+                shift
+                ;;
             --xpu_sdk_root=*)
                 XPU_SDK_ROOT="${i#*=}"
                 shift
diff --git a/lite/tools/build_mlu.sh b/lite/tools/build_mlu.sh
index 1912efda5edc6e436cc84dbdf9919a99e1ed3279..01d71aaf213abb99633112664af580b897ce7454 100755
--- a/lite/tools/build_mlu.sh
+++ b/lite/tools/build_mlu.sh
@@ -2,10 +2,10 @@
 set -ex
 
 # global variables with default value
-NEUWARE_HOME="${NEUWARE_HOME}"    # XPU SDK
+NEUWARE_HOME="${NEUWARE_HOME}"
 TARGET_NAME="all"    # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
-WITH_TESTING=OFF                     # ON/OFF
+WITH_TESTING=ON                     # ON/OFF
 
 function print_usage {
     echo -e "\nUSAGE:"
@@ -20,10 +20,9 @@ function print_usage {
 # readonly variables with default value
 readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
                                -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
-                               -DWITH_PYTHON=OFF \
                                -DLITE_WITH_ARM=OFF"
 
-readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
+readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-8}
 
 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
 readonly workspace=$(pwd)
@@ -37,8 +36,7 @@ function prepare_thirdparty {
         fi
         tar xzf third-party-05b862.tar.gz
     else
-        # git submodule update --init --recursive
-        echo "third-party is in ready"
+        git submodule update --init --recursive
     fi
 }
 
@@ -62,12 +60,12 @@ function prepare_workspace {
 }
 
 function build_mlu {
+    prepare_workspace
     build_dir=${workspace}/build.lite.mlu
     mkdir -p $build_dir
     cd $build_dir
 
     export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
-    prepare_workspace
     cmake .. \
         ${CMAKE_COMMON_OPTIONS} \
         -DWITH_GPU=OFF \
@@ -75,9 +73,10 @@ function build_mlu {
         -DLITE_WITH_X86=ON \
         -DWITH_MKL=ON \
         -DLITE_WITH_MLU=ON \
+        -DLITE_WITH_PYTHON=OFF \
         -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
         -DWITH_TESTING=${WITH_TESTING} \
-        -DMLU_SDK_ROOT=${XPU_SDK_ROOT}
+        -DNEUWARE_HOME=${NEUWARE_HOME}
 
     make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE
 
diff --git a/lite/tools/build_xpu.sh b/lite/tools/build_xpu.sh
deleted file mode 100755
index fdf287501e8f4411f51e73c55b789753f2e85674..0000000000000000000000000000000000000000
--- a/lite/tools/build_xpu.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-set -ex
-
-# global variables with default value
-XPU_SDK_ROOT="$(pwd)/../XPU_SDK"    # XPU SDK
-TARGET_NAME="test_subgraph_pass"    # default target
-BUILD_EXTRA=ON                      # ON(with sequence ops)/OFF
-WITH_TESTING=ON                     # ON/OFF
-
-function print_usage {
-    echo -e "\nUSAGE:"
-    echo
-    echo "----------------------------------------"
-    echo -e "--xpu_sdk_root=<xpu sdk directory>"
-    echo -e "--target_name=<target name>"
-    echo "----------------------------------------"
-    echo
-}
-
-# readonly variables with default value
-readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
-                               -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
-                               -DWITH_PYTHON=OFF \
-                               -DLITE_WITH_ARM=OFF"
-
-readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
-
-readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
-readonly workspace=$(pwd)
-
-function prepare_thirdparty {
-    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
-        rm -rf $workspace/third-party
-
-        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
-            wget $THIRDPARTY_TAR
-        fi
-        tar xzf third-party-05b862.tar.gz
-    else
-        git submodule update --init --recursive
-    fi
-}
-
-# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
-# here we fake an empty file to make cmake works.
-function prepare_workspace {
-    # in build directory
-    # 1. Prepare gen_code file
-    GEN_CODE_PATH_PREFIX=lite/gen_code
-    mkdir -p ./${GEN_CODE_PATH_PREFIX}
-    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
-
-    # 2.Prepare debug tool
-    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
-    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
-    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
-
-    # clone submodule
-    # git submodule update --init --recursive
-    prepare_thirdparty
-}
-
-function build_xpu {
-    build_dir=${workspace}/build.lite.xpu
-    mkdir -p $build_dir
-    cd $build_dir
-
-    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
-    prepare_workspace
-    cmake .. \
-        ${CMAKE_COMMON_OPTIONS} \
-        -DWITH_GPU=OFF \
-        -DWITH_MKLDNN=OFF \
-        -DLITE_WITH_X86=ON \
-        -DWITH_MKL=ON \
-        -DLITE_WITH_XPU=ON \
-        -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
-        -DWITH_TESTING=${WITH_TESTING} \
-        -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
-
-    make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE
-
-    cd -
-    echo "Done"
-}
-
-function main {
-    # Parse command line.
-    for i in "$@"; do
-        case $i in
-            --target_name=*)
-                TARGET_NAME="${i#*=}"
-                shift
-                ;;
-            --build_extra=*)
-                BUILD_EXTRA="${i#*=}"
-                shift
-                ;;
-            --xpu_sdk_root=*)
-                XPU_SDK_ROOT="${i#*=}"
-                shift
-                ;;
-            build)
-                build_xpu
-                shift
-                ;;
-            full_publish)
-                TARGET_NAME=publish_inference
-                build_xpu
-                shift
-                ;;
-            *)
-                # unknown option
-                print_usage
-                exit 1
-                ;;
-        esac
-    done
-}
-
-main $@
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 703da69fa59f3aa99bad9fb04c0decb591486058..a5dc2b741d2d3d5fdd2f08d13b7dc483a3065b0e 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -192,6 +192,7 @@ function build_opencl {
 
     cmake_opencl ${os} ${abi} ${lang}
     make opencl_clhpp -j$NUM_CORES_FOR_COMPILE
+    make publish_inference -j$NUM_CORES_FOR_COMPILE
     build $TESTS_FILE
 }
 
diff --git a/mobile/test/CMakeLists.txt b/mobile/test/CMakeLists.txt
index 078440f45b0525ce49140ad78b2f9c23bb0f55f1..6dddeb47f6e33446d136a8d1301834aa17fceeb8 100644
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
@@ -549,7 +549,7 @@ if (ENABLE_ALL_TEST)
         ADD_EXECUTABLE(test-net-performance net/test_net_performance.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-net-performance paddle-mobile)
 
-        ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h)
+        ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-inference-api-v2 paddle-mobile)
 
         if (GPU_CL)
@@ -566,6 +566,6 @@ else ()
     ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h)
     target_link_libraries(test-net-benchmark paddle-mobile)
 
-    ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-inference-api-v2 paddle-mobile)
 endif ()
diff --git a/mobile/test/net/test_inference_api_v2.cpp b/mobile/test/net/test_inference_ercy.cpp
similarity index 100%
rename from mobile/test/net/test_inference_api_v2.cpp
rename to mobile/test/net/test_inference_ercy.cpp
diff --git a/mobile/test/net/test_inference_m2fm.cpp b/mobile/test/net/test_inference_m2fm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe03c99cda992b06c49e0165ad64d8289f165880
--- /dev/null
+++ b/mobile/test/net/test_inference_m2fm.cpp
@@ -0,0 +1,130 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "io/paddle_inference_api.h"
+
+using namespace paddle_mobile;  // NOLINT
+
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kGPU_CL;
+  config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST;
+
+  config.prog_file = "../models/m2fm/model";
+  config.param_file = "../models/m2fm/params";
+  config.lod_mode = false;
+  config.load_when_predict = false;
+  return config;
+}
+
+int main() {
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+
+  // factor
+  int factor_len = 1 * 256 * 1 * 1;
+  std::vector<float> factor_v;
+  std::vector<int64_t> factor_dims{1, 256, 1, 1};
+  GetInput<float>(g_test_image_1x3x224x224, &factor_v, factor_dims);
+
+  PaddleTensor factor;
+  factor.shape = std::vector<int>({1, 256, 1, 1});
+  factor.data = PaddleBuf(factor_v.data(), factor_len * sizeof(float));
+  factor.dtype = PaddleDType::FLOAT32;
+  factor.layout = LayoutType::LAYOUT_CHW;
+
+  // remap
+  int remap_len = 1 * 256 * 256 * 2;
+  std::vector<float> remap_v;
+  std::vector<int64_t> remap_dims{1, 256, 256, 2};
+  GetInput<float>(g_test_image_1x3x224x224, &remap_v, remap_dims);
+
+  PaddleTensor remap;
+  remap.shape = std::vector<int>({1, 256, 256, 2});
+  remap.data = PaddleBuf(remap_v.data(), remap_len * sizeof(float));
+  remap.dtype = PaddleDType::FLOAT32;
+  remap.layout = LayoutType::LAYOUT_CHW;
+
+  // image
+  int image_len = 1 * 3 * 256 * 256;
+  std::vector<float> image_v;
+  std::vector<int64_t> image_dims{1, 3, 256, 256};
+  GetInput<float>(g_test_image_1x3x224x224, &image_v, image_dims);
+
+  PaddleTensor image;
+  image.shape = std::vector<int>({1, 3, 256, 256});
+  image.data = PaddleBuf(image_v.data(), image_len * sizeof(float));
+  image.dtype = PaddleDType::FLOAT32;
+  image.layout = LayoutType::LAYOUT_CHW;
+
+  PaddleTensor output0;
+  output0.shape = std::vector<int>({});
+  output0.data = PaddleBuf();
+  output0.dtype = PaddleDType::FLOAT32;
+  output0.layout = LayoutType::LAYOUT_CHW;
+
+  PaddleTensor output1;
+  output1.shape = std::vector<int>({});
+  output1.data = PaddleBuf();
+  output1.dtype = PaddleDType::FLOAT32;
+  output1.layout = LayoutType::LAYOUT_CHW;
+
+  PaddleTensor output2;
+  output2.shape = std::vector<int>({});
+  output2.data = PaddleBuf();
+  output2.dtype = PaddleDType::FLOAT32;
+  output2.layout = LayoutType::LAYOUT_CHW;
+
+  PaddleTensor output3;
+  output3.shape = std::vector<int>({});
+  output3.data = PaddleBuf();
+  output3.dtype = PaddleDType::FLOAT32;
+  output3.layout = LayoutType::LAYOUT_CHW;
+
+  predictor->Feed("x2paddle_mul_factor", factor);
+  predictor->Feed("x2paddle_base_remap", remap);
+  predictor->Feed("x2paddle_image", image);
+  predictor->Run();
+  predictor->Fetch("save_infer_model/scale_0", &output0);
+  predictor->Fetch("save_infer_model/scale_1", &output1);
+  predictor->Fetch("save_infer_model/scale_2", &output2);
+  predictor->Fetch("save_infer_model/scale_3", &output3);
+
+  float* out_ptr0 = reinterpret_cast<float*>(output0.data.data());
+  float* out_ptr1 = reinterpret_cast<float*>(output1.data.data());
+  std::cout << " print output0 : " << std::endl;
+  int numel = output0.data.length() / sizeof(float);
+  int stride = numel / 20;
+  stride = stride > 0 ? stride : 1;
+  for (size_t j = 0; j < numel; j += stride) {
+    std::cout << out_ptr0[j] << " ";
+  }
+  std::cout << std::endl;
+
+  std::cout << " print output1 : " << std::endl;
+  numel = output1.data.length() / sizeof(float);
+  stride = numel / 20;
+  stride = stride > 0 ? stride : 1;
+  for (size_t j = 0; j < numel; j += stride) {
+    std::cout << out_ptr1[j] << " ";
+  }
+  std::cout << std::endl;
+
+  return 0;
+}