diff --git a/.gitignore b/.gitignore
index dc0a38edcb563589ce3845803174598ca68ec396..be97cf2f3ff9878774913ecf8dab0130179bbf16 100644
--- a/.gitignore
+++ b/.gitignore
@@ -116,4 +116,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
 metal/MobileNetDemo/MobileNetDemo/Resources
 
+#flatbuffers
+lite/model_parser/flatbuffers/framework_generated.h
+
 build*
+
+# hiai libs
+ai_ddk_lib*
diff --git a/.gitmodules b/.gitmodules
index 107036c70292cf33e945f45a8bac935dea554ece..37af6a724560144190539ab677c8f17524f5e645 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "third-party/protobuf-host"]
 	path = third-party/protobuf-host
 	url = https://github.com/protocolbuffers/protobuf.git
+[submodule "third-party/flatbuffers"]
+	path = third-party/flatbuffers
+	url = https://github.com/google/flatbuffers.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ac227f0154feb64178d9a99b6784bfd6db40d50..e598f1dcd501b2ca09273a0914ff4cdf66f8b0e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
 lite_option(LITE_WITH_RKNPU  "Enable RKNPU in lite mode"  OFF)
 lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
+lite_option(LITE_WITH_HUAWEI_ASCEND_NPU  "Enable HUAWEI_ASCEND_NPU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
@@ -98,6 +99,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OF
 lite_option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
 lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
 lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
+lite_option(LITE_WITH_EXCEPTION "Enable throwing the exception when error occurs in lite" OFF)
 lite_option(LITE_WITH_NVTX "Enable nvtx or not, please enable LITE_WITH_CUDA first." OFF)
 lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
 lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
@@ -106,7 +108,8 @@ lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kerne
 lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
 # cv build options
 lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF)
-lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." ON)
+lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." OFF)
+lite_option(CUDA_WITH_FP16 "Compile with cuda half support" OFF)
 lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF)
 
 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
@@ -168,6 +171,7 @@ if(LITE_WITH_RKNPU)
    include(device/rknpu)
 endif()
 
+include(external/flatbuffers)
 
 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
@@ -222,6 +226,11 @@ endif()
 if(LITE_WITH_MLU)
     include(mlu)
 endif()
+
+if(LITE_WITH_HUAWEI_ASCEND_NPU)
+    include(device/huawei_ascend_npu)
+endif()
+
 include(coveralls)
 
 include(external/mklml)     # download mklml package
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 1b0890e0dbf5e741176c293a059d809752c72a43..773de573aff92599ad6e5fb746a2956d9e50a8c2 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -174,6 +174,10 @@ if (LITE_WITH_MLU)
 add_definitions("-DLITE_WITH_MLU")
 endif()
 
+if (LITE_WITH_HUAWEI_ASCEND_NPU)
+add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU")
+endif()
+
 if (LITE_WITH_PROFILE)
     add_definitions("-DLITE_WITH_PROFILE")
 endif()
@@ -190,6 +194,10 @@ if (LITE_WITH_LOG)
   add_definitions("-DLITE_WITH_LOG")
 endif()
 
+if (LITE_WITH_EXCEPTION)
+  add_definitions("-DLITE_WITH_EXCEPTION")
+endif()
+
 if (LITE_ON_TINY_PUBLISH)
   add_definitions("-DLITE_ON_TINY_PUBLISH")
 endif()
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index 4fc59ccd62671c5862a298832b1ec03d4e96d05a..68f91fe88173f1cd254bc44d5e7dbcd456bfcdb8 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -35,7 +35,11 @@ endif()
 if(NOT DEFINED ANDROID_API_LEVEL)
     set(ANDROID_API_LEVEL "23")
     if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
-        set(ANDROID_API_LEVEL "22")
+        if(LITE_WITH_NPU AND NOT LITE_ON_TINY_PUBLISH)
+            set(ANDROID_API_LEVEL "24") # HIAI DDK depends on android-24
+        else()
+            set(ANDROID_API_LEVEL "22")
+        endif()
     endif()
 endif()
 
@@ -76,6 +80,21 @@ if (ARM_TARGET_LANG STREQUAL "clang")
     elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
         set(triple arm-v7a-linux-android)
         set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE)
+        if(ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
+            # Use CMAKE_CXX_STANDARD_LIBRARIES_INIT to ensure libunwind and libc++ is linked in the right order
+            set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libunwind.a")
+            if (ANDROID_API_LEVEL LESS 21)
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libandroid_support.a")
+            endif()
+            if(ANDROID_STL_TYPE STREQUAL "c++_shared")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_shared.so")
+            elseif(ANDROID_STL_TYPE STREQUAL "c++_static")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_static.a")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++abi.a")
+            else()
+                message(FATAL_ERROR "Invalid Android STL TYPE: ${ANDROID_STL_TYPE}.")
+            endif()
+        endif()
     else()
         message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
     endif()
diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake
index 069923c779fbd3eed4f5f81ef3e386ff70fac215..c9c3fc9f2681b6002567d555a26ee14edefaeae5 100644
--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -23,6 +23,21 @@ if(ANDROID)
     
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")
+
+    # Don't re-export libgcc symbols
+    set(REMOVE_ATOMIC_GCC_SYMBOLS "-Wl,--exclude-libs,libatomic.a -Wl,--exclude-libs,libgcc.a")
+    set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
+    set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
+
+    # Only the libunwind.a from clang(with libc++) provide C++ exception handling support for 32-bit ARM
+    # Refer to https://android.googlesource.com/platform/ndk/+/master/docs/BuildSystemMaintainers.md#Unwinding
+    if (ARM_TARGET_LANG STREQUAL "clang" AND ARM_TARGET_ARCH_ABI STREQUAL "armv7" AND ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
+        set(REMOVE_UNWIND_SYMBOLS "-Wl,--exclude-libs,libunwind.a")
+        set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
+        set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
+        set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
+    endif()
 endif()
 
 if(ARMLINUX)
@@ -59,14 +74,13 @@ function(check_linker_flag)
 endfunction()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if((LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) OR LITE_WITH_PYTHON OR LITE_WITH_EXCEPTION OR (NOT LITE_ON_TINY_PUBLISH))
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -fasynchronous-unwind-tables -funwind-tables")
+else ()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-asynchronous-unwind-tables -fno-unwind-tables")
+endif()
 if (LITE_ON_TINY_PUBLISH)
-    if((NOT LITE_WITH_PYTHON))
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
-    endif()
-    if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
-    endif()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
     check_linker_flag(-Wl,--gc-sections)
 endif()
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index e7df3f0fd6f0b0efcaf9cd859df5fb84a0cadfc4..eb8e26218ad1d8adc920b1834abd9ba10669a3e9 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -2,6 +2,10 @@ if(NOT LITE_WITH_CUDA)
     return()
 endif()
 
+if(WITH_CUDA_FP16)
+  add_definitions("-DCUDA_WITH_FP16")
+endif()
+
 set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 53 60 61 62")
@@ -167,6 +171,10 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
   add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
 endif()
 
+if (CUDA_WITH_FP16)
+  STRING(REGEX REPLACE "30|35|50|52" "" paddle_known_gpu_archs ${paddle_known_gpu_archs})
+endif()
+
 include_directories(${CUDA_INCLUDE_DIRS})
 if(NOT WITH_DSO)
     if(WIN32)
diff --git a/cmake/device/huawei_ascend_npu.cmake b/cmake/device/huawei_ascend_npu.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0bd9591eee702f4db914a8b547c4c99b21d0473b
--- /dev/null
+++ b/cmake/device/huawei_ascend_npu.cmake
@@ -0,0 +1,169 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+  return()
+endif()
+
+# 1. path to Huawei Ascend Install Path
+if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
+    set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
+    if(NOT HUAWEI_ASCEND_NPU_DDK_ROOT)
+        message(FATAL_ERROR "Must set HUAWEI_ASCEND_NPU_DDK_ROOT or env HUAWEI_ASCEND_NPU_DDK_ROOT when LITE_WITH_HUAWEI_ASCEND_NPU=ON")
+    endif()
+endif()
+message(STATUS "HUAWEI_ASCEND_NPU_DDK_ROOT: ${HUAWEI_ASCEND_NPU_DDK_ROOT}")
+
+# 2. Huawei Ascend include directory
+set(ACL_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/include")
+set(ATC_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/include")
+set(OPP_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp")
+include_directories(${ACL_INCLUDE_DIR})
+include_directories(${ATC_INCLUDE_DIR})
+include_directories(${OPP_INCLUDE_DIR})
+
+# 3 find ACL Libs (ACL libs should before ATC libs)
+find_library(ACL_ASCENDCL_FILE NAMES ascendcl
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ACL_ASCENDCL_FILE)
+  message(FATAL_ERROR "Can not find ACL_ASCENDCL_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
+else()
+  message(STATUS "Found ACL_ASCENDCL_FILE Library: ${ACL_ASCENDCL_FILE}")
+  add_library(acl_ascendcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET acl_ascendcl PROPERTY IMPORTED_LOCATION ${ACL_ASCENDCL_FILE})
+endif()
+
+# 3.1 ascendcl dependency - libruntime.so
+find_library(ACL_RUNTIME_FILE NAMES runtime
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ACL_RUNTIME_FILE)
+  message(FATAL_ERROR "Can not find ACL_RUNTIME_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
+else()
+  message(STATUS "Found ACL_RUNTIME_FILE Library: ${ACL_RUNTIME_FILE}")
+  add_library(acl_runtime SHARED IMPORTED GLOBAL)
+  set_property(TARGET acl_runtime PROPERTY IMPORTED_LOCATION ${ACL_RUNTIME_FILE})
+endif()
+
+# 4.1 find ATC libs - libregister.so
+find_library(ATC_REGISTER_FILE NAMES register
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_REGISTER_FILE)
+  message(FATAL_ERROR "Can not find ATC_REGISTER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_REGISTER_FILE Library: ${ATC_REGISTER_FILE}")
+  add_library(atc_register SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_register PROPERTY IMPORTED_LOCATION ${ATC_REGISTER_FILE})
+endif()
+
+# 4.1.1 dependency of register - libprotobuf.so.19,
+find_library(ATC_PROTOBUF_FILE NAMES libprotobuf.so.19
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+  if(NOT ATC_REGISTER_FILE)
+  message(FATAL_ERROR "Can not find ATC_PROTOBUF_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_PROTOBUF_FILE Library: ${ATC_PROTOBUF_FILE}")
+  add_library(atc_protobuf SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_protobuf PROPERTY IMPORTED_LOCATION ${ATC_PROTOBUF_FILE})
+endif()
+
+# 4.1.2 dependency of register - libgraph.so
+find_library(ATC_GRAPH_FILE NAMES graph
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_GRAPH_FILE)
+  message(FATAL_ERROR "Can not find ATC_GRAPH_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GRAPH_FILE Library: ${ATC_GRAPH_FILE}")
+  add_library(atc_graph SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_graph PROPERTY IMPORTED_LOCATION ${ATC_GRAPH_FILE})
+endif()
+
+# 4.2 find ATC libs - libge_compiler.so
+find_library(ATC_GE_COMPILER_FILE NAMES ge_compiler
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_GE_COMPILER_FILE)
+  message(FATAL_ERROR "Can not find ATC_GE_COMPILER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GE_COMPILER_FILE Library: ${ATC_GE_COMPILER_FILE}")
+  add_library(atc_ge_compiler SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_ge_compiler PROPERTY IMPORTED_LOCATION ${ATC_GE_COMPILER_FILE})
+endif()
+
+# 4.2.1 dependencies of libge_compiler.so - libge_common.so
+find_library(ATC_GE_COMMON_FILE NAMES ge_common
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_GE_COMMON_FILE)
+  message(FATAL_ERROR "Can not find ATC_GE_COMMON_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GE_COMMON_FILE Library: ${ATC_GE_COMMON_FILE}")
+  add_library(atc_ge_common SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_ge_common PROPERTY IMPORTED_LOCATION ${ATC_GE_COMMON_FILE})
+endif()
+
+# 4.2.3 dependencies of libge_compiler.so - libresource.so
+find_library(ATC_RESOURCE_FILE NAMES resource
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_RESOURCE_FILE)
+  message(FATAL_ERROR "Can not find ATC_RESOURCE_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_RESOURCE_FILE Library: ${ATC_RESOURCE_FILE}")
+  add_library(atc_resource SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_resource PROPERTY IMPORTED_LOCATION ${ATC_RESOURCE_FILE})
+endif()
+
+# 4.3 find OPP libs - libopsproto.so
+find_library(OPP_OPS_PROTO_FILE NAMES opsproto
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in
+  NO_DEFAULT_PATH)
+
+if(NOT OPP_OPS_PROTO_FILE)
+  message(FATAL_ERROR "Can not find OPP_OPS_PROTO_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in")
+else()
+  message(STATUS "Found OPP_OPS_PROTO_FILE Library: ${OPP_OPS_PROTO_FILE}")
+  add_library(opp_ops_proto SHARED IMPORTED GLOBAL)
+  set_property(TARGET opp_ops_proto PROPERTY IMPORTED_LOCATION ${OPP_OPS_PROTO_FILE})
+endif()
+
+# 4.3.1 dependency of  opp_ops_proto - liberror_manager.so
+find_library(ATC_ERROR_MANAGER_FILE NAMES error_manager
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_ERROR_MANAGER_FILE)
+  message(FATAL_ERROR "Can not find ATC_ERROR_MANAGER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_ERROR_MANAGER_FILE Library: ${ATC_ERROR_MANAGER_FILE}")
+  add_library(atc_error_manager SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_error_manager PROPERTY IMPORTED_LOCATION ${ATC_ERROR_MANAGER_FILE})
+endif()
+
+# note: huawei_ascend_npu_runtime_libs should before huawei_ascend_npu_builder_libs
+set(huawei_ascend_npu_runtime_libs acl_ascendcl acl_runtime CACHE INTERNAL "huawei_ascend_npu acllib runtime libs")
+set(huawei_ascend_npu_builder_libs atc_register atc_protobuf atc_graph opp_ops_proto atc_error_manager 
+    atc_ge_compiler atc_ge_common atc_resource CACHE INTERNAL "huawei_ascend_npu atc builder libs")
\ No newline at end of file
diff --git a/cmake/device/npu.cmake b/cmake/device/npu.cmake
index 88598f4690a157b20ac1873d84ad13c2f8652725..0409b6a60fc651cbaade61998a09bc0489bc978c 100644
--- a/cmake/device/npu.cmake
+++ b/cmake/device/npu.cmake
@@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
   PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
   NO_DEFAULT_PATH)
 
+# Added in HiAI DDK 320 or later version
+find_library(NPU_DDK_HCL_FILE NAMES hcl
+  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
+  NO_DEFAULT_PATH)
+
 if(NOT NPU_DDK_HIAI_FILE)
   message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
 else()
@@ -78,5 +83,13 @@ else()
   set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
 endif()
 
-set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
+if(NOT NPU_DDK_HCL_FILE)
+# message(FATAL_ERROR "Can not find NPU_DDK_HCL_FILE in ${NPU_DDK_ROOT}")
+else()
+  message(STATUS "Found NPU_DDK HCL Library: ${NPU_DDK_HCL_FILE}")
+  add_library(npu_ddk_hcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET npu_ddk_hcl PROPERTY IMPORTED_LOCATION ${NPU_DDK_HCL_FILE})
+endif()
+
+set(npu_runtime_libs npu_ddk_hiai npu_ddk_hcl CACHE INTERNAL "npu ddk runtime libs")
 set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
diff --git a/cmake/device/xpu.cmake b/cmake/device/xpu.cmake
index 823048552f3cb5f05375e97e94cd5b5ad63e7563..16fc7dcf4191a6b2a145d4d6e70e915fe5321a6b 100644
--- a/cmake/device/xpu.cmake
+++ b/cmake/device/xpu.cmake
@@ -39,7 +39,7 @@ else()
 endif()
 
 find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+  PATHS ${XPU_SDK_ROOT}/XTDK/runtime/shlib ${XPU_SDK_ROOT}/XTDK/shlib # libxpurt.so may have been moved to XTDK/runtime/shlib
   NO_DEFAULT_PATH)
 
 if(NOT XPU_SDK_XPU_RT_FILE)
diff --git a/cmake/external/flatbuffers.cmake b/cmake/external/flatbuffers.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e6ab31ee855f5bbc0594f37c00a3ec46d8e4231d
--- /dev/null
+++ b/cmake/external/flatbuffers.cmake
@@ -0,0 +1,114 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+# Introduce variables:
+# * CMAKE_INSTALL_LIBDIR
+INCLUDE(GNUInstallDirs)
+SET(LIBDIR "lib")
+if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
+  SET(LIBDIR "lib64")
+endif()
+
+SET(FLATBUFFERS_PREFIX_DIR ${THIRD_PARTY_PATH}/flatbuffers)
+SET(FLATBUFFERS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/flatbuffers)
+SET(FLATBUFFERS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flatbuffers)
+SET(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_SOURCES_DIR}/include" CACHE PATH "flatbuffers include directory." FORCE)
+IF(WIN32)
+  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
+ELSE(WIN32)
+  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.a" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${FLATBUFFERS_INCLUDE_DIR})
+
+if(NOT HOST_CXX_COMPILER)
+  set(HOST_CXX_COMPILER ${CMAKE_CXX_COMPILER})
+  set(HOST_C_COMPILER ${CMAKE_C_COMPILER})
+endif()
+
+SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${HOST_CXX_COMPILER}"
+                  "-DCMAKE_C_COMPILER=${HOST_C_COMPILER}")
+
+ExternalProject_Add(
+    extern_flatbuffers
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/google/flatbuffers.git"
+    GIT_TAG         "v1.12.0"
+    SOURCE_DIR      ${FLATBUFFERS_SOURCES_DIR}
+    PREFIX          ${FLATBUFFERS_PREFIX_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DBUILD_STATIC_LIBS=ON
+                    -DCMAKE_INSTALL_PREFIX=${FLATBUFFERS_INSTALL_DIR}
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_TESTING=OFF
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+                    -DFLATBUFFERS_BUILD_TESTS=OFF
+                    ${CROSS_COMPILE_CMAKE_ARGS}
+                    ${OPTIONAL_ARGS}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${FLATBUFFERS_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+IF(WIN32)
+  IF(NOT EXISTS "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib")
+    add_custom_command(TARGET extern_flatbuffers POST_BUILD
+            COMMAND cmake -E copy ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers_static.lib ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib
+            )
+  ENDIF()
+ENDIF(WIN32)
+ADD_LIBRARY(flatbuffers STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET flatbuffers PROPERTY IMPORTED_LOCATION ${FLATBUFFERS_LIBRARIES})
+ADD_DEPENDENCIES(flatbuffers extern_flatbuffers)
+
+SET(FLATBUFFERS_FLATC_EXECUTABLE ${FLATBUFFERS_INSTALL_DIR}/bin/flatc)
+
+function(register_generated_output file_name)
+  get_property(tmp GLOBAL PROPERTY FBS_GENERATED_OUTPUTS)
+  list(APPEND tmp ${file_name})
+  set_property(GLOBAL PROPERTY FBS_GENERATED_OUTPUTS ${tmp})
+endfunction(register_generated_output)
+
+function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT)
+  if(FLATBUFFERS_BUILD_LEGACY)
+    set(OPT ${OPT};--cpp-std c++0x)
+  else()
+    # --cpp-std is defined by flatc default settings.
+  endif()
+  message(STATUS "`${SRC_FBS}`: add generation of C++ code with '${OPT}'")
+  get_filename_component(SRC_FBS_DIR ${SRC_FBS} PATH)
+  message(STATUS "SRC_FBS_DIR: ${SRC_FBS_DIR}")
+  string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS})
+  add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/${GEN_HEADER}"
+    COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
+            --cpp --gen-mutable --gen-object-api --reflect-names
+            ${OPT}
+            -o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}"
+            "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
+    DEPENDS flatbuffers
+    COMMENT "Run generation: '${GEN_HEADER}'")
+  register_generated_output(${GEN_HEADER})
+  add_custom_target(${TARGET} ALL DEPENDS ${GEN_HEADER})
+endfunction()
+
+set(FRAMEWORK_FBS_DIR "lite/model_parser/flatbuffers")
+set(FRAMEWORK_SCHEMA_PATH "${FRAMEWORK_FBS_DIR}/framework.fbs")
+compile_flatbuffers_schema_to_cpp_opt(framework_fbs_header ${FRAMEWORK_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty")
+include_directories(${FLATBUFFERS_INCLUDE_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR})
+
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index 8408a79fa4265b08771e435dcc5e82801a9d40f9..fe66d0f643e9bdf0cb778c4e4647294f553c023e 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS ARGS)
   cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   set(deps ${lite_deps_DEPS})
@@ -118,6 +118,12 @@ function (lite_deps TARGET)
     endforeach(var)
   endif()
 
+  if (LITE_WITH_HUAWEI_ASCEND_NPU)
+    foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
   set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()
 
@@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
     set(options SHARED shared STATIC static MODULE module)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
       HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -165,6 +171,7 @@ function(lite_cc_library TARGET)
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
             MLU_DEPS ${args_MLU_DEPS}
+            HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
             )
 
     if (args_SHARED OR ARGS_shared)
@@ -193,7 +200,7 @@ function(lite_cc_binary TARGET)
         set(options " -g ")
     endif()
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
       LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -215,6 +222,7 @@ function(lite_cc_binary TARGET)
             HVY_DEPS ${args_HVY_DEPS}
             CV_DEPS ${CV_DEPS}
             MLU_DEPS ${args_MLU_DEPS}
+            HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
             )
     cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
     if(NOT WIN32)
@@ -246,7 +254,7 @@ function(lite_cc_test TARGET)
     endif()
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
         ARGS
         COMPILE_LEVEL # (basic|extra)
@@ -276,6 +284,7 @@ function(lite_cc_test TARGET)
               HVY_DEPS ${args_HVY_DEPS}
               CV_DEPS ${args_CV_DEPS}
               MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
               )
     _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
     # strip binary target to reduce size
@@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
 set(apu_kernels CACHE INTERNAL "apu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(mlu_kernels CACHE INTERNAL "mlu kernels")
+set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
 set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
@@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR)
   file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
+# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -438,6 +448,15 @@ function(add_kernel TARGET device level)
         endif()
         set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
+    if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU")
+        if (NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(huawei_ascend_npu_kernels "${huawei_ascend_npu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
     if ("${device}" STREQUAL "OPENCL")
         if (NOT LITE_WITH_OPENCL)
             foreach(src ${args_SRCS})
@@ -481,6 +500,7 @@ function(add_kernel TARGET device level)
               RKNPU_DEPS ${args_RKNPU_DEPS}
               BM_DEPS ${args_BM_DEPS}
               MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
@@ -499,7 +519,7 @@ endif()
 function(add_operator TARGET level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -537,6 +557,7 @@ function(add_operator TARGET level)
               RKNPU_DEPS ${args_RKNPU_DEPS}
               BM_DEPS ${args_BM_DEPS}
               MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
diff --git a/lite/demo/cxx/train_demo/README.md b/docs/demo_guides/cpp_train_demo.md
similarity index 82%
rename from lite/demo/cxx/train_demo/README.md
rename to docs/demo_guides/cpp_train_demo.md
index 56f4513d45676a1deb51bfb93096db156ddd0449..c10f2091f9c14f6fc81563248c75e72abd713666 100644
--- a/lite/demo/cxx/train_demo/README.md
+++ b/docs/demo_guides/cpp_train_demo.md
@@ -1,8 +1,10 @@
+# C++ Train Demo
 
-# Introduction
-  我们都知道，PaddleLite可以做移动端预测，事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子，这一例子对应的任务是“波士顿房价预测”，又称作“fit-a-line”。
+## Introduction
+
+我们都知道，PaddleLite可以做移动端预测，事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子，这一例子对应的任务是“波士顿房价预测”，又称作“fit-a-line”。
   
-  你可以通过book库中的
+你可以通过book库中的
 [文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html)
 和
 [源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)
@@ -10,18 +12,16 @@
 其使用线性回归（Linear Regression）
 模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。
 
-注：这是一篇使用C++ API做模型训练的教程，其他API暂时不支持训练功能。
-
-# Requirements
+## Requirements
 
 - 一部安卓手机，用于运行训练程序
-- 装了Paddle (version: 1.7.0) 的python
+- 装了Paddle (version >= 1.7.0) 的python
 
-# Quick start
+## Quick start
 
-## Step1 build paddle-lite
+### Step1 build paddle-lite
 
-请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例，其具体的命令为：
+请按照paddle-lite官方文档的教程编译full_publish的paddle-lite lib。以Linux上编译为例，其具体的命令为：
 
 ```shell
 ## 配置环境
@@ -51,7 +51,7 @@ cd Paddle-Lite
 Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so
 ```
 
-## Step2 编译lr_trainer
+### Step2 编译lr_trainer
 
 ```shell
 cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/
@@ -64,7 +64,7 @@ bin/
 `-- demo_trainer
 ```
 
-## Step3 download model and run it!
+### Step3 download model and run it!
 
 在你的笔记本电脑上，用usb连接到手机，开启开发者模式，在任意目录下执行：
 
@@ -102,7 +102,7 @@ sample 8: Loss: 248.445
 sample 9: Loss: 325.135
 ```
 
-# 更多细节
+## 更多细节
 上面提到的模型是直接下载得到的，如果你想自己生成，可以执行以下命令：
 
 ```shell
@@ -125,9 +125,9 @@ md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d
 
 如果你想生成自己的模型用于训练，可以参考`train.py`中保存模型的方式。
 
-# 与Paddle训练结果做校对
+## 与Paddle训练结果做校对
 
-## 前10个Loss值
+### 前10个Loss值
 
 为了验证paddle与lite的一致性，我们控制模型参数一致、数据一致、batch size = 1的情况下，训练10个batch， 记录了二者的loss值。
 
@@ -171,11 +171,11 @@ sample 8: Loss: 248.445
 sample 9: Loss: 325.135
 ```
 
-## Loss 曲线
+### Loss 曲线
 
 控制训练时的batch size为20，每个epoch对训练数据做全局shuffle，训练100个epoch后，paddle和lite的loss曲线对比如下。
 
-![lr_loss](image/lr_loss.png)
+![lr_loss](../images/lr_loss.png)
 
 如果想复现上述效果，paddle+python的运行命令为：
 
diff --git a/docs/demo_guides/opencl.md b/docs/demo_guides/opencl.md
index 31a0e411566297d5556e6b7fffcec1343cd83781..52ea158cf7b9c827c17225b6690b1bd9d8d15d24 100644
--- a/docs/demo_guides/opencl.md
+++ b/docs/demo_guides/opencl.md
@@ -37,14 +37,25 @@ rm ./lite/api/paddle_use_kernels.h
 rm ./lite/api/paddle_use_ops.h
 
 # 设置编译参数并开始编译
+# android-armv7:cpu+gpu+cv+extra
 ./lite/tools/build_android.sh \
   --arch=armv7 \
   --toolchain=clang \
-  --with_cv=OFF \
   --with_log=OFF \
-  --with_extra=OFF \
+  --with_extra=ON \
+  --with_cv=ON \
   --with_opencl=ON
 
+# android-armv8:cpu+gpu+cv+extra
+./lite/tools/build_android.sh \
+  --arch=armv8 \
+  --toolchain=clang \
+  --with_log=OFF \
+  --with_extra=ON \
+  --with_cv=ON \
+  --with_opencl=ON
+
+
 # 注：编译帮助请执行: ./lite/tools/build_android.sh help
 ```
 
@@ -206,7 +217,7 @@ adb shell "export GLOG_v=4; \
 
 ## 3. 如何在Code中使用
 
-即编译产物`demo/cxx/mobile_light`目录下的代码，在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc);
+即编译产物`demo/cxx/mobile_light`目录下的代码，在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)，其中也包括判断当前设备是否支持OpenCL的方法;
 
 注：这里给出的链接会跳转到线上最新develop分支的代码，很可能与您本地的代码存在差异，建议参考自己本地位于`lite/demo/cxx/`目录的代码，查看如何使用。
 
diff --git a/docs/demo_guides/python_demo.md b/docs/demo_guides/python_demo.md
index d6a7b15bd9be638ef586e6b589e35eecbf1613c2..59f81783c0b2e791f9623e84cf57c269cbb7d6f2 100644
--- a/docs/demo_guides/python_demo.md
+++ b/docs/demo_guides/python_demo.md
@@ -86,19 +86,28 @@ config.set_model_from_file(/YOU_MODEL_PATH/mobilenet_v1_opt.nb)
 predictor = create_paddle_predictor(config)
 ```
 
-(3) 设置输入数据
+(3) 从图片读入数据
+
+```python
+image = Image.open('./example.jpg')
+resized_image = image.resize((224, 224), Image.BILINEAR)
+image_data = np.array(resized_image).flatten().tolist()
+```
+
+(4) 设置输入数据
+
 ```python
 input_tensor = predictor.get_input(0)
 input_tensor.resize([1, 3, 224, 224])
-input_tensor.set_float_data([1.] * 3 * 224 * 224)
+input_tensor.set_float_data(image_data)
 ```
 
-(4) 执行预测
+(5) 执行预测
 ```python
 predictor.run()
 ```
 
-(5) 得到输出数据
+(6) 得到输出数据
 ```python
 output_tensor = predictor.get_output(0)
 print(output_tensor.shape())
diff --git a/lite/demo/cxx/train_demo/image/lr_loss.png b/docs/images/lr_loss.png
similarity index 100%
rename from lite/demo/cxx/train_demo/image/lr_loss.png
rename to docs/images/lr_loss.png
diff --git a/docs/index.rst b/docs/index.rst
index c241f091ed2cae906879f98b769bc6b7ce830fe1..b2fba7daba51c68207af27e249559c18ab10235f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -59,7 +59,14 @@ Welcome to Paddle-Lite's documentation!
   demo_guides/baidu_xpu
   demo_guides/rockchip_npu
   demo_guides/mediatek_apu
-  
+
+.. toctree::
+  :maxdepth: 1
+  :caption: 训练示例（预览）
+  :name: sec-train_demo_guides
+
+  demo_guides/cpp_train_demo
+
 .. toctree::
   :maxdepth: 1
   :caption: API文档
diff --git a/docs/user_guides/Compile/iOS.md b/docs/user_guides/Compile/iOS.md
index 355cc11875ce8f8db891fb843d2f1624180b71ff..60375ad1085dfac090442f9c0dad86cf71b64c9e 100644
--- a/docs/user_guides/Compile/iOS.md
+++ b/docs/user_guides/Compile/iOS.md
@@ -61,7 +61,7 @@ inference_lite_lib.ios64.armv8                iOS预测库和头文件
 - 裁剪预测库方法（只编译模型中的kernel&OP，降低预测库体积）:
 
 ```shell
-./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir
+./lite/tools/build_ios.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir
 ```
 ```shell
 --with_strip: (OFF|ON);   是否根据输入模型裁剪预测库，默认为OFF
diff --git a/docs/user_guides/model_optimize_tool.md b/docs/user_guides/model_optimize_tool.md
index fed728cb0e06c9758a0497a9cbb93d7edf39bda7..4c80d638d224d294e247ad3f5300498dd536be62 100644
--- a/docs/user_guides/model_optimize_tool.md
+++ b/docs/user_guides/model_optimize_tool.md
@@ -21,11 +21,11 @@ pip install paddlelite
 - 方法二: 下载opt可执行文件
 从[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases)，选择当前预测库对应版本的`opt`转化工具
 
-本文提供`release/v2.6`和`release/v2.2.0`版本的优化工具下载
+本文提供`release/v2.6.1`和`release/v2.2.0`版本的优化工具下载
 
 |版本 | Linux | MacOS|
 |---|---|---|
-| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |
+| `release/v2.6.1` | [opt](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt_mac) |
 |`release/v2.2.0`  | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) |
 
 - 方法三: 源码编译opt
diff --git a/docs/user_guides/tutorial.md b/docs/user_guides/tutorial.md
index 338449bfcb92e4029763c4357eb6d1fd5b820272..ee156038a6ea144921258734c92e9a2ea757d6ec 100644
--- a/docs/user_guides/tutorial.md
+++ b/docs/user_guides/tutorial.md
@@ -49,4 +49,4 @@ $ ./opt \
 
 ## 五. 测试工具
 
-为了使您更好的了解并使用Lite框架，我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug) 和 [Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异，进一步快速定位问题Op，方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗，其会自动统计Op执行的次数，最长、最短、平均执行时间等等信息，为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
+为了使您更好的了解并使用Lite框架，我们向有进一步使用需求的用户开放了 [Debug工具](debug) 和 [Profile工具](debug)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异，进一步快速定位问题Op，方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗，其会自动统计Op执行的次数，最长、最短、平均执行时间等等信息，为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index ff4d00dbb1051320f817c8220a11a77edde7fb05..10601e34f9815bfee88d8dba58988169839cc86d 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
 message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
+message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
@@ -45,14 +46,17 @@ if (WITH_TESTING)
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz")
     endif()
     if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
-	      lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "bert.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "ernie.tar.gz")
     endif()
 endif()
 
@@ -242,7 +246,6 @@ if (LITE_WITH_X86)
     add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
 
     add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
-           COMMAND rm -rf "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light"
            COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full"
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 85744f5cac4b5b6dc6cb149a0375a69c98d55dd7..6ff381268a5796a52136214b64db39c057b5d59b 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -2,7 +2,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG))
   lite_cc_library(place SRCS paddle_place.cc DEPS logging)
 else()
   lite_cc_library(place SRCS paddle_place.cc DEPS glog)
-endif(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+endif()
 
 if (LITE_ON_TINY_PUBLISH)
     set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
@@ -11,12 +11,13 @@ endif()
 
 set(light_lib_DEPS light_api paddle_api paddle_api_light)
 
-if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR LITE_WITH_HUAWEI_ASCEND_NPU OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
     #full api dynamic library
     lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
                   DEPS paddle_api paddle_api_light  paddle_api_full)
-    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
-    target_link_libraries(paddle_full_api_shared framework_proto)
+    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files})
+    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry framework_fbs_header)
+    target_link_libraries(paddle_full_api_shared framework_proto op_registry)
     if(LITE_WITH_X86)
         add_dependencies(paddle_full_api_shared xxhash)
         target_link_libraries(paddle_full_api_shared xxhash)
@@ -39,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                   NPU_DEPS ${npu_kernels}
                   APU_DEPS ${apu_kernels}
                   RKNPU_DEPS ${rknpu_kernels}
+                  HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
                   )
 
     add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
     if(WIN32)
         target_link_libraries(paddle_light_api_shared shlwapi.lib)
     endif()
-    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
+    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${rknpu_kernels} ${apu_kernels})
    if(APPLE)
         set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds")
         set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}")
@@ -70,7 +72,7 @@ else()
             set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
         endif()
         set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}")
-        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
+        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h framework_fbs_header)
         if (LITE_WITH_NPU)
             # Need to add HIAI runtime libs (libhiai.so) dependency
             target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
@@ -93,6 +95,7 @@ if (WITH_TESTING)
       RKNPU_DEPS ${rknpu_kernels}
       BM_DEPS ${bm_kernels}
       MLU_DEPS ${mlu_kernels}
+      HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
       APU_DEPS ${apu_kernels})
 
 endif()
@@ -111,6 +114,10 @@ if(LITE_WITH_RKNPU)
     set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
 endif()
 
+if(LITE_WITH_HUAWEI_ASCEND_NPU)
+    set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps})
+endif()
 
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
@@ -125,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
 message(STATUS "get MLU kernels ${mlu_kernels}")
+message(STATUS "get HUAWEI_ASCEND_NPU kernels ${huawei_ascend_npu_kernels}")
 
 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -143,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
                         RKNPU_DEPS ${rknpu_kernels}
                         BM_DEPS ${bm_kernels}
                         CL_DEPS ${opencl_kernels}
-                        FPGA_DEPS ${fpga_kernels})
+                        FPGA_DEPS ${fpga_kernels}
+                        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 endif()
 
 # for light api
@@ -167,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         BM_DEPS ${bm_kernels}
-        MLU_DEPS ${mlu_kernels})
+        MLU_DEPS ${mlu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -190,6 +200,7 @@ if(WITH_TESTING)
            FPGA_DEPS ${fpga_kernels}
            BM_DEPS ${bm_kernels}
            MLU_DEPS ${mlu_kernels}
+           HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
            EXCLUDE_COMPILE_DEPS "ON"
            ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
                 --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -321,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
         APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
-        BM_DEPS ${bm_kernels})
+        BM_DEPS ${bm_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
     # The final inference library for just MobileConfig.
     bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
     target_link_libraries(paddle_api_full ${cuda_deps})
@@ -361,6 +373,9 @@ endif()
 
 if (LITE_WITH_PYTHON)
     add_subdirectory(python)
+    # add library for opt_base
+    lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
+    add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
 endif()
 
 if (LITE_ON_TINY_PUBLISH)
@@ -368,9 +383,6 @@ if (LITE_ON_TINY_PUBLISH)
 endif()
 
 
-# add library for opt_base
-lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
-add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
 
 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
     message(STATUS "Compiling opt")
@@ -393,6 +405,7 @@ if(NOT WITH_COVERAGE)
       FPGA_DEPS ${fpga_kernels}
       BM_DEPS ${bm_kernels}
       MLU_DEPS ${mlu_kernels}
+      HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
     if (WITH_TESTING)
         add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -414,7 +427,8 @@ if(NOT IOS)
         RKNPU_DEPS ${rknpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
     
     lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
@@ -429,7 +443,8 @@ if(NOT IOS)
         RKNPU_DEPS ${rknpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
     
     lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
@@ -444,7 +459,8 @@ if(NOT IOS)
         RKNPU_DEPS ${rknpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 
     lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
@@ -458,7 +474,8 @@ if(NOT IOS)
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
     
     lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
@@ -469,8 +486,9 @@ if(NOT IOS)
         XPU_DEPS ${xpu_kernels}
         RKNPU_DEPS ${rknpu_kernels}
         MLU_DEPS ${mlu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
         CL_DEPS ${opencl_kernels}
-	BM_DEPS ${bm_kernels}
+        BM_DEPS ${bm_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
@@ -486,7 +504,8 @@ if(NOT IOS)
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 endif()
 
 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt
index d46e9f7cdec1cf422340ff11165ee166c7520bab..2929e24117c616a99ff4e078fd77fe8827186cb1 100644
--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -17,6 +17,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
     # Unlike static library, module library has to link target to be able to work
     # as a single .so lib.
     target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
+    add_dependencies(paddle_lite_jni framework_fbs_header)
     if (LITE_WITH_NPU)
         # Strips the symbols of our protobuf functions to fix the conflicts during
         # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
@@ -31,7 +32,7 @@ else()
     endif()
     set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS})
     target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
-    add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
+    add_dependencies(paddle_lite_jni op_list_h kernel_list_h framework_fbs_header)
     if (LITE_WITH_NPU)
         # Need to add HIAI runtime libs (libhiai.so) dependency
         target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs})
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index f0557226a8770201d0fe79c385ef7e2d0240e91c..52fc33830828ce1325a77b821f1cea4c329e933b 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -13,26 +13,31 @@
 // limitations under the License.
 
 #include "lite/api/cxx_api.h"
+
 #include <algorithm>
 #include <memory>
 #include <set>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "lite/api/paddle_use_passes.h"
 #include "lite/utils/io.h"
 
 namespace paddle {
 namespace lite {
 
+std::vector<std::string> GetAllOps() {
+  return OpLiteFactory::Global().GetAllOps();
+}
+
 void Predictor::SaveModel(const std::string &dir,
                           lite_api::LiteModelType model_type,
                           bool record_info) {
   if (!program_) {
     GenRuntimeProgram();
   }
-  program_->SaveOpInfosToProgram(program_desc_.get());
-  program_->UpdateVarsOfProgram(program_desc_.get());
+  program_->SaveToProgram(program_desc_);
   switch (model_type) {
     case lite_api::LiteModelType::kProtobuf:
       SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true);
@@ -52,17 +57,21 @@ void Predictor::SaveModel(const std::string &dir,
 void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
   std::set<std::string> ops_info;
   std::set<std::string> kernels_info;
-  const auto &instructions_ = program_->instructions();
-  for (auto &node : instructions_) {
-    // parse op type infomation
-    auto op = node.op()->op_info();
-    ops_info.insert(op->Type());
-    // parse kernel type information
-    std::string kernel_type_str =
-        node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) +
-        "," + PrecisionRepr(node.kernel()->precision()) + "," +
-        DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias();
-    kernels_info.insert(kernel_type_str);
+  auto block_size = program_->block_size();
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    const auto &insts = program_->instructions(block_idx);
+    for (auto &inst : insts) {
+      // parse op type infomation
+      auto op = inst.op()->op_info();
+      ops_info.insert(op->Type());
+      // parse kernel type information
+      std::string kernel_type_str =
+          inst.kernel()->op_type() + "," + TargetRepr(inst.kernel()->target()) +
+          "," + PrecisionRepr(inst.kernel()->precision()) + "," +
+          DataLayoutRepr(inst.kernel()->layout()) + "," +
+          inst.kernel()->alias();
+      kernels_info.insert(kernel_type_str);
+    }
   }
 
   // get souce_file name from op type and kernel type
@@ -164,9 +173,9 @@ void Predictor::PrepareFeedFetch() {
 
   std::vector<const cpp::OpDesc *> feeds;
   std::vector<const cpp::OpDesc *> fetchs;
-  const auto &insts = program_->instructions();
-  for (size_t i = 0; i < program_->num_instructions(); i++) {
-    const auto &op = insts[i].op()->op_info();
+  const auto &insts = program_->instructions(kRootBlockIdx);
+  for (auto &inst : insts) {
+    const auto &op = inst.op()->op_info();
     if (op->Type() == "feed") {
       feeds.push_back(op);
     } else if (op->Type() == "fetch") {
@@ -249,7 +258,6 @@ void Predictor::Build(const lite_api::CxxConfig &config,
   } else {
     LOG(INFO) << "Load model from file.";
   }
-
   Build(model_path,
         model_file,
         param_file,
@@ -290,10 +298,10 @@ void Predictor::Build(const std::string &model_path,
   Build(program_desc_, valid_places, passes);
 }
 
-void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
+void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &program_desc,
                       const std::vector<Place> &valid_places,
                       const std::vector<std::string> &passes) {
-  program_desc_ = desc;
+  program_desc_ = program_desc;
   // `inner_places` is used to optimize passes
   std::vector<Place> inner_places = valid_places;
   for (auto &valid_place : valid_places) {
@@ -326,13 +334,11 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
     }
   }
   if (is_quantized_model) {
-#ifdef LITE_WITH_ARM
     inner_places.insert(inner_places.begin(),
                         Place{TARGET(kARM), PRECISION(kInt8)});
-#endif
   }
 
-  Program program(*desc.get(), scope_, inner_places);
+  Program program(program_desc_, scope_, inner_places);
   valid_places_ = inner_places;
 
   core::KernelPickFactor factor;
diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
index 6d0b7830d37be7e441df9e0e71f87572edaf3911..ceb823d5811aed26792318e3c1bf718ad9c2d851 100644
--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -36,6 +36,8 @@ static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
     ".tailored_kernels_source_list";
 static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
 
+std::vector<std::string> GetAllOps();
+
 /*
  * Predictor for inference, input a model, it will optimize and execute it.
  */
@@ -47,18 +49,33 @@ class LITE_API Predictor {
     program_desc_ = std::make_shared<cpp::ProgramDesc>();
   }
 
-  // Create a predictor with the weight variable scope set.
+  ///////////////////////////////////////////////////////////////////
+  // Function: Predictor
+  // Usage: Constructor of Predictor. Create a predictor with the
+  // weight variable scope set given.
+  ///////////////////////////////////////////////////////////////////
   explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
       : scope_(root_scope) {}
-  Predictor(const std::shared_ptr<cpp::ProgramDesc>& desc,
+  ///////////////////////////////////////////////////////////////////
+  // Function: Predictor
+  // Usage: Constructor of Predictor. This constructor function can
+  // only be called in Predictor->Clone. This Function will create
+  // a predictor from existed ProgramDesc, Scope and RuntimeProgram.
+  ///////////////////////////////////////////////////////////////////
+  Predictor(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
             const std::shared_ptr<Scope>& root,
             const std::vector<Place>& valid_places,
             const std::vector<std::string>& var_names = {})
-      : program_desc_(desc), scope_(root) {
-    Program program(*desc.get(), scope_, valid_places, var_names);
-    optimizer_ = Optimizer(std::move(program), valid_places);
-    exec_scope_ = optimizer_.exec_scope();
+      : program_desc_(program_desc), scope_(root) {
+    // step1. Create a Program to construct the exec_scope and ops
+    Program program(program_desc_, scope_, valid_places, var_names);
+    exec_scope_ = program.exec_scope();
     valid_places_ = valid_places;
+
+    // step3. Create the RuntimeProgram.
+    program_.reset(
+        new RuntimeProgram(program_desc_, exec_scope_, kRootBlockIdx));
+    program_generated_ = true;
   }
 
   // Build from a model, with places set for hardware config.
@@ -77,32 +94,62 @@ class LITE_API Predictor {
       lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
       bool memory_from_memory = false);
 
-  void Build(const std::shared_ptr<cpp::ProgramDesc>& desc,
+  void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
              const std::vector<Place>& valid_places,
              const std::vector<std::string>& passes = {});
 
-  std::shared_ptr<Predictor> Clone() const {
+  //////////////////////////////////////////////////////////
+  // Function: Clone
+  // Usage: Create a Predictor from an existed one,
+  // the cloned predictor will share persistable variables
+  // in scope_ with the original predictor.
+  //////////////////////////////////////////////////////////
+  std::shared_ptr<Predictor> Clone() {
+    // step 1. Generate runtime_program, update op_info and var_info in
+    // program_desc_
+    if (!program_generated_) {
+      GenRuntimeProgram();
+    }
+    program_->SaveToProgram(program_desc_);
+    // step 2. Create a predictor friom current program_desc_ and
+    // runtime_program.
     auto predictor =
         std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
+    // step3. Return the result
     return predictor;
   }
-
-  std::shared_ptr<Predictor> Clone(
-      const std::vector<std::string>& var_names) const {
+  //////////////////////////////////////////////////////////
+  // Function: Clone(var_names)
+  // Usage: Create a Predictor from an existed one,
+  // the cloned predictor will share persistable variables
+  // but persistable variables of name var_names will not
+  // be shared.
+  //////////////////////////////////////////////////////////
+  std::shared_ptr<Predictor> Clone(const std::vector<std::string>& var_names) {
     CHECK(program_desc_) << "Both program and scope of current predicotr "
                             "should be not be nullptr in Clone mode.";
     CHECK(scope_) << "Both program and scope of current predicotr should be "
                      "not be nullptr in Clone mode.";
+    // step 1. Generate runtime_program, update op_info and var_info in
+    // program_desc_
+    if (!program_generated_) {
+      GenRuntimeProgram();
+    }
+    program_->SaveToProgram(program_desc_);
+    // step 2. Create a predictor friom current program_desc_ and
+    // runtime_program.
     auto predictor = std::make_shared<Predictor>(
         program_desc_, scope_, valid_places_, var_names);
-
-    for (auto i : var_names) {
-      predictor->exec_scope_->LocalVar(i);
-      auto* tensor = predictor->scope_->Var(i)->GetMutable<lite::Tensor>();
+    // step3. Copy some persistable variables into private scope.
+    for (auto var_name : var_names) {
+      predictor->exec_scope_->LocalVar(var_name);
+      auto* tensor =
+          predictor->scope_->Var(var_name)->GetMutable<lite::Tensor>();
       auto* sub_tensor =
-          predictor->exec_scope_->Var(i)->GetMutable<lite::Tensor>();
+          predictor->exec_scope_->Var(var_name)->GetMutable<Tensor>();
       sub_tensor->CopyDataFrom(*tensor);
     }
+    // step4. Return the result
     return predictor;
   }
 
@@ -138,6 +185,7 @@ class LITE_API Predictor {
   // get a const tensor according to its name
   const lite::Tensor* GetTensor(const std::string& name) const;
   const RuntimeProgram& runtime_program() const;
+  Scope* scope() { return scope_.get(); }
 
   // This method is disabled in mobile, for unnecessary dependencies required.
   void SaveModel(
@@ -160,7 +208,7 @@ class LITE_API Predictor {
   std::shared_ptr<cpp::ProgramDesc> program_desc_;
   std::shared_ptr<Scope> scope_;
   Scope* exec_scope_;
-  std::unique_ptr<RuntimeProgram> program_;
+  std::shared_ptr<RuntimeProgram> program_;
   bool program_generated_{false};
   std::vector<std::string> input_names_;
   std::vector<std::string> output_names_;
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 7b3b6bf043dae6008d8d6d9bc1acde97a2e3de38..726783349f0dcc049c4578df5c9e0ecbdb3dee4f 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -53,12 +53,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
 #endif
 #ifdef LITE_WITH_MLU
     Env<TARGET(kMLU)>::Init();
-    lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
-                                             config.mlu_core_number(),
-                                             config.mlu_use_first_conv(),
-                                             config.mlu_first_conv_mean(),
-                                             config.mlu_first_conv_std(),
-                                             config.mlu_input_layout());
+    lite::TargetWrapperMlu::SetMLURunMode(config.mlu_core_version(),
+                                          config.mlu_core_number(),
+                                          config.mlu_input_layout(),
+                                          config.mlu_firstconv_param());
 #endif  // LITE_WITH_MLU
     auto use_layout_preprocess_pass =
         config.model_dir().find("OPENCL_PRE_PRECESS");
@@ -75,6 +73,18 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   }
   mode_ = config.power_mode();
   threads_ = config.threads();
+#ifdef LITE_WITH_NPU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
+  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
+      config.get_device_id());
+  Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
+      config.subgraph_model_cache_dir());
+#endif
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
     !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
   int num_threads = config.x86_math_library_num_threads();
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
index 5f57ed40ddb762f2d80fce2327a01100bae741d9..fbcf171726d741ef0073f423bc4a600c9f9389d0 100644
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -15,8 +15,6 @@
 #include "lite/api/light_api.h"
 #include <algorithm>
 #include <map>
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
 
 namespace paddle {
 namespace lite {
@@ -24,17 +22,18 @@ namespace lite {
 void LightPredictor::Build(const std::string& lite_model_file,
                            bool model_from_memory) {
   if (model_from_memory) {
-    LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
+    LoadModelNaiveFromMemory(
+        lite_model_file, scope_.get(), program_desc_.get());
   } else {
-    LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
+    LoadModelNaiveFromFile(lite_model_file, scope_.get(), program_desc_.get());
   }
 
   // For weight quantization of post training, load the int8/16 weights
   // for optimized model, and dequant it to fp32.
   DequantizeWeight();
-
-  BuildRuntimeProgram(cpp_program_desc_);
+  BuildRuntimeProgram(program_desc_);
   PrepareFeedFetch();
+  program_desc_.reset();
 }
 
 void LightPredictor::Build(const std::string& model_dir,
@@ -45,15 +44,15 @@ void LightPredictor::Build(const std::string& model_dir,
   switch (model_type) {
 #ifndef LITE_ON_TINY_PUBLISH
     case lite_api::LiteModelType::kProtobuf:
-      LoadModelPb(model_dir, "", "", scope_.get(), &cpp_program_desc_);
+      LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
       break;
 #endif
     case lite_api::LiteModelType::kNaiveBuffer: {
       if (model_from_memory) {
         LoadModelNaiveFromMemory(
-            model_buffer, param_buffer, scope_.get(), &cpp_program_desc_);
+            model_buffer, param_buffer, scope_.get(), program_desc_.get());
       } else {
-        LoadModelNaive(model_dir, scope_.get(), &cpp_program_desc_);
+        LoadModelNaive(model_dir, scope_.get(), program_desc_.get());
       }
       break;
     }
@@ -62,7 +61,7 @@ void LightPredictor::Build(const std::string& model_dir,
   }
 
   DequantizeWeight();
-  BuildRuntimeProgram(cpp_program_desc_);
+  BuildRuntimeProgram(program_desc_);
   PrepareFeedFetch();
 }
 
@@ -111,15 +110,17 @@ std::vector<std::string> LightPredictor::GetOutputNames() {
 }
 // append the names of inputs and outputs into input_names_ and output_names_
 void LightPredictor::PrepareFeedFetch() {
-  auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
-  std::vector<cpp::OpDesc*> feeds;
-  std::vector<cpp::OpDesc*> fetchs;
-  for (size_t i = 0; i < current_block->OpsSize(); i++) {
-    auto op = current_block->GetOp<cpp::OpDesc>(i);
-    if (op->Type() == "feed") {
-      feeds.push_back(op);
-    } else if (op->Type() == "fetch") {
-      fetchs.push_back(op);
+  std::vector<const cpp::OpDesc*> feeds;
+  std::vector<const cpp::OpDesc*> fetchs;
+  std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
+  auto main_block = program_desc->GetBlock<cpp::BlockDesc>(kRootBlockIdx);
+  auto op_size = main_block->OpsSize();
+  for (size_t op_idx = 0; op_idx < op_size; ++op_idx) {
+    auto op_desc = main_block->GetOp<cpp::OpDesc>(op_idx);
+    if (op_desc->Type() == "feed") {
+      feeds.push_back(op_desc);
+    } else if (op_desc->Type() == "fetch") {
+      fetchs.push_back(op_desc);
     }
   }
   input_names_.resize(feeds.size());
@@ -134,54 +135,35 @@ void LightPredictor::PrepareFeedFetch() {
   }
 }
 
-void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
-  std::vector<Instruction> insts;
-  // 1. Create op first
-  Program program(prog, scope_, {});
-
-// 2. Create Instructs
-#ifdef LITE_WITH_OPENCL
-  using OpenCLContext = Context<TargetType::kOpenCL>;
-  std::unique_ptr<KernelContext> local_ctx(new KernelContext());
-  local_ctx->As<OpenCLContext>().InitOnce();
-#endif
-
-  // Create the kernels of the target places, and filter out the specific
-  // kernel with the target alias.
-  for (auto& op : program.ops()) {
-    auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
-    std::string op_type, alias;
-    Place place;
-    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-    auto kernels = op->CreateKernels({place});
-    // filter out a kernel
-    auto it = std::find_if(
-        kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
-          return it->alias() == alias;
-        });
-    CHECK(it != kernels.end());
-
-#ifdef LITE_WITH_OPENCL
-    if ((*it)->target() == TARGET(kOpenCL)) {
-      std::unique_ptr<KernelContext> ctx(new KernelContext());
-      (*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
-      (*it)->SetContext(std::move(ctx));
-    } else {
-      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+void LightPredictor::BuildRuntimeProgram(
+    const std::shared_ptr<const cpp::ProgramDesc>& program_desc) {
+  auto* exe_scope = &scope_->NewScope();
+  // Prepare workspace
+  scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
+  scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
+  CHECK(program_desc);
+  auto block_size = program_desc->BlocksSize();
+  CHECK(block_size);
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+    auto var_size = block_desc->VarsSize();
+    for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
+      auto var_desc = block_desc->GetVar<cpp::VarDesc>(var_idx);
+      if (!var_desc->Persistable()) {
+        exe_scope->Var(var_desc->Name());
+      } else {
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") continue;
+        scope_->Var(var_desc->Name());
+      }
     }
-#else
-    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
-#endif
-
-    insts.emplace_back(op, std::move(*it));
   }
-  program_.reset(new RuntimeProgram(std::move(insts)));
-
-  CHECK(program.exec_scope());
-  program_->set_exec_scope(program.exec_scope());
+  // Only extracting the ops and generate the runtime program from the main
+  // block desc
+  program_.reset(new RuntimeProgram(program_desc, exe_scope, kRootBlockIdx));
 }
 
 void LightPredictor::DequantizeWeight() {
+  std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
 #define PROCESS_CONV2D_DATA()                                             \
   for (int64_t i = 0; i < ch; ++i) {                                      \
     for (int64_t j = 0; j < offset; ++j) {                                \
@@ -207,10 +189,9 @@ void LightPredictor::DequantizeWeight() {
     }
     return result;
   };
-
   Tensor tmp_tensor;
-  for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
-    auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
+  for (size_t i = 0; i < program_desc->BlocksSize(); i++) {
+    auto* block = program_desc->GetBlock<cpp::BlockDesc>(i);
     for (size_t k = 0; k < block->OpsSize(); ++k) {
       auto* op_desc = block->GetOp<cpp::OpDesc>(k);
       if (is_weight_quantized_op(op_desc)) {
diff --git a/lite/api/light_api.h b/lite/api/light_api.h
index e651d1323a5ce6e36546e9437d06a472eb8a5137..97a46b7d28ffc84feb87283eed9786b562a45229 100644
--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -46,6 +46,7 @@ class LITE_API LightPredictor {
   LightPredictor(const std::string& lite_model_file,
                  bool model_from_memory = false) {
     scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
     Build(lite_model_file, model_from_memory);
   }
 
@@ -57,6 +58,7 @@ class LITE_API LightPredictor {
                  lite_api::LiteModelType model_type =
                      lite_api::LiteModelType::kNaiveBuffer) {
     scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
     Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
   }
 
@@ -78,6 +80,7 @@ class LITE_API LightPredictor {
   std::vector<std::string> GetInputNames();
   std::vector<std::string> GetOutputNames();
   void PrepareFeedFetch();
+  Scope* scope() { return scope_.get(); }
 
  private:
   void Build(const std::string& lite_model_file,
@@ -91,14 +94,15 @@ class LITE_API LightPredictor {
       lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
       bool model_from_memory = false);
 
-  void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
+  void BuildRuntimeProgram(
+      const std::shared_ptr<const cpp::ProgramDesc>& program_desc);
 
   void DequantizeWeight();
 
  private:
   std::shared_ptr<Scope> scope_;
   std::unique_ptr<RuntimeProgram> program_;
-  cpp::ProgramDesc cpp_program_desc_;
+  std::shared_ptr<cpp::ProgramDesc> program_desc_;
   std::vector<std::string> input_names_;
   std::vector<std::string> output_names_;
 };
diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc
index 718ba020fb9c6daa4dc4d7263238692267335a48..c9c34377e2a82b72d26e3148a694fe0662e985ce 100644
--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -38,7 +38,15 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
   threads_ = config.threads();
 
 #ifdef LITE_WITH_NPU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
   Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
+      config.get_device_id());
+  Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
       config.subgraph_model_cache_dir());
 #endif
 }
diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc
index 465f82056c6bb80b706cfb7d875773d75735911b..b523d5951b3302c5aa46763625af12e24da0015e 100644
--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,
 
   if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
     ASSERT_EQ(out->dims().production(), 1000);
-    double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
+    double eps = first_target == TARGET(kOpenCL) ? 0.25 : 0.1;
     for (int i = 0; i < ref.size(); ++i) {
       for (int j = 0; j < ref[i].size(); ++j) {
         auto result = pdata[j * step + (out->dims()[1] * i)];
diff --git a/lite/api/opt.cc b/lite/api/opt.cc
index c2fb594e8877020848ecc90c039c31d6f77f638b..e6a53e93e72261082fa220c5fe7b0c12bf60ca87 100644
--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -112,6 +112,8 @@ std::vector<Place> ParserValidPlaces() {
       valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
     } else if (target_repr == "npu") {
       valid_places.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "huawei_ascend_npu") {
+      valid_places.emplace_back(TARGET(kHuaweiAscendNPU));
     } else if (target_repr == "xpu") {
       valid_places.emplace_back(TARGET(kXPU));
     } else if (target_repr == "mlu") {
@@ -201,6 +203,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                       "kXPU",
                                       "kRKNPU",
                                       "kAPU",
+                                      "kHuaweiAscendNPU",
                                       "kAny",
                                       "kUnk"};
   int maximum_optype_length = 0;
@@ -265,16 +268,17 @@ void PrintHelpInfo() {
       "        `--param_file=<param_path>`\n"
       "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
       "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
+      "        "
+      "`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
       "        `--record_tailoring_info=(true|false)`\n"
       "  Arguments of model checking and ops information:\n"
       "        `--print_all_ops=true`   Display all the valid operators of "
       "Paddle-Lite\n"
       "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
       "  Display valid operators of input targets\n"
       "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
       "  Display operators in the input model\n";
   std::cout << "opt version:" << opt_version << std::endl
             << help_info << std::endl;
diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc
index 4ee18e24a632777c6a3e4a661c90aa9b59654028..ed41a821c0938b599dc8900baa021491df78f329 100644
--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
       valid_places_.emplace_back(TARGET(kX86));
     } else if (target_repr == "npu") {
       valid_places_.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "huawei_ascend_npu") {
+      valid_places_.emplace_back(TARGET(kHuaweiAscendNPU));
     } else if (target_repr == "xpu") {
       valid_places_.emplace_back(TARGET(kXPU));
     } else if (target_repr == "rknpu") {
@@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() {
       "        `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
       "default\n"
       "        `set_lite_out(output_optimize_model_dir)`\n"
-      "        `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
+      "        "
+      "`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
       "        `record_model_info(false|true)`: refer to whether to record ops "
       "info for striping lib, false by default`\n"
       "        `run() : start model transformation`\n"
@@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() {
       "        `--param_file=<param_path>`\n"
       "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
       "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n"
       "        `--record_tailoring_info=(true|false)`\n"
       "  Arguments of model checking and ops information:\n"
       "        `--print_all_ops=true`   Display all the valid operators of "
       "Paddle-Lite\n"
       "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
       "  Display valid operators of input targets\n"
       "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
       "  Display operators in the input model\n";
   std::cout << "paddlelite opt version:" << opt_version << std::endl
             << help_info << std::endl;
@@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
                                                      "kXPU",
                                                      "kRKNPU",
                                                      "kAPU",
+                                                     "kHuaweiAscendNPU",
                                                      "kAny",
                                                      "kUnk"};
   // Get the lengh of the first column: maximum length of the op_type
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index bfeff4879820f132a331e9bff56a5f9c494fe775..08d2233536b90d2b39c7ba6e6733036652179d5f 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "lite/api/paddle_api.h"
+
+#include <utility>
+
 #include "lite/core/context.h"
 #include "lite/core/device_info.h"
 #include "lite/core/target_wrapper.h"
@@ -21,10 +24,30 @@
 #ifdef LITE_WITH_CUDA
 #include "lite/backends/cuda/target_wrapper.h"
 #endif
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif
+
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/target_wrapper.h"
+#endif
+
+#ifdef LITE_WITH_OPENCL
+#include "lite/backends/opencl/cl_runtime.h"
+#endif
 
 namespace paddle {
 namespace lite_api {
 
+bool IsOpenCLBackendValid() {
+  bool opencl_valid = false;
+#ifdef LITE_WITH_OPENCL
+  opencl_valid = paddle::lite::CLRuntime::Global()->OpenCLAvaliableForDevice();
+#endif
+  LOG(INFO) << "opencl_valid:" << opencl_valid;
+  return opencl_valid;
+}
+
 Tensor::Tensor(void *raw) : raw_tensor_(raw) {}
 
 // TODO(Superjomn) refine this by using another `const void* const_raw`;
@@ -97,6 +120,13 @@ void Tensor::CopyFromCpu(const T *src_data) {
         data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
 #else
     LOG(FATAL) << "Please compile the lib with CUDA.";
+#endif
+  } else if (type == TargetType::kMLU) {
+#ifdef LITE_WITH_MLU
+    lite::TargetWrapperMlu::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
+#else
+    LOG(FATAL) << "Please compile the lib with MLU.";
 #endif
   } else {
     LOG(FATAL) << "The CopyFromCpu interface just support kHost, kARM, kCUDA";
@@ -117,6 +147,13 @@ void Tensor::CopyToCpu(T *data) const {
         data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
 #else
     LOG(FATAL) << "Please compile the lib with CUDA.";
+#endif
+  } else if (type == TargetType::kMLU) {
+#ifdef LITE_WITH_MLU
+    lite::TargetWrapperMlu::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
+#else
+    LOG(FATAL) << "Please compile the lib with MLU.";
 #endif
   } else {
     LOG(FATAL) << "The CopyToCpu interface just support kHost, kARM, kCUDA";
@@ -138,6 +175,11 @@ template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
 template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
 
+template void Tensor::CopyFromCpu<int, TargetType::kMLU>(const int *);
+template void Tensor::CopyFromCpu<int64_t, TargetType::kMLU>(const int64_t *);
+template void Tensor::CopyFromCpu<float, TargetType::kMLU>(const float *);
+template void Tensor::CopyFromCpu<int8_t, TargetType::kMLU>(const int8_t *);
+
 template void Tensor::CopyToCpu(float *) const;
 template void Tensor::CopyToCpu(int *) const;
 template void Tensor::CopyToCpu(int8_t *) const;
@@ -228,13 +270,9 @@ void CxxConfig::set_mlu_core_number(int core_number) {
 void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
   mlu_input_layout_ = layout;
 }
-void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
-  mlu_use_first_conv_ = use_first_conv;
-}
-void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
+void CxxConfig::set_mlu_firstconv_param(const std::vector<float> &mean,
+                                        const std::vector<float> &std) {
   mlu_first_conv_mean_ = mean;
-}
-void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
   mlu_first_conv_std_ = std;
 }
 lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
@@ -242,18 +280,15 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
 }
 int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
 DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
-bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
-const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
-  return mlu_first_conv_mean_;
-}
-const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
-  return mlu_first_conv_std_;
+std::pair<std::vector<float>, std::vector<float>>
+CxxConfig::mlu_firstconv_param() const {
+  return std::make_pair(mlu_first_conv_mean_, mlu_first_conv_std_);
 }
 #endif
 
 void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
 #ifdef LITE_WITH_XPU
-  lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
+  lite::TargetWrapperXPU::workspace_l3_size_per_thread = l3_size;
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_workspace_l3_size_per_thread' is ignored, please "
@@ -263,7 +298,7 @@ void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
 
 void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 #ifdef LITE_WITH_XPU
-  lite::Context<TargetType::kXPU>::SetDev(dev_no);
+  lite::TargetWrapperXPU::SetDev(dev_no);
 #else
   LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
                   "ignored, please rebuild it with LITE_WITH_XPU=ON.";
@@ -272,7 +307,7 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 
 void CxxConfig::set_xpu_multi_encoder_precision(const std::string &precision) {
 #ifdef LITE_WITH_XPU
-  lite::Context<TargetType::kXPU>::_multi_encoder_precision = precision;
+  lite::TargetWrapperXPU::multi_encoder_precision = precision;
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_multi_encoder_precision' is "
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index d28ea8fdbf3f77a15f9ef561e03555090fddac97..6fe00bbd32d51e7d923901792e9d62166058c406 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -21,6 +21,7 @@
 #define PADDLE_LITE_API_H_
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #include "paddle_place.h"  // NOLINT
 
@@ -32,6 +33,9 @@ using lod_t = std::vector<std::vector<uint64_t>>;
 
 enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
 
+// return true if current device supports OpenCL model
+LITE_API bool IsOpenCLBackendValid();
+
 struct LITE_API Tensor {
   explicit Tensor(void* raw);
   explicit Tensor(const void* raw);
@@ -122,6 +126,7 @@ class LITE_API ConfigBase {
   PowerMode mode_{LITE_POWER_NO_BIND};
   // to save subgraph model for npu/xpu/...
   std::string subgraph_model_cache_dir_{""};
+  int device_id_{0};
 
  public:
   explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
@@ -141,6 +146,9 @@ class LITE_API ConfigBase {
   const std::string& subgraph_model_cache_dir() const {
     return subgraph_model_cache_dir_;
   }
+  // set Device ID
+  void set_device_id(int device_id) { device_id_ = device_id; }
+  const int get_device_id() const { return device_id_; }
 };
 
 /// CxxConfig is the config for the Full feature predictor.
@@ -160,9 +168,8 @@ class LITE_API CxxConfig : public ConfigBase {
   lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
   int mlu_core_number_{1};
   DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
-  bool mlu_use_first_conv_{false};
-  std::vector<float> mlu_first_conv_mean_;
-  std::vector<float> mlu_first_conv_std_;
+  std::vector<float> mlu_first_conv_mean_{};
+  std::vector<float> mlu_first_conv_std_{};
 #endif
 
  public:
@@ -210,24 +217,22 @@ class LITE_API CxxConfig : public ConfigBase {
   void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
   // set MLU core number, which is used when compiling MLU kernels
   void set_mlu_core_number(int core_number);
-  // set MLU input layout. User can specify layout of input data to be NHWC,
-  // default is NCHW
-  void set_mlu_input_layout(DataLayoutType layout);
   // whether use MLU's first conv kernel. First conv is a special kernel
   // provided by MLU, its input is uint8, and also needs two 3-dimentional
   // vectors which save all inputs' mean and std values
-  void set_mlu_use_first_conv(bool use_first_conv);
-  // set the 3-dimentional mean vector used by MLU's first conv
-  void set_mlu_first_conv_mean(const std::vector<float>& mean);
-  // set the 3-dimentional std vector used by MLU's first conv
-  void set_mlu_first_conv_std(const std::vector<float>& std);
+  // set the 3-dimentional mean vector and 3-dimentional std vector used by
+  // MLU's first conv
+  void set_mlu_firstconv_param(const std::vector<float>& mean,
+                               const std::vector<float>& std);
+  // set MLU input layout. User can specify layout of input data to be NHWC,
+  // default is NCHW
+  void set_mlu_input_layout(DataLayoutType layout);
 
   lite_api::MLUCoreVersion mlu_core_version() const;
   int mlu_core_number() const;
   DataLayoutType mlu_input_layout() const;
-  bool mlu_use_first_conv() const;
-  const std::vector<float>& mlu_first_conv_mean() const;
-  const std::vector<float>& mlu_first_conv_std() const;
+  // std::pair<mean, std>
+  std::pair<std::vector<float>, std::vector<float>> mlu_firstconv_param() const;
 #endif
 
   // XPU only, set the size of the workspace memory from L3 cache for the
diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc
index 832867df079efa1baebf08da4c0d8e37958460f1..4edd61277059e20f7dfb1b8410a784fd04d85502 100644
--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -15,8 +15,11 @@
 #include "lite/api/paddle_api.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/io.h"
+
 DEFINE_string(model_dir, "", "");
 
 namespace paddle {
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index 9bc63e78aae92556a312eb36c3415f9d57c2239a..29a119a6916e1e9fe9880c801291072351c18365 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -54,7 +54,8 @@ const std::string& ActivationTypeToStr(ActivationType act) {
                                            "Sigmoid",
                                            "Tanh",
                                            "Swish",
-                                           "Exp"};
+                                           "Exp",
+                                           "ThresholdedRelu"};
   auto x = static_cast<int>(act);
   CHECK_LT(x, static_cast<int>(ActivationType::NUM));
   return act2string[x];
@@ -74,7 +75,8 @@ const std::string& TargetToStr(TargetType target) {
                                               "bm",
                                               "mlu",
                                               "rknpu",
-                                              "apu"};
+                                              "apu",
+                                              "huawei_ascend_npu"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -117,7 +119,8 @@ const std::string& TargetRepr(TargetType target) {
                                               "kBM",
                                               "kMLU",
                                               "kRKNPU",
-                                              "kAPU"};
+                                              "kAPU",
+                                              "kHuaweiAscendNPU"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -162,7 +165,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                                TARGET(kMLU),
                                                TARGET(kAPU),
                                                TARGET(kRKNPU),
-                                               TARGET(kFPGA)});
+                                               TARGET(kFPGA),
+                                               TARGET(kHuaweiAscendNPU)});
   if (target == TARGET(kAny)) {
     return valid_set;
   }
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index 7066656f18ec0693048223f5f1201e77a1b0a37d..5161d6b58af01f7af4dcbaec6a1cacb91e7c7056 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -57,7 +57,8 @@ enum class TargetType : int {
   kMLU = 11,
   kRKNPU = 12,
   kAPU = 13,
-  NUM = 14,  // number of fields.
+  kHuaweiAscendNPU = 14,
+  NUM = 15,  // number of fields.
 };
 enum class PrecisionType : int {
   kUnk = 0,
@@ -106,7 +107,8 @@ enum class ActivationType : int {
   kAbs = 9,
   kHardSwish = 10,
   kReciprocal = 11,
-  NUM = 12,
+  kThresholdedRelu = 12,
+  NUM = 13,
 };
 
 static size_t PrecisionTypeLength(PrecisionType type) {
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 6732b968734631cf74c1e8fc7b825f3e0b89b9fe..f132b2064e76a85865b6092240ec96d6af9ae49a 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -26,7 +26,9 @@ USE_MIR_PASS(argument_type_display_pass);
 USE_MIR_PASS(runtime_context_assign_pass);
 USE_MIR_PASS(graph_visualize_pass);
 
+USE_MIR_PASS(remove_tf_redundant_ops_pass);
 USE_MIR_PASS(lite_conv_bn_fuse_pass);
+USE_MIR_PASS(lite_conv_conv_fuse_pass);
 USE_MIR_PASS(lite_fc_fuse_pass);
 USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
 USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
@@ -46,14 +48,18 @@ USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
+USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
 USE_MIR_PASS(mlu_subgraph_pass);
 USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
 USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
+USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
 USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
 USE_MIR_PASS(__xpu__fc_fuse_pass);
+USE_MIR_PASS(__xpu__mmdnn_fuse_pass);
diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc
index b7b24dfcea31d6e6e78538c6ac33923116b2e5a5..e32b61094a0b9ce9781cb6e9b8aef7ab753d7278 100644
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) {
       .value("MLU", TargetType::kMLU)
       .value("RKNPU", TargetType::kRKNPU)
       .value("APU", TargetType::kAPU)
+      .value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU)
       .value("Any", TargetType::kAny);
 
   // PrecisionType
diff --git a/lite/api/test_yolov3_lite_bm.cc b/lite/api/test_yolov3_lite_bm.cc
index d70ecf3c03955286244aa13cfe65f19569a55930..ded851d93313c3e155dd7f8860eee7446e56e715 100644
--- a/lite/api/test_yolov3_lite_bm.cc
+++ b/lite/api/test_yolov3_lite_bm.cc
@@ -59,9 +59,9 @@ void TestModel(const std::vector<Place>& valid_places) {
   }
   auto* image_tensor = predictor.GetInput(1);
   image_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 2})));
-  data = image_tensor->mutable_data<float>();
-  data[0] = FLAGS_im_height;
-  data[1] = FLAGS_im_width;
+  auto* data_1 = image_tensor->mutable_data<int>();
+  data_1[0] = FLAGS_im_height;
+  data_1[1] = FLAGS_im_width;
 
   for (int i = 0; i < FLAGS_warmup; ++i) {
     predictor.Run();
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
index 7f0d53f976ace17ee8d95e62e62d56f5cb974881..27a8a46cfa1413ea0d9ffa3641d8e4bd60785e11 100644
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(mlu)
 add_subdirectory(bm)
 add_subdirectory(apu)
 add_subdirectory(rknpu)
+add_subdirectory(huawei_ascend_npu)
diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt
index aecec295ae0269fb34a3c4fa38e396bdf98d4418..9cf8f6a507401656bb0df214bd463a09fd82a61d 100644
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -83,6 +83,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       conv5x5s2_depthwise_int8.cc
       conv5x5s2_depthwise_fp32.cc
       conv3x3_winograd_fp32_c4.cc
+      conv3x3_winograd_int8.cc
       conv_winograd_3x3.cc
       conv_impl.cc
       softmax.cc
@@ -126,5 +127,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       split_merge_lod_tenosr.cc
       reduce_prod.cc
       lstm.cc
+      clip.cc
       DEPS ${lite_kernel_deps} context tensor)
 endif()
diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc
index 1d01642100109d14a413ad5e274606c88bf0005a..01f25cbd36d327f7a3c252fdc675262d39748318 100644
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -753,23 +753,15 @@ void act_abs<float>(const float* din, float* dout, int size, int threads) {
   }
 }
 
-#ifdef LITE_WITH_TRAIN
 template <>
-void act_square_grad(const float* din,
-                     const float* dout_grad,
-                     float* din_grad,
-                     int size,
-                     int threads) {
-  const float* ptr_out_grad = dout_grad;
-  float* ptr_in_grad = din_grad;
+void act_thresholded_relu<float>(
+    const float* din, float* dout, int size, float threshold, int threads) {
   for (int i = 0; i < size; ++i) {
-    ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0];
-    ptr_out_grad++;
-    ptr_in_grad++;
+    dout[0] = (din[0] > threshold ? din[0] : 0.f);
     din++;
+    dout++;
   }
 }
-#endif
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h
index 50f60f300bbab9b9f0bcad222f31699b7bfadeab..b0147040cd11a888ec045948f0914a13aa932a2f 100644
--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -86,11 +86,9 @@ void act_reciprocal(const T* din, T* dout, int size, int threads);
 template <typename T>
 void act_abs(const T* din, T* dout, int size, int threads);
 
-#ifdef LITE_WITH_TRAIN
 template <typename T>
-void act_square_grad(
-    const T* din, const T* dout_grad, T* din_grad, int size, int threads);
-#endif
+void act_thresholded_relu(
+    const T* din, T* dout, int size, float threshold, int threads);
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/backends/arm/math/beam_search.cc b/lite/backends/arm/math/beam_search.cc
index 32b7d3bfeba6107493d62a0c9be14a3c15ce7692..74dfa143bda97219874b0e53efc7de34b0416c0e 100644
--- a/lite/backends/arm/math/beam_search.cc
+++ b/lite/backends/arm/math/beam_search.cc
@@ -234,7 +234,7 @@ void beam_search(const Tensor *pre_ids,
   selected_ids->Resize(dims);
   selected_scores->Resize(dims);
   if (parent_idx) {
-    parent_idx->Resize(dims);
+    parent_idx->Resize({static_cast<int64_t>(num_instances)});
   }
   auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
   auto *selected_scores_data = selected_scores->mutable_data<float>();
diff --git a/lite/backends/arm/math/clip.cc b/lite/backends/arm/math/clip.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f8b48db53b9fe1b50a0832a64b3849faa417fb8
--- /dev/null
+++ b/lite/backends/arm/math/clip.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/clip.h"
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/arm/math/saturate.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void clip_kernel_fp32(
+    const float* input, int64_t num, float min, float max, float* output) {
+  float tmp;
+  for (int64_t i = 0; i < num; i++) {
+    tmp = *input;
+    tmp = tmp > min ? tmp : min;
+    *output = tmp < max ? tmp : max;
+    input++;
+    output++;
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/clip.h b/lite/backends/arm/math/clip.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd74a8880abfb660c13c630ca708fa9c8f849d12
--- /dev/null
+++ b/lite/backends/arm/math/clip.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "lite/operators/op_params.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void clip_kernel_fp32(
+    const float* input, int64_t num, float min, float max, float* output);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
index 35d9eeaee1b69bed423cd3b489217c71575b3079..2957085493f15016abf2bf50f0aabecbe95f5b36 100644
--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -1245,7 +1245,7 @@ void weight_trans_c4_8x8(
   for (int i = 0; i < ch_out * ch_in * 64; ++i) {
     int new_c = i % 64;
     int new_oc = i / ch_in / 64 / 4;
-    int new_ic = i / 64 % (ch_in * 4) % ch_in;
+    int new_ic = i / 64 % ch_in;
     int new_inner = i / ch_in / 64 % 4;
     int dest_ind =
         new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner;
@@ -1302,7 +1302,7 @@ void weight_trans_c4_4x4(
   for (int i = 0; i < ch_out * ch_in * 16; ++i) {
     int new_c = i % 16;
     int new_oc = i / ch_in / 16 / 4;
-    int new_ic = i / 16 % (ch_in * 4) % ch_in;
+    int new_ic = i / 16 % ch_in;
     int new_inner = i / ch_in / 16 % 4;
     int dest_ind =
         new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner;
diff --git a/lite/backends/arm/math/conv3x3_winograd_int8.cc b/lite/backends/arm/math/conv3x3_winograd_int8.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b9870730e0cec07add470ad13292e1598736e5a
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3_winograd_int8.cc
@@ -0,0 +1,602 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/backends/arm/math/packed_sgemm_c4.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+#include <arm_neon.h>
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+void input_trans_c8_4x4_int8(const int8_t* src,
+                             int src_stride,
+                             int src_h_stride,
+                             int16_t* dest,
+                             int dest_stride,
+                             int dest_h_stride);
+void output_trans_c8_post_2x4_int8(const int32_t* src,
+                                   int src_stride,
+                                   int src_h_stride,
+                                   int32_t* dest,
+                                   int dest_stride,
+                                   int dest_h_stride);
+void weight_trans_c8_4x4_int8(
+    int16_t* dest, const int8_t* src, int ic, int oc, void* workspace);
+
+// F(2,3)
+template <typename Dtype>
+void conv_compute_2x2_3x3_int8(const int8_t* input,
+                               Dtype* output,
+                               int num,
+                               int chout,
+                               int hout,
+                               int wout,
+                               int chin,
+                               int hin,
+                               int win,
+                               const int16_t* weight,
+                               const float* bias,
+                               const float* scale,
+                               const operators::ConvParam& param,
+                               ARMContext* ctx) {
+  auto act_param = param.activation_param;
+  const int pad_h0 = (*param.paddings)[0];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
+  int8_t* tmp_work_space =
+      ctx->workspace_data<int8_t>() + ctx->llc_size() / sizeof(int8_t);
+
+  int in_n_stride = chin * hin * win;
+  int out_n_stride = chout * hout * wout;
+  int ic_stride = win * hin;
+  int oc_stride = wout * hout;
+  int ic_8 = (chin + 7) / 8;
+  int oc_8 = (chout + 7) / 8;
+
+  int tile_w = (wout + 1) / 2;
+  int tile_h = (hout + 1) / 2;
+  int size_tile = tile_h * tile_w;
+
+  int w_pad = win + pad_w0 + pad_w1;
+  int h_pad = hin + pad_h0 + pad_h1;
+
+  const int zero_len = (w_pad + 3) / 4 * 4;
+  Dtype zero_ptr[zero_len];  // NOLINT
+  memset(zero_ptr, 0, zero_len * sizeof(Dtype));
+
+  int8_t* input_c8 = tmp_work_space;
+  int new_h_stride = w_pad * 8;
+  int new_c_stride = new_h_stride * h_pad;
+
+  int ic_8_stride = w_pad * h_pad * 8;
+  int oc_8_stride = wout * hout * 8;
+
+  int tile_block = 8;
+  int block_count = (size_tile + tile_block - 1) / tile_block;
+
+  int threads = ctx->threads();
+  int16_t* g_tmp_data =
+      (int16_t*)(tmp_work_space + ic_8 * ic_8_stride +  // NOLINT
+                 oc_8 * oc_8_stride * sizeof(int32_t));
+  int tmp_input_thread_stride = tile_block * ic_8 * 128;
+  int tmp_output_thread_stride = tile_block * oc_8 * 128;
+  int tmp_data_thread_stride_size = tmp_input_thread_stride * sizeof(int16_t) +
+                                    tmp_output_thread_stride * sizeof(int32_t);
+  memset(g_tmp_data, 0, tmp_data_thread_stride_size);
+  int8_t* g_trans_remain_tmp_data =
+      (int8_t*)(g_tmp_data +  // NOLINT
+                threads * (tmp_input_thread_stride +
+                           tmp_output_thread_stride * sizeof(int32_t) /
+                               sizeof(int16_t)));
+  int32_t* g_trans_tmp_data =
+      (int32_t*)(g_trans_remain_tmp_data + threads * 128);  // NOLINT
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
+  // begin compute
+  for (int ni = 0; ni < num; ++ni) {
+    // trans input to c8
+    for (int i = 0; i < ic_8; ++i) {
+      prepack_input_nxwc8_int8_dw(input + ni * in_n_stride,
+                                  input_c8 + i * new_c_stride,
+                                  i * 8,
+                                  -pad_h0,
+                                  hin + pad_h1,
+                                  -pad_w0,
+                                  win + pad_w1,
+                                  chin,
+                                  win,
+                                  hin);
+    }
+    int32_t* output_c8 = (int32_t*)(input_c8 + ic_8 * ic_8_stride);  // NOLINT
+    Dtype* output_ptr = output + ni * out_n_stride;
+
+    const int16_t* weight_ptr = weight;
+#pragma omp parallel for num_threads(threads)
+    for (int tbi = 0; tbi < block_count; ++tbi) {
+#ifdef ARM_WITH_OMP
+      int16_t* tmp_data =
+          g_tmp_data +
+          omp_get_thread_num() * tmp_data_thread_stride_size / sizeof(int16_t);
+      int32_t* trans_tmp_data = g_trans_tmp_data + omp_get_thread_num() * 32;
+      int8_t* trans_remain_tmp_data =
+          g_trans_remain_tmp_data + omp_get_thread_num() * 128;
+#else
+      int16_t* tmp_data = g_tmp_data;
+      int32_t* trans_tmp_data = g_trans_tmp_data;
+      int8_t* trans_remain_tmp_data = g_trans_remain_tmp_data;
+#endif
+      int tile_index = tbi * tile_block;
+      int tile_remain = size_tile - tile_index;
+      int tile_count = tile_remain > tile_block ? tile_block : tile_remain;
+
+      // input trans
+      int c_gi_stride = tile_count * oc_8 * 8;
+      int b_gi_stride = tile_count * ic_8 * 8;
+      //*
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int src_x = tw_index + tw_index;
+        int src_y = th_index + th_index;
+        int ex = src_x + 4 > w_pad ? w_pad - src_x : 4;
+        int ey = src_y + 4 > h_pad ? h_pad - src_y : 4;
+
+        int16_t* dst_ptr = tmp_data + ti * 8;
+        const int8_t* src_ptr = input_c8 + (src_y * w_pad + src_x) * 8;
+
+        if (ex == 4 && ey == 4) {
+          // trans input
+          for (int ci = 0; ci < ic_8; ++ci) {
+            const int8_t* src_ci = src_ptr + ci * ic_8_stride;
+            int16_t* dst_ci = dst_ptr + ci * tile_count * 8;
+            input_trans_c8_4x4_int8(
+                src_ci, 8, w_pad * 8, dst_ci, b_gi_stride, b_gi_stride * 4);
+          }
+        } else {
+          // trans remain input
+          int x_size = ex;
+          for (int ci = 0; ci < ic_8; ++ci) {
+            const int8_t* src_ci = src_ptr + ci * ic_8_stride;
+            // pad
+            memset(trans_remain_tmp_data, 0, 128 * sizeof(int8_t));
+            if (x_size > 0) {
+              for (int yi = 0; yi < ey; ++yi) {
+                int8_t* dst_yi = trans_remain_tmp_data + yi * 32;
+                const int8_t* src_yi = src_ci + w_pad * yi * 8;
+                memcpy(dst_yi, src_yi, x_size * sizeof(int8_t) * 8);
+              }
+            }
+
+            // trans
+            int16_t* dst_ci = dst_ptr + ci * tile_count * 8;
+            input_trans_c8_4x4_int8(trans_remain_tmp_data,
+                                    8,
+                                    32,
+                                    dst_ci,
+                                    b_gi_stride,
+                                    b_gi_stride * 4);
+          }  // for ci_4
+        }
+      }
+      //*/
+      // input trans end
+      // *begin compute dot
+      // *
+      //*
+      int32_t* dst_temp_data =
+          (int32_t*)(tmp_data + tmp_input_thread_stride);  // NOLINT
+      int16_t* b_ptr = tmp_data;
+      int w_gi_stride = ic_8 * oc_8 * 64;
+      for (int gi = 0; gi < 16; ++gi) {
+        int32_t* origin_C = dst_temp_data + gi * c_gi_stride;
+        int16_t* origin_B = b_ptr + gi * b_gi_stride;
+        const int16_t* origin_A = weight + gi * w_gi_stride;
+        sgemm_prepack_c8_int16_small(
+            oc_8 * 8, tile_count, ic_8 * 8, origin_A, origin_B, origin_C, ctx);
+      }
+      //*/
+      //*
+      // output trans
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int dst_x = tw_index * 2;
+        int dst_y = th_index * 2;
+
+        int ex = dst_x + 2 > wout ? wout - dst_x : 2;
+        int ey = dst_y + 2 > hout ? hout - dst_y : 2;
+
+        int32_t* src_ptr = dst_temp_data + ti * 8;
+        int32_t* trans_remain_tmp_i32_data =
+            (int32_t*)(trans_remain_tmp_data);  // NOLINT
+        int32_t* dst_ptr = output_c8 + (dst_y * wout + dst_x) * 8;
+
+        if (ex == 2 && ey == 2) {
+          // trans output
+          for (int ci = 0; ci < oc_8; ++ci) {
+            int cur_ind = ci * 8;
+
+            int32_t* src_ci = src_ptr + ci * tile_count * 8;
+            int32_t* dst_ci = dst_ptr + ci * oc_8_stride;
+            output_trans_c8_post_2x4_int8(
+                src_ci, c_gi_stride, c_gi_stride * 4, dst_ci, 8, wout * 8);
+          }
+        } else {
+          for (int ci = 0; ci < oc_8; ++ci) {
+            int cur_ind = ci * 8;
+            // trans output
+            int32_t* src_ci = src_ptr + ci * tile_count * 8;
+            output_trans_c8_post_2x4_int8(src_ci,
+                                          c_gi_stride,
+                                          c_gi_stride * 4,
+                                          trans_remain_tmp_i32_data,
+                                          8,
+                                          16);
+            // copy to dest
+            int32_t* dst_ci = dst_ptr + ci * oc_8_stride;
+            for (int i = 0; i < ey; ++i) {
+              memcpy(dst_ci + i * wout * 8,
+                     trans_remain_tmp_i32_data + i * 16,
+                     ex * sizeof(int32_t) * 8);
+            }
+          }
+        }
+      }
+      //*/
+    }  // for block_count
+    const float* bias_local_ptr = bias;
+    for (int ci = 0; ci < oc_8; ++ci) {
+      float bias_local[8] = {bias_local_ptr[0],
+                             bias_local_ptr[1],
+                             bias_local_ptr[2],
+                             bias_local_ptr[3],
+                             bias_local_ptr[4],
+                             bias_local_ptr[5],
+                             bias_local_ptr[6],
+                             bias_local_ptr[7]};
+      write_int32_nchwc8_to_nchw(output_c8 + ci * oc_8_stride,
+                                 output_ptr,
+                                 ci * 8,
+                                 ci * 8 + 8,
+                                 0,
+                                 hout,
+                                 0,
+                                 wout,
+                                 chout,
+                                 hout,
+                                 wout,
+                                 flag_act,
+                                 alpha,
+                                 bias_local,
+                                 param.bias,
+                                 zero_ptr,
+                                 scale + ci * 8);
+      bias_local_ptr += 8;
+    }
+  }  // for num
+}  // conv compute
+template void conv_compute_2x2_3x3_int8<int8_t>(
+    const int8_t* input,
+    int8_t* output,
+    int num,
+    int chout,
+    int hout,
+    int wout,
+    int chin,
+    int hin,
+    int win,
+    const int16_t* weight,
+    const float* bias,
+    const float* scale,
+    const operators::ConvParam& param,
+    ARMContext* ctx);
+template void conv_compute_2x2_3x3_int8<float>(
+    const int8_t* input,
+    float* output,
+    int num,
+    int chout,
+    int hout,
+    int wout,
+    int chin,
+    int hin,
+    int win,
+    const int16_t* weight,
+    const float* bias,
+    const float* scale,
+    const operators::ConvParam& param,
+    ARMContext* ctx);
+
+// BT=[1, 0, -1, 0,
+//    0, 1,  1, 0,
+//    0, -1, 1, 0,
+//    0, 1,  0, -1]
+void input_trans_c8_4x4_int8(const int8_t* src,
+                             int src_stride,
+                             int src_h_stride,
+                             int16_t* dest,
+                             int dest_stride,
+                             int dest_h_stride) {
+  int8x8_t src00 = vld1_s8(src);
+  int8x8_t src01 = vld1_s8(src + src_stride);
+  int8x8_t src02 = vld1_s8(src + src_stride + src_stride);
+  int8x8_t src03 = vld1_s8(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  int8x8_t src10 = vld1_s8(src);
+  int8x8_t src11 = vld1_s8(src + src_stride);
+  int8x8_t src12 = vld1_s8(src + src_stride + src_stride);
+  int8x8_t src13 = vld1_s8(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  int8x8_t src20 = vld1_s8(src);
+  int8x8_t src21 = vld1_s8(src + src_stride);
+  int8x8_t src22 = vld1_s8(src + src_stride + src_stride);
+  int8x8_t src23 = vld1_s8(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  int8x8_t src30 = vld1_s8(src);
+  int8x8_t src31 = vld1_s8(src + src_stride);
+  int8x8_t src32 = vld1_s8(src + src_stride + src_stride);
+  int8x8_t src33 = vld1_s8(src + src_stride + src_stride + src_stride);
+
+  int16x8_t dst00 = vsubl_s8(src00, src02);
+  int16x8_t dst10 = vaddl_s8(src01, src02);
+  int16x8_t dst20 = vsubl_s8(src02, src01);
+  int16x8_t dst30 = vsubl_s8(src01, src03);
+
+  int16x8_t dst01 = vsubl_s8(src10, src12);
+  int16x8_t dst11 = vaddl_s8(src11, src12);
+  int16x8_t dst21 = vsubl_s8(src12, src11);
+  int16x8_t dst31 = vsubl_s8(src11, src13);
+
+  int16x8_t dst02 = vsubl_s8(src20, src22);
+  int16x8_t dst12 = vaddl_s8(src21, src22);
+  int16x8_t dst22 = vsubl_s8(src22, src21);
+  int16x8_t dst32 = vsubl_s8(src21, src23);
+
+  int16x8_t dst03 = vsubl_s8(src30, src32);
+  int16x8_t dst13 = vaddl_s8(src31, src32);
+  int16x8_t dst23 = vsubl_s8(src32, src31);
+  int16x8_t dst33 = vsubl_s8(src31, src33);
+
+  int16x8_t dest00 = vsubq_s16(dst00, dst02);
+  int16x8_t dest10 = vaddq_s16(dst01, dst02);
+  int16x8_t dest20 = vsubq_s16(dst02, dst01);
+  int16x8_t dest30 = vsubq_s16(dst01, dst03);
+
+  int16x8_t dest01 = vsubq_s16(dst10, dst12);
+  int16x8_t dest11 = vaddq_s16(dst11, dst12);
+  int16x8_t dest21 = vsubq_s16(dst12, dst11);
+  int16x8_t dest31 = vsubq_s16(dst11, dst13);
+
+  int16x8_t dest02 = vsubq_s16(dst20, dst22);
+  int16x8_t dest12 = vaddq_s16(dst21, dst22);
+  int16x8_t dest22 = vsubq_s16(dst22, dst21);
+  int16x8_t dest32 = vsubq_s16(dst21, dst23);
+
+  int16x8_t dest03 = vsubq_s16(dst30, dst32);
+  int16x8_t dest13 = vaddq_s16(dst31, dst32);
+  int16x8_t dest23 = vsubq_s16(dst32, dst31);
+  int16x8_t dest33 = vsubq_s16(dst31, dst33);
+
+  vst1q_s16(dest, dest00);
+  vst1q_s16(dest + dest_stride, dest10);
+  vst1q_s16(dest + dest_stride + dest_stride, dest20);
+  vst1q_s16(dest + dest_stride + dest_stride + dest_stride, dest30);
+  dest += dest_h_stride;
+  vst1q_s16(dest, dest01);
+  vst1q_s16(dest + dest_stride, dest11);
+  vst1q_s16(dest + dest_stride + dest_stride, dest21);
+  vst1q_s16(dest + dest_stride + dest_stride + dest_stride, dest31);
+  dest += dest_h_stride;
+  vst1q_s16(dest, dest02);
+  vst1q_s16(dest + dest_stride, dest12);
+  vst1q_s16(dest + dest_stride + dest_stride, dest22);
+  vst1q_s16(dest + dest_stride + dest_stride + dest_stride, dest32);
+  dest += dest_h_stride;
+  vst1q_s16(dest, dest03);
+  vst1q_s16(dest + dest_stride, dest13);
+  vst1q_s16(dest + dest_stride + dest_stride, dest23);
+  vst1q_s16(dest + dest_stride + dest_stride + dest_stride, dest33);
+}
+
+// AT=[1, 1,  1,  0,
+//    0, 1, -1, -1]
+void output_trans_c8_post_2x4_int8(const int32_t* src,
+                                   int src_stride,
+                                   int src_h_stride,
+                                   int32_t* dest,
+                                   int dest_stride,
+                                   int dest_h_stride) {
+  int32x4_t src400 = vld1q_s32(src);
+  int32x4_t src800 = vld1q_s32(src + 4);
+  src += src_stride;
+  int32x4_t src401 = vld1q_s32(src);
+  int32x4_t src801 = vld1q_s32(src + 4);
+  src += src_stride;
+  int32x4_t src402 = vld1q_s32(src);
+  int32x4_t src802 = vld1q_s32(src + 4);
+  src += src_stride;
+  int32x4_t src403 = vld1q_s32(src);
+  int32x4_t src803 = vld1q_s32(src + 4);
+
+  src += src_h_stride - 3 * src_stride;
+
+  int32x4_t src410 = vld1q_s32(src);
+  int32x4_t src810 = vld1q_s32(src + 4);
+  src += src_stride;
+  int32x4_t src411 = vld1q_s32(src);
+  int32x4_t src811 = vld1q_s32(src + 4);
+  src += src_stride;
+  int32x4_t src412 = vld1q_s32(src);
+  int32x4_t src812 = vld1q_s32(src + 4);
+  src += src_stride;
+  int32x4_t src413 = vld1q_s32(src);
+  int32x4_t src813 = vld1q_s32(src + 4);
+
+  src += src_h_stride - 3 * src_stride;
+
+  int32x4_t src420 = vld1q_s32(src);
+  int32x4_t src820 = vld1q_s32(src + 4);
+  src += src_stride;
+  int32x4_t src421 = vld1q_s32(src);
+  int32x4_t src821 = vld1q_s32(src + 4);
+  src += src_stride;
+  int32x4_t src422 = vld1q_s32(src);
+  int32x4_t src822 = vld1q_s32(src + 4);
+  src += src_stride;
+  int32x4_t src423 = vld1q_s32(src);
+  int32x4_t src823 = vld1q_s32(src + 4);
+
+  src += src_h_stride - 3 * src_stride;
+
+  int32x4_t src430 = vld1q_s32(src);
+  int32x4_t src830 = vld1q_s32(src + 4);
+  src += src_stride;
+  int32x4_t src431 = vld1q_s32(src);
+  int32x4_t src831 = vld1q_s32(src + 4);
+  src += src_stride;
+  int32x4_t src432 = vld1q_s32(src);
+  int32x4_t src832 = vld1q_s32(src + 4);
+  src += src_stride;
+  int32x4_t src433 = vld1q_s32(src);
+  int32x4_t src833 = vld1q_s32(src + 4);
+
+  int32x4_t dst400 = vaddq_s32(vaddq_s32(src400, src401), src402);
+  int32x4_t dst410 = vsubq_s32(vsubq_s32(src401, src402), src403);
+  int32x4_t dst401 = vaddq_s32(vaddq_s32(src410, src411), src412);
+  int32x4_t dst411 = vsubq_s32(vsubq_s32(src411, src412), src413);
+  int32x4_t dst402 = vaddq_s32(vaddq_s32(src420, src421), src422);
+  int32x4_t dst412 = vsubq_s32(vsubq_s32(src421, src422), src423);
+  int32x4_t dst403 = vaddq_s32(vaddq_s32(src430, src431), src432);
+  int32x4_t dst413 = vsubq_s32(vsubq_s32(src431, src432), src433);
+
+  int32x4_t dst800 = vaddq_s32(vaddq_s32(src800, src801), src802);
+  int32x4_t dst810 = vsubq_s32(vsubq_s32(src801, src802), src803);
+  int32x4_t dst801 = vaddq_s32(vaddq_s32(src810, src811), src812);
+  int32x4_t dst811 = vsubq_s32(vsubq_s32(src811, src812), src813);
+  int32x4_t dst802 = vaddq_s32(vaddq_s32(src820, src821), src822);
+  int32x4_t dst812 = vsubq_s32(vsubq_s32(src821, src822), src823);
+  int32x4_t dst803 = vaddq_s32(vaddq_s32(src830, src831), src832);
+  int32x4_t dst813 = vsubq_s32(vsubq_s32(src831, src832), src833);
+
+  int32x4_t dest400 = vaddq_s32(vaddq_s32(dst400, dst401), dst402);
+  int32x4_t dest410 = vsubq_s32(vsubq_s32(dst401, dst402), dst403);
+  int32x4_t dest401 = vaddq_s32(vaddq_s32(dst410, dst411), dst412);
+  int32x4_t dest411 = vsubq_s32(vsubq_s32(dst411, dst412), dst413);
+
+  int32x4_t dest800 = vaddq_s32(vaddq_s32(dst800, dst801), dst802);
+  int32x4_t dest810 = vsubq_s32(vsubq_s32(dst801, dst802), dst803);
+  int32x4_t dest801 = vaddq_s32(vaddq_s32(dst810, dst811), dst812);
+  int32x4_t dest811 = vsubq_s32(vsubq_s32(dst811, dst812), dst813);
+
+  vst1q_s32(dest, dest400);
+  vst1q_s32(dest + 4, dest800);
+  dest += dest_stride;
+  vst1q_s32(dest, dest410);
+  vst1q_s32(dest + 4, dest810);
+  dest += dest_h_stride - dest_stride;
+  vst1q_s32(dest, dest401);
+  vst1q_s32(dest + 4, dest801);
+  dest += dest_stride;
+  vst1q_s32(dest, dest411);
+  vst1q_s32(dest + 4, dest811);
+}
+
+void weight_trans_c8_4x4_int8(
+    int16_t* dest, const int8_t* din, int ch_in, int ch_out, void* workspace) {
+  const int16_t coeff[4][3] = {{2, 0, 0}, {1, 1, 1}, {1, -1, 1}, {0, 0, 2}};
+
+  int16_t* ptr_out = static_cast<int16_t*>(workspace);
+
+  for (int i = 0; i < ch_out; i++) {
+    for (int j = 0; j < ch_in; j++) {
+      const int8_t* kernel0 =
+          static_cast<const int8_t*>(din) + (i * ch_in + j) * 9;
+      int16_t* ptr_channel = ptr_out + (i * ch_in + j) * 16;
+
+      //! transform kernel, transposed
+      const int8_t* k0 = kernel0;
+      const int8_t* k1 = kernel0 + 3;
+      const int8_t* k2 = kernel0 + 6;
+
+      //! h
+      int16_t tmp[4][3];
+      for (int i = 0; i < 4; i++) {
+        tmp[i][0] =
+            k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2];
+        tmp[i][1] =
+            k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2];
+        tmp[i][2] =
+            k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2];
+      }
+
+      //! v
+      for (int j = 0; j < 4; j++) {
+        int16_t* tmpp = &tmp[j][0];
+        for (int i = 0; i < 4; i++) {
+          ptr_channel[j * 4 + i] = tmpp[0] * coeff[i][0] +
+                                   tmpp[1] * coeff[i][1] +
+                                   tmpp[2] * coeff[i][2];
+        }
+      }
+    }
+  }
+
+  int oc_pad = (ch_out + 7) / 8 * 8;
+  int ic_pad = (ch_in + 7) / 8 * 8;
+  int c_stride = ic_pad * oc_pad;
+  for (int i = 0; i < ch_out * ch_in * 16; ++i) {
+    int new_c = i % 16;
+    int new_oc = i / ch_in / 16 / 8;
+    int new_ic = i / 16 % ch_in;
+    int new_inner = i / ch_in / 16 % 8;
+    int dest_ind =
+        new_c * c_stride + new_oc * ic_pad * 8 + new_ic * 8 + new_inner;
+    dest[dest_ind] = ptr_out[i];
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h
index 78d4f3f74e3e8a0fb06b1fda83ad5deed281621b..c72223d2e845bc67b541e6f1790e45129deff62f 100644
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
@@ -139,6 +139,151 @@ static bool conv_trans_weights_numc(const dtype* din,
   }
   return true;
 }
+// for example: m = 4, n = 4
+// din = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9 , 10 ,11], [12, 13, 14, 15]]
+// dout = [[0, 4, 8, 12], [1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15]]
+/*
+  m = 8 n = 8: 0 1 2 3 4 5 6 7           0 8 16 24 32 40 48 56
+               16 17 18 19 20 21 22 23   2 10 18 26 34 42 50 58
+               24 25 26 27 28 29 30 31   3 11 19 27 35 43 51 59
+               32 33 34 35 36 37 38 39   4 12 20 28 36 44 52 60           ...
+    }
+  }
+*/
+template <typename Dtype>
+void local_transpose(const Dtype* din, Dtype* dout, int m, int n) {
+  // n % 4 == 0 && m % 4 == 0
+  // n * m ==> n * m data trans
+  int offset_m = m << 2;
+  const Dtype* din_ptr = din;
+  Dtype* dout_ptr = dout;
+  for (int i = 0; i < n; i += 4) {
+    Dtype* out_ptr0 = dout_ptr;
+    Dtype* out_ptr1 = dout_ptr + m;
+    Dtype* out_ptr2 = out_ptr1 + m;
+    Dtype* out_ptr3 = out_ptr2 + m;
+    const Dtype* in_ptr0 = din_ptr;
+    const Dtype* in_ptr1 = din_ptr + m;
+    const Dtype* in_ptr2 = in_ptr1 + m;
+    const Dtype* in_ptr3 = in_ptr2 + m;
+    for (int j = 0; j < m; j += 4) {
+      float32x4_t vin0 = vld1q_f32(in_ptr0);
+      float32x4_t vin1 = vld1q_f32(in_ptr1);
+      float32x4_t vin2 = vld1q_f32(in_ptr2);
+      float32x4_t vin3 = vld1q_f32(in_ptr3);
+      // a00 b00 a02 b02 a01 b01 a03 b03
+      float32x4x2_t tmp0 = vtrnq_f32(vin0, vin1);
+      // c00 d00 c02 d02 c01 d01 c03 d03
+      float32x4x2_t tmp2 = vtrnq_f32(vin2, vin3);
+      in_ptr0 = in_ptr3 + m;
+      in_ptr1 = in_ptr3 + 2 * m;
+      float tmp_val1 = tmp0.val[0][2];
+      float tmp_val2 = tmp0.val[0][3];
+      tmp0.val[0][2] = tmp2.val[0][0];
+      tmp0.val[0][3] = tmp2.val[0][1];
+      float tmp_val3 = tmp0.val[1][2];
+      float tmp_val4 = tmp0.val[1][3];
+      tmp2.val[0][0] = tmp_val1;
+      tmp2.val[0][1] = tmp_val2;
+      tmp0.val[1][2] = tmp2.val[1][0];
+      tmp0.val[1][3] = tmp2.val[1][1];
+      tmp2.val[1][0] = tmp_val3;
+      tmp2.val[1][1] = tmp_val4;
+      in_ptr2 = in_ptr1 + m;
+      in_ptr3 = in_ptr1 + 2 * m;
+      vst1q_f32(out_ptr0, tmp0.val[0]);
+      vst1q_f32(out_ptr1, tmp0.val[1]);
+      out_ptr0 += 4;
+      out_ptr1 += 4;
+      vst1q_f32(out_ptr2, tmp2.val[0]);
+      vst1q_f32(out_ptr3, tmp2.val[1]);
+      out_ptr2 += 4;
+      out_ptr3 += 4;
+    }
+    dout_ptr += offset_m;
+    din_ptr += 4;
+  }
+}
+template <typename Dtype>
+void transpose(const Dtype* din, Dtype* dout, int m, int n) {
+  // nxm == mxn
+  // 4x4
+  int cnt_n = n >> 2;
+  int remain_n = n & 3;
+  int cnt_m = m >> 2;
+  int remain_m = m & 3;
+  int nn_num = n << 2;  // n * 4
+  int mm_num = m << 2;  // m * 4
+  for (int x = 0; x < cnt_n; x++) {
+    const Dtype* din_ptr0 = din + x * mm_num;
+    const Dtype* din_ptr1 = din_ptr0 + m;
+    const Dtype* din_ptr2 = din_ptr1 + m;
+    const Dtype* din_ptr3 = din_ptr2 + m;
+    Dtype* dout_ptr0 = dout + x * 4;
+    for (int y = 0; y < cnt_m; y++) {
+      float32x4_t din0 = vld1q_f32(din_ptr0);  // a00 a01 a02 a03
+      float32x4_t din1 = vld1q_f32(din_ptr1);
+      float32x4_t din2 = vld1q_f32(din_ptr2);
+      float32x4_t din3 = vld1q_f32(din_ptr3);
+      Dtype* dout_ptr1 = dout_ptr0 + n;
+      Dtype* dout_ptr2 = dout_ptr1 + n;
+      Dtype* dout_ptr3 = dout_ptr2 + n;
+      // a00 b00 a02 b02 a01 b01 a03 b03
+      float32x4x2_t tmp0 = vtrnq_f32(din0, din1);
+      // c00 d00 c02 d02 c01 d01 c03 d03
+      float32x4x2_t tmp2 = vtrnq_f32(din2, din3);
+      din_ptr0 += 4;
+      din_ptr1 += 4;
+      // a00 b00 c00 d00 a02 b02 c02 d02
+      // a01 b01 c01 d01 a03 b03 c03 d03
+      float tmp_val1 = tmp0.val[0][2];
+      float tmp_val2 = tmp0.val[0][3];
+      tmp0.val[0][2] = tmp2.val[0][0];
+      tmp0.val[0][3] = tmp2.val[0][1];
+      float tmp_val3 = tmp0.val[1][2];
+      float tmp_val4 = tmp0.val[1][3];
+      tmp2.val[0][0] = tmp_val1;
+      tmp2.val[0][1] = tmp_val2;
+      tmp0.val[1][2] = tmp2.val[1][0];
+      tmp0.val[1][3] = tmp2.val[1][1];
+      tmp2.val[1][0] = tmp_val3;
+      tmp2.val[1][1] = tmp_val4;
+      din_ptr2 += 4;
+      din_ptr3 += 4;
+      vst1q_f32(dout_ptr0, tmp0.val[0]);
+      vst1q_f32(dout_ptr1, tmp0.val[1]);
+      dout_ptr0 += nn_num;
+      vst1q_f32(dout_ptr2, tmp2.val[0]);
+      vst1q_f32(dout_ptr3, tmp2.val[1]);
+    }
+    for (int y = 0; y < remain_m; y++) {
+      *dout_ptr0++ = *din_ptr0++;
+      *dout_ptr0++ = *din_ptr1++;
+      *dout_ptr0++ = *din_ptr2++;
+      *dout_ptr0++ = *din_ptr3++;
+    }
+  }
+  const Dtype* din_ptr0 = din + cnt_n * mm_num;
+  dout = dout + cnt_n * 4;
+  for (int x = 0; x < remain_n; x++) {
+    Dtype* dout_ptr0 = dout + x * 4;
+    for (int y = 0; y < cnt_m; y++) {
+      float32x4_t din0 = vld1q_f32(din_ptr0);
+      Dtype* dout_ptr1 = dout_ptr0 + n;
+      Dtype* dout_ptr2 = dout_ptr1 + n;
+      Dtype* dout_ptr3 = dout_ptr2 + n;
+      din_ptr0 += 4;
+      *dout_ptr0 = din0[0];
+      *dout_ptr1 = din0[1];
+      dout_ptr0 += nn_num;
+      *dout_ptr2 = din0[2];
+      *dout_ptr3 = din0[3];
+    }
+    for (int y = 0; y < remain_m; y++) {
+      *dout_ptr0++ = *din_ptr0++;
+    }
+  }
+}
 /*preprocessing inputs
 * input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws]
 * n = he - hs
@@ -3762,6 +3907,7 @@ inline void write_int32_nchwc8_to_nchw(const int* din,
   int w_stride = we - ws;
   int valid_w = (we > width ? width : we) - ws;
   int cnt = valid_w / 4;
+  int remain = valid_w & 3;
 
   float32x4_t w_scale0 = vld1q_f32(scale);
   float32x4_t w_scale1 = vld1q_f32(scale + 4);
@@ -3818,10 +3964,10 @@ inline void write_int32_nchwc8_to_nchw(const int* din,
                           flag_act,
                           alpha);
     }
-    if (we > width) {
+    if (remain > 0) {
       int offset = 32 * cnt;
       din_hei_ptr = ptr_din + offset;
-      for (int j = ws + cnt * 4; j < width; ++j) {
+      for (int j = 0; j < remain; ++j) {
         if (flag_bias) {
           *(doutc0_ptr++) = cvt_kernel<Dtype>(
               din_hei_ptr[0], scale[0], bias[0], flag_act, alpha[0]);
diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h
index 28a2fb7e2a42a27e9ecd3d42b25f9942b481004e..495a13eec17a0c35e90fbf3ef47c505028721857 100644
--- a/lite/backends/arm/math/conv_impl.h
+++ b/lite/backends/arm/math/conv_impl.h
@@ -359,6 +359,35 @@ void conv_compute_2x2_3x3_small(const float* input,
                                 const float* bias,
                                 const operators::ConvParam& param,
                                 ARMContext* ctx);
+void input_trans_c8_4x4_int8(const int8_t* src,
+                             int src_stride,
+                             int src_h_stride,
+                             int16_t* dest,
+                             int dest_stride,
+                             int dest_h_stride);
+void output_trans_c8_post_2x4_int8(const int32_t* src,
+                                   int src_stride,
+                                   int src_h_stride,
+                                   int32_t* dest,
+                                   int dest_stride,
+                                   int dest_h_stride);
+void weight_trans_c8_4x4_int8(
+    int16_t* dest, const int8_t* src, int ic, int oc, void* workspace);
+template <typename Dtype>
+void conv_compute_2x2_3x3_int8(const int8_t* input,
+                               Dtype* output,
+                               int num,
+                               int chout,
+                               int hout,
+                               int wout,
+                               int chin,
+                               int hin,
+                               int win,
+                               const int16_t* weight,
+                               const float* bias,
+                               const float* scale,
+                               const operators::ConvParam& param,
+                               ARMContext* ctx);
 
 template <typename Dtype>
 void im2col(const Dtype* data_im,
diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc
index 4d08c1e957d43b5b748ffdb90fd14a07a61d0183..a73a63ddcb67f8790f73aff3fff8368f4005b7e1 100644
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "lite/backends/arm/math/elementwise.h"
+#include <math.h>
 #include <algorithm>
 #include "lite/backends/arm/math/funcs.h"
 
@@ -747,6 +747,16 @@ void elementwise_mul<int>(const int* dinx,
   }
 }
 
+template <>
+void elementwise_mul<int64_t>(const int64_t* dinx,
+                              const int64_t* diny,
+                              int64_t* dout,
+                              int num) {
+  for (int i = 0; i < num; i++) {
+    dout[i] = dinx[i] * diny[i];
+  }
+}
+
 template <>
 void elementwise_mul_relu<float>(const float* dinx,
                                  const float* diny,
@@ -801,6 +811,17 @@ void elementwise_mul_relu<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_mul_relu<int64_t>(const int64_t* dinx,
+                                   const int64_t* diny,
+                                   int64_t* dout,
+                                   int num) {
+  for (int i = 0; i < num; i++) {
+    int64_t tmp = dinx[i] * diny[i];
+    dout[i] = tmp > 0 ? tmp : 0;
+  }
+}
+
 template <>
 void elementwise_mul_broadcast<float>(const float* dinx,
                                       const float* diny,
@@ -935,6 +956,29 @@ void elementwise_mul_broadcast<int>(const int* dinx,
   }
 }
 
+template <>
+void elementwise_mul_broadcast<int64_t>(const int64_t* dinx,
+                                        const int64_t* diny,
+                                        int64_t* dout,
+                                        int batch,
+                                        int channels,
+                                        int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* dinx_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = *dinx_ptr * diny_data;
+        dout_ptr++;
+        dinx_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_mul_relu_broadcast<float>(const float* dinx,
                                            const float* diny,
@@ -1014,6 +1058,30 @@ void elementwise_mul_relu_broadcast<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_mul_relu_broadcast<int64_t>(const int64_t* dinx,
+                                             const int64_t* diny,
+                                             int64_t* dout,
+                                             int batch,
+                                             int channels,
+                                             int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* dinx_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        int64_t tmp = *dinx_ptr * diny_data;
+        *dout_ptr = tmp > 0 ? tmp : 0;
+        dout_ptr++;
+        dinx_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_max<float>(const float* dinx,
                             const float* diny,
@@ -1254,6 +1322,19 @@ void elementwise_max_relu_broadcast<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_div<int64_t>(const int64_t* dinx,
+                              const int64_t* diny,
+                              int64_t* dout,
+                              int num) {
+  for (int i = 0; i < num; i++) {
+    *dout = *dinx / *diny;
+    dout++;
+    dinx++;
+    diny++;
+  }
+}
+
 template <>
 void elementwise_div<float>(const float* dinx,
                             const float* diny,
@@ -1306,6 +1387,28 @@ void elementwise_div<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_div_broadcast<int64_t>(const int64_t* dinx,
+                                        const int64_t* diny,
+                                        int64_t* dout,
+                                        int batch,
+                                        int channels,
+                                        int num) {
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* din_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int p = 0; p < num; p++) {
+        *dout_ptr = *din_ptr / diny_data;
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_div_broadcast<float>(const float* dinx,
                                       const float* diny,
@@ -1541,6 +1644,87 @@ void elementwise_div_relu_broadcast<float>(const float* dinx,
   }
 }
 
+template <typename T>
+void elementwise_mod_broadcast(
+    const T* dinx, const T* diny, T* dout, int batch, int channels, int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const T* din_ptr = dinx + offset;
+      const T diny_data = diny[j];
+      T* dout_ptr = dout + offset;
+
+      int cnt = num >> 2;
+      int remain = num % 4;
+      for (int k = 0; k < cnt; ++k) {
+        register T dinx0 = din_ptr[0];
+        register T dinx1 = din_ptr[1];
+        register T dinx2 = din_ptr[2];
+        register T dinx3 = din_ptr[3];
+        dout_ptr[0] = dinx0 % diny_data;
+        dout_ptr[1] = dinx1 % diny_data;
+        dout_ptr[2] = dinx2 % diny_data;
+        dout_ptr[3] = dinx3 % diny_data;
+        din_ptr += 4;
+        dout_ptr += 4;
+      }
+      if (remain > 0) {
+        for (int p = 0; p < remain; p++) {
+          *dout_ptr++ = *din_ptr++ % diny_data;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void elementwise_mod(const T* dinx, const T* diny, T* dout, int num) {
+  int cnt = num >> 2;
+  int remain = num % 4;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const T* dinx_ptr = dinx + (i << 2);
+    const T* diny_ptr = diny + (i << 2);
+    T* dout_ptr = dout + (i << 2);
+
+    register T dinx0 = dinx_ptr[0];
+    register T dinx1 = dinx_ptr[1];
+    register T dinx2 = dinx_ptr[2];
+    register T dinx3 = dinx_ptr[3];
+
+    register T diny0 = diny_ptr[0];
+    register T diny1 = diny_ptr[1];
+    register T diny2 = diny_ptr[2];
+    register T diny3 = diny_ptr[3];
+
+    dout_ptr[0] = dinx0 % diny0;
+    dout_ptr[1] = dinx1 % diny1;
+    dout_ptr[2] = dinx2 % diny2;
+    dout_ptr[3] = dinx3 % diny3;
+  }
+  if (remain > 0) {
+    const T* dinx_ptr = dinx + (cnt << 2);
+    const T* diny_ptr = diny + (cnt << 2);
+    T* dout_ptr = dout + (cnt << 2);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr++ = *dinx_ptr++ % *diny_ptr++;
+    }
+  }
+}
+
+template void elementwise_mod<int64_t>(const int64_t* dinx,
+                                       const int64_t* diny,
+                                       int64_t* dout,
+                                       int num);
+
+template void elementwise_mod_broadcast<int64_t>(const int64_t* dinx,
+                                                 const int64_t* diny,
+                                                 int64_t* dout,
+                                                 int batch,
+                                                 int channels,
+                                                 int num);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/elementwise.h b/lite/backends/arm/math/elementwise.h
index 06ecab08edcaf06614de94b99084be2ee80647aa..0b400fcce26c7d307777cc6e25d8d25e0d6234bc 100644
--- a/lite/backends/arm/math/elementwise.h
+++ b/lite/backends/arm/math/elementwise.h
@@ -253,6 +253,13 @@ template <typename T>
 void elementwise_div_relu_broadcast(
     const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
 
+template <typename T>
+void elementwise_mod(const T* dinx, const T* diny, T* dout, int num);
+
+template <typename T>
+void elementwise_mod_broadcast(
+    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h
index e975160c97b6e7396ab208805a4d685586ac00c8..75dcc971b80e53c3874ffcbb108afdc0e0faa705 100644
--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -25,6 +25,7 @@
 #include "lite/backends/arm/math/axpy.h"
 #include "lite/backends/arm/math/beam_search.h"
 #include "lite/backends/arm/math/box_coder.h"
+#include "lite/backends/arm/math/clip.h"
 #include "lite/backends/arm/math/col_im_transform.h"
 #include "lite/backends/arm/math/concat.h"
 #include "lite/backends/arm/math/conv_block_utils.h"
diff --git a/lite/backends/arm/math/gemm_prepacked_int8.cc b/lite/backends/arm/math/gemm_prepacked_int8.cc
index 343e93439d2db563e5ccd4d8c6aed681601871a0..f0c7c65c9067dabb46ad43b3a20a1b85d86d62d0 100644
--- a/lite/backends/arm/math/gemm_prepacked_int8.cc
+++ b/lite/backends/arm/math/gemm_prepacked_int8.cc
@@ -2242,19 +2242,45 @@ void gemm_prepack_oth_int8(const int8_t* A_packed,
       Dtype* tmp1 = nullptr;
       Dtype* tmp2 = nullptr;
       Dtype* tmp3 = nullptr;
-      float32_t scale_local[4];
+      float32_t scale_local[4] = {0, 0, 0, 0};
       float32_t bias_local[4] = {0, 0, 0, 0};
       if (is_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
+        if (y + 4 <= M) {
+          bias_local[0] = bias[y];
+          bias_local[1] = bias[y + 1];
+          bias_local[2] = bias[y + 2];
+          bias_local[3] = bias[y + 3];
+        } else {
+          switch (M - y) {
+            case 3:
+              bias_local[2] = bias[y + 2];
+            case 2:
+              bias_local[1] = bias[y + 1];
+            case 1:
+              bias_local[0] = bias[y + 0];
+            default:
+              break;
+          }
+        }
       }
       if (scale) {
-        scale_local[0] = scale[y];
-        scale_local[1] = scale[y + 1];
-        scale_local[2] = scale[y + 2];
-        scale_local[3] = scale[y + 3];
+        if (y + 4 <= M) {
+          scale_local[0] = scale[y];
+          scale_local[1] = scale[y + 1];
+          scale_local[2] = scale[y + 2];
+          scale_local[3] = scale[y + 3];
+        } else {
+          switch (M - y) {
+            case 3:
+              scale_local[2] = scale[y + 2];
+            case 2:
+              scale_local[1] = scale[y + 1];
+            case 1:
+              scale_local[0] = scale[y + 0];
+            default:
+              break;
+          }
+        }
       }
       if (y + MBLOCK_INT8_OTH > M) {
         switch (y + MBLOCK_INT8_OTH - M) {
diff --git a/lite/backends/arm/math/packed_sgemm_c4.cc b/lite/backends/arm/math/packed_sgemm_c4.cc
index af4934e85756f03ec197520b2b5c130e27bdcad6..db1189a63c38bdb6ab33c6fa280a6f618b53ef7f 100644
--- a/lite/backends/arm/math/packed_sgemm_c4.cc
+++ b/lite/backends/arm/math/packed_sgemm_c4.cc
@@ -1679,6 +1679,912 @@ void sgemm_prepack_c4_small(int M,
   }
 }
 
+void sgemm_prepack_c8_int16_small(int M,
+                                  int N,
+                                  int K,
+                                  const int16_t* A_packed,
+                                  const int16_t* B,
+                                  int32_t* C,
+                                  ARMContext* ctx) {
+  const int m_round = (M + 7) / 8 * 8;
+  const int k_round = (K + 7) / 8 * 8;
+  const int mloop = m_round >> 3;
+  const int lda = 8 * k_round;
+  const int ldb_byte = 8 * N * sizeof(int16_t);
+  const int kcnt = k_round >> 3;
+#ifdef __aarch64__
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#endif
+  for (int m = 0; m < mloop; ++m) {
+    const int16_t* b = B;
+    int n = N;
+#ifdef __aarch64__
+    for (; n > 7; n -= 8) {
+      int cnt = kcnt;
+      const int16_t* a_ptr = A_packed;
+      const int16_t* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" //load a0, a1
+        "ld1 {v4.8h, v5.8h}, [%[b]], #32 \n" //load b0, b1
+        "ld1 {v6.8h, v7.8h}, [%[b]], #32 \n" //load b2, b3
+        
+        "smull v20.4s, v0.4h, v4.h[0] \n"
+        "smull v21.4s, v0.4h, v5.h[0] \n"
+        "smull v22.4s, v0.4h, v6.h[0] \n"
+        "smull v23.4s, v0.4h, v7.h[0] \n"
+        "ld1 {v8.8h, v9.8h}, [%[b]], #32 \n" //load b0, b1
+        "ld1 {v10.8h, v11.8h}, [%[b]], #32 \n" //load b2, b3
+        
+        "smull2 v24.4s, v0.8h, v4.h[0] \n"        
+        "smull2 v25.4s, v0.8h, v5.h[0] \n"        
+        "smull2 v26.4s, v0.8h, v6.h[0] \n"        
+        "smull2 v27.4s, v0.8h, v7.h[0] \n"        
+        "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" //load a2, a3
+
+        "smlal v20.4s, v1.4h, v4.h[1] \n"        
+        "smlal v21.4s, v1.4h, v5.h[1] \n"        
+        "smlal v22.4s, v1.4h, v6.h[1] \n"        
+        "smlal v23.4s, v1.4h, v7.h[1] \n"
+
+        "smlal2 v24.4s, v1.8h, v4.h[1] \n"        
+        "smlal2 v25.4s, v1.8h, v5.h[1] \n"        
+        "smlal2 v26.4s, v1.8h, v6.h[1] \n"        
+        "smlal2 v27.4s, v1.8h, v7.h[1] \n"        
+
+        "smull v12.4s, v0.4h, v8.h[0] \n"
+        "smull v13.4s, v0.4h, v9.h[0] \n"
+        "smull v14.4s, v0.4h, v10.h[0] \n"
+        "smull v15.4s, v0.4h, v11.h[0] \n"
+        
+        "smull2 v16.4s, v0.8h, v8.h[0] \n"        
+        "smull2 v17.4s, v0.8h, v9.h[0] \n"        
+        "smull2 v18.4s, v0.8h, v10.h[0] \n"        
+        "smull2 v19.4s, v0.8h, v11.h[0] \n"        
+
+        "smlal v12.4s, v1.4h, v8.h[1] \n"        
+        "smlal v13.4s, v1.4h, v9.h[1] \n"        
+        "smlal v14.4s, v1.4h, v10.h[1] \n"        
+        "smlal v15.4s, v1.4h, v11.h[1] \n"        
+
+        "smlal2 v16.4s, v1.8h, v8.h[1] \n"        
+        "smlal2 v17.4s, v1.8h, v9.h[1] \n"        
+        "smlal2 v18.4s, v1.8h, v10.h[1] \n"        
+        "smlal2 v19.4s, v1.8h, v11.h[1] \n"        
+
+        "smlal v20.4s, v2.4h, v4.h[2] \n"        
+        "smlal v21.4s, v2.4h, v5.h[2] \n"        
+        "smlal v22.4s, v2.4h, v6.h[2] \n"        
+        "smlal v23.4s, v2.4h, v7.h[2] \n"        
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" //load a0, a1
+        "smlal2 v24.4s, v2.8h, v4.h[2] \n"        
+        "smlal2 v25.4s, v2.8h, v5.h[2] \n"        
+        "smlal2 v26.4s, v2.8h, v6.h[2] \n"        
+        "smlal2 v27.4s, v2.8h, v7.h[2] \n"        
+        "smlal v12.4s, v2.4h, v8.h[2] \n"        
+        "smlal v13.4s, v2.4h, v9.h[2] \n"        
+        "smlal v14.4s, v2.4h, v10.h[2] \n"        
+        "smlal v15.4s, v2.4h, v11.h[2] \n"        
+        "smlal2 v16.4s, v2.8h, v8.h[2] \n"        
+        "smlal2 v17.4s, v2.8h, v9.h[2] \n"        
+        "smlal2 v18.4s, v2.8h, v10.h[2] \n"        
+        "smlal2 v19.4s, v2.8h, v11.h[2] \n"        
+
+        "smlal v20.4s, v3.4h, v4.h[3] \n"        
+        "smlal v21.4s, v3.4h, v5.h[3] \n"        
+        "smlal v22.4s, v3.4h, v6.h[3] \n"        
+        "smlal v23.4s, v3.4h, v7.h[3] \n"        
+        "smlal2 v24.4s, v3.8h, v4.h[3] \n"        
+        "smlal2 v25.4s, v3.8h, v5.h[3] \n"        
+        "smlal2 v26.4s, v3.8h, v6.h[3] \n"        
+        "smlal2 v27.4s, v3.8h, v7.h[3] \n"
+        "smlal v12.4s, v3.4h, v8.h[3] \n"        
+        "smlal v13.4s, v3.4h, v9.h[3] \n"        
+        "smlal v14.4s, v3.4h, v10.h[3] \n"        
+        "smlal v15.4s, v3.4h, v11.h[3] \n"        
+        "smlal2 v16.4s, v3.8h, v8.h[3] \n"        
+        "smlal2 v17.4s, v3.8h, v9.h[3] \n"        
+        "smlal2 v18.4s, v3.8h, v10.h[3] \n"        
+        "smlal2 v19.4s, v3.8h, v11.h[3] \n"        
+
+        "smlal v20.4s, v0.4h, v4.h[4] \n"
+        "smlal v21.4s, v0.4h, v5.h[4] \n"
+        "smlal v22.4s, v0.4h, v6.h[4] \n"
+        "smlal v23.4s, v0.4h, v7.h[4] \n"
+        
+        "smlal2 v24.4s, v0.8h, v4.h[4] \n"        
+        "smlal2 v25.4s, v0.8h, v5.h[4] \n"        
+        "smlal2 v26.4s, v0.8h, v6.h[4] \n"        
+        "smlal2 v27.4s, v0.8h, v7.h[4] \n"        
+        "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" //load a2, a3
+
+        "smlal v20.4s, v1.4h, v4.h[5] \n"        
+        "smlal v21.4s, v1.4h, v5.h[5] \n"        
+        "smlal v22.4s, v1.4h, v6.h[5] \n"        
+        "smlal v23.4s, v1.4h, v7.h[5] \n"
+
+        "smlal2 v24.4s, v1.8h, v4.h[5] \n"        
+        "smlal2 v25.4s, v1.8h, v5.h[5] \n"        
+        "smlal2 v26.4s, v1.8h, v6.h[5] \n"        
+        "smlal2 v27.4s, v1.8h, v7.h[5] \n"        
+
+        "smlal v12.4s, v0.4h, v8.h[4] \n"
+        "smlal v13.4s, v0.4h, v9.h[4] \n"
+        "smlal v14.4s, v0.4h, v10.h[4] \n"
+        "smlal v15.4s, v0.4h, v11.h[4] \n"
+        
+        "smlal2 v16.4s, v0.8h, v8.h[4] \n"        
+        "smlal2 v17.4s, v0.8h, v9.h[4] \n"        
+        "smlal2 v18.4s, v0.8h, v10.h[4] \n"        
+        "smlal2 v19.4s, v0.8h, v11.h[4] \n"        
+
+        "smlal v12.4s, v1.4h, v8.h[5] \n"        
+        "smlal v13.4s, v1.4h, v9.h[5] \n"        
+        "smlal v14.4s, v1.4h, v10.h[5] \n"        
+        "smlal v15.4s, v1.4h, v11.h[5] \n"        
+
+        "smlal2 v16.4s, v1.8h, v8.h[5] \n"        
+        "smlal2 v17.4s, v1.8h, v9.h[5] \n"        
+        "smlal2 v18.4s, v1.8h, v10.h[5] \n"        
+        "smlal2 v19.4s, v1.8h, v11.h[5] \n"        
+
+        "smlal v20.4s, v2.4h, v4.h[6] \n"        
+        "smlal v21.4s, v2.4h, v5.h[6] \n"        
+        "smlal v22.4s, v2.4h, v6.h[6] \n"        
+        "smlal v23.4s, v2.4h, v7.h[6] \n"        
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" //load a0, a1
+        "smlal2 v24.4s, v2.8h, v4.h[6] \n"        
+        "smlal2 v25.4s, v2.8h, v5.h[6] \n"        
+        "smlal2 v26.4s, v2.8h, v6.h[6] \n"        
+        "smlal2 v27.4s, v2.8h, v7.h[6] \n"        
+        "sub %[b], %[b], #128         \n"
+        "add %[b], %[b], %[ldb]        \n"        
+        "smlal v20.4s, v3.4h, v4.h[7] \n"        
+        "smlal v21.4s, v3.4h, v5.h[7] \n"        
+        "smlal v22.4s, v3.4h, v6.h[7] \n"        
+        "smlal v23.4s, v3.4h, v7.h[7] \n"        
+        "smlal2 v24.4s, v3.8h, v4.h[7] \n"        
+        "smlal2 v25.4s, v3.8h, v5.h[7] \n"        
+        "smlal2 v26.4s, v3.8h, v6.h[7] \n"        
+        "smlal2 v27.4s, v3.8h, v7.h[7] \n"
+        "ld1 {v4.8h, v5.8h}, [%[b]], #32 \n" //load b0, b1
+        "ld1 {v6.8h, v7.8h}, [%[b]], #32 \n" //load b2, b3
+
+        "smlal v12.4s, v2.4h, v8.h[6] \n"        
+        "smlal v13.4s, v2.4h, v9.h[6] \n"        
+        "smlal v14.4s, v2.4h, v10.h[6] \n"        
+        "smlal v15.4s, v2.4h, v11.h[6] \n"        
+        "smlal2 v16.4s, v2.8h, v8.h[6] \n"        
+        "smlal2 v17.4s, v2.8h, v9.h[6] \n"        
+        "smlal2 v18.4s, v2.8h, v10.h[6] \n"        
+        "smlal2 v19.4s, v2.8h, v11.h[6] \n"        
+        "subs   %w[cnt], %w[cnt], #1      \n"
+
+        "smlal v12.4s, v3.4h, v8.h[7] \n"        
+        "smlal v13.4s, v3.4h, v9.h[7] \n"        
+        "smlal v14.4s, v3.4h, v10.h[7] \n"        
+        "smlal v15.4s, v3.4h, v11.h[7] \n"        
+        "smlal2 v16.4s, v3.8h, v8.h[7] \n"        
+        "smlal2 v17.4s, v3.8h, v9.h[7] \n"        
+        "smlal2 v18.4s, v3.8h, v10.h[7] \n"        
+        "smlal2 v19.4s, v3.8h, v11.h[7] \n"        
+
+        "beq 2f                         \n"
+        "1:\n"
+        "smlal v20.4s, v0.4h, v4.h[0] \n"
+        "smlal v21.4s, v0.4h, v5.h[0] \n"
+        "smlal v22.4s, v0.4h, v6.h[0] \n"
+        "smlal v23.4s, v0.4h, v7.h[0] \n"
+        "ld1 {v8.8h, v9.8h}, [%[b]], #32 \n" //load b0, b1
+        "ld1 {v10.8h, v11.8h}, [%[b]], #32 \n" //load b2, b3
+        
+        "smlal2 v24.4s, v0.8h, v4.h[0] \n"        
+        "smlal2 v25.4s, v0.8h, v5.h[0] \n"        
+        "smlal2 v26.4s, v0.8h, v6.h[0] \n"        
+        "smlal2 v27.4s, v0.8h, v7.h[0] \n"        
+        "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" //load a2, a3
+
+        "smlal v20.4s, v1.4h, v4.h[1] \n"        
+        "smlal v21.4s, v1.4h, v5.h[1] \n"        
+        "smlal v22.4s, v1.4h, v6.h[1] \n"        
+        "smlal v23.4s, v1.4h, v7.h[1] \n"
+
+        "smlal2 v24.4s, v1.8h, v4.h[1] \n"        
+        "smlal2 v25.4s, v1.8h, v5.h[1] \n"        
+        "smlal2 v26.4s, v1.8h, v6.h[1] \n"        
+        "smlal2 v27.4s, v1.8h, v7.h[1] \n"        
+
+        "smlal v12.4s, v0.4h, v8.h[0] \n"
+        "smlal v13.4s, v0.4h, v9.h[0] \n"
+        "smlal v14.4s, v0.4h, v10.h[0] \n"
+        "smlal v15.4s, v0.4h, v11.h[0] \n"
+        
+        "smlal2 v16.4s, v0.8h, v8.h[0] \n"        
+        "smlal2 v17.4s, v0.8h, v9.h[0] \n"        
+        "smlal2 v18.4s, v0.8h, v10.h[0] \n"        
+        "smlal2 v19.4s, v0.8h, v11.h[0] \n"        
+
+        "smlal v12.4s, v1.4h, v8.h[1] \n"        
+        "smlal v13.4s, v1.4h, v9.h[1] \n"        
+        "smlal v14.4s, v1.4h, v10.h[1] \n"        
+        "smlal v15.4s, v1.4h, v11.h[1] \n"        
+
+        "smlal2 v16.4s, v1.8h, v8.h[1] \n"        
+        "smlal2 v17.4s, v1.8h, v9.h[1] \n"        
+        "smlal2 v18.4s, v1.8h, v10.h[1] \n"        
+        "smlal2 v19.4s, v1.8h, v11.h[1] \n"        
+
+        "smlal v20.4s, v2.4h, v4.h[2] \n"        
+        "smlal v21.4s, v2.4h, v5.h[2] \n"        
+        "smlal v22.4s, v2.4h, v6.h[2] \n"        
+        "smlal v23.4s, v2.4h, v7.h[2] \n"        
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" //load a0, a1
+        "smlal2 v24.4s, v2.8h, v4.h[2] \n"        
+        "smlal2 v25.4s, v2.8h, v5.h[2] \n"        
+        "smlal2 v26.4s, v2.8h, v6.h[2] \n"        
+        "smlal2 v27.4s, v2.8h, v7.h[2] \n"        
+        "smlal v12.4s, v2.4h, v8.h[2] \n"        
+        "smlal v13.4s, v2.4h, v9.h[2] \n"        
+        "smlal v14.4s, v2.4h, v10.h[2] \n"        
+        "smlal v15.4s, v2.4h, v11.h[2] \n"        
+        "smlal2 v16.4s, v2.8h, v8.h[2] \n"        
+        "smlal2 v17.4s, v2.8h, v9.h[2] \n"        
+        "smlal2 v18.4s, v2.8h, v10.h[2] \n"        
+        "smlal2 v19.4s, v2.8h, v11.h[2] \n"        
+
+        "smlal v20.4s, v3.4h, v4.h[3] \n"        
+        "smlal v21.4s, v3.4h, v5.h[3] \n"        
+        "smlal v22.4s, v3.4h, v6.h[3] \n"        
+        "smlal v23.4s, v3.4h, v7.h[3] \n"        
+        "smlal2 v24.4s, v3.8h, v4.h[3] \n"        
+        "smlal2 v25.4s, v3.8h, v5.h[3] \n"        
+        "smlal2 v26.4s, v3.8h, v6.h[3] \n"        
+        "smlal2 v27.4s, v3.8h, v7.h[3] \n"
+        "smlal v12.4s, v3.4h, v8.h[3] \n"        
+        "smlal v13.4s, v3.4h, v9.h[3] \n"        
+        "smlal v14.4s, v3.4h, v10.h[3] \n"        
+        "smlal v15.4s, v3.4h, v11.h[3] \n"        
+        "smlal2 v16.4s, v3.8h, v8.h[3] \n"        
+        "smlal2 v17.4s, v3.8h, v9.h[3] \n"        
+        "smlal2 v18.4s, v3.8h, v10.h[3] \n"        
+        "smlal2 v19.4s, v3.8h, v11.h[3] \n"        
+
+        "smlal v20.4s, v0.4h, v4.h[4] \n"
+        "smlal v21.4s, v0.4h, v5.h[4] \n"
+        "smlal v22.4s, v0.4h, v6.h[4] \n"
+        "smlal v23.4s, v0.4h, v7.h[4] \n"
+        
+        "smlal2 v24.4s, v0.8h, v4.h[4] \n"        
+        "smlal2 v25.4s, v0.8h, v5.h[4] \n"        
+        "smlal2 v26.4s, v0.8h, v6.h[4] \n"        
+        "smlal2 v27.4s, v0.8h, v7.h[4] \n"        
+        "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n" //load a2, a3
+
+        "smlal v20.4s, v1.4h, v4.h[5] \n"        
+        "smlal v21.4s, v1.4h, v5.h[5] \n"        
+        "smlal v22.4s, v1.4h, v6.h[5] \n"        
+        "smlal v23.4s, v1.4h, v7.h[5] \n"
+
+        "smlal2 v24.4s, v1.8h, v4.h[5] \n"        
+        "smlal2 v25.4s, v1.8h, v5.h[5] \n"        
+        "smlal2 v26.4s, v1.8h, v6.h[5] \n"        
+        "smlal2 v27.4s, v1.8h, v7.h[5] \n"        
+
+        "smlal v12.4s, v0.4h, v8.h[4] \n"
+        "smlal v13.4s, v0.4h, v9.h[4] \n"
+        "smlal v14.4s, v0.4h, v10.h[4] \n"
+        "smlal v15.4s, v0.4h, v11.h[4] \n"
+        
+        "smlal2 v16.4s, v0.8h, v8.h[4] \n"        
+        "smlal2 v17.4s, v0.8h, v9.h[4] \n"        
+        "smlal2 v18.4s, v0.8h, v10.h[4] \n"        
+        "smlal2 v19.4s, v0.8h, v11.h[4] \n"        
+
+        "smlal v12.4s, v1.4h, v8.h[5] \n"        
+        "smlal v13.4s, v1.4h, v9.h[5] \n"        
+        "smlal v14.4s, v1.4h, v10.h[5] \n"        
+        "smlal v15.4s, v1.4h, v11.h[5] \n"        
+
+        "smlal2 v16.4s, v1.8h, v8.h[5] \n"        
+        "smlal2 v17.4s, v1.8h, v9.h[5] \n"        
+        "smlal2 v18.4s, v1.8h, v10.h[5] \n"        
+        "smlal2 v19.4s, v1.8h, v11.h[5] \n"        
+
+        "smlal v20.4s, v2.4h, v4.h[6] \n"        
+        "smlal v21.4s, v2.4h, v5.h[6] \n"        
+        "smlal v22.4s, v2.4h, v6.h[6] \n"        
+        "smlal v23.4s, v2.4h, v7.h[6] \n"        
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n" //load a0, a1
+        "smlal2 v24.4s, v2.8h, v4.h[6] \n"        
+        "smlal2 v25.4s, v2.8h, v5.h[6] \n"        
+        "smlal2 v26.4s, v2.8h, v6.h[6] \n"        
+        "smlal2 v27.4s, v2.8h, v7.h[6] \n"        
+        "sub %[b], %[b], #128         \n"
+        "add %[b], %[b], %[ldb]        \n"        
+        "smlal v20.4s, v3.4h, v4.h[7] \n"        
+        "smlal v21.4s, v3.4h, v5.h[7] \n"        
+        "smlal v22.4s, v3.4h, v6.h[7] \n"        
+        "smlal v23.4s, v3.4h, v7.h[7] \n"        
+        "smlal2 v24.4s, v3.8h, v4.h[7] \n"        
+        "smlal2 v25.4s, v3.8h, v5.h[7] \n"        
+        "smlal2 v26.4s, v3.8h, v6.h[7] \n"        
+        "smlal2 v27.4s, v3.8h, v7.h[7] \n"
+        "ld1 {v4.8h, v5.8h}, [%[b]], #32 \n" //load b0, b1
+        "ld1 {v6.8h, v7.8h}, [%[b]], #32 \n" //load b2, b3
+
+        "smlal v12.4s, v2.4h, v8.h[6] \n"        
+        "smlal v13.4s, v2.4h, v9.h[6] \n"        
+        "smlal v14.4s, v2.4h, v10.h[6] \n"        
+        "smlal v15.4s, v2.4h, v11.h[6] \n"        
+        "smlal2 v16.4s, v2.8h, v8.h[6] \n"        
+        "smlal2 v17.4s, v2.8h, v9.h[6] \n"        
+        "smlal2 v18.4s, v2.8h, v10.h[6] \n"        
+        "smlal2 v19.4s, v2.8h, v11.h[6] \n"        
+        "subs   %w[cnt], %w[cnt], #1      \n"
+
+        "smlal v12.4s, v3.4h, v8.h[7] \n"        
+        "smlal v13.4s, v3.4h, v9.h[7] \n"        
+        "smlal v14.4s, v3.4h, v10.h[7] \n"        
+        "smlal v15.4s, v3.4h, v11.h[7] \n"        
+        "smlal2 v16.4s, v3.8h, v8.h[7] \n"        
+        "smlal2 v17.4s, v3.8h, v9.h[7] \n"        
+        "smlal2 v18.4s, v3.8h, v10.h[7] \n"        
+        "smlal2 v19.4s, v3.8h, v11.h[7] \n"        
+        
+        "bne 1b                         \n"        
+        "2:                             \n"
+        "stp q20, q24, [%[c]], #32 \n"
+        "stp q21, q25, [%[c]], #32 \n"
+        "stp q22, q26, [%[c]], #32 \n"
+        "stp q23, q27, [%[c]], #32 \n"
+        "stp q12, q16, [%[c]], #32 \n"
+        "stp q13, q17, [%[c]], #32 \n"
+        "stp q14, q18, [%[c]], #32 \n"
+        "stp q15, q19, [%[c]], #32 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb] "r" (ldb_byte)
+        : "v0", "v1", "v2", "v3", "v4","v5", "v6", "v7", "v8", "v9", 
+          "v10", "v11", "13", "14", "15", "16", "17", "18", "19","v20",
+           "v21", "v22", "v23", "v24", "v25", "v26", "v27", "cc", "memory" 
+      );
+      // clang format on
+      b += 64;
+    }
+    for (; n > 3; n -= 4) {
+      int cnt = kcnt;
+      const int16_t* a_ptr = A_packed;
+      const int16_t* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n"
+        "ld1 {v4.8h, v5.8h}, [%[b]], #32 \n"
+        
+        "smull v8.4s, v0.4h, v4.h[0] \n"
+        "smull v9.4s, v0.4h, v5.h[0] \n"
+        "ld1 {v6.8h, v7.8h}, [%[b]], #32 \n"
+        "smull2 v10.4s, v0.8h, v4.h[0] \n"
+        "smull2 v11.4s, v0.8h, v5.h[0] \n"
+
+        "smlal v8.4s, v1.4h, v4.h[1] \n"
+        "smlal v9.4s, v1.4h, v5.h[1] \n"
+        "smlal2 v10.4s, v1.8h, v4.h[1] \n"
+        "smlal2 v11.4s, v1.8h, v5.h[1] \n"
+        "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n"
+
+        "smull v12.4s, v0.4h, v6.h[0] \n"
+        "smull v13.4s, v0.4h, v7.h[0] \n"
+        "smull2 v14.4s, v0.8h, v6.h[0] \n"
+        "smull2 v15.4s, v0.8h, v7.h[0] \n"
+        "smlal v12.4s, v1.4h, v6.h[1] \n"
+        "smlal v13.4s, v1.4h, v7.h[1] \n"
+        "smlal2 v14.4s, v1.8h, v6.h[1] \n"
+        "smlal2 v15.4s, v1.8h, v7.h[1] \n"
+
+        "smlal v8.4s, v2.4h, v4.h[2] \n"
+        "smlal v9.4s, v2.4h, v5.h[2] \n"
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n"
+        "smlal2 v10.4s, v2.8h, v4.h[2] \n"
+        "smlal2 v11.4s, v2.8h, v5.h[2] \n"
+        "smlal v8.4s, v3.4h, v4.h[3] \n"
+        "smlal v9.4s, v3.4h, v5.h[3] \n"
+        "smlal2 v10.4s, v3.8h, v4.h[3] \n"
+        "smlal2 v11.4s, v3.8h, v5.h[3] \n"
+
+        "smlal v12.4s, v2.4h, v6.h[2] \n"
+        "smlal v13.4s, v2.4h, v7.h[2] \n"
+        "smlal2 v14.4s, v2.8h, v6.h[2] \n"
+        "smlal2 v15.4s, v2.8h, v7.h[2] \n"
+        "smlal v12.4s, v3.4h, v6.h[3] \n"
+        "smlal v13.4s, v3.4h, v7.h[3] \n"
+        "smlal2 v14.4s, v3.8h, v6.h[3] \n"
+        "smlal2 v15.4s, v3.8h, v7.h[3] \n"
+
+        "smlal v8.4s, v0.4h, v4.h[4] \n"
+        "smlal v9.4s, v0.4h, v5.h[4] \n"
+        "smlal2 v10.4s, v0.8h, v4.h[4] \n"
+        "smlal2 v11.4s, v0.8h, v5.h[4] \n"
+
+        "smlal v8.4s, v1.4h, v4.h[5] \n"
+        "smlal v9.4s, v1.4h, v5.h[5] \n"
+        "smlal2 v10.4s, v1.8h, v4.h[5] \n"
+        "smlal2 v11.4s, v1.8h, v5.h[5] \n"
+        "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n"
+
+        "smlal v12.4s, v0.4h, v6.h[4] \n"
+        "smlal v13.4s, v0.4h, v7.h[4] \n"
+        "smlal2 v14.4s, v0.8h, v6.h[4] \n"
+        "smlal2 v15.4s, v0.8h, v7.h[4] \n"
+        "smlal v12.4s, v1.4h, v6.h[5] \n"
+        "smlal v13.4s, v1.4h, v7.h[5] \n"
+        "smlal2 v14.4s, v1.8h, v6.h[5] \n"
+        "smlal2 v15.4s, v1.8h, v7.h[5] \n"
+
+        "smlal v8.4s, v2.4h, v4.h[6] \n"
+        "smlal v9.4s, v2.4h, v5.h[6] \n"
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n"
+        "smlal2 v10.4s, v2.8h, v4.h[6] \n"
+        "smlal2 v11.4s, v2.8h, v5.h[6] \n"
+        "smlal v8.4s, v3.4h, v4.h[7] \n"
+        "smlal v9.4s, v3.4h, v5.h[7] \n"
+        "smlal2 v10.4s, v3.8h, v4.h[7] \n"
+        "smlal2 v11.4s, v3.8h, v5.h[7] \n"
+        "sub %[b], %[b], #64           \n"
+        "add %[b], %[b], %[ldb]        \n"
+
+        "smlal v12.4s, v2.4h, v6.h[6] \n"
+        "smlal v13.4s, v2.4h, v7.h[6] \n"
+        "subs %w[cnt], %w[cnt], #1        \n"
+        "ld1 {v4.8h, v5.8h}, [%[b]], #32 \n"
+        "smlal2 v14.4s, v2.8h, v6.h[6] \n"
+        "smlal2 v15.4s, v2.8h, v7.h[6] \n"
+        "smlal v12.4s, v3.4h, v6.h[7] \n"
+        "smlal v13.4s, v3.4h, v7.h[7] \n"
+        "smlal2 v14.4s, v3.8h, v6.h[7] \n"
+        "smlal2 v15.4s, v3.8h, v7.h[7] \n"
+
+        "beq 2f \n"
+        "1: \n"
+        "smlal v8.4s, v0.4h, v4.h[0] \n"
+        "smlal v9.4s, v0.4h, v5.h[0] \n"
+        "ld1 {v6.8h, v7.8h}, [%[b]], #32 \n"
+        "smlal2 v10.4s, v0.8h, v4.h[0] \n"
+        "smlal2 v11.4s, v0.8h, v5.h[0] \n"
+
+        "smlal v8.4s, v1.4h, v4.h[1] \n"
+        "smlal v9.4s, v1.4h, v5.h[1] \n"
+        "smlal2 v10.4s, v1.8h, v4.h[1] \n"
+        "smlal2 v11.4s, v1.8h, v5.h[1] \n"
+        "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n"
+
+        "smlal v12.4s, v0.4h, v6.h[0] \n"
+        "smlal v13.4s, v0.4h, v7.h[0] \n"
+        "smlal2 v14.4s, v0.8h, v6.h[0] \n"
+        "smlal2 v15.4s, v0.8h, v7.h[0] \n"
+        "smlal v12.4s, v1.4h, v6.h[1] \n"
+        "smlal v13.4s, v1.4h, v7.h[1] \n"
+        "smlal2 v14.4s, v1.8h, v6.h[1] \n"
+        "smlal2 v15.4s, v1.8h, v7.h[1] \n"
+
+        "smlal v8.4s, v2.4h, v4.h[2] \n"
+        "smlal v9.4s, v2.4h, v5.h[2] \n"
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n"
+        "smlal2 v10.4s, v2.8h, v4.h[2] \n"
+        "smlal2 v11.4s, v2.8h, v5.h[2] \n"
+        "smlal v8.4s, v3.4h, v4.h[3] \n"
+        "smlal v9.4s, v3.4h, v5.h[3] \n"
+        "smlal2 v10.4s, v3.8h, v4.h[3] \n"
+        "smlal2 v11.4s, v3.8h, v5.h[3] \n"
+
+        "smlal v12.4s, v2.4h, v6.h[2] \n"
+        "smlal v13.4s, v2.4h, v7.h[2] \n"
+        "smlal2 v14.4s, v2.8h, v6.h[2] \n"
+        "smlal2 v15.4s, v2.8h, v7.h[2] \n"
+        "smlal v12.4s, v3.4h, v6.h[3] \n"
+        "smlal v13.4s, v3.4h, v7.h[3] \n"
+        "smlal2 v14.4s, v3.8h, v6.h[3] \n"
+        "smlal2 v15.4s, v3.8h, v7.h[3] \n"
+
+        "smlal v8.4s, v0.4h, v4.h[4] \n"
+        "smlal v9.4s, v0.4h, v5.h[4] \n"
+        "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n"
+        "smlal2 v10.4s, v0.8h, v4.h[4] \n"
+        "smlal2 v11.4s, v0.8h, v5.h[4] \n"
+
+        "smlal v8.4s, v1.4h, v4.h[5] \n"
+        "smlal v9.4s, v1.4h, v5.h[5] \n"
+        "smlal2 v10.4s, v1.8h, v4.h[5] \n"
+        "smlal2 v11.4s, v1.8h, v5.h[5] \n"
+
+        "smlal v12.4s, v0.4h, v6.h[4] \n"
+        "smlal v13.4s, v0.4h, v7.h[4] \n"
+        "smlal2 v14.4s, v0.8h, v6.h[4] \n"
+        "smlal2 v15.4s, v0.8h, v7.h[4] \n"
+        "smlal v12.4s, v1.4h, v6.h[5] \n"
+        "smlal v13.4s, v1.4h, v7.h[5] \n"
+        "smlal2 v14.4s, v1.8h, v6.h[5] \n"
+        "smlal2 v15.4s, v1.8h, v7.h[5] \n"
+
+        "smlal v8.4s, v2.4h, v4.h[6] \n"
+        "smlal v9.4s, v2.4h, v5.h[6] \n"
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n"
+        "smlal2 v10.4s, v2.8h, v4.h[6] \n"
+        "smlal2 v11.4s, v2.8h, v5.h[6] \n"
+        "smlal v8.4s, v3.4h, v4.h[7] \n"
+        "smlal v9.4s, v3.4h, v5.h[7] \n"
+        "smlal2 v10.4s, v3.8h, v4.h[7] \n"
+        "smlal2 v11.4s, v3.8h, v5.h[7] \n"
+        "sub %[b], %[b], #64           \n"
+        "add %[b], %[b], %[ldb]        \n"
+
+        "smlal v12.4s, v2.4h, v6.h[6] \n"
+        "smlal v13.4s, v2.4h, v7.h[6] \n"
+        "subs %w[cnt], %w[cnt], #1        \n"
+        "ld1 {v4.8h, v5.8h}, [%[b]], #32 \n"
+        "smlal2 v14.4s, v2.8h, v6.h[6] \n"
+        "smlal2 v15.4s, v2.8h, v7.h[6] \n"
+        "smlal v12.4s, v3.4h, v6.h[7] \n"
+        "smlal v13.4s, v3.4h, v7.h[7] \n"
+        "smlal2 v14.4s, v3.8h, v6.h[7] \n"
+        "smlal2 v15.4s, v3.8h, v7.h[7] \n"
+
+        "bne 1b \n"
+        "2: \n"
+        "stp q8, q10, [%[c]], #32 \n"
+        "stp q9, q11, [%[c]], #32 \n"
+        "stp q12, q14, [%[c]], #32 \n"
+        "stp q13, q15, [%[c]], #32 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb] "r" (ldb_byte)
+        : "v0", "v1", "v2", "v3", "v4","v5", "v6", "v7", "v8", "v9", 
+          "v10", "v11","v12", "v13", "v14", "v15", "cc", "memory"
+      );
+      // clang-format on
+      b += 32;
+    }
+    for (; n > 0; --n) {
+      int cnt = kcnt;
+      const int16_t* a_ptr = A_packed;
+      const int16_t* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n"
+        "ld1 {v4.8h}, [%[b]], #16 \n"
+        "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n"
+        "smull v5.4s, v0.4h, v4.h[0] \n" 
+        "smull2 v6.4s, v0.8h, v4.h[0] \n"
+        "ld1 {v10.8h, v11.8h}, [%[a]], #32 \n"
+        "smlal v5.4s, v1.4h, v4.h[1] \n" 
+        "smlal2 v6.4s, v1.8h, v4.h[1] \n"
+        "ld1 {v12.8h, v13.8h}, [%[a]], #32 \n"
+        "smlal v5.4s, v2.4h, v4.h[2] \n" 
+        "smlal2 v6.4s, v2.8h, v4.h[2] \n"
+        "smlal v5.4s, v3.4h, v4.h[3] \n" 
+        "smlal2 v6.4s, v3.8h, v4.h[3] \n"
+        "sub %[b], %[b], #16 \n"
+        "add %[b], %[b], %[ldb] \n"
+        "smlal v5.4s, v10.4h, v4.h[4] \n" 
+        "smlal2 v6.4s, v10.8h, v4.h[4] \n"
+        "smlal v5.4s, v11.4h, v4.h[5] \n" 
+        "smlal2 v6.4s, v11.8h, v4.h[5] \n"
+        "subs %w[cnt], %w[cnt], #1 \n"
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n"
+        "smlal v5.4s, v12.4h, v4.h[6] \n" 
+        "smlal2 v6.4s, v12.8h, v4.h[6] \n"
+        "smlal v5.4s, v13.4h, v4.h[7] \n" 
+        "smlal2 v6.4s, v13.8h, v4.h[7] \n"
+
+        "beq 2f \n"
+        "1: \n"
+        "ld1 {v4.8h}, [%[b]], #16 \n"
+        "ld1 {v2.8h, v3.8h}, [%[a]], #32 \n"
+        "smlal v5.4s, v0.4h, v4.h[0] \n" 
+        "smlal2 v6.4s, v0.8h, v4.h[0] \n"
+        "ld1 {v10.8h, v11.8h}, [%[a]], #32 \n"
+        "smlal v5.4s, v1.4h, v4.h[1] \n" 
+        "smlal2 v6.4s, v1.8h, v4.h[1] \n"
+        "ld1 {v12.8h, v13.8h}, [%[a]], #32 \n"
+        "smlal v5.4s, v2.4h, v4.h[2] \n" 
+        "smlal2 v6.4s, v2.8h, v4.h[2] \n"
+        "smlal v5.4s, v3.4h, v4.h[3] \n" 
+        "smlal2 v6.4s, v3.8h, v4.h[3] \n"
+        "sub %[b], %[b], #16 \n"
+        "add %[b], %[b], %[ldb] \n"
+        "smlal v5.4s, v10.4h, v4.h[4] \n" 
+        "smlal2 v6.4s, v10.8h, v4.h[4] \n"
+        "smlal v5.4s, v11.4h, v4.h[5] \n" 
+        "smlal2 v6.4s, v11.8h, v4.h[5] \n"
+        "subs %w[cnt], %w[cnt], #1 \n"
+        "ld1 {v0.8h, v1.8h}, [%[a]], #32 \n"
+        "smlal v5.4s, v12.4h, v4.h[6] \n" 
+        "smlal2 v6.4s, v12.8h, v4.h[6] \n"
+        "smlal v5.4s, v13.4h, v4.h[7] \n" 
+        "smlal2 v6.4s, v13.8h, v4.h[7] \n"
+        "bne 1b \n"
+
+        "2: \n"
+        "st1 {v5.4s, v6.4s}, [%[c]], #32 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb] "r" (ldb_byte)
+        : "v0", "v1", "v2", "v3", "v4","v5", "v6", "cc", "memory" 
+      );
+      // clang-format on
+      b += 8;
+    }
+#else
+    for (; n > 3; n -= 4) {
+      int cnt = kcnt;
+      const int16_t* a_ptr = A_packed;
+      const int16_t* b_ptr = b;
+      // clang-format off
+      asm volatile (
+        "vld1.16 {d0-d3}, [%[b]]!  \n"
+        "vld1.16 {d8-d11}, [%[a]]! \n"
+        "vld1.16 {d4-d7}, [%[b]]! \n"
+        "vmull.s16 q8, d8, d0[0] \n"
+        "vmull.s16 q9, d8, d2[0] \n"
+        "vld1.16 {d12-d15}, [%[a]]! \n"
+        "vmull.s16 q10, d9, d0[0] \n"
+        "vmull.s16 q11, d9, d2[0] \n"
+        "vmlal.s16 q8, d10, d0[1] \n"
+        "vmlal.s16 q9, d10, d2[1] \n"
+        "vmlal.s16 q10, d11, d0[1] \n"
+        "vmlal.s16 q11, d11, d2[1] \n"
+        "vmull.s16 q12, d8, d4[0] \n"
+        "vmull.s16 q13, d8, d6[0] \n"
+        "vmull.s16 q14, d9, d4[0] \n"
+        "vmull.s16 q15, d9, d6[0] \n"
+        "vmlal.s16 q12, d10, d4[1] \n"
+        "vmlal.s16 q13, d10, d6[1] \n"
+        "vmlal.s16 q14, d11, d4[1] \n"
+        "vmlal.s16 q15, d11, d6[1] \n"
+
+        "vmlal.s16 q8, d12, d0[2] \n"
+        "vmlal.s16 q9, d12, d2[2] \n"
+        "vld1.16 {d8-d11}, [%[a]]! \n"
+        "vmlal.s16 q10, d13, d0[2] \n"
+        "vmlal.s16 q11, d13, d2[2] \n"
+        "vmlal.s16 q8, d14, d0[3] \n"
+        "vmlal.s16 q9, d14, d2[3] \n"
+        "vmlal.s16 q10, d15, d0[3] \n"
+        "vmlal.s16 q11, d15, d2[3] \n"
+
+        "vmlal.s16 q12, d12, d4[2] \n"
+        "vmlal.s16 q13, d12, d6[2] \n"
+        "vmlal.s16 q14, d13, d4[2] \n"
+        "vmlal.s16 q15, d13, d6[2] \n"
+        "vmlal.s16 q12, d14, d4[3] \n"
+        "vmlal.s16 q13, d14, d6[3] \n"
+        "vmlal.s16 q14, d15, d4[3] \n"
+        "vmlal.s16 q15, d15, d6[3] \n"
+
+        "sub %[b], %[b], #64   \n"
+        "add %[b], %[b], %[ldb]   \n"
+        "vld1.16 {d12-d15}, [%[a]]! \n"
+        "vmlal.s16 q8, d8, d1[0] \n"
+        "vmlal.s16 q9, d8, d3[0] \n"
+        "vmlal.s16 q10, d9, d1[0] \n"
+        "vmlal.s16 q11, d9, d3[0] \n"
+        "vmlal.s16 q8, d10, d1[1] \n"
+        "vmlal.s16 q9, d10, d3[1] \n"
+        "vmlal.s16 q10, d11, d1[1] \n"
+        "vmlal.s16 q11, d11, d3[1] \n"
+        "vmlal.s16 q8, d12, d1[2] \n"
+        "vmlal.s16 q9, d12, d3[2] \n"
+        "vmlal.s16 q10, d13, d1[2] \n"
+        "vmlal.s16 q11, d13, d3[2] \n"
+        "vmlal.s16 q8, d14, d1[3] \n"
+        "vmlal.s16 q9, d14, d3[3] \n"
+        "vmlal.s16 q10, d15, d1[3] \n"
+        "vmlal.s16 q11, d15, d3[3] \n"
+        "vld1.16 {d0-d3}, [%[b]]!  \n"
+        "vmlal.s16 q12, d8, d5[0] \n"
+        "vmlal.s16 q13, d8, d7[0] \n"
+        "vmlal.s16 q14, d9, d5[0] \n"
+        "vmlal.s16 q15, d9, d7[0] \n"
+        "vmlal.s16 q12, d10, d5[1] \n"
+        "vmlal.s16 q13, d10, d7[1] \n"
+        "subs %[cnt], %[cnt], #1   \n"
+        "vmlal.s16 q14, d11, d5[1] \n"
+        "vmlal.s16 q15, d11, d7[1] \n"
+        "vld1.16 {d8-d11}, [%[a]]! \n"
+        "vmlal.s16 q12, d12, d5[2] \n"
+        "vmlal.s16 q13, d12, d7[2] \n"
+        "vmlal.s16 q14, d13, d5[2] \n"
+        "vmlal.s16 q15, d13, d7[2] \n"
+        "vmlal.s16 q12, d14, d5[3] \n"
+        "vmlal.s16 q13, d14, d7[3] \n"
+        "vmlal.s16 q14, d15, d5[3] \n"
+        "vmlal.s16 q15, d15, d7[3] \n"
+
+        "beq 2f \n"
+        "1: \n"
+        "vld1.16 {d4-d7}, [%[b]]! \n"
+        "vmlal.s16 q8, d8, d0[0] \n"
+        "vmlal.s16 q9, d8, d2[0] \n"
+        "vld1.16 {d12-d15}, [%[a]]! \n"
+        "vmlal.s16 q10, d9, d0[0] \n"
+        "vmlal.s16 q11, d9, d2[0] \n"
+        "vmlal.s16 q8, d10, d0[1] \n"
+        "vmlal.s16 q9, d10, d2[1] \n"
+        "vmlal.s16 q10, d11, d0[1] \n"
+        "vmlal.s16 q11, d11, d2[1] \n"
+        "vmlal.s16 q12, d8, d4[0] \n"
+        "vmlal.s16 q13, d8, d6[0] \n"
+        "vmlal.s16 q14, d9, d4[0] \n"
+        "vmlal.s16 q15, d9, d6[0] \n"
+        "vmlal.s16 q12, d10, d4[1] \n"
+        "vmlal.s16 q13, d10, d6[1] \n"
+        "vmlal.s16 q14, d11, d4[1] \n"
+        "vmlal.s16 q15, d11, d6[1] \n"
+
+        "vmlal.s16 q8, d12, d0[2] \n"
+        "vmlal.s16 q9, d12, d2[2] \n"
+        "vld1.16 {d8-d11}, [%[a]]! \n"
+        "vmlal.s16 q10, d13, d0[2] \n"
+        "vmlal.s16 q11, d13, d2[2] \n"
+        "vmlal.s16 q8, d14, d0[3] \n"
+        "vmlal.s16 q9, d14, d2[3] \n"
+        "vmlal.s16 q10, d15, d0[3] \n"
+        "vmlal.s16 q11, d15, d2[3] \n"
+
+        "vmlal.s16 q12, d12, d4[2] \n"
+        "vmlal.s16 q13, d12, d6[2] \n"
+        "vmlal.s16 q14, d13, d4[2] \n"
+        "vmlal.s16 q15, d13, d6[2] \n"
+        "vmlal.s16 q12, d14, d4[3] \n"
+        "vmlal.s16 q13, d14, d6[3] \n"
+        "vmlal.s16 q14, d15, d4[3] \n"
+        "vmlal.s16 q15, d15, d6[3] \n"
+
+        "sub %[b], %[b], #64   \n"
+        "add %[b], %[b], %[ldb]   \n"
+        "vld1.16 {d12-d15}, [%[a]]! \n"
+        "vmlal.s16 q8, d8, d1[0] \n"
+        "vmlal.s16 q9, d8, d3[0] \n"
+        "vmlal.s16 q10, d9, d1[0] \n"
+        "vmlal.s16 q11, d9, d3[0] \n"
+        "vmlal.s16 q8, d10, d1[1] \n"
+        "vmlal.s16 q9, d10, d3[1] \n"
+        "vmlal.s16 q10, d11, d1[1] \n"
+        "vmlal.s16 q11, d11, d3[1] \n"
+        "vmlal.s16 q8, d12, d1[2] \n"
+        "vmlal.s16 q9, d12, d3[2] \n"
+        "vmlal.s16 q10, d13, d1[2] \n"
+        "vmlal.s16 q11, d13, d3[2] \n"
+        "vmlal.s16 q8, d14, d1[3] \n"
+        "vmlal.s16 q9, d14, d3[3] \n"
+        "vmlal.s16 q10, d15, d1[3] \n"
+        "vmlal.s16 q11, d15, d3[3] \n"
+        "vld1.16 {d0-d3}, [%[b]]!  \n"
+        "vmlal.s16 q12, d8, d5[0] \n"
+        "vmlal.s16 q13, d8, d7[0] \n"
+        "vmlal.s16 q14, d9, d5[0] \n"
+        "vmlal.s16 q15, d9, d7[0] \n"
+        "vmlal.s16 q12, d10, d5[1] \n"
+        "vmlal.s16 q13, d10, d7[1] \n"
+        "subs %[cnt], %[cnt], #1   \n"
+        "vmlal.s16 q14, d11, d5[1] \n"
+        "vmlal.s16 q15, d11, d7[1] \n"
+        "vld1.16 {d8-d11}, [%[a]]! \n"
+        "vmlal.s16 q12, d12, d5[2] \n"
+        "vmlal.s16 q13, d12, d7[2] \n"
+        "vmlal.s16 q14, d13, d5[2] \n"
+        "vmlal.s16 q15, d13, d7[2] \n"
+        "vmlal.s16 q12, d14, d5[3] \n"
+        "vmlal.s16 q13, d14, d7[3] \n"
+        "vmlal.s16 q14, d15, d5[3] \n"
+        "vmlal.s16 q15, d15, d7[3] \n"
+
+        "bne 1b \n"
+        "2: \n"
+        "vst1.32 {d16-d17}, [%[c]]! \n"
+        "vst1.32 {d20-d21}, [%[c]]! \n"
+        "vst1.32 {d18-d19}, [%[c]]! \n"
+        "vst1.32 {d22-d23}, [%[c]]! \n"
+        "vst1.32 {d24-d25}, [%[c]]! \n"
+        "vst1.32 {d28-d29}, [%[c]]! \n"
+        "vst1.32 {d26-d27}, [%[c]]! \n"
+        "vst1.32 {d30-d31}, [%[c]]! \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb] "r" (ldb_byte)
+        : "q0", "q1", "q2", "q3", "q4","q5", "q6", "q7", "q8",
+          "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory" 
+      );
+      // clang format on
+      b += 32;
+     }
+    for (; n > 0; --n) {
+      int cnt = kcnt;
+      const int16_t* a_ptr = A_packed;
+      const int16_t* b_ptr = b;
+      // clang format off
+      asm volatile (
+        "vld1.16 {d0-d1}, [%[b]]! \n"
+        "vld1.16 {d4-d7}, [%[a]]! \n"
+        "vld1.16 {d8-d11}, [%[a]]! \n"
+        "vmull.s16 q8, d4, d0[0] \n"
+        "vmull.s16 q9, d5, d0[0] \n"
+        "sub %[b], %[b], #16   \n"
+        "vmlal.s16 q8, d6, d0[1] \n"
+        "vmlal.s16 q9, d7, d0[1] \n"
+        "add %[b], %[b], %[ldb]   \n"
+        "subs %[cnt], %[cnt], #1   \n"
+        
+        "vld1.16 {d4-d7}, [%[a]]! \n"
+        "vmlal.s16 q8, d8, d0[2] \n"
+        "vmlal.s16 q9, d9, d0[2] \n"
+        "vmlal.s16 q8, d10, d0[3] \n"
+        "vmlal.s16 q9, d11, d0[3] \n"
+        "vld1.16 {d8-d11}, [%[a]]! \n"
+
+        "vmlal.s16 q8, d4, d1[0] \n"
+        "vmlal.s16 q9, d5, d1[0] \n"
+        "vmlal.s16 q8, d6, d1[1] \n"
+        "vmlal.s16 q9, d7, d1[1] \n"
+        "vld1.16 {d4-d7}, [%[a]]! \n"
+        "vmlal.s16 q8, d8, d1[2] \n"
+        "vmlal.s16 q9, d9, d1[2] \n"
+        "vmlal.s16 q8, d10, d1[3] \n"
+        "vmlal.s16 q9, d11, d1[3] \n"
+        "beq 2f \n"
+        "1:\n"
+        "vld1.16 {d0-d1}, [%[b]]! \n"
+        "vld1.16 {d8-d11}, [%[a]]! \n"
+        "vmlal.s16 q8, d4, d0[0] \n"
+        "vmlal.s16 q9, d5, d0[0] \n"
+        "sub %[b], %[b], #16   \n"
+        "vmlal.s16 q8, d6, d0[1] \n"
+        "vmlal.s16 q9, d7, d0[1] \n"
+        "add %[b], %[b], %[ldb]   \n"
+        "subs %[cnt], %[cnt], #1   \n"
+        
+        "vld1.16 {d4-d7}, [%[a]]! \n"
+        "vmlal.s16 q8, d8, d0[2] \n"
+        "vmlal.s16 q9, d9, d0[2] \n"
+        "vmlal.s16 q8, d10, d0[3] \n"
+        "vmlal.s16 q9, d11, d0[3] \n"
+        "vld1.16 {d8-d11}, [%[a]]! \n"
+
+        "vmlal.s16 q8, d4, d1[0] \n"
+        "vmlal.s16 q9, d5, d1[0] \n"
+        "vmlal.s16 q8, d6, d1[1] \n"
+        "vmlal.s16 q9, d7, d1[1] \n"
+        "vld1.16 {d4-d7}, [%[a]]! \n"
+        "vmlal.s16 q8, d8, d1[2] \n"
+        "vmlal.s16 q9, d9, d1[2] \n"
+        "vmlal.s16 q8, d10, d1[3] \n"
+        "vmlal.s16 q9, d11, d1[3] \n"
+        "bne 1b \n"
+        "2: \n"
+        "vst1.32 {d16-d19}, [%[c]]! \n" 
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb] "r" (ldb_byte)
+        : "q0", "q1", "q2", "q3", "q4","q5", "q6", "q7", "q8",
+          "q9", "cc", "memory" 
+      );
+      // clang-format on
+      b += 8;
+    }
+#endif
+    A_packed += lda;
+  }
+}
+
 void sgemm_prepack_c4(int M,
                       int N,
                       int K,
diff --git a/lite/backends/arm/math/packed_sgemm_c4.h b/lite/backends/arm/math/packed_sgemm_c4.h
index 3229ff3e0774ce8bff02b12d79d7ec50ed873cea..51457d57405396f68bf1991bfa43cc6aa9fbe050 100644
--- a/lite/backends/arm/math/packed_sgemm_c4.h
+++ b/lite/backends/arm/math/packed_sgemm_c4.h
@@ -54,6 +54,13 @@ void sgemm_prepack_c4_small(int M,
                             const float* B,
                             float* C,
                             ARMContext* ctx);
+void sgemm_prepack_c8_int16_small(int M,
+                                  int N,
+                                  int K,
+                                  const int16_t* A_packed,
+                                  const int16_t* B,
+                                  int32_t* C,
+                                  ARMContext* ctx);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc
index 3e6cbff0660be8f2542d059a39115bed52122ff1..8303851ece9dd2f1d053f9f4b888e42f2fdc0aad 100644
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -2044,7 +2044,7 @@ void pooling3x3s1p0_avg(const float* din,
               } else {
                 if (pad_bottom > 1) {
                   coef_h = 1.f / 3;
-                } else if (pad_bottom = 1) {
+                } else if (pad_bottom == 1) {
                   coef_h = 0.5f;
                 } else {
                   coef_h = 1.f;
diff --git a/lite/backends/arm/math/prior_box.cc b/lite/backends/arm/math/prior_box.cc
index 6daab69ebf00da24d67132afba4b9abef0afbd39..4ef7356e67cee4c47ddf3eb16ed5286b4271b41a 100644
--- a/lite/backends/arm/math/prior_box.cc
+++ b/lite/backends/arm/math/prior_box.cc
@@ -21,7 +21,7 @@ namespace lite {
 namespace arm {
 namespace math {
 
-const int MALLOC_ALIGN = 64;
+const int MALLOC_ALIGN = 16;
 
 void* fast_malloc(size_t size) {
   size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
diff --git a/lite/backends/arm/math/sequence_pool.cc b/lite/backends/arm/math/sequence_pool.cc
index b8f9ab0a1a842a59971ad4c165d4c1be3426059a..ded76c1bdae354ca46a254309dcc6b3e216c92f4 100644
--- a/lite/backends/arm/math/sequence_pool.cc
+++ b/lite/backends/arm/math/sequence_pool.cc
@@ -46,11 +46,60 @@ void seq_pool_sum<float>(const float* din,
       memcpy(dout_ptr, din_ptr, width * sizeof(float));
       din_ptr += width;
       height = height - 1;
-      for (int h = 0; h < height; h++) {
-        for (int w = 0; w < width; ++w) {
-          dout_ptr[w] += din_ptr[w];
+      int cnt_w = width >> 2;
+      int remain_w = width & 3;
+      int cnt_h = height >> 2;
+      int remain_h = height & 3;
+      int stride = width << 2;
+      for (int w = 0; w < cnt_w; w++) {
+        const float* din_ptr0 = din_ptr + w * 4;
+        float32x4_t dout_val = vld1q_f32(dout_ptr);
+        const float* din_ptr1 = din_ptr0 + width;
+        const float* din_ptr2 = din_ptr1 + width;
+        const float* din_ptr3 = din_ptr2 + width;
+        for (int h = 0; h < cnt_h; h++) {
+          float32x4_t din0 = vld1q_f32(din_ptr0);
+          float32x4_t din1 = vld1q_f32(din_ptr1);
+          float32x4_t din2 = vld1q_f32(din_ptr2);
+          float32x4_t din3 = vld1q_f32(din_ptr3);
+          dout_val = vaddq_f32(din0, dout_val);
+          float32x4_t tmp = vaddq_f32(din1, din2);
+          din_ptr0 += stride;
+          din_ptr1 += stride;
+          dout_val = vaddq_f32(din3, dout_val);
+          din_ptr2 += stride;
+          din_ptr3 += stride;
+          dout_val = vaddq_f32(tmp, dout_val);
         }
-        din_ptr += width;
+        for (int h = 0; h < remain_h; h++) {
+          float32x4_t din0 = vld1q_f32(din_ptr0);
+          dout_val = vaddq_f32(din0, dout_val);
+          din_ptr0 += width;
+        }
+        vst1q_f32(dout_ptr, dout_val);
+        dout_ptr += 4;
+      }
+      const float* din_ptr00 = din_ptr + cnt_w * 4;
+      for (int w = 0; w < remain_w; w++) {
+        const float* din_ptr0 = din_ptr00 + w;
+        const float* din_ptr1 = din_ptr0 + width;
+        const float* din_ptr2 = din_ptr1 + width;
+        const float* din_ptr3 = din_ptr2 + width;
+        for (int h = 0; h < cnt_h; h++) {
+          *dout_ptr += din_ptr0[0];
+          float tmp = din_ptr1[0] + din_ptr2[0];
+          din_ptr0 += stride;
+          din_ptr1 += stride;
+          *dout_ptr += din_ptr3[0];
+          din_ptr2 += stride;
+          din_ptr3 += stride;
+          *dout_ptr += tmp;
+        }
+        for (int h = 0; h < remain_h; h++) {
+          *dout_ptr += din_ptr0[0];
+          din_ptr0 += width;
+        }
+        dout_ptr++;
       }
     }
   }
@@ -144,12 +193,62 @@ void seq_pool_max<float>(const float* din,
       } else {
         memcpy(dout_ptr, din_ptr, width * sizeof(float));
         din_ptr += width;
-        int remain_h = height - 1;
-        for (int h = 0; h < remain_h; h++) {
-          for (int w = 0; w < width; w++) {
-            dout_ptr[w] = std::max(dout_ptr[w], din_ptr[w]);
+        height = height - 1;
+        int cnt_w = width >> 2;
+        int remain_w = width & 3;
+        int cnt_h = height >> 2;
+        int remain_h = height & 3;
+        int stride = width << 2;
+        for (int w = 0; w < cnt_w; w++) {
+          const float* din_ptr0 = din_ptr + w * 4;
+          float32x4_t dout_val = vld1q_f32(dout_ptr);
+          const float* din_ptr1 = din_ptr0 + width;
+          const float* din_ptr2 = din_ptr1 + width;
+          const float* din_ptr3 = din_ptr2 + width;
+          for (int h = 0; h < cnt_h; h++) {
+            float32x4_t din0 = vld1q_f32(din_ptr0);
+            float32x4_t din1 = vld1q_f32(din_ptr1);
+            float32x4_t din2 = vld1q_f32(din_ptr2);
+            float32x4_t din3 = vld1q_f32(din_ptr3);
+            dout_val = vmaxq_f32(din0, dout_val);
+            float32x4_t tmp = vmaxq_f32(din1, din2);
+            din_ptr0 += stride;
+            din_ptr1 += stride;
+            dout_val = vmaxq_f32(din3, dout_val);
+            din_ptr2 += stride;
+            din_ptr3 += stride;
+            dout_val = vmaxq_f32(tmp, dout_val);
           }
-          din_ptr += width;
+          for (int h = 0; h < remain_h; h++) {
+            float32x4_t din0 = vld1q_f32(din_ptr0);
+            dout_val = vmaxq_f32(din0, dout_val);
+            din_ptr0 += width;
+          }
+          vst1q_f32(dout_ptr, dout_val);
+          dout_ptr += 4;
+        }
+        const float* din_ptr00 = din_ptr + cnt_w * 4;
+        for (int w = 0; w < remain_w; w++) {
+          const float* din_ptr0 = din_ptr00 + w;
+          const float* din_ptr1 = din_ptr0 + width;
+          const float* din_ptr2 = din_ptr1 + width;
+          const float* din_ptr3 = din_ptr2 + width;
+          for (int h = 0; h < cnt_h; h++) {
+            *dout_ptr += din_ptr0[0];
+            *dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
+            float tmp = std::max(din_ptr1[0], din_ptr2[0]);
+            din_ptr0 += stride;
+            din_ptr1 += stride;
+            *dout_ptr = std::max(*dout_ptr, din_ptr3[0]);
+            din_ptr2 += stride;
+            din_ptr3 += stride;
+            *dout_ptr = std::max(*dout_ptr, tmp);
+          }
+          for (int h = 0; h < remain_h; h++) {
+            *dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
+            din_ptr0 += width;
+          }
+          dout_ptr++;
         }
       }
     }
diff --git a/lite/backends/arm/math/softmax.cc b/lite/backends/arm/math/softmax.cc
index 65d41b049123680f26674cc05d3c02172a260b31..b7f82e9f376e8b62195d884e8de19a142d76b316 100644
--- a/lite/backends/arm/math/softmax.cc
+++ b/lite/backends/arm/math/softmax.cc
@@ -531,7 +531,7 @@ void softmax_inner1_large_axis<float>(const float* din,
     }
     float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax));
     float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1));
-    for (j = 4 * j; j < axis_size; ++j) {
+    for (j = 4 * nn; j < axis_size; ++j) {
       max_data = std::max(max_data, din_max_ptr[0]);
       din_max_ptr++;
     }
@@ -557,7 +557,7 @@ void softmax_inner1_large_axis<float>(const float* din,
     float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum));
     float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1);
 
-    for (j = 4 * j; j < axis_size; ++j) {
+    for (j = 4 * nn; j < axis_size; ++j) {
       dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data);
       sum_data += dout_sum_ptr[0];
       din_sum_ptr++;
diff --git a/lite/backends/cuda/cuda_utils.h b/lite/backends/cuda/cuda_utils.h
index 4c7cedaa97e22f74caebc5288fad8543f61bc88d..012004a65fa7d531ed85837e27b880c8c493ffca 100644
--- a/lite/backends/cuda/cuda_utils.h
+++ b/lite/backends/cuda/cuda_utils.h
@@ -41,6 +41,8 @@
         << "CUDA: " << cudaGetErrorString(e);                \
   }
 
+#define CUDA_POST_KERNEL_CHECK CUDA_CALL(cudaPeekAtLastError())
+
 #define CUBLAS_CALL(func)                                        \
   {                                                              \
     auto e = (func);                                             \
@@ -127,6 +129,10 @@ static const char* CudnnGetErrorInfo(cudnnStatus_t status) {
       return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
     case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
       return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
+#endif
+#if CUDNN_VERSION_MIN(8, 0, 0)
+    case CUDNN_STATUS_VERSION_MISMATCH:
+      return "CUDNN_STATUS_VERSION_MISMATCH";
 #endif
   }
   return "Unknown cudnn status";
diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt
index 9e33d38feedbe682f3c4d962b4ccb85b74af3a7b..c23d3d0ed0351b59d4a373efb2474e9a73763659 100644
--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -11,8 +11,13 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps})
 nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
+nv_library(cuda_gru_forward  SRCS gru_forward.cu DEPS cuda_activation ${cuda_static_deps})
+nv_library(cuda_sequence2batch  SRCS sequence2batch.cu DEPS ${cuda_static_deps})
 nv_library(cuda_gemm SRCS gemm.cc  DEPS ${cuda_static_deps})
 nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
+nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps})
+nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})
+nv_library(cuda_bias SRCS bias.cu DEPS ${cuda_static_deps})
 
 set (
  math_cuda
@@ -23,8 +28,13 @@ set (
  cuda_transpose
  cuda_elementwise
  cudnn_pool
+ cuda_gru_forward
+ cuda_sequence2batch
  cuda_gemm
  cuda_batched_gemm
+ cuda_strided_gemm
+ cuda_sequence_padding
+ cuda_bias
 )
 
 set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
diff --git a/lite/backends/cuda/math/activation.cu b/lite/backends/cuda/math/activation.cu
index a45e3eb378eefdbabce0b837891514dc659e0429..4d97042aeb0b728b491fbc2dd12ddcc94b4c1490 100644
--- a/lite/backends/cuda/math/activation.cu
+++ b/lite/backends/cuda/math/activation.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <iostream>
+#include "lite/backends/cuda/cuda_utils.h"
 #include "lite/backends/cuda/math/activation.h"
 #include "lite/backends/cuda/math/utils.h"
 
@@ -21,6 +22,20 @@ namespace lite {
 namespace cuda {
 namespace math {
 
+ActivationType GetActiveType(const std::string& act) {
+  if (act == "sigmoid") {
+    return kSigmoid;
+  } else if (act == "relu") {
+    return kReLU;
+  } else if (act == "tanh") {
+    return kTanh;
+  } else if (act == "identify") {
+    return kIdentity;
+  } else {
+    LOG(FATAL) << "not supported activation: " << act;
+  }
+}
+
 template <typename T>
 __global__ void relu_kernel(const int num,
                             const float alpha,
@@ -470,6 +485,76 @@ template void relu(int, const half*, half*, float, cudaStream_t);
 template void bias_relu(
     int, const float*, const float* bias, float*, float, cudaStream_t);
 
+// ------------- sigmoid -------------
+
+template <typename T>
+__global__ void sigmoid_kernel(const int num, const T* in, T* out) {
+  CUDA_KERNEL_LOOP(i, num) {
+#if __CUDA_ARCH__ >= 350
+    out[i] = static_cast<T>(1.0f) /
+             (static_cast<T>(1.0f) + expf(-1 * __ldg(in + i)));
+#else
+    out[i] = static_cast<T>(1.0f) / (static_cast<T>(1.0f) + expf(-in[i]));
+#endif
+  }
+}
+
+template <>
+__global__ void sigmoid_kernel(const int num, const half* in, half* out) {
+  CUDA_KERNEL_LOOP(i, num) {
+    half tmp = __float2half(1.0f);
+#if __CUDA_ARCH__ >= 530
+    out[i] = __hdiv(
+        tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.0f), __ldg(in + i)))));
+#else
+    out[i] = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i]))));
+#endif
+  }
+}
+
+template <>
+__global__ void sigmoid_kernel(const int num, const half2* in, half2* out) {
+  CUDA_KERNEL_LOOP(i, num) {
+    half2 tmp = __floats2half2_rn(1.0f, 1.0f);
+#if __CUDA_ARCH__ >= 530
+    out[i] = __h2div(tmp,
+                     __hadd2(tmp,
+                             h2exp(__hmul2(__floats2half2_rn(-1.0f, -1.0f),
+                                           __ldg(in + i)))));
+#else
+    out[i].x = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].x))));
+    out[i].y = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].y))));
+#endif
+  }
+}
+
+template <typename T>
+void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream) {
+  sigmoid_kernel<T><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
+      num, din, dout);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <>
+void sigmoid(const int num, const half* din, half* dout, cudaStream_t stream) {
+  if (num % 2 == 0) {
+    const half2* din2 = reinterpret_cast<const half2*>(din);
+    half2* dout2 = reinterpret_cast<half2*>(dout);
+    sigmoid_kernel<
+        half2><<<CUDA_GET_BLOCKS(num / 2), CUDA_NUM_THREADS, 0, stream>>>(
+        num / 2, din2, dout2);
+  } else {
+    sigmoid_kernel<half><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
+        num, din, dout);
+  }
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template void sigmoid(const int num,
+                      const float* din,
+                      float* dout,
+                      cudaStream_t stream);
+
 }  // namespace math
 }  // namespace cuda
 }  // namespace lite
diff --git a/lite/backends/cuda/math/activation.h b/lite/backends/cuda/math/activation.h
index 887a222ee83878aa19fd6a94a76572e48ab4d954..926ad8d99fc4bd6464ed517505fcf30f035c57f8 100644
--- a/lite/backends/cuda/math/activation.h
+++ b/lite/backends/cuda/math/activation.h
@@ -17,11 +17,22 @@
 #include <cuda_runtime.h>
 #include <string>
 
+#include "lite/utils/cp_logging.h"
+
 namespace paddle {
 namespace lite {
 namespace cuda {
 namespace math {
 
+enum ActivationType {
+  kSigmoid,
+  kReLU,
+  kTanh,
+  kIdentity,
+};
+
+ActivationType GetActiveType(const std::string& act);
+
 // fp32 and half
 template <typename T>
 void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream);
@@ -72,6 +83,9 @@ void bias_int8_nhwc(int num,
                     const void* scale,
                     cudaStream_t stream);
 
+template <typename T>
+void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream);
+
 }  // namespace math
 }  // namespace cuda
 }  // namespace lite
diff --git a/lite/backends/cuda/math/bias.cu b/lite/backends/cuda/math/bias.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5e597e51c81cf75ddc2f850ac41924a0176ecb45
--- /dev/null
+++ b/lite/backends/cuda/math/bias.cu
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/cuda/math/bias.h"
+
+#include <iostream>
+
+#include "lite/backends/cuda/cuda_utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+__global__ void RowwiseAddKernel(
+    const T* a, const T* b, T* c, int width, int num) {
+  CUDA_KERNEL_LOOP(i, num) {
+    int h = i / width;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
+}
+
+template <>
+__global__ void RowwiseAddKernel(
+    const half* a, const half* b, half* c, int width, int num) {
+  CUDA_KERNEL_LOOP(i, num) {
+    int h = i / width;
+    int w = i - h * width;
+    c[i] = __hadd(a[i], b[w]);
+  }
+}
+
+template <typename T>
+void RowwiseAdd<T>::operator()(const T* input,
+                               const T* bias,
+                               T* output,
+                               const int width,
+                               const int count,
+                               const cudaStream_t& stream) {
+  RowwiseAddKernel<T><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+      input, bias, output, width, count);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template struct RowwiseAdd<float>;
+template struct RowwiseAdd<half>;
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/bias.h b/lite/backends/cuda/math/bias.h
new file mode 100644
index 0000000000000000000000000000000000000000..98f805a013ff80b267301be4d47a9694c5ce642f
--- /dev/null
+++ b/lite/backends/cuda/math/bias.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "lite/backends/cuda/cuda_utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+struct RowwiseAdd {
+  void operator()(const T* input,
+                  const T* bias,
+                  T* output,
+                  const int width,
+                  const int count,
+                  const cudaStream_t& stream);
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc
index 19ace2762af7d2088d5235e20387d8a4d941be30..5db41302c0cb0133e3badad0b5fa167d2c88f9df 100644
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -161,15 +161,17 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
                                               search_func);
 
   } else {
-    CUDNN_CHECK(
-        cudnnGetConvolutionForwardAlgorithm(this->handle_,
-                                            this->input_desc_,
-                                            this->filter_desc_,
-                                            this->conv_desc_,
-                                            this->output_desc_,
-                                            this->preference_,
-                                            this->workspace_limit_bytes_,
-                                            &this->fwd_algo_));
+    int requestedAlgoCount = 1;
+    int returnedAlgoCount;
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(this->handle_,
+                                                       this->input_desc_,
+                                                       this->filter_desc_,
+                                                       this->conv_desc_,
+                                                       this->output_desc_,
+                                                       requestedAlgoCount,
+                                                       &returnedAlgoCount,
+                                                       &this->algo_perf_));
+    this->fwd_algo_ = this->algo_perf_.algo;
   }
   CUDNN_CHECK(
       cudnnGetConvolutionForwardWorkspaceSize(this->handle_,
diff --git a/lite/backends/cuda/math/cudnn_conv.h b/lite/backends/cuda/math/cudnn_conv.h
index f73f1db7b1785814b6e97f28c8624b76fa75f89c..a084edefa17a5882f7e6d67407e1f48a818e3407 100644
--- a/lite/backends/cuda/math/cudnn_conv.h
+++ b/lite/backends/cuda/math/cudnn_conv.h
@@ -81,6 +81,7 @@ class CudnnConv2DBase {
   cudaStream_t stream_;
   cudnnHandle_t handle_;
   cudnnConvolutionFwdAlgo_t fwd_algo_;
+  cudnnConvolutionFwdAlgoPerf_t algo_perf_;
   cudnnTensorDescriptor_t input_desc_;
   cudnnTensorDescriptor_t output_desc_;
   cudnnTensorDescriptor_t bias_desc_;
@@ -98,8 +99,6 @@ class CudnnConv2DBase {
 
   const bool use_tensor_core_ = true;
   const size_t workspace_limit_bytes_ = 4 * 1024 * 1024;
-  const cudnnConvolutionFwdPreference_t preference_ =
-      CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
 
   // For int8
   Tensor temp_tensor_;
diff --git a/lite/backends/cuda/math/gru_forward.cu b/lite/backends/cuda/math/gru_forward.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cd04c3871db07a18acab99c960a90124941ade5d
--- /dev/null
+++ b/lite/backends/cuda/math/gru_forward.cu
@@ -0,0 +1,278 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#include "lite/backends/cuda/math/gru_forward.h"
+#include "lite/core/device_info.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <typename T>
+__global__ void GruForwardResetOutput(
+    T* gate_value,
+    T* reset_output_value,
+    T* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
+  }
+  T prev_out = 0;
+  T reset_out_val;
+  T update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_output_value) {
+    if (is_batch) {
+      prev_output_value += batch_idx * frame_size;
+    }
+    prev_out = prev_output_value[frame_idx];
+  }
+
+  if (active_gate == lite::cuda::math::ActivationType::kSigmoid) {
+    update_gate_value = Sigmoid(update_gate_value);
+    reset_gate_value = Sigmoid(reset_gate_value);
+  } else if (active_gate == lite::cuda::math::ActivationType::kReLU) {
+    update_gate_value = ReLU(update_gate_value);
+    reset_gate_value = ReLU(reset_gate_value);
+  } else if (active_gate == lite::cuda::math::ActivationType::kTanh) {
+    update_gate_value = Tanh(update_gate_value);
+    reset_gate_value = Tanh(reset_gate_value);
+  }
+
+  reset_out_val = prev_out * reset_gate_value;
+
+  gate_value[frame_idx + frame_size * 0] = update_gate_value;
+  gate_value[frame_idx + frame_size * 1] = reset_gate_value;
+  reset_output_value[frame_idx] = reset_out_val;
+}
+
+template <>
+__global__ void GruForwardResetOutput(
+    half* gate_value,
+    half* reset_output_value,
+    half* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
+  }
+  half prev_out = 0;
+  half reset_out_val;
+  half update_gate_value = gate_value[frame_idx + frame_size * 0];
+  half reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_output_value) {
+    if (is_batch) {
+      prev_output_value += batch_idx * frame_size;
+    }
+    prev_out = prev_output_value[frame_idx];
+  }
+
+  if (active_gate == ActivationType::kSigmoid) {
+    update_gate_value = Sigmoid(update_gate_value);
+    reset_gate_value = Sigmoid(reset_gate_value);
+  } else if (active_gate == ActivationType::kReLU) {
+    update_gate_value = ReLU(update_gate_value);
+    reset_gate_value = ReLU(reset_gate_value);
+  } else if (active_gate == ActivationType::kTanh) {
+    update_gate_value = Tanh(update_gate_value);
+    reset_gate_value = Tanh(reset_gate_value);
+  }
+#if __CUDA_ARCH__ >= 530
+  reset_out_val = __hmul(prev_out, reset_gate_value);
+#else
+  reset_out_val =
+      __float2half(__half2float(prev_out) * __half2float(reset_gate_value));
+#endif
+
+  gate_value[frame_idx + frame_size * 0] = update_gate_value;
+  gate_value[frame_idx + frame_size * 1] = reset_gate_value;
+  reset_output_value[frame_idx] = reset_out_val;
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <typename T>
+__global__ void GruForwardFinalOutput(
+    T* gate_value,
+    T* prev_output_value,
+    T* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) {
+      return;
+    }
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
+  }
+
+  T output;
+  T prev_out = 0;
+  T update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T state_frame_value = gate_value[frame_idx + frame_size * 2];
+
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    prev_out = prev_output_value[frame_idx];
+  }
+
+  if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
+    state_frame_value = Sigmoid(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kReLU) {
+    state_frame_value = ReLU(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kTanh) {
+    state_frame_value = Tanh(state_frame_value);
+  }
+
+  if (origin_mode) {
+    output = update_gate_value * prev_out + state_frame_value -
+             update_gate_value * state_frame_value;
+  } else {
+    output = prev_out - update_gate_value * prev_out +
+             update_gate_value * state_frame_value;
+  }
+
+  gate_value[frame_idx + frame_size * 2] = state_frame_value;
+  output_value[frame_idx] = output;
+}
+
+template <>
+__global__ void GruForwardFinalOutput(
+    half* gate_value,
+    half* prev_output_value,
+    half* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) {
+      return;
+    }
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
+  }
+
+  half output;
+  half prev_out = 0;
+  half update_gate_value = gate_value[frame_idx + frame_size * 0];
+  half state_frame_value = gate_value[frame_idx + frame_size * 2];
+
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    prev_out = prev_output_value[frame_idx];
+  }
+
+  if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
+    state_frame_value = Sigmoid(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kReLU) {
+    state_frame_value = ReLU(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kTanh) {
+    state_frame_value = Tanh(state_frame_value);
+  }
+
+  if (origin_mode) {
+#if __CUDA_ARCH__ >= 530
+    output =
+        __hsub(__hadd(__hmul(update_gate_value, prev_out), state_frame_value),
+               __hmul(update_gate_value, state_frame_value));
+#else
+    output = __float2half(
+        __half2float(update_gate_value) * __half2float(prev_out) +
+        __half2float(state_frame_value) -
+        __half2float(update_gate_value) * __half2float(state_frame_value));
+#endif
+  } else {
+#if __CUDA_ARCH__ >= 530
+    output = prev_out - update_gate_value * prev_out +
+             update_gate_value * state_frame_value;
+    output = __hadd(__hsub(prev_out, __hmul(update_gate_value, prev_out)),
+                    __hmul(update_gate_value, state_frame_value));
+#else
+    output = __float2half(
+        __half2float(prev_out) -
+        __half2float(update_gate_value) * __half2float(prev_out) +
+        __half2float(update_gate_value) * __half2float(state_frame_value));
+#endif
+  }
+
+  gate_value[frame_idx + frame_size * 2] = state_frame_value;
+  output_value[frame_idx] = output;
+}
+
+template __global__ void GruForwardFinalOutput<float>(
+    float* gate_value,
+    float* prev_output_value,
+    float* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch);
+
+template __global__ void GruForwardResetOutput<float>(
+    float* gate_value,
+    float* reset_output_value,
+    float* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch);
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/gru_forward.h b/lite/backends/cuda/math/gru_forward.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a1648c437e860bec07fbec7bbbd69b659a58407
--- /dev/null
+++ b/lite/backends/cuda/math/gru_forward.h
@@ -0,0 +1,242 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+
+#include <string>
+#include <vector>
+
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/activation.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+template <typename Dtype>
+inline __device__ Dtype Sigmoid(const Dtype a) {
+  const Dtype min = SIGMOID_THRESHOLD_MIN;
+  const Dtype max = SIGMOID_THRESHOLD_MAX;
+  Dtype tmp = (a < min) ? min : ((a > max) ? max : a);
+  return static_cast<Dtype>(1.0) / (static_cast<Dtype>(1.0) + expf(-tmp));
+}
+
+template <>
+inline __device__ half Sigmoid(const half a) {
+#if __CUDA_ARCH__ >= 530
+  const half tmp = __float2half(1.0f);
+  return __hdiv(tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.f), a))));
+#else
+  return __float2half(1.0f / (expf(__half2float(a) * -1) + 1.0f));
+#endif
+}
+
+template <typename Dtype>
+inline __device__ Dtype ReLU(const Dtype a) {
+  return a > static_cast<Dtype>(0.f) ? a : static_cast<Dtype>(0.f);
+}
+
+template <>
+inline __device__ half ReLU(const half a) {
+  const half tmp = __float2half(0.f);
+#if __CUDA_ARCH__ >= 530
+  return __hgt(a, tmp) ? a : tmp;
+#else
+  return __float2half(__half2float(a) > 0.f ? __half2float(a) : 0.f);
+#endif
+}
+
+template <typename Dtype>
+inline __device__ Dtype Tanh(const Dtype a) {
+  Dtype tmp = static_cast<Dtype>(-2.0) * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (static_cast<Dtype>(2.0) / (static_cast<Dtype>(1.0) + expf(tmp))) -
+         static_cast<Dtype>(1.0);
+}
+
+template <>
+inline __device__ half Tanh(const half a) {
+#if __CUDA_ARCH__ >= 530
+  half tmp = __float2half(1.0f);
+  half numerator = __hmul(__float2half(-2.0f), a);
+  return __hsub(__hdiv(__float2half(2.0f), __hadd(tmp, hexp(numerator))), tmp);
+#else
+  float tmp = -2.0f * __half2float(a);
+  return __float2half(2.0f / (1.0f + expf(tmp)) - 1.0f);
+#endif
+}
+
+template <typename T>
+__global__ void GruForwardResetOutput(
+    T* gate_value,
+    T* reset_output_value,
+    T* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch);
+
+template <typename T>
+__global__ void GruForwardFinalOutput(
+    T* gate_value,
+    T* prev_output_value,
+    T* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch);
+
+/*
+ * threads(tile_size, 1)
+ * grids(frame_blocks, 1)
+ */
+template <class T, int TiledSize>
+__global__ void FastCollectiveGruGate(T* gate_value,
+                                      T* prev_output_value,
+                                      T* gate_weight,
+                                      T* reset_output,
+                                      int frame_size,
+                                      ActivationType active_node) {
+  T xt_0 = 0.0f;
+  T a0 = 0.0f;
+  T c0 = 0.0f;
+  T b0[TiledSize];
+
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  int tiled_mask = ((1 << TiledSize) - 1);
+  // tiled matrix multiply using register shift, faster than sm.
+  if (prev_output_value) {
+    for (int k = 0; k < (((frame_size - 1) / TiledSize) + 1); ++k) {
+      a0 = 0;
+      if ((threadIdx.x + k * TiledSize) < frame_size) {
+        a0 = prev_output_value[threadIdx.x + (k * TiledSize)];
+      }
+      for (int i = 0; i < TiledSize; ++i) {
+        if (col < frame_size * 2 && (i + k * TiledSize) < frame_size) {
+          b0[i] = gate_weight[(i + k * TiledSize) * frame_size * 2 + col];
+        }
+      }
+
+      for (int i = 0; i < TiledSize; ++i) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+        c0 = c0 + __shfl_sync(tiled_mask, a0, i, TiledSize) * b0[i];
+#else
+        c0 = c0 + __shfl(a0, i, TiledSize) * b0[i];
+#endif
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (col < frame_size * 2) {
+    xt_0 = gate_value[col];
+    c0 += xt_0;
+    if (active_node == ActivationType::kSigmoid) {
+      c0 = Sigmoid(c0);
+    } else if (active_node == ActivationType::kReLU) {
+      c0 = ReLU(c0);
+    } else if (active_node == ActivationType::kTanh) {
+      c0 = Tanh(c0);
+    }
+    gate_value[col] = c0;
+    if (frame_size <= col && col < frame_size * 2) {
+      T htp_0 = 0.0;
+      if (prev_output_value) {
+        htp_0 = prev_output_value[col - frame_size];
+      }
+      reset_output[col - frame_size] = c0 * htp_0;
+    } else if (col < frame_size) {
+      gate_value[col] = c0;
+    }
+  }
+}
+
+template <class T, int TiledSize>
+__global__ void FastCollectiveGruOut(T* gate_weight,
+                                     T* prev_out_value,
+                                     T* output_value,
+                                     T* gate_value,
+                                     T* reset_value,
+                                     int frame_size,
+                                     ActivationType active_node,
+                                     bool origin_mode) {
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  T a0 = 0.0f;
+  T b0[TiledSize];
+  T c0 = 0.0f;
+
+  int tiled_mask = ((1 << TiledSize) - 1);
+  if (prev_out_value) {
+    for (int k = 0; k < ((frame_size - 1) / TiledSize + 1); ++k) {
+      a0 = 0;
+      if ((threadIdx.x + k * TiledSize) < frame_size) {
+        a0 = reset_value[threadIdx.x + k * TiledSize];
+      }
+      for (int i = 0; i < TiledSize; ++i) {
+        if (col < frame_size && (i + k * TiledSize) < frame_size) {
+          b0[i] = gate_weight[(i + k * TiledSize) * frame_size + col];
+        }
+      }
+      for (int i = 0; i < TiledSize; ++i) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+        c0 = c0 + __shfl_sync(tiled_mask, a0, i, TiledSize) * b0[i];
+#else
+        c0 = c0 + __shfl(a0, i, TiledSize) * b0[i];
+#endif
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (col < frame_size) {
+    T xt_0 = gate_value[col + 2 * frame_size];
+    T gta_0 = gate_value[col];
+    T htp_0 = 0;
+    if (prev_out_value) {
+      htp_0 = prev_out_value[col];
+    }
+    c0 += xt_0;
+    if (active_node == ActivationType::kSigmoid) {
+      c0 = Sigmoid(c0);
+    } else if (active_node == ActivationType::kReLU) {
+      c0 = ReLU(c0);
+    } else if (active_node == ActivationType::kTanh) {
+      c0 = Tanh(c0);
+    }
+    gate_value[col + 2 * frame_size] = c0;
+    if (origin_mode) {
+      output_value[col] = htp_0 * gta_0 + (1 - gta_0) * c0;
+    } else {
+      output_value[col] = c0 * gta_0 + (1 - gta_0) * htp_0;
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/scale.cu b/lite/backends/cuda/math/scale.cu
index 806a3697a2eb19354a81056f0a7ab6272ed991a1..f9d5209c3e4af11231f4b62531f9eb11ede56557 100644
--- a/lite/backends/cuda/math/scale.cu
+++ b/lite/backends/cuda/math/scale.cu
@@ -22,10 +22,6 @@ namespace lite {
 namespace cuda {
 namespace math {
 
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void scale_kernel(int count,
                              const T* in_data,
@@ -48,7 +44,6 @@ __global__ void scale_kernel(int count,
 template <typename T>
 __global__ void scale_kernel(
     int count, const T* in_data, T* out_data, const T scale, const T bias) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
   CUDA_KERNEL_LOOP(tid, count) { out_data[tid] = scale * in_data[tid] + bias; }
 }
 
@@ -133,12 +128,11 @@ void fp32_scale_nhwc(int num,
 }
 
 template <typename T>
-void scale(int num, const T* in, T* out, T scale, cudaStream_t stream, T bias) {
+void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream) {
   int thread = 256;
   int block = (num + thread - 1) / thread;
   scale_kernel<<<block, thread, 0, stream>>>(num, in, out, scale, bias);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }
 
 template <typename T>
@@ -146,11 +140,10 @@ void scale(int num, const T* in, T* out, T scale, T bias) {
   int thread = 256;
   int block = (num + thread - 1) / thread;
   scale_kernel<<<block, thread>>>(num, in, out, scale, bias);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }
 
-template void scale(int num, const float*, float*, float, cudaStream_t, float);
+template void scale(int num, const float*, float*, float, float, cudaStream_t);
 template void scale(int num, const float*, float*, float, float);
 
 }  // namespace math
diff --git a/lite/backends/cuda/math/scale.h b/lite/backends/cuda/math/scale.h
index 52ed1d38ae79ce11cac50a9abef0f57e6de1352c..b9961b12c3c251ffb7f80589fa8c9ccb12d96e30 100644
--- a/lite/backends/cuda/math/scale.h
+++ b/lite/backends/cuda/math/scale.h
@@ -32,8 +32,7 @@ void fp32_scale_nhwc(int num,
                      cudaStream_t stream);
 
 template <typename T>
-void scale(
-    int num, const T* in, T* out, T scale, cudaStream_t stream, T bias = 0);
+void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream);
 
 template <typename T>
 void scale(int num, const T* in, T* out, T scale, T bias = 0);
diff --git a/lite/backends/cuda/math/sequence2batch.cu b/lite/backends/cuda/math/sequence2batch.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9a93362b3bb163b889049d07186634987ed63940
--- /dev/null
+++ b/lite/backends/cuda/math/sequence2batch.cu
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/sequence2batch.h"
+#include "lite/backends/cuda/math/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+__global__ void CopyMatrixRowsKernel(const T* src,
+                                     T* dst,
+                                     const uint64_t* index,
+                                     int height,
+                                     int width,
+                                     bool is_src_index) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int row_id = blockDim.y * blockIdx.x + idy;
+  if (row_id < height) {
+    int src_idx = is_src_index ? index[row_id] : row_id;
+    int dst_idx = is_src_index ? row_id : index[row_id];
+    const T* src_data = src + src_idx * width;
+    T* dst_data = dst + dst_idx * width;
+    for (int i = idx; i < width; i += blockDim.x) {
+      dst_data[i] = src_data[i];
+    }
+  }
+}
+
+template <typename T>
+void CopyMatrixRowsFunctor<T>::operator()(
+    const lite::Tensor& src,
+    lite::Tensor* dst,
+    const std::vector<uint64_t>& index_lod,
+    bool is_src_index,
+    const cudaStream_t& stream) {
+  auto src_dims = src.dims();
+  auto dst_dims = dst->dims();
+  CHECK_EQ(src_dims.size(), 2) << "The src must be matrix with rank 2.";
+  CHECK_EQ(dst_dims.size(), 2) << "The dst must be matrix with rank 2.";
+  CHECK_EQ(src_dims[1], dst_dims[1])
+      << "The width of src and dst must be same.";
+  int height = dst_dims[0];
+  int width = dst_dims[1];
+  const auto* src_data = src.data<T>();
+  auto* dst_data = dst->template mutable_data<T>(TARGET(kCUDA));
+
+  index_tensor_.Resize({static_cast<int64_t>(index_lod.size())});
+  auto* index_tensor_data = index_tensor_.mutable_data<uint64_t>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(index_tensor_data,
+                                 index_lod.data(),
+                                 sizeof(uint64_t) * index_lod.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  dim3 threads(128, 8);
+  dim3 grids((height + threads.y - 1) / threads.y);
+  CopyMatrixRowsKernel<T><<<grids, threads, 0, stream>>>(
+      src_data, dst_data, index_tensor_data, height, width, is_src_index);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template class CopyMatrixRowsFunctor<float>;
+template class CopyMatrixRowsFunctor<half>;
+
+template class LoDTensor2BatchFunctor<float>;
+template class LoDTensor2BatchFunctor<half>;
+
+template class Batch2LoDTensorFunctor<float>;
+template class Batch2LoDTensorFunctor<half>;
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/sequence2batch.h b/lite/backends/cuda/math/sequence2batch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5a12ed0b4d54a9af47cfc046906ae96767e63cf
--- /dev/null
+++ b/lite/backends/cuda/math/sequence2batch.h
@@ -0,0 +1,167 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+class CopyMatrixRowsFunctor {
+ public:
+  // If is_src_index is true, copy the indexed rows of input src to the output
+  // dst. If is_src_index is false, copy the input src to the indexed of output
+  // dst. The indexes rows are based on the input index.
+  void operator()(const lite::Tensor& src,
+                  lite::Tensor* dst,
+                  const std::vector<uint64_t>& index_lod,
+                  bool is_src_index,
+                  const cudaStream_t& stream);
+
+ private:
+  lite::Tensor index_tensor_;
+};
+
+template <typename T>
+class LoDTensor2BatchFunctor {
+  // Calculate the length of each sequence and
+  // sort sequence index by the length.
+  // example:  sequences = {s0, s1, s2}
+  //            s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+  //            seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
+  struct SeqInfo {
+    SeqInfo(size_t start_val, size_t len_val, size_t seq_val)
+        : start(start_val), length(len_val), seq_idx(seq_val) {}
+    size_t start;
+    size_t length;
+    size_t seq_idx;
+  };
+
+ public:
+  void operator()(const lite::Tensor& lod_tensor,
+                  lite::Tensor* batch_tensor,
+                  bool is_reverse,
+                  const cudaStream_t& stream) const {
+    auto lods = lod_tensor.lod();
+    CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now.";
+    const auto& lod = lods[0];
+
+    std::vector<SeqInfo> seq_info;
+    for (int seq_id = 0; seq_id < static_cast<int>(lod.size()) - 1; ++seq_id) {
+      size_t length = lod[seq_id + 1] - lod[seq_id];
+      seq_info.emplace_back(lod[seq_id], length, seq_id);
+    }
+
+    std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
+      return a.length > b.length;
+    });
+
+    // Calculate the start position of each batch.
+    // example:  sequences = {s0, s1, s2}
+    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+    //           max_seqlen = 5,
+    //           batchIndex = {b0, b1, b2, b3, b4}
+    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
+    //              batch_start_positions[0] = 0
+    //              batch_start_positions[1] = len(b0)
+    //              batch_start_positions[2] = len(b0) + len(b1)
+    //              ...
+    //           seq2batch_idx[12] = {4, 0, 9,
+    //                                5, 1, 10,
+    //                                6, 2, 11,
+    //                                7, 3,
+    //                                8}
+    //           seq_order = {1, 0, 2}, the sort order.
+    //               where 1 is the second sequence,
+    //                     0 is the first sequence,
+    //                     2 is the third sequence.
+
+    LoD batch_lods;
+    batch_lods.emplace_back(std::vector<uint64_t>{0});
+    batch_lods.emplace_back(std::vector<uint64_t>{0});
+    batch_lods.emplace_back(std::vector<uint64_t>{0});
+
+    // batch_lods[0] is the start positions for batch LoDTensor
+    size_t max_seqlen = seq_info[0].length;
+    batch_lods[0].resize(max_seqlen + 1);
+    // batch_lods[1] is the raw index in the input LoDTensor
+    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+    // batch_lods[2] is the sort order for the input LoDTensor.
+    batch_lods[2].resize(seq_info.size());
+
+    auto* batch_starts = batch_lods[0].data();
+    auto* seq2batch_idx = batch_lods[1].data();
+    batch_starts[0] = 0;
+    for (size_t n = 0; n < max_seqlen; ++n) {
+      size_t batch_id = batch_starts[n];
+      for (size_t i = 0; i < seq_info.size(); ++i) {
+        size_t seq_len = seq_info[i].length;
+        size_t start = seq_info[i].start;
+        if (n < seq_len) {
+          seq2batch_idx[batch_id] =
+              is_reverse ? start + seq_len - 1 - n : start + n;
+          ++batch_id;
+        } else {
+          break;
+        }
+      }
+      batch_starts[n + 1] = batch_id;
+    }
+    auto* seq_order = batch_lods[2].data();
+    for (size_t i = 0; i < seq_info.size(); ++i) {
+      seq_order[i] = seq_info[i].seq_idx;
+    }
+
+    batch_tensor->set_lod(batch_lods);
+
+    lite::cuda::math::CopyMatrixRowsFunctor<T> to_batch;
+    to_batch(lod_tensor, batch_tensor, batch_lods[1], true, stream);
+    CUDA_POST_KERNEL_CHECK;
+  }
+};
+
+template <typename T>
+class Batch2LoDTensorFunctor {
+ public:
+  void operator()(const lite::Tensor& batch_tensor,
+                  lite::Tensor* lod_tensor,
+                  const cudaStream_t& stream) {
+    auto in_lod = batch_tensor.lod();
+    CHECK_GT(in_lod.size(), 2UL) << "The LoD of LoDTensor should include at "
+                                    "least 2-level sequence infomation.";
+    CHECK_EQ(in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]))
+        << "The LoD information should be consistent with the dims.";
+    lite::cuda::math::CopyMatrixRowsFunctor<T> to_seq;
+    to_seq(batch_tensor, lod_tensor, in_lod[1], false, stream);
+    CUDA_POST_KERNEL_CHECK;
+  }
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/sequence_padding.cu b/lite/backends/cuda/math/sequence_padding.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e4f194b9c2289c51983d62b3835727efea91028d
--- /dev/null
+++ b/lite/backends/cuda/math/sequence_padding.cu
@@ -0,0 +1,164 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/sequence_padding.h"
+#include "lite/backends/cuda/math/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+enum CopyType { kSeqToPad, kPadToSeq };
+
+template <typename T, CopyType Type>
+__global__ void SequencePadKernel(T* dst,
+                                  const T* src,
+                                  const T* pad_value,
+                                  bool is_constant_pad,
+                                  const size_t* seq_offsets,
+                                  const int seq_num,
+                                  const int pad_seq_len,
+                                  const int step_width) {
+  size_t seq_idx = blockIdx.y;
+  size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
+
+  size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width;
+  size_t pad_data_offset = (seq_idx * pad_seq_len + step_idx) * step_width;
+  T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset);
+  const T* src_data =
+      src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
+
+  if (step_idx < seq_len) {
+    for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
+      dst_data[i] = src_data[i];
+    }
+  } else if (step_idx < pad_seq_len && Type == kSeqToPad) {
+    for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
+      dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i];
+    }
+  }
+}
+
+template <typename T>
+void SequencePadding(T* pad_data,
+                     const T* seq_data,
+                     const T* pad_value_data,
+                     bool is_constant_pad,
+                     const size_t* seq_offsets_data,
+                     int seq_num,
+                     int pad_seq_len,
+                     int step_width,
+                     cudaStream_t* stream) {
+  const int kBlockSize = 512;
+  /* At least use 32 threads to copy sequence_width elements,
+   * and at least 8 elements for each thread.
+   */
+  size_t block_dim_x =
+      std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+  size_t block_dim_y = kBlockSize / block_dim_x;
+  dim3 threads(block_dim_x, block_dim_y);
+
+  size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
+  size_t grid_dim_y = seq_num;
+  dim3 grid(grid_dim_x, grid_dim_y);
+
+  SequencePadKernel<T, kSeqToPad><<<grid, threads, 0, *stream>>>(
+      pad_data,
+      seq_data,
+      pad_value_data,
+      is_constant_pad,
+      seq_offsets_data,
+      seq_num,
+      pad_seq_len,
+      step_width);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <typename T>
+void SequenceUnpadding(T* seq_data,
+                       const T* pad_data,
+                       const size_t* seq_offsets_data,
+                       int seq_num,
+                       int pad_seq_len,
+                       int step_width,
+                       cudaStream_t* stream) {
+  const int kBlockSize = 512;
+  /* At least use 32 threads to copy sequence_width elements,
+   * and at least 8 elements for each thread.
+   */
+  size_t block_dim_x =
+      std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+  size_t block_dim_y = kBlockSize / block_dim_x;
+  dim3 threads(block_dim_x, block_dim_y);
+
+  size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
+  size_t grid_dim_y = seq_num;
+  dim3 grid(grid_dim_x, grid_dim_y);
+
+  SequencePadKernel<T, kPadToSeq><<<grid, threads, 0, *stream>>>(
+      seq_data,
+      pad_data,
+      nullptr,
+      false,
+      seq_offsets_data,
+      seq_num,
+      pad_seq_len,
+      step_width);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template void SequencePadding(float* pad_data,
+                              const float* seq_data,
+                              const float* pad_value_data,
+                              bool is_constant_pad,
+                              const size_t* seq_offsets_data,
+                              int seq_num,
+                              int pad_seq_len,
+                              int step_width,
+                              cudaStream_t* stream);
+
+template void SequencePadding(half* pad_data,
+                              const half* seq_data,
+                              const half* pad_value_data,
+                              bool is_constant_pad,
+                              const size_t* seq_offsets_data,
+                              int seq_num,
+                              int pad_seq_len,
+                              int step_width,
+                              cudaStream_t* stream);
+
+template void SequenceUnpadding(float* seq_data,
+                                const float* pad_data,
+                                const size_t* seq_offsets_data,
+                                int seq_num,
+                                int pad_seq_len,
+                                int step_width,
+                                cudaStream_t* stream);
+
+template void SequenceUnpadding(half* seq_data,
+                                const half* pad_data,
+                                const size_t* seq_offsets_data,
+                                int seq_num,
+                                int pad_seq_len,
+                                int step_width,
+                                cudaStream_t* stream);
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/sequence_padding.h b/lite/backends/cuda/math/sequence_padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfbac9b5bce2cad75174695ee85c28720a3eaf11
--- /dev/null
+++ b/lite/backends/cuda/math/sequence_padding.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <string>
+#include <vector>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+void SequenceUnpadding(T* seq_data,
+                       const T* pad_data,
+                       const size_t* seq_offsets_data,
+                       int seq_num,
+                       int pad_seq_len,
+                       int step_width,
+                       cudaStream_t* stream);
+
+template <typename T>
+void SequencePadding(T* pad_data,
+                     const T* seq_data,
+                     const T* pad_value_data,
+                     bool is_constant_pad,
+                     const size_t* seq_offsets_data,
+                     int seq_num,
+                     int pad_seq_len,
+                     int step_width,
+                     cudaStream_t* stream);
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/strided_gemm.cc b/lite/backends/cuda/math/strided_gemm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91013d977702682a42050407f49356bf7445bcbd
--- /dev/null
+++ b/lite/backends/cuda/math/strided_gemm.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/strided_gemm.h"
+
+#include <iostream>
+
+#include "lite/core/device_info.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename PtypeIn, typename PtypeOut>
+bool StridedGemm<PtypeIn, PtypeOut>::init(const bool trans_a,
+                                          const bool trans_b,
+                                          Context<TARGET(kCUDA)>* ctx) {
+  if (cu_handle_ == nullptr) {
+    this->exe_stream_ = ctx->exec_stream();
+    CUBLAS_CALL(cublasCreate(&cu_handle_));
+    CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
+  }
+  cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  return true;
+}
+
+template <>
+bool StridedGemm<float, float>::run(const float alpha,
+                                    const float beta,
+                                    const int m,
+                                    const int n,
+                                    const int k,
+                                    const float* a_data,
+                                    const float* b_data,
+                                    float* c_data,
+                                    const int batch_size,
+                                    const int64_t stride_a,
+                                    const int64_t stride_b) {
+  lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
+  ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  const int64_t stride_c = m_ * n_;
+  CUBLAS_CALL(cublasGemmStridedBatchedEx(cu_handle_,
+                                         cu_trans_b_,
+                                         cu_trans_a_,
+                                         n_,
+                                         m_,
+                                         k_,
+                                         &alpha,
+                                         b_data,
+                                         CUDA_R_32F,
+                                         ldb_,
+                                         stride_b,
+                                         a_data,
+                                         CUDA_R_32F,
+                                         lda_,
+                                         stride_a,
+                                         &beta,
+                                         c_data,
+                                         CUDA_R_32F,
+                                         ldc_,
+                                         stride_c,
+                                         batch_size,
+                                         CUDA_R_32F,
+                                         algo_));
+  return true;
+}
+
+template <>
+bool StridedGemm<half, half>::run(const half alpha,
+                                  const half beta,
+                                  const int m,
+                                  const int n,
+                                  const int k,
+                                  const half* a_data,
+                                  const half* b_data,
+                                  half* c_data,
+                                  const int batch_size,
+                                  const int64_t stride_a,
+                                  const int64_t stride_b) {
+  lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
+  ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  const int64_t stride_c = m_ * n_;
+  CUBLAS_CALL(cublasGemmStridedBatchedEx(cu_handle_,
+                                         cu_trans_b_,
+                                         cu_trans_a_,
+                                         n_,
+                                         m_,
+                                         k_,
+                                         &alpha,
+                                         b_data,
+                                         CUDA_R_16F,
+                                         ldb_,
+                                         stride_b,
+                                         a_data,
+                                         CUDA_R_16F,
+                                         lda_,
+                                         stride_a,
+                                         &beta,
+                                         c_data,
+                                         CUDA_R_16F,
+                                         ldc_,
+                                         stride_c,
+                                         batch_size,
+                                         CUDA_R_16F,
+                                         algo_));
+  return true;
+}
+
+template class StridedGemm<float, float>;
+template class StridedGemm<half, half>;
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/strided_gemm.h b/lite/backends/cuda/math/strided_gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a0fe7143a2569eda36d203d9c905f2a4a0c772c
--- /dev/null
+++ b/lite/backends/cuda/math/strided_gemm.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cudnn.h>
+
+#include <string>
+#include <vector>
+
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename PtypeIn, typename PtypeOut>
+class StridedGemm {
+ public:
+  StridedGemm() : cu_handle_(nullptr) {}
+  ~StridedGemm() {}
+
+  bool init(const bool trans_a,
+            const bool trans_b,
+            Context<TARGET(kCUDA)>* ctx);
+
+  bool run(const PtypeIn alpha,
+           const PtypeIn beta,
+           const int m,
+           const int n,
+           const int k,
+           const PtypeIn* a_data,
+           const PtypeIn* b_data,
+           PtypeOut* c_data,
+           const int batch_size,
+           const int64_t stride_a,
+           const int64_t stride_b);
+
+ private:
+  cudaStream_t exe_stream_;
+  cublasHandle_t cu_handle_;
+  cublasOperation_t cu_trans_a_;
+  cublasOperation_t cu_trans_b_;
+  int m_{-1};
+  int n_{-1};
+  int k_{-1};
+  int lda_{-1};
+  int ldb_{-1};
+  int ldc_{-1};
+  cublasGemmAlgo_t algo_{CUBLAS_GEMM_DEFAULT_TENSOR_OP};
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/transpose.cu b/lite/backends/cuda/math/transpose.cu
index c50840fe269657965db8c58b171fce6819009775..d919bd757fbbcfcc5e5f8a3a4c18fbd1ed9ac53f 100644
--- a/lite/backends/cuda/math/transpose.cu
+++ b/lite/backends/cuda/math/transpose.cu
@@ -174,24 +174,9 @@ void Transpose<T>::transpose(T* dst,
   TransposeCUDAImpl<T>(src_dims, axes, src, dst, &Y_dims_, &strides_, stream);
 }
 
-// template <typename T>
-// void Transpose<T>::transpose(T* dst,
-//                             const T* src,
-//                             const std::vector<int>& src_dims,
-//                             const std::vector<int>& axes,
-//                             cudaStream_t* stream) {
-//  std::vector<int64_t> _src_dims(src_dims.size(), 0);
-//  std::transform(
-//      src_dims.begin(),
-//      src_dims.end(),
-//      _src_dims.begin(),
-//      [](int data) -> int64_t { return static_cast<int64_t>(data); });
-//  TransposeCUDAImpl<T>(_src_dims, axes, src, dst, &Y_dims_, &strides_,
-//  stream);
-//}
-
 template class Transpose<int8_t>;
 template class Transpose<float>;
+template class Transpose<half>;
 
 }  // namespace math
 }  // namespace cuda
diff --git a/lite/backends/cuda/target_wrapper.h b/lite/backends/cuda/target_wrapper.h
index 3eeee84c1c46a65782e38b998bcd8142e08cbec1..caa9b3077fe96bf73e50b33688b90b71e0cd5c23 100644
--- a/lite/backends/cuda/target_wrapper.h
+++ b/lite/backends/cuda/target_wrapper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include "lite/backends/cuda/cuda_utils.h"
 #include "lite/core/target_wrapper.h"
 
 namespace paddle {
@@ -31,6 +32,16 @@ class TargetWrapper<TARGET(kCUDA)> {
   static size_t num_devices();
   static size_t maximum_stream() { return 0; }
 
+  static int GetComputeCapability() {
+    int dev_id = GetCurDevice();
+    int major, minor;
+    CUDA_CALL(cudaDeviceGetAttribute(
+        &major, cudaDevAttrComputeCapabilityMajor, dev_id));
+    CUDA_CALL(cudaDeviceGetAttribute(
+        &minor, cudaDevAttrComputeCapabilityMinor, dev_id));
+    return major * 10 + minor;
+  }
+
   static size_t GetCurDevice() {
     int dev_id;
     cudaGetDevice(&dev_id);
diff --git a/lite/backends/host/target_wrapper.cc b/lite/backends/host/target_wrapper.cc
index 5f020662a9d74aab6c28f79221d670e5de5ae048..00ce9dd6b349decc2f603692c2a6a0801bd4d7c0 100644
--- a/lite/backends/host/target_wrapper.cc
+++ b/lite/backends/host/target_wrapper.cc
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace lite {
 
-const int MALLOC_ALIGN = 64;
+const int MALLOC_ALIGN = 16;
 
 void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
   size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
@@ -30,7 +30,6 @@ void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
   void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
                                     (~(MALLOC_ALIGN - 1)));
   static_cast<void**>(r)[-1] = p;
-  memset(r, 0, size);
   return r;
 }
 void TargetWrapper<TARGET(kHost)>::Free(void* ptr) {
diff --git a/lite/backends/huawei_ascend_npu/CMakeLists.txt b/lite/backends/huawei_ascend_npu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..65616b4d357d4d29ca9b356abead2e1f6eb725d1
--- /dev/null
+++ b/lite/backends/huawei_ascend_npu/CMakeLists.txt
@@ -0,0 +1,6 @@
+if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+  return()
+endif()
+
+lite_cc_library(model_client_huawei_ascend_npu SRCS model_client.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs})
+lite_cc_library(device_huawei_ascend_npu SRCS device.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs} model_client_huawei_ascend_npu)
diff --git a/lite/backends/huawei_ascend_npu/device.cc b/lite/backends/huawei_ascend_npu/device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c8dc3d1de46fe12c3cb41257f864bcb1ff82bd9a
--- /dev/null
+++ b/lite/backends/huawei_ascend_npu/device.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/huawei_ascend_npu/device.h"
+#include <map>
+#include <utility>
+#include "ge/ge_api_types.h"
+#include "ge/ge_ir_build.h"
+#include "graph/graph.h"
+#include "lite/utils/io.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+std::shared_ptr<AclModelClient> Device::LoadFromMem(
+    const std::vector<char>& model_buffer, const int device_id) {
+  if (model_buffer.size() == 0) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] model_buffer size is ZERO!";
+    return nullptr;
+  }
+
+  // Create a ACL model  client to load the om model
+  std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
+  // Load model from memory
+  if (model_client->LoadFromMem(
+          reinterpret_cast<const void*>(model_buffer.data()),
+          model_buffer.size())) {
+    return model_client;
+  }
+  return nullptr;
+}
+
+std::shared_ptr<AclModelClient> Device::LoadFromFile(
+    const std::string& model_path, const int device_id) {
+  if (!paddle::lite::IsFileExists(model_path)) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] om model file not exists:" << model_path;
+    return nullptr;
+  }
+
+  // Create a ACL model  client to load the om model
+  std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
+  // Load model from memory
+  if (model_client->LoadFromFile(model_path.c_str())) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
+    return model_client;
+  }
+  return nullptr;
+}
+
+std::mutex Device::device_mutex_;
+
+bool Device::Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
+                   std::vector<ge::Operator>& output_nodes,  // NOLINT
+                   std::vector<char>* model_buffer) {
+  std::lock_guard<std::mutex> lock(device_mutex_);
+  // Convert the HiAI IR graph to the HiAI om model
+  ge::Graph ir_graph("graph");
+  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
+
+  // Build IR model
+  ge::ModelBufferData om_buffer;
+  std::map<std::string, std::string> options;
+  options.insert(std::make_pair(ge::ir_option::LOG_LEVEL, "error"));
+
+  ATC_CALL(aclgrphBuildModel(ir_graph, options, om_buffer));
+
+  // Copy from om model buffer
+  model_buffer->resize(om_buffer.length);
+  memcpy(reinterpret_cast<void*>(model_buffer->data()),
+         reinterpret_cast<void*>(om_buffer.data.get()),
+         om_buffer.length);
+
+  return true;
+}
+
+void Device::InitOnce() {
+  if (runtime_inited_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] runtime already inited!";
+    return;
+  }
+  // ACL runtime init => can only be called once in one process
+  ACL_CALL(aclInit(NULL));
+
+  // ATC builder init => can only be called once in one process
+  std::map<std::string, std::string> global_options;
+  global_options.insert(
+      std::make_pair(ge::ir_option::SOC_VERSION, "Ascend310"));
+  ATC_CALL(ge::aclgrphBuildInitialize(global_options));
+
+  runtime_inited_ = true;
+}
+
+void Device::DestroyOnce() {
+  if (!runtime_inited_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to destroy runtime!";
+    return;
+  }
+  // ATC builder finalize => can only be called once in one process
+  ge::aclgrphBuildFinalize();
+  // ACL runtime finalize => can only be called once in one process
+  ACL_CALL(aclFinalize());
+
+  runtime_inited_ = false;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/huawei_ascend_npu/device.h b/lite/backends/huawei_ascend_npu/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..de7ca55670ad019b0f035f9e8ab42c29748654f1
--- /dev/null
+++ b/lite/backends/huawei_ascend_npu/device.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+#include "lite/backends/huawei_ascend_npu/model_client.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() { InitOnce(); }
+
+  ~Device() { DestroyOnce(); }
+
+  std::shared_ptr<AclModelClient> LoadFromMem(
+      const std::vector<char>& model_buffer, const int device_id);
+  std::shared_ptr<AclModelClient> LoadFromFile(const std::string& model_path,
+                                               const int device_id);
+  // Build the ACL IR graph to the ACL om model
+  bool Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
+             std::vector<ge::Operator>& output_nodes,  // NOLINT
+             std::vector<char>* model_buffer);         // NOLINT
+
+ private:
+  void InitOnce();
+  void DestroyOnce();
+  bool runtime_inited_{false};
+  static std::mutex device_mutex_;
+};
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/huawei_ascend_npu/model_client.cc b/lite/backends/huawei_ascend_npu/model_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02a8014210b24f8ae143ee68341aec0281d5a570
--- /dev/null
+++ b/lite/backends/huawei_ascend_npu/model_client.cc
@@ -0,0 +1,398 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/huawei_ascend_npu/model_client.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+bool AclModelClient::LoadFromMem(const void* data, uint32_t size) {
+  if (load_flag_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
+    return true;
+  }
+
+  auto ret = aclmdlQuerySizeFromMem(
+      data, size, &model_memory_size_, &model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from memory failed!";
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
+                    "failed, require size is "
+                 << model_memory_size_;
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
+                    "failed, require size is "
+                 << model_weight_size_;
+    return false;
+  }
+  ret = aclmdlLoadFromMemWithMem(data,
+                                 size,
+                                 &model_id_,
+                                 model_memory_ptr_,
+                                 model_memory_size_,
+                                 model_weight_ptr_,
+                                 model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from memory failed!";
+    return false;
+  }
+  model_desc_ = aclmdlCreateDesc();
+  if (model_desc_ == nullptr) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
+    return false;
+  }
+  ret = aclmdlGetDesc(model_desc_, model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] AclModelClient LoadFromMem success.";
+  load_flag_ = true;
+  return true;
+}
+
+bool AclModelClient::LoadFromFile(const char* model_path) {
+  if (load_flag_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
+    return true;
+  }
+  auto ret =
+      aclmdlQuerySize(model_path, &model_memory_size_, &model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from file failed!";
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
+                    "failed, require size is "
+                 << model_memory_size_;
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
+                    "failed, require size is "
+                 << model_weight_size_;
+    return false;
+  }
+  ret = aclmdlLoadFromFileWithMem(model_path,
+                                  &model_id_,
+                                  model_memory_ptr_,
+                                  model_memory_size_,
+                                  model_weight_ptr_,
+                                  model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from file failed!";
+    return false;
+  }
+  model_desc_ = aclmdlCreateDesc();
+  if (model_desc_ == nullptr) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
+    return false;
+  }
+  ret = aclmdlGetDesc(model_desc_, model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
+  load_flag_ = true;
+  return true;
+}
+
+bool AclModelClient::GetModelIOTensorDim(
+    std::vector<TensorDesc>* input_tensor,
+    std::vector<TensorDesc>* output_tensor) {
+  if (!model_desc_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] GetModelIOTensorDim failed!";
+    return false;
+  }
+  size_t input_num = aclmdlGetNumInputs(model_desc_);
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] input numher is " << input_num;
+  for (size_t i = 0; i < input_num; i++) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] printing input [" << i << "] ....";
+    aclmdlIODims input_dim;
+    aclmdlGetInputDims(model_desc_, i, &input_dim);
+    aclDataType data_type = aclmdlGetInputDataType(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of inputs[" << i << "] is "
+            << data_type;
+    aclFormat data_format = aclmdlGetInputFormat(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of inputs[" << i << "] is "
+            << data_format;
+    TensorDesc tensor_desc = TensorDesc(data_type, input_dim, data_format);
+    input_tensor->push_back(tensor_desc);
+  }
+
+  size_t output_num = aclmdlGetNumOutputs(model_desc_);
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] output numher is " << output_num;
+  for (size_t i = 0; i < output_num; i++) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] printing output [" << i << "] ....";
+    aclmdlIODims output_dim;
+    aclmdlGetOutputDims(model_desc_, i, &output_dim);
+    aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of outputs[" << i << "] is "
+            << data_type;
+    aclFormat data_format = aclmdlGetOutputFormat(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of outputs[" << i << "] is "
+            << data_format;
+    TensorDesc tensor_desc = TensorDesc(data_type, output_dim, data_format);
+    output_tensor->push_back(tensor_desc);
+  }
+  return true;
+}
+
+bool AclModelClient::GetTensorFromDataset(
+    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
+  size_t device_output_num = aclmdlGetDatasetNumBuffers(output_dataset_);
+  size_t tensor_output_num = reinterpret_cast<size_t>(output_tensor->size());
+  if (device_output_num != tensor_output_num) {
+    LOG(ERROR)
+        << "[HUAWEI_ASCEND_NPU] output number not equal, device number is "
+        << device_output_num << "tensor number is " << tensor_output_num;
+    return false;
+  }
+  for (size_t i = 0; i < device_output_num; i++) {
+    aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(output_dataset_, i);
+    void* device_data = aclGetDataBufferAddr(buffer_device);
+    uint32_t device_size = aclGetDataBufferSize(buffer_device);
+
+    void* tensor_data = nullptr;
+    aclError ret = aclrtMallocHost(&tensor_data, device_size);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMallocHost failed, ret " << ret;
+      return false;
+    }
+    ret = aclrtMemcpy(tensor_data,
+                      device_size,
+                      device_data,
+                      device_size,
+                      ACL_MEMCPY_DEVICE_TO_HOST);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMemcpy failed, ret " << ret;
+      return false;
+    }
+    if (output_tensor->at(i)->SetData(reinterpret_cast<uint8_t*>(tensor_data),
+                                      device_size) != ge::GRAPH_SUCCESS) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] SetData to output tensor failed";
+      return false;
+    }
+  }
+  VLOG(3)
+      << "[HUAWEI_ASCEND_NPU] Get output tensor from output dataset succeed.";
+  return true;
+}
+
+void AclModelClient::CreateInputDataset(
+    std::vector<std::shared_ptr<ge::Tensor>>* input_tensor) {
+  input_dataset_ = aclmdlCreateDataset();
+  if (input_dataset_ == nullptr) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create input dataset failed!";
+    return;
+  }
+
+  for (size_t i = 0; i < input_tensor->size(); i++) {
+    auto item = input_tensor->at(i);
+    size_t buffer_size = item->GetSize();
+    void* buffer_device = nullptr;
+    aclError ret =
+        aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR)
+          << "[HUAWEI_ASCEND_NPU] input malloc device buffer failed. size is "
+          << buffer_size;
+      return;
+    }
+    void* buffer_data = reinterpret_cast<void*>(item->GetData());
+    ret = aclrtMemcpy(buffer_device,
+                      buffer_size,
+                      buffer_data,
+                      buffer_size,
+                      ACL_MEMCPY_HOST_TO_DEVICE);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input memcpy failed, buffer size is "
+                 << buffer_size;
+      aclrtFree(buffer_device);
+      return;
+    }
+    aclDataBuffer* data_buffer =
+        aclCreateDataBuffer(buffer_device, buffer_size);
+    if (data_buffer == nullptr) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
+      aclrtFree(buffer_device);
+      return;
+    }
+    if (aclmdlAddDatasetBuffer(input_dataset_, data_buffer) != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input aclmdlAddDatasetBuffer failed!";
+      aclrtFree(buffer_device);
+      aclDestroyDataBuffer(data_buffer);
+      return;
+    }
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateInputDataset succeed.";
+}
+void AclModelClient::CreateOutputDataset(
+    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
+  output_dataset_ = aclmdlCreateDataset();
+  if (output_dataset_ == nullptr) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create output dataset failed!";
+    return;
+  }
+  size_t output_size = aclmdlGetNumOutputs(model_desc_);
+  CHECK_EQ(output_size, output_tensor->size());
+  for (size_t i = 0; i < output_size; i++) {
+    size_t buffer_size = aclmdlGetOutputSizeByIndex(model_desc_, i);
+    void* buffer_device = nullptr;
+    aclError ret =
+        aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR)
+          << "[HUAWEI_ASCEND_NPU] output malloc device buffer failed. size is "
+          << buffer_size;
+      return;
+    }
+    aclDataBuffer* data_buffer =
+        aclCreateDataBuffer(buffer_device, buffer_size);
+    if (data_buffer == nullptr) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
+      aclrtFree(buffer_device);
+      return;
+    }
+    if (aclmdlAddDatasetBuffer(output_dataset_, data_buffer) !=
+        ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclmdlAddDatasetBuffer failed!";
+      aclrtFree(buffer_device);
+      aclDestroyDataBuffer(data_buffer);
+      return;
+    }
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateOutputDataset succeed.";
+}
+
+bool AclModelClient::ModelExecute(
+    std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
+    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
+  // check model exists
+  if (model_desc_ == nullptr) {
+    LOG(ERROR)
+        << "[HUAWEI_ASCEND_NPU] no model description, model execution failed!";
+    return false;
+  }
+  // create input/output dataset
+  CreateInputDataset(input_tensor);
+  CreateOutputDataset(output_tensor);
+
+  // model execution
+  ACL_CALL(aclmdlExecute(model_id_, input_dataset_, output_dataset_));
+
+  // get output
+  if (!GetTensorFromDataset(output_tensor)) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset failed, modelId:"
+               << model_id_;
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset succeed, modelId:"
+          << model_id_;
+
+  return true;
+}
+
+void AclModelClient::DestroyDataset(aclmdlDataset** dataset) {
+  if (*dataset == nullptr) {
+    LOG(WARNING)
+        << "[HUAWEI_ASCEND_NPU] no dataset exists, no need to destroy!";
+    return;
+  }
+
+  size_t dataset_num = aclmdlGetDatasetNumBuffers(*dataset);
+  for (size_t i = 0; i < dataset_num; i++) {
+    aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(*dataset, i);
+    void* device_data = aclGetDataBufferAddr(buffer_device);
+    if (device_data == nullptr) {
+      LOG(WARNING)
+          << "[HUAWEI_ASCEND_NPU] failed to get data buffer of deivce data!";
+    } else {
+      if (aclrtFree(device_data) != ACL_ERROR_NONE) {
+        LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to free deivce data!";
+      }
+    }
+    if (aclDestroyDataBuffer(buffer_device) != ACL_ERROR_NONE) {
+      LOG(WARNING)
+          << "[HUAWEI_ASCEND_NPU] failed to destroy deivce data buffer!";
+    }
+  }
+  if (aclmdlDestroyDataset(*dataset) != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to destroy dataset!";
+  }
+  *dataset = nullptr;
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroy dataset success.";
+}
+
+bool AclModelClient::UnloadModel() {
+  if (!load_flag_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to unload model, load flag is "
+                 << load_flag_;
+    return true;
+  }
+
+  DestroyDataset(&input_dataset_);
+  DestroyDataset(&output_dataset_);
+
+  aclError ret = aclmdlUnload(model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "unload model failed, model id is " << model_id_;
+    return false;
+  }
+  if (model_desc_ != nullptr) {
+    (void)aclmdlDestroyDesc(model_desc_);
+    model_desc_ = nullptr;
+  }
+
+  if (model_memory_ptr_ != nullptr) {
+    aclrtFree(model_memory_ptr_);
+    model_memory_ptr_ = nullptr;
+    model_memory_size_ = 0;
+  }
+
+  if (model_weight_ptr_ != nullptr) {
+    aclrtFree(model_weight_ptr_);
+    model_weight_ptr_ = nullptr;
+    model_weight_size_ = 0;
+  }
+  load_flag_ = false;
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Unload model success, model id " << model_id_;
+  return true;
+}
+
+uint32_t AclModelClient::num_devices() {
+  uint32_t count = 0;
+  ACL_CALL(aclrtGetDeviceCount(&count));
+  return count;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/huawei_ascend_npu/model_client.h b/lite/backends/huawei_ascend_npu/model_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cf19b26261a4ff0301b493c7edf2de6ce3f7ec1
--- /dev/null
+++ b/lite/backends/huawei_ascend_npu/model_client.h
@@ -0,0 +1,179 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/huawei_ascend_npu/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+class TensorDesc {
+ public:
+  TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) {
+    if (format == ACL_FORMAT_NHWC) {
+      dim_order[1] = 3;
+      dim_order[2] = 1;
+      dim_order[3] = 2;
+    }
+    // create ge::Tensordesc
+    ge_tensor_desc_ = new ge::TensorDesc(
+        GetGeShape(dims), GetGeFormat(format), GetGeDataType(data_type));
+    CHECK(ge_tensor_desc_ != nullptr);
+  }
+  ~TensorDesc() { ge_tensor_desc_ = nullptr; }
+  int64_t GetNumber() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[0]);
+  }
+  int64_t GetChannel() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[1]);
+  }
+  int64_t GetHeight() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[2]);
+  }
+  int64_t GetWidth() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[3]);
+  }
+  const ge::TensorDesc& GetGeTensorDesc() const { return *ge_tensor_desc_; }
+
+ private:
+  ge::Shape GetGeShape(aclmdlIODims dims) {
+    ge::Shape ge_shape({0, 0, 0, 0});
+    for (size_t i = 0; i < dims.dimCount; i++) {
+      if (ge_shape.SetDim(i, dims.dims[i]) != ge::GRAPH_SUCCESS) {
+        LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Shape SetDim failed!";
+      } else {
+        VLOG(3) << "[HUAWEI_ASCEND_NPU] Setting Ge Shape[" << i << "] = <"
+                << dims.dims[i] << ">";
+      }
+    }
+    return ge_shape;
+  }
+  ge::Format GetGeFormat(aclFormat format) {
+    ge::Format ge_format = ge::FORMAT_NCHW;
+    switch (format) {
+      case ACL_FORMAT_NCHW:
+        ge_format = ge::FORMAT_NCHW;
+        break;
+      case ACL_FORMAT_NHWC:
+        ge_format = ge::FORMAT_NHWC;
+        break;
+      case ACL_FORMAT_ND:
+        ge_format = ge::FORMAT_ND;
+        break;
+      default:
+        LOG(FATAL) << "[HUAWEI_ASCEND_NPU] format not supported:" << format;
+        break;
+    }
+    return ge_format;
+  }
+  ge::DataType GetGeDataType(aclDataType data_type) {
+    ge::DataType ge_datatype = ge::DT_FLOAT;
+    switch (data_type) {
+      case ACL_FLOAT:
+        ge_datatype = ge::DT_FLOAT;
+        break;
+      case ACL_FLOAT16:
+        ge_datatype = ge::DT_FLOAT16;
+        break;
+      case ACL_INT8:
+        ge_datatype = ge::DT_INT8;
+        break;
+      case ACL_INT16:
+        ge_datatype = ge::DT_INT16;
+        break;
+      case ACL_INT32:
+        ge_datatype = ge::DT_INT32;
+        break;
+      case ACL_INT64:
+        ge_datatype = ge::DT_INT64;
+        break;
+      case ACL_BOOL:
+        ge_datatype = ge::DT_BOOL;
+        break;
+      default:
+        LOG(FATAL) << "[HUAWEI_ASCEND_NPU] data type not supported!";
+        break;
+    }
+    return ge_datatype;
+  }
+
+ private:
+  ge::TensorDesc* ge_tensor_desc_{nullptr};
+  // n c h w order, default to ACL_FORMAT_NCHW
+  std::vector<size_t> dim_order{0, 1, 2, 3};
+};
+
+class AclModelClient {
+ public:
+  explicit AclModelClient(int device_id) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Creating Huawei Ascend Device: "
+            << device_id;
+    device_num_ = num_devices();
+    if (device_id < 0 || device_id >= device_num_) {
+      LOG(FATAL) << "Failed with invalid device id " << device_id;
+      return;
+    }
+    device_id_ = device_id;
+    ACL_CALL(aclrtSetDevice(device_id_));
+  }
+
+  ~AclModelClient() {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroying Huawei Ascend Device: "
+            << device_id_;
+    ACL_CALL(aclrtResetDevice(device_id_));
+  }
+
+  bool LoadFromMem(const void* data, uint32_t size);
+  bool LoadFromFile(const char* model_path);
+  bool GetModelIOTensorDim(std::vector<TensorDesc>* input_tensor,
+                           std::vector<TensorDesc>* output_tensor);
+  bool ModelExecute(std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
+                    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
+  bool UnloadModel();
+
+ private:
+  void CreateInputDataset(
+      std::vector<std::shared_ptr<ge::Tensor>>* input_tensor);
+  void CreateOutputDataset(
+      std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
+  bool GetTensorFromDataset(
+      std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
+  void DestroyDataset(aclmdlDataset** dataset);
+
+ private:
+  uint32_t num_devices();
+
+ private:
+  int device_id_{0};
+  int device_num_{0};
+  aclrtContext context_{nullptr};
+  bool load_flag_{false};
+  uint32_t model_id_{0};
+  size_t model_memory_size_;
+  size_t model_weight_size_;
+  void* model_memory_ptr_;
+  void* model_weight_ptr_;
+  aclmdlDesc* model_desc_{nullptr};
+  aclmdlDataset* input_dataset_{nullptr};
+  aclmdlDataset* output_dataset_{nullptr};
+};
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/huawei_ascend_npu/utils.h b/lite/backends/huawei_ascend_npu/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2bff3f87e0831f7b98be60ef3980f10da610f10
--- /dev/null
+++ b/lite/backends/huawei_ascend_npu/utils.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "acl/acl.h"
+#include "ge/ge_api_types.h"
+#include "ge/ge_ir_build.h"
+#include "graph/ge_error_codes.h"
+#include "graph/graph.h"
+#include "graph/tensor.h"
+#include "graph/types.h"
+#include "lite/utils/cp_logging.h"
+
+/*
+ * This file contains some Huawei Ascend NPU specific uitls.
+ */
+
+#define ACL_CALL(msg)                                       \
+  CHECK_EQ(reinterpret_cast<aclError>(msg), ACL_ERROR_NONE) \
+      << (msg) << " Huawei Ascend NPU ACL Error: "          \
+      << ::paddle::lite::huawei_ascend_npu::AclErrorInfo(   \
+             reinterpret_cast<int>(msg))
+
+#define ATC_CALL(msg)                                                 \
+  CHECK_EQ(reinterpret_cast<ge::graphStatus>(msg), ge::GRAPH_SUCCESS) \
+      << (msg) << " Huawei Ascend NPU ATC Error: "                    \
+      << ::paddle::lite::huawei_ascend_npu::AtcErrorInfo(             \
+             reinterpret_cast<uint32_t>(msg))
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+static const char* AtcErrorInfo(uint32_t error) {
+  switch (error) {
+#define LITE_ATC_ERROR_INFO(xx) \
+  case xx:                      \
+    return #xx;                 \
+    break;
+    LITE_ATC_ERROR_INFO(ge::GRAPH_FAILED);         // 0xFFFFFFFF
+    LITE_ATC_ERROR_INFO(ge::GRAPH_PARAM_INVALID);  // 50331649
+#undef LITE_ATC_ERROR_INFO
+    default:
+      return "unknown error";
+      break;
+  }
+}
+
+static const char* AclErrorInfo(int error) {
+  switch (error) {
+#define LITE_ACL_ERROR_INFO(xx) \
+  case xx:                      \
+    return #xx;                 \
+    break;
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PARAM);                    // 100000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_UNINITIALIZE);                     // 100001
+    LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_INITIALIZE);                // 100002
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE);                     // 100003
+    LITE_ACL_ERROR_INFO(ACL_ERROR_WRITE_FILE);                       // 100004
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE_SIZE);                // 100005
+    LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_FILE);                       // 100006
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_MISSING_ATTR);                // 100007
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_ATTR_INVALID);                // 100008
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DUMP_CONFIG);              // 100009
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PROFILING_CONFIG);         // 100010
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_MODEL_ID);                 // 100011
+    LITE_ACL_ERROR_INFO(ACL_ERROR_DESERIALIZE_MODEL);                // 100012
+    LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_MODEL);                      // 100013
+    LITE_ACL_ERROR_INFO(ACL_ERROR_READ_MODEL_FAILURE);               // 100014
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_SIZE_INVALID);               // 100015
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_MISSING_ATTR);               // 100016
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_INPUT_NOT_MATCH);            // 100017
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_OUTPUT_NOT_MATCH);           // 100018
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_NOT_DYNAMIC);                // 100019
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_TYPE_NOT_MATCH);                // 100020
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_INPUT_NOT_MATCH);               // 100021
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_OUTPUT_NOT_MATCH);              // 100022
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_ATTR_NOT_MATCH);                // 100023
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_NOT_FOUND);                     // 100024
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_LOAD_FAILED);                   // 100025
+    LITE_ACL_ERROR_INFO(ACL_ERROR_UNSUPPORTED_DATA_TYPE);            // 100026
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FORMAT_NOT_MATCH);                 // 100027
+    LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED);      // 100028
+    LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_NOT_FOUND);                 // 100029
+    LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED);  // 100030
+    LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_ALREADY_REGISTERED);        // 100031
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_QUEUE_ID);                 // 100032
+    LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_SUBSCRIBE);                 // 100033
+    LITE_ACL_ERROR_INFO(ACL_ERROR_STREAM_NOT_SUBSCRIBE);             // 100034
+    LITE_ACL_ERROR_INFO(ACL_ERROR_THREAD_NOT_SUBSCRIBE);             // 100035
+    LITE_ACL_ERROR_INFO(ACL_ERROR_WAIT_CALLBACK_TIMEOUT);            // 100036
+    LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_FINALIZE);                  // 100037
+    LITE_ACL_ERROR_INFO(ACL_ERROR_NOT_STATIC_AIPP);                  // 100038
+    LITE_ACL_ERROR_INFO(ACL_ERROR_BAD_ALLOC);                        // 200000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_API_NOT_SUPPORT);                  // 200001
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DEVICE);                   // 200002
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MEMORY_ADDRESS_UNALIGNED);         // 200003
+    LITE_ACL_ERROR_INFO(ACL_ERROR_RESOURCE_NOT_MATCH);               // 200004
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_RESOURCE_HANDLE);          // 200005
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FEATURE_UNSUPPORTED);              // 200006
+    LITE_ACL_ERROR_INFO(ACL_ERROR_STORAGE_OVER_LIMIT);               // 300000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INTERNAL_ERROR);                   // 500000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FAILURE);                          // 500001
+    LITE_ACL_ERROR_INFO(ACL_ERROR_GE_FAILURE);                       // 500002
+    LITE_ACL_ERROR_INFO(ACL_ERROR_RT_FAILURE);                       // 500003
+    LITE_ACL_ERROR_INFO(ACL_ERROR_DRV_FAILURE);                      // 500004
+    LITE_ACL_ERROR_INFO(ACL_ERROR_PROFILING_FAILURE);                // 500005
+#undef LITE_ACL_ERROR_INFO
+    default:
+      return "unknown error";
+      break;
+  }
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/mlu/target_wrapper.cc b/lite/backends/mlu/target_wrapper.cc
index 2385f69246a163830e0df855082d728da2743e02..b98854946db7eda4f133d773ae0f5ba9e45a77cc 100644
--- a/lite/backends/mlu/target_wrapper.cc
+++ b/lite/backends/mlu/target_wrapper.cc
@@ -15,6 +15,7 @@
 #include "lite/backends/mlu/target_wrapper.h"
 
 #include <memory>
+#include <utility>
 
 #include "lite/backends/mlu/mlu_utils.h"
 
@@ -36,6 +37,13 @@ void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) {
 
 }  // namespace mlu
 
+thread_local cnmlCoreVersion_t TargetWrapperMlu::mlu_core_version_{CNML_MLU270};
+thread_local int TargetWrapperMlu::mlu_core_number_{1};
+thread_local bool TargetWrapperMlu::use_first_conv_{false};
+thread_local std::vector<float> TargetWrapperMlu::mean_vec_;
+thread_local std::vector<float> TargetWrapperMlu::std_vec_;
+thread_local DataLayoutType TargetWrapperMlu::input_layout_{DATALAYOUT(kNCHW)};
+
 size_t TargetWrapperMlu::num_devices() {
   uint32_t dev_count = 0;
   CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed";
@@ -77,15 +85,42 @@ void TargetWrapperMlu::MemcpySync(void* dst,
       LOG(FATAL) << "Unsupported IoDirection" << static_cast<int>(dir);
   }
 }
+void TargetWrapperMlu::SetMLURunMode(
+    lite_api::MLUCoreVersion core_version,
+    int core_number,
+    DataLayoutType input_layout,
+    std::pair<std::vector<float>, std::vector<float>> firstconv_param) {
+  switch (core_version) {
+    case (lite_api::MLUCoreVersion::MLU_220):
+      mlu_core_version_ = CNML_MLU220;
+      break;
+    case (lite_api::MLUCoreVersion::MLU_270):
+      mlu_core_version_ = CNML_MLU270;
+      break;
+    default:
+      mlu_core_version_ = CNML_MLU270;
+      break;
+  }
+  mlu_core_number_ = core_number;
+  mean_vec_ = firstconv_param.first;
+  std_vec_ = firstconv_param.second;
+  use_first_conv_ = !(mean_vec_.empty() || std_vec_.empty());
+  input_layout_ = input_layout;
+}
+
+cnmlCoreVersion_t TargetWrapperMlu::MLUCoreVersion() {
+  return mlu_core_version_;
+}
+
+int TargetWrapperMlu::MLUCoreNumber() { return mlu_core_number_; }
+
+bool TargetWrapperMlu::UseFirstConv() { return use_first_conv_; }
+
+const std::vector<float>& TargetWrapperMlu::MeanVec() { return mean_vec_; }
+
+const std::vector<float>& TargetWrapperMlu::StdVec() { return std_vec_; }
 
-// void TargetWrapperMlu::MemcpyAsync(void* dst,
-//                                    const void* src,
-//                                    size_t size,
-//                                    IoDirection dir,
-//                                    const stream_t& stream) {
-//   LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync.";
-//   MemcpySync(dst, src, size, dir);
-// }
+DataLayoutType TargetWrapperMlu::InputLayout() { return input_layout_; }
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/mlu/target_wrapper.h b/lite/backends/mlu/target_wrapper.h
index 2d9e10806f78e56f50b04d408dab219c923456fc..2566ae153e2f9539d1ad5739f208bc5f946a7542 100644
--- a/lite/backends/mlu/target_wrapper.h
+++ b/lite/backends/mlu/target_wrapper.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include <utility>
+#include <vector>
 #include "lite/backends/mlu/mlu_utils.h"
 #include "lite/core/target_wrapper.h"
 
@@ -43,11 +45,25 @@ class TargetWrapper<TARGET(kMLU)> {
                          const void* src,
                          size_t size,
                          IoDirection dir);
-  // static void MemcpyAsync(void* dst,
-  //                         const void* src,
-  //                         size_t size,
-  //                         IoDirection dir,
-  //                         const queue_t& queue);
+  static void SetMLURunMode(
+      lite_api::MLUCoreVersion core_version,
+      int core_number,
+      DataLayoutType input_layout,
+      std::pair<std::vector<float>, std::vector<float>> firstconv_param);
+  static cnmlCoreVersion_t MLUCoreVersion();
+  static int MLUCoreNumber();
+  static bool UseFirstConv();
+  static const std::vector<float>& MeanVec();
+  static const std::vector<float>& StdVec();
+  static DataLayoutType InputLayout();
+
+ private:
+  static thread_local cnmlCoreVersion_t mlu_core_version_;
+  static thread_local int mlu_core_number_;
+  static thread_local bool use_first_conv_;
+  static thread_local std::vector<float> mean_vec_;
+  static thread_local std::vector<float> std_vec_;
+  static thread_local DataLayoutType input_layout_;
 };
 
 }  // namespace lite
diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc
index f9803aa8810ada33b9eecafe1502515501514e41..2b2d5321ba6dbac7ff002039c3c8a0423cbe0a6e 100644
--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -20,96 +20,122 @@ namespace paddle {
 namespace lite {
 namespace npu {
 
-bool WriteToOMFile(const domi::ModelBufferData& om_model_buff,
-                   std::string om_file_path) {
-  FILE* fp;
-  fp = fopen(om_file_path.c_str(), "wb");
-  CHECK(fp != nullptr) << om_file_path << " open failed!";
-
-  uint32_t write_size =
-      (uint32_t)fwrite(om_model_buff.data, 1, om_model_buff.length, fp);
-  CHECK_EQ(write_size, om_model_buff.length) << "write om file failed !";
-
-  fclose(fp);
-  return true;
-}
-
-bool ReadFromOMFile(domi::ModelBufferData* om_model_buff,
-                    std::string om_file_path) {
-  FILE* fp;
-  fp = fopen(om_file_path.c_str(), "rb");
-  CHECK(fp != nullptr) << om_file_path << " open failed!";
-
-  fseek(fp, 0, SEEK_END);
-  uint32_t model_length = (uint32_t)ftell(fp);
-  fseek(fp, 0, SEEK_SET);
-  om_model_buff->data = malloc(model_length);
-  om_model_buff->length = model_length;
-  uint32_t read_size =
-      (uint32_t)fread(om_model_buff->data, 1, model_length, fp);
-  CHECK_EQ(read_size, model_length) << "read om file failed !";
-
-  fclose(fp);
-  return true;
-}
-
-std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
-    const std::string model_name,                // NOLINT
-    std::vector<ge::Operator>& input_nodes,      // NOLINT
-    std::vector<ge::Operator>& output_nodes,     // NOLINT
-    const std::string model_cache_full_dir = ""  // NOLINT
-    ) {
-  VLOG(3) << "[NPU] Build model";
-  // Build the HiAI IR graph to the HiAI om model
-  ge::Graph ir_graph("graph");
-  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
-  ge::Model om_model("model", "model");
-  om_model.SetGraph(ir_graph);
-  domi::HiaiIrBuild ir_build;
-  domi::ModelBufferData om_model_buf;
-
-  if (!model_cache_full_dir.empty() && IsFileExists(model_cache_full_dir)) {
-    VLOG(3) << "Will read om model from " << model_cache_full_dir;
-    ReadFromOMFile(&om_model_buf, model_cache_full_dir);
-  } else {
-    if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
-      LOG(WARNING) << "[NPU] CreateModelBuff failed!";
-      return nullptr;
-    }
-    if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
-      LOG(WARNING) << "[NPU] BuildIRModel failed!";
-      ir_build.ReleaseModelBuff(om_model_buf);
-      return nullptr;
-    }
-    if (!model_cache_full_dir.empty()) {
-      VLOG(3) << "Will write om model to " << model_cache_full_dir;
-      WriteToOMFile(om_model_buf, model_cache_full_dir);
-    }
-  }
-
+std::shared_ptr<hiai::AiModelMngerClient> Device::Load(
+    const std::string& model_name,
+    std::vector<char>* model_buffer,
+    bool* model_comp) {
   // Create a HiAI model manager client to load the HiAI om model
-  std::shared_ptr<hiai::AiModelMngerClient> model_client(
-      new hiai::AiModelMngerClient());
+  auto model_client = std::make_shared<hiai::AiModelMngerClient>();
   if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
-    LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
-    ir_build.ReleaseModelBuff(om_model_buf);
+    LOG(WARNING) << "[NPU] Init hiai model client failed!";
     return nullptr;
   }
+  // Check HiAI DDK version
+  const char* ddk_version = model_client->GetVersion();
+  if (ddk_version) {
+    VLOG(3) << "[NPU] HiAI DDK version: " << ddk_version;
+  } else {
+    LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!";
+  }
+  // Check model compatibility
   auto model_desc = std::make_shared<hiai::AiModelDescription>(
       model_name, freq_level(), framework_type(), model_type(), device_type());
-  model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
-  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs;
-  model_descs.push_back(model_desc);
+  model_desc->SetModelBuffer(
+      reinterpret_cast<const void*>(model_buffer->data()),
+      model_buffer->size());
+  if (!*model_comp &&
+      model_client->CheckModelCompatibility(*model_desc, *model_comp) !=
+          hiai::AI_SUCCESS) {
+    *model_comp = false;
+    VLOG(3) << "[NPU] model is NOT compatiblitiable, setting model_comp to "
+            << *model_comp;
+  } else {
+    *model_comp = true;
+    VLOG(3) << "[NPU] model is compatiblitiable, setting model_comp to "
+            << *model_comp;
+  }
+  // Rebuild and write the data of the compatible model to the model buffer
+  if (!*model_comp) {
+    std::shared_ptr<hiai::AiModelBuilder> model_builder =
+        std::make_shared<hiai::AiModelBuilder>(model_client);
+    hiai::MemBuffer* org_model_buffer = model_builder->InputMemBufferCreate(
+        reinterpret_cast<void*>(model_buffer->data()), model_buffer->size());
+    if (org_model_buffer) {
+      std::vector<hiai::MemBuffer*> org_model_buffers;
+      org_model_buffers.push_back(org_model_buffer);
+      hiai::MemBuffer* new_model_buffer = model_builder->OutputMemBufferCreate(
+          framework_type(), org_model_buffers);
+      // VLOG(3) << "[NPU] new model buffer memeory size is " <<
+      // new_model_buffer->GetMemBufferSize();
+      if (new_model_buffer) {
+        uint32_t new_model_size = 0;
+        if (model_builder->BuildModel(org_model_buffers,
+                                      new_model_buffer,
+                                      new_model_size) == hiai::AI_SUCCESS) {
+          // need to change to new_model_size as GetMemBufferSize is not
+          // correct.
+          model_buffer->resize(new_model_size);
+          memcpy(reinterpret_cast<void*>(model_buffer->data()),
+                 new_model_buffer->GetMemBufferData(),
+                 new_model_size);
+          // Reset the model buffer
+          model_desc->SetModelBuffer(
+              reinterpret_cast<const void*>(model_buffer->data()),
+              model_buffer->size());
+          VLOG(3) << "[NPU] Rebuild the compatible model done.";
+        } else {
+          LOG(WARNING) << "[NPU] Rebuild the compatible model failed!";
+        }
+        model_builder->MemBufferDestroy(new_model_buffer);
+      } else {
+        LOG(WARNING) << "[NPU] OutputMemBufferCreate failed!";
+      }
+      model_builder->MemBufferDestroy(org_model_buffer);
+    } else {
+      LOG(WARNING) << "[NPU] InputMemBufferCreate failed!";
+    }
+  }
+  // Load the compatible model
+  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs{
+      model_desc};
   if (model_client->Load(model_descs) != hiai::AI_SUCCESS) {
     LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
-    ir_build.ReleaseModelBuff(om_model_buf);
     return nullptr;
   }
-  ir_build.ReleaseModelBuff(om_model_buf);
-  VLOG(3) << "[NPU] Build done";
+  VLOG(3) << "[NPU] Load model done.";
   return model_client;
 }
 
+bool Device::Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
+                   std::vector<ge::Operator>& output_nodes,  // NOLINT
+                   std::vector<char>* model_buffer) {
+  // Convert the HiAI IR graph to the HiAI om model
+  ge::Graph ir_graph("graph");
+  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
+  ge::Model om_model("model", "model");
+  om_model.SetGraph(ir_graph);
+
+  // Build the HiAI om model, serialize and output it to the om buffer
+  domi::HiaiIrBuild ir_build;
+  domi::ModelBufferData om_buffer;
+  if (!ir_build.CreateModelBuff(om_model, om_buffer)) {
+    LOG(WARNING) << "[NPU] CreateModelBuff failed!";
+    return false;
+  }
+  if (!ir_build.BuildIRModel(om_model, om_buffer)) {
+    LOG(WARNING) << "[NPU] BuildIRModel failed!";
+    ir_build.ReleaseModelBuff(om_buffer);
+    return false;
+  }
+  model_buffer->resize(om_buffer.length);
+  memcpy(reinterpret_cast<void*>(model_buffer->data()),
+         reinterpret_cast<void*>(om_buffer.data),
+         om_buffer.length);
+  ir_build.ReleaseModelBuff(om_buffer);
+  VLOG(3) << "[NPU] Build model done.";
+  return true;
+}
+
 }  // namespace npu
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h
index cf03e097194bf20ab428677b09b840991e8a902c..5862f0b393292d95b6500ae75171fab07a5279a6 100644
--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
@@ -38,14 +38,18 @@ class Device {
   int model_type() { return model_type_; }
   int device_type() { return device_type_; }
 
+  // Load the HiAI om model from buffer, rebuild the model if it's incompatible
+  // with the current device, then create a HiAI model manager client(from HiAI
+  // Server) to run inference
+  std::shared_ptr<hiai::AiModelMngerClient> Load(
+      const std::string& model_name,
+      std::vector<char>* model_buffer,
+      bool* model_comp);
   // Build the HiAI IR graph to om model, return HiAI model manager client to
   // load om model and run inference.
-  std::shared_ptr<hiai::AiModelMngerClient> Build(
-      const std::string model_name,             // NOLINT
-      std::vector<ge::Operator>& input_nodes,   // NOLINT
-      std::vector<ge::Operator>& output_nodes,  // NOLINT
-      const std::string model_cache_name        // NOLINT
-      );                                        // NOLINT
+  bool Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
+             std::vector<ge::Operator>& output_nodes,  // NOLINT
+             std::vector<char>* model_buffer);
 
  private:
   int freq_level_{3};
diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc
index 67d679fdd596b109b714bf7ba3cd45b2632b9420..002073517bc61af60da213db9af6e56da5f5b501 100644
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -119,7 +119,7 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
   }
 }
 
-cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
+cl::NDRange CLContext::LocalWorkSizeTune(cl::NDRange global_work_size,
                                          size_t max_work_size,
                                          int divisor) {
   int preferred_lws = 0;
@@ -157,7 +157,7 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
                      static_cast<size_t>(gws0)};
 #endif
 }
-cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
+cl::NDRange CLContext::LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
                                                 size_t max_work_size,
                                                 int divisor) {
   int preferred_lws = 0;
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
index 82d15bee5ec460a1fb06430571f007fcef23f66f..c204a8510402b8741c761938c3b2c37ac07fe961 100644
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -62,10 +62,10 @@ class CLContext {
 
   cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
 
-  cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
+  cl::NDRange LocalWorkSizeTune(cl::NDRange global_work_size,
                                 size_t max_work_size,
                                 int divitor = 2);
-  cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
+  cl::NDRange LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
                                        size_t max_work_size,
                                        int divitor = 2);
   bool IsArmMali();
diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
index a14748c69f3eafce515c90f2b8a226703fe5883d..080ce2b457421970409431dee6841ac4f7d57bb5 100644
--- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
@@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a,
     } else {
         for (int cidx = col; cidx < N; ++cidx) {
             for (int ridx = row; ridx < M; ++ridx) {
-                CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0;
+                CL_COMPUTE_DTYPE a0 = 0;
+                CL_COMPUTE_DTYPE b0 = 0;
+                CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0;
                 for (int p = 0; p < K; ++p) {
                     a0 = *(a + ridx * K + p);
                     b0 = *(b + p * N + cidx),
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
index 1c808da68ddc923e12234bc4b6ac99b35bfffb0b..9209f0e0f8d04fad5e788f3742c7922af8e13f49 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
@@ -6,9 +6,7 @@ __kernel void conv2d_1x1_opt(
     __private const int global_size_dim2,
     __read_only image2d_t input_image,
     __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
     __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
     __read_only image2d_t new_scale,
     __read_only image2d_t new_biase,
@@ -284,9 +282,7 @@ __kernel void conv2d_1x1_simple(
     __private const int global_size_dim2,
     __read_only image2d_t input_image,
     __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
     __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
     __read_only image2d_t new_scale,
     __read_only image2d_t new_biase,
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
index 771765ea6063a08784ae824a757b28450d808f6d..6a3aa6455daf8d20430a434ff6f47dac382f1f74 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
@@ -19,9 +19,7 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
                          __private const int global_size_dim2,
                          __read_only image2d_t input_image,
                          __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                          __read_only image2d_t bias,
-#endif
                          __write_only image2d_t output_image,
                          __private const int stride,
                          __private const int offset,
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
index 79f3922e89549fc15b7a849efb0e2b6595357102..739f852a7c6b60e4c38cb2523dfb745af65bc8df 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
@@ -19,9 +19,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
                              __private const int item_h,
                              __read_only image2d_t input_image,
                              __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                              __read_only image2d_t bias,
-#endif
                              __write_only image2d_t output_image,
                              __private const int stride,
                              __private const int pad,
@@ -264,9 +262,7 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch,
                                      __private const int item_h,
                                      __read_only image2d_t input_image,
                                      __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                      __read_only image2d_t bias,
-#endif
                                      __write_only image2d_t output_image,
                                      __private const int stride,
                                      __private const int pad,
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
index d856af6a1d4026b1595bc287901e53f64267dc81..f08d53fa4968d041337adfe3252529bca3b5c55e 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
@@ -5,9 +5,7 @@ __kernel void conv2d_5x5(__private const int global_size_dim0,
                          __private const int global_size_dim2,
                          __read_only image2d_t input_image,
                          __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                          __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                          __read_only image2d_t new_scale,
                          __read_only image2d_t new_biase,
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
index 4ed2e072022dc4b457a86d634bf4bc21ab62bc45..4cce039f27b750950a1475ac266e0f5117c6d259 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
@@ -20,9 +20,7 @@ __kernel void conv2d_5x5_opt(__private const int item_ch,
                              __private const int item_h,
                              __read_only image2d_t input_image,
                              __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                              __read_only image2d_t bias,
-#endif
                              __write_only image2d_t output_image,
                              __private const int stride,
                              __private const int pad,
@@ -268,9 +266,7 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
                                      __private const int item_h,
                                      __read_only image2d_t input_image,
                                      __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                      __read_only image2d_t bias,
-#endif
                                      __write_only image2d_t output_image,
                                      __private const int stride,
                                      __private const int pad,
@@ -513,4 +509,4 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
                    (int2)(out_w_base_id + out_w_id4, item_h_id),
                    output[4]);
   }
-}
\ No newline at end of file
+}
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
index 4998dc99279fffad8750ef3b6495597e9fc4ad65..2a2f210601e760651ee850686391af3c040fbe7f 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
@@ -5,9 +5,7 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
                          __private const int global_size_dim2,
                          __read_only image2d_t input_image,
                          __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                          __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                          __read_only image2d_t new_scale,
                          __read_only image2d_t new_biase,
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
index d82f4b4c96b586b6ecf948827402afd0766dcea4..4eadcd9f8032996abae04660b6878ab5beaff9a7 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
@@ -20,9 +20,7 @@ __kernel void conv2d_7x7_opt(__private const int item_ch,
                              __private const int item_h,
                              __read_only image2d_t input_image,
                              __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                              __read_only image2d_t bias,
-#endif
                              __write_only image2d_t output_image,
                              __private const int stride,
                              __private const int pad,
@@ -268,9 +266,7 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
                                      __private const int item_h,
                                      __read_only image2d_t input_image,
                                      __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                      __read_only image2d_t bias,
-#endif
                                      __write_only image2d_t output_image,
                                      __private const int stride,
                                      __private const int pad,
@@ -513,4 +509,4 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
                    (int2)(out_w_base_id + out_w_id4, item_h_id),
                    output[4]);
   }
-}
\ No newline at end of file
+}
diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
index 27313aea23ed16ecc7a6763dfbbbe63bca18941a..465b9f8f925a130b4d1b059ab15e93bc29128ec7 100755
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
@@ -19,9 +19,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0,
                            __private const int global_size_dim2,
                            __read_only image2d_t input,
                            __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                            __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                            __read_only image2d_t new_scale,
                            __read_only image2d_t new_biase,
diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
index 5626fe6be7d451d4ffe22a2008affa7d82298bc3..6fbdc21f934f21dd26c3eb66885f7087e3d340c0 100755
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -20,9 +20,7 @@ __kernel void depth_conv2d_3x3(
     __private const int global_size_dim2,
     __read_only image2d_t input,
     __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
     __read_only image2d_t bias,
-#endif
     __write_only image2d_t output_image,
     __private const int stride,
     __private const int offset,
@@ -249,9 +247,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
                                  __private const int ou_nh,
                                  __read_only image2d_t input,
                                  __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                  __read_only image2d_t bias,
-#endif
                                  __write_only image2d_t output_image,
                                  __private const int stride,
                                  __private const int pad,
diff --git a/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl b/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..b8533076b79aa2e94e30e38dd34d3f2292fdf88a
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void transpose_4d(__read_only image2d_t input_image,
+                           __write_only image2d_t output_image,
+                           __private const int out_C,
+                           __private const int out_H,
+                           __private const int out_W,
+                           __private const int in_W) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = 1;
+  const int out_h = out_nh % out_H;
+  const int out_c0 = out_c * 4;
+  const int out_c1 = out_c * 4 + 1;
+  const int out_c2 = out_c * 4 + 2;
+  const int out_c3 = out_c * 4 + 3;
+
+  const int in_n = out_n;
+  const int in_c = out_w * 0.25;
+  const int in_h0 = out_c0;
+  const int in_h1 = out_c1;
+  const int in_h2 = out_c2;
+  const int in_h3 = out_c3;
+  const int in_w = out_h;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+
+  int2 input_pos0;
+  int2 input_pos1;
+  int2 input_pos2;
+  int2 input_pos3;
+
+  input_pos0.x = in_W * in_c + in_w;
+  input_pos0.y = in_n * in_h0;
+
+  input_pos1.x = in_W * in_c + in_w;
+  input_pos1.y = in_n * in_h1;
+
+  input_pos2.x = in_W * in_c + in_w;
+  input_pos2.y = in_n * in_h2;
+
+  input_pos3.x = in_W * in_c + in_w;
+  input_pos3.y = in_n * in_h3;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 input0;
+  CL_DTYPE4 input1;
+  CL_DTYPE4 input2;
+  CL_DTYPE4 input3;
+  CL_DTYPE4 output;
+  input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0);
+
+  if (out_w % 4 == 0) {
+    output.x = input0.x;
+  } else if (out_w % 4 == 1) {
+    output.x = input0.y;
+  } else if (out_w % 4 == 2) {
+    output.x = input0.z;
+  } else {
+    output.x = input0.w;
+  }
+  if (out_C - out_c * 4 >= 2) {
+    input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1);
+    if(out_w % 4 == 0) {
+      output.y = input1.x;
+    } else if(out_w % 4 == 1) {
+      output.y = input1.y;
+    } else if(out_w % 4 == 2) {
+      output.y = input1.z;
+    } else {
+      output.y = input1.w;
+    }
+  } else {
+    output.y = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 3) {
+    input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2);
+    if (out_w % 4 == 0){
+      output.z = input2.x;
+    } else if (out_w % 4 == 1) {
+      output.z = input2.y;
+    } else if (out_w % 4 == 2) {
+      output.z = input2.z;
+    } else {
+      output.z = input2.w;
+    }
+  } else {
+    output.z = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 4) {
+    input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3);
+    if (out_w % 4 == 0) {
+      output.w = input3.x;
+    } else if (out_w % 4 == 1) {
+      output.w = input3.y;
+    } else if (out_w % 4 == 2) {
+      output.w = input3.z;
+    } else {
+      output.w = input3.w;
+    }
+  } else {
+    output.w = 0.0f;
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
+
+__kernel void transpose(__read_only image2d_t input_image,
+                        __write_only image2d_t output_image,
+                        __private const int out_C,
+                        __private const int out_H,
+                        __private const int out_W,
+                        __private const int in_W) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = 1;
+  const int out_h = out_nh % out_H;
+  
+  const int in_n = 1;
+  const int in_c = out_c;
+  const int in_w = out_h;
+  const int in_h = out_w;
+  
+  int2 input_pos;
+  int2 output_pos;
+  input_pos.x = in_c * in_W + in_w;
+  input_pos.y = in_n * in_h;
+  
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_n * out_h;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 input;
+  CL_DTYPE4 output;
+  input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos);
+
+  output = input;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
+}
\ No newline at end of file
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
index d8232cda4c790646fb5a4aae7d4e00d272d3a640..fe6b8fcd99d3f615aefd25145e97b7a08a537794 100644
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -38,17 +38,20 @@ CLRuntime::~CLRuntime() {
 }
 
 bool CLRuntime::Init() {
-  if (initialized_) {
+  if (is_cl_runtime_initialized_) {
     return true;
   }
   bool is_platform_init = InitializePlatform();
   bool is_device_init = InitializeDevice();
-  is_init_success_ = is_platform_init && is_device_init;
-  initialized_ = true;
-
-  context_ = CreateContext();
-  command_queue_ = CreateCommandQueue(context());
-  return initialized_;
+  LOG(INFO) << "is_platform_init:" << is_platform_init;
+  LOG(INFO) << "is_device_init:" << is_device_init;
+  if ((is_platform_init == true) && (is_device_init == true)) {
+    is_platform_device_init_success_ = true;
+    context_ = CreateContext();
+    command_queue_ = CreateCommandQueue(context());
+    is_cl_runtime_initialized_ = true;
+  }
+  return is_cl_runtime_initialized_;
 }
 
 cl::Platform& CLRuntime::platform() {
@@ -64,7 +67,9 @@ cl::Context& CLRuntime::context() {
 }
 
 cl::Device& CLRuntime::device() {
-  CHECK(device_ != nullptr) << "device_ is not initialized!";
+  if (device_ == nullptr) {
+    LOG(ERROR) << "device_ is not initialized!";
+  }
   return *device_;
 }
 
@@ -150,6 +155,14 @@ GpuType CLRuntime::ParseGpuTypeFromDeviceName(std::string device_name) {
 }
 
 bool CLRuntime::InitializeDevice() {
+  VLOG(3) << "device_info_.size():" << device_info_.size();
+  for (auto i : device_info_) {
+    VLOG(3) << ">>> " << i.first << " " << i.second;
+  }
+  if (device_info_.size() > 0 && device_info_.size() <= 2) {
+    return false;
+  }
+  device_info_["PLACEHOLDER"] = 1;
   // ===================== BASIC =====================
   // CL_DEVICE_TYPE_GPU
   // CL_DEVICE_NAME
@@ -160,7 +173,7 @@ bool CLRuntime::InitializeDevice() {
   status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
   CL_CHECK_ERROR(status_);
   if (all_devices.empty()) {
-    LOG(FATAL) << "No OpenCL GPU device found!";
+    LOG(ERROR) << "No available OpenCL GPU device found!";
     return false;
   }
   device_ = std::make_shared<cl::Device>();
@@ -313,9 +326,6 @@ bool CLRuntime::InitializeDevice() {
 }
 
 std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
-  if (0 != device_info_.size()) {
-    return device_info_;
-  }
   InitializeDevice();
   return device_info_;
 }
diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h
index 3eeea7d63ae8f81e7eb395bc0da70caaf94c2a79..7e28130e15da0d45e62d984202f76aa1aff9762c 100644
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/backends/opencl/cl_utility.h"
+#include "lite/backends/opencl/cl_wrapper.h"
 
 typedef enum {
   UNKNOWN = 0,
@@ -68,6 +69,28 @@ class CLRuntime {
  public:
   static CLRuntime* Global();
 
+  bool OpenCLAvaliableForDevice() {
+    bool opencl_lib_found = paddle::lite::CLWrapper::Global()->OpenclLibFound();
+    LOG(INFO) << "opencl_lib_found:" << opencl_lib_found;
+    if (opencl_lib_found == false) return false;
+
+    bool dlsym_success = paddle::lite::CLWrapper::Global()->DlsymSuccess();
+    LOG(INFO) << "dlsym_success:" << dlsym_success;
+    if (opencl_lib_found == false) return false;
+
+    InitializeDevice();
+    bool support_fp16 =
+        static_cast<bool>(device_info_["CL_DEVICE_EXTENSIONS_FP16"]);
+    LOG(INFO) << "support_fp16:" << support_fp16;
+    if (support_fp16 == false) return false;
+
+    is_device_avaliable_for_opencl_ =
+        dlsym_success && opencl_lib_found && support_fp16;
+    LOG(INFO) << "is_device_avaliable_for_opencl_:"
+              << is_device_avaliable_for_opencl_;
+    return is_device_avaliable_for_opencl_;
+  }
+
   bool Init();
 
   cl::Platform& platform();
@@ -85,7 +108,7 @@ class CLRuntime {
 
   bool BuildProgram(cl::Program* program, const std::string& options = "");
 
-  bool IsInitSuccess() { return is_init_success_; }
+  bool IsInitSuccess() { return is_platform_device_init_success_; }
 
   std::string cl_path() { return cl_path_; }
 
@@ -167,9 +190,11 @@ class CLRuntime {
 
   cl_int status_{CL_SUCCESS};
 
-  bool initialized_{false};
+  bool is_device_avaliable_for_opencl_{false};
+
+  bool is_cl_runtime_initialized_{false};
 
-  bool is_init_success_{false};
+  bool is_platform_device_init_success_{false};
 };
 
 }  // namespace lite
diff --git a/lite/backends/opencl/cl_wrapper.cc b/lite/backends/opencl/cl_wrapper.cc
index 41011b593120d896cd1e6a2537ca59c4cf2a0835..5580a487eaaaf77676d2d6bd41542596504774a4 100644
--- a/lite/backends/opencl/cl_wrapper.cc
+++ b/lite/backends/opencl/cl_wrapper.cc
@@ -19,14 +19,16 @@ limitations under the License. */
 
 namespace paddle {
 namespace lite {
+
 CLWrapper *CLWrapper::Global() {
   static CLWrapper wrapper;
   return &wrapper;
 }
 
 CLWrapper::CLWrapper() {
-  CHECK(InitHandle()) << "Fail to initialize the OpenCL library!";
-  InitFunctions();
+  opencl_lib_found_ = InitHandle();
+  CHECK(opencl_lib_found_) << "Fail to initialize the OpenCL library!";
+  dlsym_success_ = InitFunctions();
 }
 
 bool CLWrapper::InitHandle() {
@@ -68,15 +70,17 @@ bool CLWrapper::InitHandle() {
   }
 }
 
-void CLWrapper::InitFunctions() {
+bool CLWrapper::InitFunctions() {
   CHECK(handle_ != nullptr) << "The library handle can't be null!";
+  bool dlsym_success = true;
 
 #define PADDLE_DLSYM(cl_func)                                        \
   do {                                                               \
     cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func);            \
     if (cl_func##_ == nullptr) {                                     \
-      LOG(FATAL) << "Cannot find the " << #cl_func                   \
+      LOG(ERROR) << "Cannot find the " << #cl_func                   \
                  << " symbol in libOpenCL.so!";                      \
+      dlsym_success = false;                                         \
       break;                                                         \
     }                                                                \
     VLOG(4) << "Loaded the " << #cl_func << " symbol successfully."; \
@@ -106,7 +110,7 @@ void CLWrapper::InitFunctions() {
   PADDLE_DLSYM(clCreateCommandQueue);
   // note(ysh329): consider compatibility for cl_driver_version 1.10
   // using clCreateCommandQueue instead.
-  //  PADDLE_DLSYM(clCreateCommandQueueWithProperties);
+  // PADDLE_DLSYM(clCreateCommandQueueWithProperties);
   PADDLE_DLSYM(clReleaseCommandQueue);
   PADDLE_DLSYM(clCreateProgramWithBinary);
   PADDLE_DLSYM(clRetainContext);
@@ -137,6 +141,7 @@ void CLWrapper::InitFunctions() {
   PADDLE_DLSYM(clEnqueueCopyImage);
 
 #undef PADDLE_DLSYM
+  return dlsym_success;
 }
 
 }  // namespace lite
@@ -445,9 +450,8 @@ CL_API_ENTRY cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties(
   //     ->clCreateCommandQueueWithProperties()(
   //         context, device, properties, errcode_ret);
   //
-  cl_command_queue_properties cl_cmd_properties;
   return paddle::lite::CLWrapper::Global()->clCreateCommandQueue()(
-      context, device, cl_cmd_properties, errcode_ret);
+      context, device, 0, errcode_ret);
 }
 
 CL_API_ENTRY cl_int CL_API_CALL clReleaseCommandQueue(
diff --git a/lite/backends/opencl/cl_wrapper.h b/lite/backends/opencl/cl_wrapper.h
index 35ef33e5a2f3973217e0e4c36caf1f8eb0fbdcb2..4df86b4028f92883718e7da0967f4a88ab20cc6d 100644
--- a/lite/backends/opencl/cl_wrapper.h
+++ b/lite/backends/opencl/cl_wrapper.h
@@ -508,13 +508,20 @@ class CLWrapper final {
     return clEnqueueCopyImage_;
   }
 
+  bool OpenclLibFound() { return opencl_lib_found_; }
+
+  bool DlsymSuccess() { return dlsym_success_; }
+
  private:
   CLWrapper();
   CLWrapper(const CLWrapper &) = delete;
   CLWrapper &operator=(const CLWrapper &) = delete;
   bool InitHandle();
-  void InitFunctions();
+  bool InitFunctions();
+  bool opencl_lib_found_{true};
+  bool dlsym_success_{true};
   void *handle_{nullptr};
+
   clGetPlatformIDsType clGetPlatformIDs_{nullptr};
   clGetPlatformInfoType clGetPlatformInfo_{nullptr};
   clBuildProgramType clBuildProgram_{nullptr};
diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc
index 4978dfb84a4ee5770df011c54dccde59a62135b7..0d4301c5b6a56e50eba2d9a6ae13ce353a9b1e2e 100644
--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "lite/backends/x86/cupti_lib_path.h"
 #include "lite/backends/x86/port.h"
 #include "lite/backends/x86/warpctc_lib_path.h"
+#include "lite/utils/cp_logging.h"
 #include "lite/utils/env.h"
-#include "lite/utils/paddle_enforce.h"
 
 // DEFINE_string(cudnn_dir,
 //               "",
@@ -178,7 +178,7 @@ auto error_msg =
 #endif  // !_WIN32
   if (throw_on_error) {
     CHECK(dso_handle != nullptr);
-    // PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno);
+    // CHECK(nullptr != dso_handle, error_msg, dlPath, errorno);
   } else if (nullptr == dso_handle) {
     // LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno);
   }
diff --git a/lite/backends/x86/jit/benchmark.cc b/lite/backends/x86/jit/benchmark.cc
index c49984691e5beca5a42defd68243e1352372cf11..6318916dfa53d5cce0c33d0149a520ccb9288c28 100644
--- a/lite/backends/x86/jit/benchmark.cc
+++ b/lite/backends/x86/jit/benchmark.cc
@@ -319,8 +319,8 @@ void BenchKernelSgd() {
   const T lr = 0.1;
   auto UnDuplicatedRandomVec = [](
       int n, const int64_t lower, const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
+    CHECK_LE(static_cast<size_t>(upper - lower), n - 1);
+    CHECK_GT(n, 0);
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
diff --git a/lite/backends/x86/jit/gen/embseqpool.cc b/lite/backends/x86/jit/gen/embseqpool.cc
index 7e697014ed241a75693b783127633b255964f80b..e6628058d03959a2a58b403a6ad61af6c50b431c 100644
--- a/lite/backends/x86/jit/gen/embseqpool.cc
+++ b/lite/backends/x86/jit/gen/embseqpool.cc
@@ -129,11 +129,11 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const emb_seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.table_height, 0);
-    PADDLE_ENFORCE_GT(attr.table_width, 0);
-    PADDLE_ENFORCE_GT(attr.index_height, 0);
-    PADDLE_ENFORCE_GT(attr.index_width, 0);
-    PADDLE_ENFORCE_GT(attr.out_width, 0);
+    CHECK_GT(attr.table_height, 0);
+    CHECK_GT(attr.table_width, 0);
+    CHECK_GT(attr.index_height, 0);
+    CHECK_GT(attr.index_width, 0);
+    CHECK_GT(attr.out_width, 0);
     return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/lite/backends/x86/jit/gen/embseqpool.h b/lite/backends/x86/jit/gen/embseqpool.h
index 7bb248dd1d384af949fd3cd190df3d90d21921ef..d013887be5ecec1f67fa022b49b889f9cee9ade4 100644
--- a/lite/backends/x86/jit/gen/embseqpool.h
+++ b/lite/backends/x86/jit/gen/embseqpool.h
@@ -17,7 +17,7 @@
 #include <string>
 #include "lite/backends/x86/jit/gen/jitcode.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc
index f78df73f66532f891721c74cff9c78cc3bb61922..87fe758809e3e7e18d2f939a26f3729b937bf6f6 100644
--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
@@ -27,7 +27,7 @@ void MatMulJitCode::genCode() {
   preCode();
   int block, rest;
   const auto groups = packed_groups(n_, k_, &block, &rest);
-  PADDLE_ENFORCE_GT(groups.front(), 0);
+  CHECK_GT(groups.front(), 0);
 
   const int block_len = sizeof(float) * block;
   const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
@@ -116,9 +116,9 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const matmul_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.m, 0);
-    PADDLE_ENFORCE_GT(attr.n, 0);
-    PADDLE_ENFORCE_GT(attr.k, 0);
+    CHECK_GT(attr.m, 0);
+    CHECK_GT(attr.n, 0);
+    CHECK_GT(attr.k, 0);
     return make_unique<MatMulJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/lite/backends/x86/jit/gen/matmul.h b/lite/backends/x86/jit/gen/matmul.h
index 95edc14201ac94d302ff806d0a4b8f5f50b2835c..8bc1e41d0a17d548c47819b5e11daf7ed5065e86 100644
--- a/lite/backends/x86/jit/gen/matmul.h
+++ b/lite/backends/x86/jit/gen/matmul.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "lite/backends/x86/jit/gen/jitcode.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
@@ -32,7 +32,7 @@ class MatMulJitCode : public JitCode {
                          size_t code_size = 256 * 1024,
                          void* code_ptr = nullptr)
       : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
-    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
+    CHECK_EQ(m_, 1) << "Only support m==1 yet";
     this->genCode();
   }
 
diff --git a/lite/backends/x86/jit/gen/seqpool.cc b/lite/backends/x86/jit/gen/seqpool.cc
index 4c80737aac4bc9cd09f4ff222c8fad8c441887ec..c54093e4dfa00f89f51c70840c45518f3eddfd3d 100644
--- a/lite/backends/x86/jit/gen/seqpool.cc
+++ b/lite/backends/x86/jit/gen/seqpool.cc
@@ -69,8 +69,8 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.w, 0);
-    PADDLE_ENFORCE_GT(attr.h, 0);
+    CHECK_GT(attr.w, 0);
+    CHECK_GT(attr.h, 0);
     return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/lite/backends/x86/jit/gen/seqpool.h b/lite/backends/x86/jit/gen/seqpool.h
index a00428f3e0982889665cd23b21a5978c7c239399..a1bde4a9b66f22ef8815bdc61fe866065e7f4203 100644
--- a/lite/backends/x86/jit/gen/seqpool.h
+++ b/lite/backends/x86/jit/gen/seqpool.h
@@ -17,7 +17,7 @@
 #include <string>
 #include "lite/backends/x86/jit/gen/jitcode.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
@@ -125,8 +125,8 @@ class SeqPoolJitCode : public JitCode {
         vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
         reg_idx++;
       }
-      PADDLE_ENFORCE_EQ(
-          reg_idx, rest_used_num_regs, "All heights should use same regs");
+      CHECK_EQ(reg_idx, rest_used_num_regs)
+          << "All heights should use same regs";
       for (int i = 0; i < reg_idx; ++i) {
         vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
       }
diff --git a/lite/backends/x86/jit/gen/sgd.cc b/lite/backends/x86/jit/gen/sgd.cc
index 44e083366132c675b339b2da4bbb3b7c1c6b7569..f91f1305ee30af708443e6a9a8bbb3fae2cc0b80 100644
--- a/lite/backends/x86/jit/gen/sgd.cc
+++ b/lite/backends/x86/jit/gen/sgd.cc
@@ -17,7 +17,7 @@
 #include <memory>
 #include <vector>
 #include "lite/backends/x86/jit/registry.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -113,9 +113,9 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const sgd_attr_t& attr) const override {
-    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
-    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
-    PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
+    CHECK_EQ(attr.param_width, attr.grad_width);
+    CHECK_LE(attr.selected_rows_size, attr.grad_height);
+    CHECK_GE(attr.selected_rows_size, 0);
     return make_unique<SgdJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/lite/backends/x86/jit/gen/vbroadcast.cc b/lite/backends/x86/jit/gen/vbroadcast.cc
index fb1e71f7b0b1e6f68a331d264682e80fbab7c219..7c4860ba5084860b67b6ecb7e3eed8aafb16cb2c 100644
--- a/lite/backends/x86/jit/gen/vbroadcast.cc
+++ b/lite/backends/x86/jit/gen/vbroadcast.cc
@@ -16,7 +16,7 @@
 #include <memory>
 #include <vector>
 #include "lite/backends/x86/jit/registry.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -76,7 +76,7 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
     return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
   }
   std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
-    PADDLE_ENFORCE_GT(w, 0);
+    CHECK_GT(w, 0);
     return make_unique<VBroadcastJitCode>(w, CodeSize(w));
   }
 };
diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc
index a3376be423828b25c6eda6fff30a56578c7bbbe5..a9a89fdb205ad54268986eeee628aec75ac01b74 100644
--- a/lite/backends/x86/jit/gen_base.cc
+++ b/lite/backends/x86/jit/gen_base.cc
@@ -21,8 +21,8 @@
 // posix_memalign
 #include "lite/backends/x86/cpu_info.h"
 #include "lite/backends/x86/jit/macro.h"
+#include "lite/utils/cp_logging.h"
 #include "lite/utils/env.h"
-#include "lite/utils/paddle_enforce.h"
 
 #ifndef _WIN32
 #define posix_memalign_free free
@@ -62,12 +62,10 @@ void* GenBase::operator new(size_t size) {
 #ifdef _WIN32
   ptr = _aligned_malloc(size, alignment);
 #else
-  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
-                    0,
-                    "GenBase Alloc %ld error!",
-                    size);
+  CHECK_EQ(posix_memalign(&ptr, alignment, size), 0) << "GenBase Alloc " << size
+                                                     << " error!";
 #endif
-  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
+  CHECK(ptr) << "Fail to allocate GenBase CPU memory: size = " << size;
   return ptr;
 }
 
diff --git a/lite/backends/x86/jit/helper.cc b/lite/backends/x86/jit/helper.cc
index 8322f7ebd2ce78f99979574983d81cebe5139606..f80a24d15c4666eacd31770c46f8a7ad4e7cfb37 100644
--- a/lite/backends/x86/jit/helper.cc
+++ b/lite/backends/x86/jit/helper.cc
@@ -14,9 +14,10 @@
 
 #include "lite/backends/x86/jit/helper.h"
 #include <algorithm>  // tolower
+#include <cstring>
 #include <numeric>
 #include <string>
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -104,12 +105,12 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
   int block, rest;
   const auto groups = packed_groups(n, k, &block, &rest);
   std::for_each(groups.begin(), groups.end(), [&](int i) {
-    PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
+    CHECK_GT(i, 0) << "each element of groups should be larger than 0.";
   });
   int sum = std::accumulate(groups.begin(), groups.end(), 0);
   std::memset(dst, 0, k * sum * block * sizeof(float));
-  PADDLE_ENFORCE_GE(
-      sum * block, n, "The packed n should be equal to or larger than n");
+  CHECK_GE(sum * block, n)
+      << "The packed n should be equal to or larger than n";
 
   const int block_len = sizeof(float) * block;
   int n_offset = 0;
diff --git a/lite/backends/x86/jit/helper.h b/lite/backends/x86/jit/helper.h
index f741edbbed5b721fb9104a9c9a171a12532e4705..57a3611bb671c6d83ec3212702a57e3fc7d7f35f 100644
--- a/lite/backends/x86/jit/helper.h
+++ b/lite/backends/x86/jit/helper.h
@@ -23,7 +23,7 @@
 #include "lite/backends/x86/jit/kernel_base.h"
 #include "lite/backends/x86/jit/kernel_key.h"
 #include "lite/backends/x86/jit/kernel_pool.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -78,8 +78,8 @@ inline const Kernel* GetReferKernel() {
   auto& ref_pool = ReferKernelPool::Instance().AllKernels();
   KernelKey kkey(KernelTuple::kernel_type, lite::fluid::CPUPlace());
   auto ref_iter = ref_pool.find(kkey);
-  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
-                 "Every Kernel should have reference function.");
+  CHECK(ref_iter != ref_pool.end())
+      << "Every Kernel should have reference function.";
   auto& ref_impls = ref_iter->second;
   for (auto& impl : ref_impls) {
     auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
@@ -94,7 +94,7 @@ template <typename KernelTuple>
 inline typename KernelTuple::func_type GetReferFunc() {
   auto ker = GetReferKernel<KernelTuple>();
   auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
-  PADDLE_ENFORCE(p, "The Refer kernel should exsit");
+  CHECK(p) << "The Refer kernel should exsit";
   return p->GetFunc();
 }
 
@@ -125,7 +125,7 @@ std::vector<const Kernel*> GetAllCandidateKernels(
 
   // The last implementation should be reference function on CPUPlace.
   auto ref = GetReferKernel<KernelTuple>();
-  PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
+  CHECK(ref != nullptr) << "Refer Kernel can not be empty.";
   res.emplace_back(ref);
   return res;
 }
@@ -140,11 +140,11 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
     std::string name = k->ImplType();
     if (name == "JitCode") {
       auto i = dynamic_cast<const GenBase*>(k);
-      PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
+      CHECK(i) << "jitcode kernel cast can not fail.";
       res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
     } else {
       auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
-      PADDLE_ENFORCE(i, "kernel cast can not fail.");
+      CHECK(i) << "kernel cast can not fail.";
       res.emplace_back(std::make_pair(name, i->GetFunc()));
     }
   }
@@ -166,7 +166,7 @@ template <typename KernelTuple, typename PlaceType = lite::fluid::CPUPlace>
 typename KernelTuple::func_type GetDefaultBestFunc(
     const typename KernelTuple::attr_type& attr) {
   auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
-  PADDLE_ENFORCE_GE(funcs.size(), 1UL);
+  CHECK_GE(funcs.size(), 1UL);
   // Here could do some runtime benchmark of this attr and return the best one.
   // But yet just get the first one as the default best one,
   // which is searched in order and tuned by offline.
diff --git a/lite/backends/x86/jit/kernel_key.cc b/lite/backends/x86/jit/kernel_key.cc
index a6288fcf19d6867e1e1eb0bce32e559a4f303929..30397ffe1c4980e4af19a7a0eb44b47585b44f2c 100644
--- a/lite/backends/x86/jit/kernel_key.cc
+++ b/lite/backends/x86/jit/kernel_key.cc
@@ -14,7 +14,7 @@
 
 #include "lite/backends/x86/jit/kernel_key.h"
 #include <xxhash.h>  // XXH64: 13.8 GB/s
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/x86/jit/more/mkl/mkl.h b/lite/backends/x86/jit/more/mkl/mkl.h
index 6bc791e64575b8f481f91ea3c28ea4896fe1860d..473e1253194513c16d6d8c3b52eac110512e806e 100644
--- a/lite/backends/x86/jit/more/mkl/mkl.h
+++ b/lite/backends/x86/jit/more/mkl/mkl.h
@@ -18,7 +18,7 @@
 #include <type_traits>
 #include <vector>
 #include "lite/backends/x86/jit/kernel_base.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -104,11 +104,11 @@ void EmbSeqPool(const T* table,
                 const int64_t* idx,
                 T* out,
                 const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  CHECK_EQ(attr->table_width * attr->index_width, attr->out_width);
   auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(
-        idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+    CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i]
+                                         << " i: " << i;
+    CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i;
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -175,22 +175,22 @@ void Sgd(const T* lr,
          const int64_t* rows,
          T* out,
          const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  CHECK_EQ(attr->param_width, attr->grad_width);
+  CHECK_LE(attr->selected_rows_size, attr->grad_height);
   T scalar = -lr[0];
   int width = attr->grad_width;
   if (out == param) {
     for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
       auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
+      CHECK_LT(h_idx, attr->param_height);
+      CHECK_GE(h_idx, 0);
       VAXPY(scalar, grad + i * width, out + h_idx * width, width);
     }
   } else {
     for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
       auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
+      CHECK_LT(h_idx, attr->param_height);
+      CHECK_GE(h_idx, 0);
       VScal(&scalar, grad + i * width, out + h_idx * width, width);
       VAdd(param + h_idx * width,
            out + h_idx * width,
diff --git a/lite/backends/x86/jit/refer/refer.h b/lite/backends/x86/jit/refer/refer.h
index d8c8d86911ab9a7794192aa68fb0c0571b1e4d26..b7243dfda350e8d0ea5909cf84ae3aa76d845055 100644
--- a/lite/backends/x86/jit/refer/refer.h
+++ b/lite/backends/x86/jit/refer/refer.h
@@ -22,7 +22,6 @@
 #include "lite/backends/x86/jit/kernel_base.h"
 #include "lite/backends/x86/jit/macro.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
 
 namespace paddle {
 namespace lite {
@@ -480,12 +479,12 @@ void EmbSeqPool(const T* table,
                 const int64_t* idx,
                 T* out,
                 const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  CHECK_EQ(attr->table_width * attr->index_width, attr->out_width);
 
   auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(
-        idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+    CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i]
+                                         << " i: " << i;
+    CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i;
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -527,12 +526,12 @@ void Sgd(const T* lr,
          const int64_t* rows,
          T* out,
          const lite::jit::sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  CHECK_EQ(attr->param_width, attr->grad_width);
+  CHECK_LE(attr->selected_rows_size, attr->grad_height);
   for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
     auto h_idx = rows[i];
-    PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-    PADDLE_ENFORCE_GE(h_idx, 0);
+    CHECK_LT(h_idx, attr->param_height);
+    CHECK_GE(h_idx, 0);
     for (int64_t j = 0; j < attr->grad_width; ++j) {
       out[h_idx * attr->grad_width + j] =
           param[h_idx * attr->grad_width + j] -
diff --git a/lite/backends/x86/jit/test.cc b/lite/backends/x86/jit/test.cc
index aafcad579fdefd675323e0e2a6f40bd89c2a0166..03570a56d9c766271be630fe1d2e3048c6c42608 100644
--- a/lite/backends/x86/jit/test.cc
+++ b/lite/backends/x86/jit/test.cc
@@ -910,8 +910,8 @@ void TestKernelSgd() {
   const T lr = 0.1;
   auto UnDuplicatedRandomVec = [](
       int n, const int64_t lower, const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
+    CHECK_LE(static_cast<size_t>(upper - lower), n - 1);
+    CHECK_GT(n, 0);
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc
index 5d7e98629cb89bd7a3fdee852507e0f381e54931..274e8836dd6e59d610ddeb7a63f898cdc1b19cc1 100644
--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -116,7 +116,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
     lod[0].assign(high_level.begin(), high_level.end());
     lod[1].assign(low_level.begin(), low_level.end());
     // if (!lite::fluid::CheckLoD(lod)) {
-    //  //PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+    //  //LOG(FATAL)<<"lod %s is not right", framework::LoDToString(lod));
     //}
     selected_ids->set_lod(lod);
     selected_scores->set_lod(lod);
diff --git a/lite/backends/x86/math/blas.cc b/lite/backends/x86/math/blas.cc
index 3bc5f9f67ad96e7ec699400ff6369fe48c745b7e..4c6bf06951f81e90a73c91c2378f904db5678495 100644
--- a/lite/backends/x86/math/blas.cc
+++ b/lite/backends/x86/math/blas.cc
@@ -23,7 +23,7 @@ namespace math {
 MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
                                      int num_flatten_cols,
                                      bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1u);
+  CHECK_GT(tensor_dim.size(), 1u);
   MatDescriptor retv;
   if (num_flatten_cols > 1) {
     auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
diff --git a/lite/backends/x86/math/blas_impl.h b/lite/backends/x86/math/blas_impl.h
index 34b258892be05625ae88076eff175f56a53d3537..4a64e45ea945f2d46c06ba31d67bd2a0fbf7c635 100644
--- a/lite/backends/x86/math/blas_impl.h
+++ b/lite/backends/x86/math/blas_impl.h
@@ -287,22 +287,22 @@ struct CBlas<double> {
 
 template <>
 struct CBlas<lite::fluid::float16> {
-  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+  static void GEMM(...) { LOG(FATAL) << "float16 GEMM not supported on CPU"; }
   static void SMM_GEMM(...) {
-    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
+    LOG(FATAL) << "float16 SMM_GEMM not supported on CPU";
   }
-  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
-  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
+  static void VMUL(...) { LOG(FATAL) << "float16 VMUL not supported on CPU"; }
+  static void VEXP(...) { LOG(FATAL) << "float16 VEXP not supported on CPU"; }
   static void VSQUARE(...) {
-    PADDLE_THROW("float16 VSQUARE not supported on CPU");
+    LOG(FATAL) << "float16 VSQUARE not supported on CPU";
   }
-  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
-  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
-  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
-  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
+  static void VPOW(...) { LOG(FATAL) << "float16 VPOW not supported on CPU"; }
+  static void DOT(...) { LOG(FATAL) << "float16 DOT not supported on CPU"; };
+  static void SCAL(...) { LOG(FATAL) << "float16 SCAL not supported on CPU"; };
+  static void ASUM(...) { LOG(FATAL) << "float16 ASUM not supported on CPU"; };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
-    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
+    LOG(FATAL) << "float16 GEMM_BATCH not supported on CPU";
   }
 #endif
 };
@@ -461,11 +461,11 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
   auto dim_a = mat_a.dims();
   auto dim_b = mat_b.dims();
   auto dim_out = mat_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-  // PADDLE_ENFORCE(
-  //    mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target(),
-  //    "The targets of matrices must be same");
+  CHECK(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2)
+      << "The input and output of matmul be matrix";
+  // CHECK(
+  //    mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target())
+  //    << "The targets of matrices must be same";
 
   int M = dim_out[0];
   int N = dim_out[1];
@@ -746,7 +746,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
                           T alpha,
                           lite::Tensor *mat_out,
                           T beta) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+  CHECK_EQ(dim_a.width_, dim_b.height_);
   CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
   if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -761,8 +761,8 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
                            beta,
                            mat_out->template mutable_data<T>());
   } else {
-    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
-                   dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
+    CHECK(dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
+          dim_b.batch_size_ == 0);
     this->template BatchedGEMM<T>(
         transA,
         transB,
diff --git a/lite/backends/x86/math/context_project.h b/lite/backends/x86/math/context_project.h
index 0c56e0d759fd9b1e3abba5209f43d7a0c8fe194e..72a2f4ce12cbd72b26cd87e97d0178275a4b4abd 100644
--- a/lite/backends/x86/math/context_project.h
+++ b/lite/backends/x86/math/context_project.h
@@ -146,7 +146,7 @@ class ContextProjectFunctor {
       }
     }
     if (padding_trainable) {
-      PADDLE_ENFORCE(padding_data != nullptr);
+      CHECK(padding_data != nullptr);
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
         if (lod_level_0[i] == lod_level_0[i + 1]) continue;
 
diff --git a/lite/backends/x86/math/cpu_vec.h b/lite/backends/x86/math/cpu_vec.h
index 9ff64d53f069d2e4c5b639d273af5b4aa5738b2b..0e721cc8c272eee4b1df1f4b254b5e1d0c1ebb0a 100644
--- a/lite/backends/x86/math/cpu_vec.h
+++ b/lite/backends/x86/math/cpu_vec.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <functional>
 #include <string>
 #include "lite/backends/x86/cpu_info.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 #ifdef PADDLE_WITH_MKLML
 #include "lite/backends/x86/mklml.h"
@@ -652,7 +652,7 @@ class VecActivations {
     } else if (type == "identity" || type == "") {
       return vec_identity<T, isa>;
     }
-    PADDLE_THROW("Not support type: %s", type);
+    LOG(FATAL) << "Not support type: " << type;
   }
 };
 
diff --git a/lite/backends/x86/math/cross_entropy.cc b/lite/backends/x86/math/cross_entropy.cc
index 941a34643669f060cdd18f38f92c39e529da7b19..2419620111b7ace292d8a2d366fc1dce2353a15c 100644
--- a/lite/backends/x86/math/cross_entropy.cc
+++ b/lite/backends/x86/math/cross_entropy.cc
@@ -57,7 +57,7 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> {
       for (int i = 0; i < batch_size; ++i) {
         for (int j = 0; j < num_remain; j++) {
           int lbl = label_data[i * num_remain + j];
-          PADDLE_ENFORCE((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
+          CHECK((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
           int index = i * num_classes + lbl * num_remain + j;
           int loss_idx = i * num_remain + j;
           loss_data[loss_idx] =
diff --git a/lite/backends/x86/math/cross_entropy.h b/lite/backends/x86/math/cross_entropy.h
index 6b66f0b08548c1306681409345c051d1ab40a7c0..d2a66083ac1a72de9e5e469618fc387a5ea784dc 100644
--- a/lite/backends/x86/math/cross_entropy.h
+++ b/lite/backends/x86/math/cross_entropy.h
@@ -27,7 +27,7 @@ namespace math {
 template <typename T>
 struct TolerableValue {
   HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ENFORCE(static_cast<bool>(std::is_floating_point<T>::value));
+    CHECK(static_cast<bool>(std::is_floating_point<T>::value));
     const T kApproInf = 1e20;
 
     if (x == INFINITY) return kApproInf;
diff --git a/lite/backends/x86/math/detail/activation_functions.h b/lite/backends/x86/math/detail/activation_functions.h
index 6a13a3d471e10970b36120a12b21a36256350803..dc3c3eac1989f256378e408b8e8e4401bea43e7c 100644
--- a/lite/backends/x86/math/detail/activation_functions.h
+++ b/lite/backends/x86/math/detail/activation_functions.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <math.h>
 #include <string>
 #include "lite/backends/x86/cpu_info.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -46,8 +46,6 @@ inline ActivationType GetActivationType(const std::string &type) {
     return ActivationType::kIdentity;
   }
   LOG(ERROR) << "Not support type " << type;
-  // PADDLE_ENFORCE(false, "Not support type %s", type);
-  // PADDLE_THROW("Not support type %s.", type);
   return ActivationType();
 }
 
diff --git a/lite/backends/x86/math/gru_compute.h b/lite/backends/x86/math/gru_compute.h
index 86b7a91f4127de50aeb5c5fb02122bced0af4188..767e9b9da0e2977f566c793c2fdc71f83ab5b6d4 100644
--- a/lite/backends/x86/math/gru_compute.h
+++ b/lite/backends/x86/math/gru_compute.h
@@ -13,7 +13,7 @@ limitations under the License. */
 
 #include "lite/backends/x86/math/detail/activation_functions.h"
 #include "lite/core/context.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/x86/math/im2col.cc b/lite/backends/x86/math/im2col.cc
index b916c912ffc2a4d62b63b98fdce150b353ba087e..abbd9b0e2811913f6aff79561e365d20bffbeae4 100644
--- a/lite/backends/x86/math/im2col.cc
+++ b/lite/backends/x86/math/im2col.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "lite/backends/x86/math/im2col.h"
 #include <vector>
 #include "lite/backends/x86/math/im2col_cfo_cpu.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -38,8 +38,8 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
                   lite::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
+    CHECK_EQ(im.dims().size(), 3);
+    CHECK_EQ(col->dims().size(), 5);
 
     if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
         dilation[1] == 1) {
@@ -72,8 +72,8 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
                   lite::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
+    CHECK_EQ(im->dims().size(), 3);
+    CHECK_EQ(col.dims().size(), 5);
     int im_channels = im->dims()[0];
     int im_height = im->dims()[1];
     int im_width = im->dims()[2];
@@ -82,20 +82,20 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
     int col_height = col.dims()[3];
     int col_width = col.dims()[4];
 
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       ((dilation[0] * (filter_height - 1) + 1))) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       ((dilation[1] * (filter_width - 1) + 1))) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+    CHECK_EQ((im_height + padding[0] + padding[2] -
+              ((dilation[0] * (filter_height - 1) + 1))) /
+                     stride[0] +
+                 1,
+             col_height)
+        << "Output_height and padding(padding_up, padding_down) are "
+           "inconsistent.";
+    CHECK_EQ((im_width + padding[1] + padding[3] -
+              ((dilation[1] * (filter_width - 1) + 1))) /
+                     stride[1] +
+                 1,
+             col_width)
+        << "Output_height and padding(padding_up, padding_down) are "
+           "inconsistent.";
 
     int channels_col = im_channels * filter_height * filter_width;
 
@@ -150,8 +150,8 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
                   lite::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
+    CHECK_EQ(im.dims().size(), 3);
+    CHECK_EQ(col->dims().size(), 5);
     int im_channels = im.dims()[0];
     int im_height = im.dims()[1];
     int im_width = im.dims()[2];
@@ -214,8 +214,8 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
                   lite::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
+    CHECK_EQ(im->dims().size(), 3);
+    CHECK_EQ(col.dims().size(), 5);
     int im_channels = im->dims()[0];
     int im_height = im->dims()[1];
     int im_width = im->dims()[2];
@@ -224,16 +224,16 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
     int col_height = col.dims()[0];
     int col_width = col.dims()[1];
 
-    PADDLE_ENFORCE_EQ(
+    CHECK_EQ(
         (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
+        col_height)
+        << "Output_height and padding(padding_up, padding_down) are "
+           "inconsistent.";
+    CHECK_EQ(
         (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
+        col_width)
+        << "col_width and padding(padding_left, padding_right) are "
+           "inconsistent.";
 
     T* im_data = im->template mutable_data<T>();
     const T* col_data = col.data<T>();
diff --git a/lite/backends/x86/math/lstm_compute.h b/lite/backends/x86/math/lstm_compute.h
index ddb7bea9995ebcca978be97f8295eb07b0e4e17e..b403770cca7248fba10e62708dddfb91f2789488 100644
--- a/lite/backends/x86/math/lstm_compute.h
+++ b/lite/backends/x86/math/lstm_compute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "lite/backends/x86/math/detail/activation_functions.h"
 #include "lite/core/context.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc
index cb1781db2199c1b7a12aaec80b1904f65b23b534..cc4aa5d9fa54c50eb944714c14a5f6b15634a181 100644
--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -121,8 +121,8 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
                   lite::Tensor* output) {
     const auto& in_dims = input.dims();
     auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    CHECK_EQ(vector.numel(), size);
+    CHECK_EQ(output->dims(), in_dims);
 
     const T* input_data = input.data<T>();
     const T* vector_data = vector.data<T>();
diff --git a/lite/backends/x86/math/math_function.h b/lite/backends/x86/math/math_function.h
index 8f629b5f171814f0df8e51e61123c7c0aabf7643..7081ec0053e0b4194730e6f4353e1274d6019bb4 100644
--- a/lite/backends/x86/math/math_function.h
+++ b/lite/backends/x86/math/math_function.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 #include "lite/fluid/float16.h"
-#include "lite/utils/paddle_enforce.h"
-//#include "lite/tensor_util.h"
+#include "lite/utils/cp_logging.h"
+// #include "lite/tensor_util.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/x86/math/math_function_impl.h b/lite/backends/x86/math/math_function_impl.h
index acfb76759f6fc9fa4122afd2388bc3adf8f5ea22..9bbfebcfb2feb0e3c9d68261240bed18888350c3 100644
--- a/lite/backends/x86/math/math_function_impl.h
+++ b/lite/backends/x86/math/math_function_impl.h
@@ -59,7 +59,7 @@ void ColwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
                                        lite::TensorLite* out) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(out->numel(), size);
+  CHECK_EQ(out->numel(), size);
 
   auto in = lite::fluid::EigenMatrix<T>::From(input);
   auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
@@ -81,7 +81,7 @@ class ColwiseSum<lite::TargetType::kX86, T> {
     auto& in_dims = input.dims();
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
+    CHECK_EQ(out->numel(), size);
 
     T* out_buf = out->template mutable_data<T>(out->target());
     const T* in_buf = input.data<T>();
@@ -103,8 +103,8 @@ void RowwiseMean<Target, T>::operator()(const lite::Context<Target>& context,
                                         const lite::TensorLite& input,
                                         lite::TensorLite* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  CHECK_EQ(in_dims.size(), 2U);
+  CHECK_EQ(out->numel(), in_dims[0]);
 
   auto in = lite::fluid::EigenMatrix<T>::From(input);
   auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
@@ -124,10 +124,10 @@ class RowwiseMean<lite::TargetType::kX86, T> {
                   const lite::TensorLite& input,
                   lite::TensorLite* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    CHECK_EQ(in_dims.size(), 2U);
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    CHECK_EQ(out->numel(), height);
     auto inv_size = 1.0 / size;
     T* out_buf = out->template mutable_data<T>(out->target());
     const T* in_buf = input.data<T>();
@@ -147,8 +147,8 @@ void RowwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
                                        const lite::TensorLite& input,
                                        lite::TensorLite* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  CHECK_EQ(in_dims.size(), 2U);
+  CHECK_EQ(out->numel(), in_dims[0]);
 
   auto in = lite::fluid::EigenMatrix<T>::From(input);
   auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
@@ -168,10 +168,10 @@ class RowwiseSum<lite::TargetType::kX86, T> {
                   const lite::TensorLite& input,
                   lite::TensorLite* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    CHECK_EQ(in_dims.size(), 2U);
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    CHECK_EQ(out->numel(), height);
 
     T* out_buf = out->template mutable_data<T>(out->target());
     const T* in_buf = input.data<T>();
diff --git a/lite/backends/x86/math/math_function_test.cc b/lite/backends/x86/math/math_function_test.cc
index 19122a6169fbbe1729e38389b0006b11190bc206..b3511ca3521634a771965348e754e10bfd72e19f 100644
--- a/lite/backends/x86/math/math_function_test.cc
+++ b/lite/backends/x86/math/math_function_test.cc
@@ -273,7 +273,7 @@ TEST(math_funciton, set_constant) {
   auto* ctx = new paddle::platform::CPUDeviceContext();
   paddle::operators::math::set_constant(*ctx, &t, 10);
   for (int64_t i = 0; i < t.numel(); ++i) {
-    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
+    CHECK_EQ(10, t.data<int>()[i]);
   }
   delete ctx;
 }
diff --git a/lite/backends/x86/math/sampler.h b/lite/backends/x86/math/sampler.h
index efd9e48e5443186b6b735287cc150f99cb42be81..07cca52e1f436c2979a331dd27c2ddc554c0dad8 100644
--- a/lite/backends/x86/math/sampler.h
+++ b/lite/backends/x86/math/sampler.h
@@ -32,7 +32,7 @@ namespace math {
 class Sampler {
  public:
   explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
-    //    PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
+    //    CHECK_GT(range, 0, "Range should be greater than 0.");
     if (seed == 0) {
       std::random_device r;
       seed_ = r();
diff --git a/lite/backends/x86/math/selected_rows_functor.cc b/lite/backends/x86/math/selected_rows_functor.cc
index 03a18587f4a029bcaebe484ca1ab1951e7c3ecad..8e2a81905b871902aa8ec79c9dd718a62c9f6dec 100644
--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
@@ -31,7 +31,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
                   const fluid::SelectedRows& input2,
                   fluid::SelectedRows* output) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    CHECK_EQ(in1_height, input2.height());
     output->set_height(in1_height);
 
     auto& in1_rows = input1.rows();
@@ -49,8 +49,8 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
     auto& in2_value = input2.value();
 
     auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+    CHECK_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    CHECK_EQ(in1_row_numel, out_value->numel() / out_rows.size());
 
     auto* out_data = out_value->template mutable_data<T>();
     auto* in1_data = in1_value.data<T>();
@@ -73,15 +73,15 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
     auto in1_height = input1.height();
     auto in2_dims = input2.dims();
     auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+    CHECK_EQ(in1_height, in2_dims[0]);
+    CHECK_EQ(in1_height, out_dims[0]);
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+    CHECK_EQ(in1_row_numel, input2.numel() / in1_height);
+    CHECK_EQ(in1_row_numel, output->numel() / in1_height);
 
     SetConstant<lite::TargetType::kX86, T> functor;
     functor(context, output, 0.0);
@@ -113,7 +113,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
                   const int64_t input2_offset,
                   fluid::SelectedRows* input2) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    CHECK_EQ(in1_height, input2->height());
 
     auto& in1_rows = input1.rows();
     auto& in2_rows = *(input2->mutable_rows());
@@ -149,7 +149,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
       auto& in_rows = (*iter)->rows();
       size += in_rows.end() - in_rows.begin();
       auto in1_height = (*iter)->height();
-      PADDLE_ENFORCE_EQ(in1_height, input2->height());
+      CHECK_EQ(in1_height, input2->height());
     }
     // concat rows
     std::vector<int64_t> in2_rows;
@@ -185,13 +185,13 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {
 
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    CHECK_EQ(in1_height, in2_dims[0]);
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    CHECK_EQ(in1_row_numel, input2->numel() / in1_height);
 
     auto* in1_data = in1_value.data<T>();
     auto* input2_data = input2->template mutable_data<T>();
@@ -291,12 +291,11 @@ struct MergeAdd<lite::TargetType::kX86, T> {
       if (input->rows().size() == 0) {
         continue;
       }
-      PADDLE_ENFORCE_EQ(input_width,
-                        input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
-      PADDLE_ENFORCE_EQ(
-          input_height, input->height(), "all input should have same height");
+      CHECK_EQ(input_width, input->value().dims()[1])
+          << "all input should have same "
+             "dimension except for the first one";
+      CHECK_EQ(input_height, input->height())
+          << "all input should have same height";
       row_num += input->rows().size();
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
@@ -376,13 +375,13 @@ struct UpdateToTensor<lite::TargetType::kX86, T> {
                   lite::Tensor* input2) {
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    CHECK_EQ(in1_height, in2_dims[0]);
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    CHECK_EQ(in1_row_numel, input2->numel() / in1_height);
 
     auto* in1_data = in1_value.data<T>();
     auto* input2_data = input2->template data<T>();
diff --git a/lite/backends/x86/math/sequence2batch.cc b/lite/backends/x86/math/sequence2batch.cc
index aa7aeac532e2fa1f90d452924b364be1896ee862..597521b6e7cac49ac91dbddac71af22bb5a8760c 100644
--- a/lite/backends/x86/math/sequence2batch.cc
+++ b/lite/backends/x86/math/sequence2batch.cc
@@ -30,12 +30,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
     const uint64_t* index = index_lod.data();
     const auto& src_dims = src.dims();
     const auto& dst_dims = dst->dims();
-    PADDLE_ENFORCE_EQ(
-        src_dims.size(), 2UL, "The src must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(
-        dst_dims.size(), 2UL, "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(
-        src_dims[1], dst_dims[1], "The width of src and dst must be same.");
+    CHECK_EQ(src_dims.size(), 2UL) << "The src must be matrix with rank 2.";
+    CHECK_EQ(dst_dims.size(), 2UL) << "The dst must be matrix with rank 2.";
+    CHECK_EQ(src_dims[1], dst_dims[1])
+        << "The width of src and dst must be same.";
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
diff --git a/lite/backends/x86/math/sequence2batch.h b/lite/backends/x86/math/sequence2batch.h
index 796894cb7d18ec4db7b670276bb3d3fc5b1427f8..953576eea4170cca57f10bb977ca9bfecb36ae6d 100644
--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "lite/core/context.h"
 #include "lite/core/tensor.h"
 #include "lite/fluid/eigen.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -66,21 +66,18 @@ class LoDTensor2BatchFunctor {
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
       auto lods = batch->lod();
-      PADDLE_ENFORCE_GT(lods.size(),
-                        2UL,
-                        "The LoD of LoDTensor should inlcude at least 2-level "
-                        "sequence information.");
-      PADDLE_ENFORCE_EQ(
-          lods[1].size(),
-          static_cast<size_t>(lod_tensor.dims()[0]),
-          "The LoD information should be consistent with the dims.");
+      CHECK_GT(lods.size(), 2UL)
+          << "The LoD of LoDTensor should inlcude at least 2-level "
+             "sequence information.";
+      CHECK_EQ(lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]))
+          << "The LoD information should be consistent with the dims.";
       CopyMatrixRowsFunctor<Target, T> to_batch;
       to_batch(context, lod_tensor, lods[1], batch, true);
       return;
     }
 
     auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
+    CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now.";
 
     const auto& lod = lods[0];
 
@@ -165,14 +162,11 @@ class Batch2LoDTensorFunctor {
                   const lite::Tensor& batch,
                   lite::Tensor* lod_tensor) const {
     auto in_lod = batch.lod();
-    PADDLE_ENFORCE_GT(in_lod.size(),
-                      2UL,
-                      "The LoD of LoDTensor should inlcude at least 2-level "
-                      "sequence information.");
-    PADDLE_ENFORCE_EQ(
-        in_lod[1].size(),
-        static_cast<size_t>(lod_tensor->dims()[0]),
-        "The LoD information should be consistent with the dims.");
+    CHECK_GT(in_lod.size(), 2UL)
+        << "The LoD of LoDTensor should inlcude at least 2-level "
+           "sequence information.";
+    CHECK_EQ(in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]))
+        << "The LoD information should be consistent with the dims.";
     CopyMatrixRowsFunctor<Target, T> to_seq;
     to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
diff --git a/lite/backends/x86/math/sequence_padding.cc b/lite/backends/x86/math/sequence_padding.cc
index eb977dc2d23f4cfaeec7dd5a6e2834ca23345f76..3b2f8bfc4f58a4bfcab968a9288eb8d1d817d78d 100644
--- a/lite/backends/x86/math/sequence_padding.cc
+++ b/lite/backends/x86/math/sequence_padding.cc
@@ -37,10 +37,9 @@ void CopyValidData(lite::Tensor* dst_tensor,
       layout == kBatchLengthWidth ? step_width : seq_num * step_width;
   for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) {
     int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
-    PADDLE_ENFORCE_GE(
-        pad_seq_len,
-        valid_seq_len,
-        "The padded sequence length can not be less than its original length.");
+    CHECK_GE(pad_seq_len, valid_seq_len) << "The padded sequence length can "
+                                            "not be less than its original "
+                                            "length.";
     int seq_data_offset = seq_offsets[seq_idx] * step_width;
     int pad_data_offset = layout == kBatchLengthWidth
                               ? seq_idx * pad_seq_len * step_width
@@ -108,9 +107,9 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
               pad_seq_len,
               step_width,
               layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
+    CHECK(pad_value.numel() == 1 || pad_value.numel() == step_width)
+        << "The numel of 'pad_value' can only be 1 or be equal to the "
+           "'step_width'.";
 
     // fill padding value
     T* pad_data = pad_tensor->template mutable_data<T>();
diff --git a/lite/backends/x86/math/sequence_padding.h b/lite/backends/x86/math/sequence_padding.h
index 43407014dea0ed0c78ab29da7fb8ebb0e0310566..5512c4aa11fb5dc05283d01b1d6d3da7fb83c064 100644
--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "lite/core/context.h"
 #include "lite/core/tensor.h"
 #include "lite/fluid/lod.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -46,15 +46,14 @@ inline static void CheckDims(const lite::DDim& seq_tensor_dims,
                              int64_t padded_seq_len,
                              int64_t step_width,
                              const PadLayout& layout) {
-  PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]),
-                    seq_offset.back(),
-                    "Value of 1st dimension of the sequence tensor should be "
-                    "equal to sum of lengths of all sequences.");
+  CHECK_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back())
+      << "Value of 1st dimension of the sequence tensor should be "
+         "equal to sum of lengths of all sequences.";
 
-  PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
-                     seq_tensor_dims.size() == pad_tensor_dims.size(),
-                 "pad_tensor's rank should be 1 greater than seq_tensor's "
-                 "rank, or be equal with it.");
+  CHECK(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
+        seq_tensor_dims.size() == pad_tensor_dims.size())
+      << "pad_tensor's rank should be 1 greater than seq_tensor's "
+         "rank, or be equal with it.";
 }
 
 /*
diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc
index 2d00ebad61840da5b14fbf12d9255394b2b2df1a..c1ddb030349a7f7f46fd6b98d3f967eb6fdfe48e 100644
--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
@@ -46,12 +46,12 @@ class MaxSeqPoolFunctor {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     auto idx_dims = index->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1u);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1u);
+    CHECK_GT(in_dims.size(), 1u);
+    CHECK_GT(out_dims.size(), 1u);
     for (size_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+      CHECK_EQ(in_dims[i], out_dims[i]);
     }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+    CHECK_EQ(idx_dims, out_dims);
 
     auto starts = input.lod()[0];
     const T* in_data = input.data<T>();
@@ -95,10 +95,10 @@ class MaxSeqPoolFunctor<T, true> {
                   lite::Tensor* index) {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1u);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1u);
+    CHECK_GT(in_dims.size(), 1u);
+    CHECK_GT(out_dims.size(), 1u);
     for (size_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+      CHECK_EQ(in_dims[i], out_dims[i]);
     }
 
     auto starts = input.lod()[0];
@@ -136,12 +136,12 @@ class MaxSeqPoolGradFunctor {
     auto og_dims = out_grad.dims();
     auto ig_dims = in_grad->dims();
     auto idx_dims = index.dims();
-    PADDLE_ENFORCE_GT(og_dims.size(), 1);
-    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
+    CHECK_GT(og_dims.size(), 1);
+    CHECK_GT(ig_dims.size(), 1);
     for (size_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+      CHECK_EQ(og_dims[i], ig_dims[i]);
     }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+    CHECK_EQ(idx_dims, og_dims);
 
     const T* og_data = out_grad.data<T>();
     const int* max_index = index.data<int>();
@@ -236,7 +236,7 @@ class SumSeqPoolGradFunctor {
     auto lod = in_grad->lod()[0];
     int64_t out_w = out_grad.numel() / out_grad.dims()[0];
     int64_t in_w = in_grad->numel() / in_grad->dims()[0];
-    PADDLE_ENFORCE(in_w == out_w);
+    CHECK(in_w == out_w);
     const T* out_g_data = out_grad.data<T>();
     T* in_g_data = in_grad->template mutable_data<T>(TARGET(kX86));
     auto blas = math::GetBlas<TARGET(kX86), T>(context);
@@ -330,7 +330,7 @@ class SequencePoolFunctor<TARGET(kX86), T> {
         out_e.device(eigen_device) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                                      std::sqrt(static_cast<T>(h));
       } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        LOG(FATAL) << "unsupported pooling pooltype";
       }
     }
   }
@@ -389,7 +389,7 @@ class SequencePoolGradFunctor<TARGET(kX86), T> {
       } else if (pooltype == "FIRST") {
         in_g_e.chip(0, 0).device(eigen_device) = out_g_e_v;
       } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        LOG(FATAL) << "unsupported pooling pooltype";
       }
     }
   }
diff --git a/lite/backends/x86/math/sequence_pooling_test.cc b/lite/backends/x86/math/sequence_pooling_test.cc
index b91f43a571994bef95650361a6dc62c0465837a7..8bba0f92055dbee5a81bf12ab2fa5cc6592bd60c 100644
--- a/lite/backends/x86/math/sequence_pooling_test.cc
+++ b/lite/backends/x86/math/sequence_pooling_test.cc
@@ -50,9 +50,9 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
   in_grad.mutable_data<T>(in_dims, context->GetPlace());
 
   // check tensor contruction result
-  PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size());
+  CHECK_EQ(in_grad.dims().size(), out_grad.dims().size());
   for (int64_t i = 1; i < out_grad.dims().size(); ++i) {
-    PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]);
+    CHECK_EQ(in_grad.dims()[i], out_grad.dims()[i]);
   }
 
   // call functor
diff --git a/lite/backends/x86/math/tree2col.cc b/lite/backends/x86/math/tree2col.cc
index c54bb2099edd0a7e6be61cfdff6340734f09116a..bcab1e77c0bef356453bf1ea1f30aabfc9f1dff0 100644
--- a/lite/backends/x86/math/tree2col.cc
+++ b/lite/backends/x86/math/tree2col.cc
@@ -55,7 +55,7 @@ void Tree2ColUtil::construct_tree(const lite::Tensor &EdgeSet,
                                   std::vector<std::vector<int>> *tr,
                                   size_t *node_count) {
   auto edge_set_dims = EdgeSet.dims();
-  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2);
+  CHECK_EQ(edge_set_dims[1], 2);
   int64_t edge_count = EdgeSet.numel();
 
   const int *edge_data = EdgeSet.data<int>();
diff --git a/lite/backends/x86/math/unpooling.cc b/lite/backends/x86/math/unpooling.cc
index 119d7294e9ec21e67f09776ad20d04f15b8b81ce..7ff132cbf121172b5bf35966637080d599eaf498 100644
--- a/lite/backends/x86/math/unpooling.cc
+++ b/lite/backends/x86/math/unpooling.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "lite/backends/x86/math/unpooling.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -41,7 +41,7 @@ class Unpool2dMaxFunctor<lite::TargetType::kX86, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          CHECK(index < output_feasize) << "err index in unpooling!";
           output_data[index] = input_data[i];
         }
         input_data += input_feasize;
@@ -77,7 +77,7 @@ class Unpool2dMaxGradFunctor<lite::TargetType::kX86, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          CHECK(index < output_feasize) << "err index in unpooling!";
           input_grad_data[i] = output_grad_data[index];
         }
         input_grad_data += input_feasize;
diff --git a/lite/backends/x86/math/vol2col.cc b/lite/backends/x86/math/vol2col.cc
index 91979bb7fdcfe66d84ded3f9797144ddafc8769e..8e8f44be55fc2df342092ad399f00bcc7941908d 100644
--- a/lite/backends/x86/math/vol2col.cc
+++ b/lite/backends/x86/math/vol2col.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "lite/backends/x86/math/vol2col.h"
 #include <vector>
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -36,8 +36,8 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   lite::Tensor* col) const {
-    PADDLE_ENFORCE(vol.dims().size() == 4);
-    PADDLE_ENFORCE(col->dims().size() == 7);
+    CHECK_EQ(vol.dims().size(), 4);
+    CHECK_EQ(col->dims().size(), 7);
 
     int input_channels = vol.dims()[0];
     int input_depth = vol.dims()[1];
@@ -52,27 +52,27 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
     int channels_col =
         input_channels * filter_depth * filter_height * filter_width;
 
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    CHECK_EQ((input_depth + 2 * paddings[0] -
+              ((dilations[0] * (filter_depth - 1) + 1))) /
+                     strides[0] +
+                 1,
+             output_depth)
+        << "input_depth and output_depth are "
+           "mismatching.";
+    CHECK_EQ((input_height + 2 * paddings[1] -
+              ((dilations[1] * (filter_height - 1) + 1))) /
+                     strides[1] +
+                 1,
+             output_height)
+        << "input_height and output_height are "
+           "mismatching.";
+    CHECK_EQ((input_width + 2 * paddings[2] -
+              ((dilations[2] * (filter_width - 1) + 1))) /
+                     strides[2] +
+                 1,
+             output_width)
+        << "input_width and output_width are "
+           "mismatching.";
 
     const T* vol_data = vol.data<T>();
     T* col_data = col->template mutable_data<T>();
@@ -122,8 +122,8 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   lite::Tensor* vol) const {
-    PADDLE_ENFORCE(vol->dims().size() == 4);
-    PADDLE_ENFORCE(col.dims().size() == 7);
+    CHECK_EQ(vol->dims().size(), 4);
+    CHECK_EQ(col.dims().size(), 7);
 
     int input_channels = vol->dims()[0];
     int input_depth = vol->dims()[1];
@@ -138,27 +138,27 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
     int channels_col =
         input_channels * filter_depth * filter_height * filter_width;
 
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    CHECK_EQ((input_depth + 2 * paddings[0] -
+              ((dilations[0] * (filter_depth - 1) + 1))) /
+                     strides[0] +
+                 1,
+             output_depth)
+        << "input_depth and output_depth are "
+           "mismatching.";
+    CHECK_EQ((input_height + 2 * paddings[1] -
+              ((dilations[1] * (filter_height - 1) + 1))) /
+                     strides[1] +
+                 1,
+             output_height)
+        << "input_height and output_height are "
+           "mismatching.";
+    CHECK_EQ((input_width + 2 * paddings[2] -
+              ((dilations[2] * (filter_width - 1) + 1))) /
+                     strides[2] +
+                 1,
+             output_width)
+        << "input_width and output_width are "
+           "mismatching.";
     T* vol_data = vol->template mutable_data<T>();
     const T* col_data = col.data<T>();
 
diff --git a/lite/backends/xpu/debug.h b/lite/backends/xpu/debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..56bafc9c3d3a7772af8fc8afd10fc7efa3415ef7
--- /dev/null
+++ b/lite/backends/xpu/debug.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include "lite/backends/xpu/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+
+template <typename T>
+void DumpCPUMem(const T* ptr,
+                size_t len,
+                const std::string& comment = "",
+                size_t stride = 1,
+                size_t item_per_line = 30) {
+  size_t after_stride_len = (len + stride - 1) / stride;
+  std::unique_ptr<T[]> after_stride(new T[after_stride_len]);
+  for (size_t i = 0; i < after_stride_len; ++i) {
+    after_stride[i] = ptr[i * stride];
+  }
+  double sum = 0;
+  for (size_t i = 0; i < len; ++i) {
+    sum += ptr[i];
+  }
+
+  printf(
+      "------------------------------ [%s] len=%zd stride=%zd sum=%f BEGIN "
+      "------------------------------\n",
+      comment.c_str(),
+      len,
+      stride,
+      sum);
+  size_t nline = (after_stride_len + item_per_line - 1) / item_per_line;
+  for (size_t i = 0; i < nline; ++i) {
+    size_t line_begin = i * item_per_line;
+    size_t line_end = line_begin + item_per_line;
+    printf("line[%04zd] -- ", i);
+    for (size_t ii = line_begin; (ii < line_end) && (ii < after_stride_len);
+         ++ii) {
+      if (std::is_same<T, float>::value) {
+        printf("%.6f, ", static_cast<float>(after_stride[ii]));
+      } else if (std::is_same<T, int16_t>::value) {
+        printf("%d ", static_cast<int>(after_stride[ii]));
+      } else {
+        // CHECK(false) << "unknown type";
+      }
+    }
+    printf("\n");
+  }
+  printf(
+      "------------------------------ [%s] len=%zd stride=%zd sum=%f  END  "
+      "------------------------------\n",
+      comment.c_str(),
+      len,
+      stride,
+      sum);
+}
+
+template <typename T>
+void DumpXPUMem(const T* ptr,
+                size_t len,
+                const std::string& comment = "",
+                size_t stride = 1,
+                size_t item_per_line = 30) {
+  size_t after_stride_len = (len + stride - 1) / stride;
+  std::unique_ptr<T[]> cpu_mem(new T[len]);
+  XPU_CALL(xpu_memcpy(
+      cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST));
+  std::unique_ptr<T[]> after_stride(new T[after_stride_len]);
+  for (size_t i = 0; i < after_stride_len; ++i) {
+    after_stride[i] = cpu_mem[i * stride];
+  }
+  double sum = 0;
+  for (size_t i = 0; i < len; ++i) {
+    sum += cpu_mem[i];
+  }
+
+  printf(
+      "------------------------------ [%s] len=%zd stride=%zd sum=%f BEGIN "
+      "------------------------------\n",
+      comment.c_str(),
+      len,
+      stride,
+      sum);
+  size_t nline = (after_stride_len + item_per_line - 1) / item_per_line;
+  for (size_t i = 0; i < nline; ++i) {
+    size_t line_begin = i * item_per_line;
+    size_t line_end = line_begin + item_per_line;
+    printf("line[%04zd] -- ", i);
+    for (size_t ii = line_begin; (ii < line_end) && (ii < after_stride_len);
+         ++ii) {
+      if (std::is_same<T, float>::value) {
+        printf("%.6f, ", static_cast<float>(after_stride[ii]));
+      } else if (std::is_same<T, int16_t>::value) {
+        printf("%d ", static_cast<int>(after_stride[ii]));
+      } else {
+        // CHECK(false) << "unknown type";
+      }
+    }
+    printf("\n");
+  }
+  printf(
+      "------------------------------ [%s] len=%zd stride=%zd sum=%f  END  "
+      "------------------------------\n",
+      comment.c_str(),
+      len,
+      stride,
+      sum);
+}
+
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc
index 5dcbc1e275cca8c32003cbef74dfb1f6d4caee93..a322418ccde20a34dc6c6ba9b47601a9a658f99c 100644
--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
@@ -13,18 +13,17 @@
 // limitations under the License.
 
 #include "lite/backends/xpu/target_wrapper.h"
-#include "lite/backends/xpu/xpu_header_sitter.h"
 
 namespace paddle {
 namespace lite {
 
 void* TargetWrapperXPU::Malloc(size_t size) {
   void* ptr{nullptr};
-  xpu_malloc(&ptr, size);
+  XPU_CALL(xpu_malloc(&ptr, size));
   return ptr;
 }
 
-void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
+void TargetWrapperXPU::Free(void* ptr) { XPU_CALL(xpu_free(ptr)); }
 
 void TargetWrapperXPU::MemcpySync(void* dst,
                                   const void* src,
@@ -32,15 +31,31 @@ void TargetWrapperXPU::MemcpySync(void* dst,
                                   IoDirection dir) {
   switch (dir) {
     case IoDirection::HtoD:
-      xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
+      XPU_CALL(xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE));
       break;
     case IoDirection::DtoH:
-      xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
+      XPU_CALL(xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST));
       break;
     default:
       LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
   }
 }
 
+XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size,
+                                                      bool use_l3) {
+  void* ptr{nullptr};
+  if (use_l3) {
+    ptr = xdnn::alloc_workspace(GetRawContext(), size);
+  } else {
+    ptr = TargetWrapperXPU::Malloc(size);
+  }
+  CHECK(ptr != nullptr) << "size = " << size << ", use_l3 = " << use_l3;
+  return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3));
+}
+
+std::string TargetWrapperXPU::multi_encoder_precision;  // NOLINT
+int TargetWrapperXPU::workspace_l3_size_per_thread{0};
+thread_local xdnn::Context* TargetWrapperXPU::tls_raw_ctx_{nullptr};
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h
index c42d4139246085d8b9a367b45b60699209d0b668..070184a13088a169fe38f1b8105a0803d9915da1 100644
--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
@@ -14,13 +14,45 @@
 
 #pragma once
 
-#include "lite/core/target_wrapper.h"
+#include <memory>                                 // std::unique_ptr
+#include "lite/backends/xpu/xpu_header_sitter.h"  // xpu_free
+#include "lite/core/target_wrapper.h"             // TargetWrapper
+#include "lite/utils/cp_logging.h"                // CHECK_EQ
+
+#define XPU_CALL(func)                                        \
+  {                                                           \
+    auto e = (func);                                          \
+    CHECK_EQ(e, 0) << "XPU: (" << #func << ") returns " << e; \
+  }
 
 namespace paddle {
 namespace lite {
 
+// MAX(lod.size()) = 64
+const int XPU_MAX_LOD_SIZE = 64;
+// MAX(lod[i + 1] - lod[i]) = 512
+const int XPU_MAX_LOD_SEQ_LEN = 512;
+
 using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
 
+struct XPUScratchPad {
+  XPUScratchPad(void* addr, bool is_l3) : addr_(addr), is_l3_(is_l3) {}
+
+  void* addr_{nullptr};
+  bool is_l3_{false};
+};
+
+struct XPUScratchPadDeleter {
+  void operator()(XPUScratchPad* sp) const {
+    if (!sp->is_l3_) {
+      XPU_CALL(xpu_free(sp->addr_));
+    }
+    delete sp;
+  }
+};
+
+using XPUScratchPadGuard = std::unique_ptr<XPUScratchPad, XPUScratchPadDeleter>;
+
 template <>
 class TargetWrapper<TARGET(kXPU)> {
  public:
@@ -34,6 +66,40 @@ class TargetWrapper<TARGET(kXPU)> {
                          const void* src,
                          size_t size,
                          IoDirection dir);
+
+  static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = false);
+
+  static xdnn::Context* GetRawContext() {
+    if (tls_raw_ctx_ == nullptr) {
+      tls_raw_ctx_ = xdnn::create_context();
+      CHECK(tls_raw_ctx_);
+      int r = xdnn::set_workspace_l3_size(tls_raw_ctx_,
+                                          workspace_l3_size_per_thread);
+      if (r != 0) {
+        LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r
+                     << ", workspace_l3_size_per_thread = "
+                     << workspace_l3_size_per_thread;
+      }
+    }
+    return tls_raw_ctx_;
+  }
+
+  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
+  // thread
+  static void SetDev(int dev_no = 0) {
+    const char* dev_env = getenv("LITE_XPU_DEV");
+    if (dev_env) {
+      dev_no = atoi(dev_env);
+    }
+
+    XPU_CALL(xpu_set_device(dev_no));
+  }
+
+  static std::string multi_encoder_precision;  // NOLINT
+  static int workspace_l3_size_per_thread;
+
+ private:
+  static thread_local xdnn::Context* tls_raw_ctx_;
 };
 
 }  // namespace lite
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index 56a5c9b8f7ea0ed47d21629d7ccf083b4f9fa232..af2bfbe86aaa1b3f145838015a6d6a62090cb3b1 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -121,7 +121,7 @@ lite_cc_library(kernel SRCS kernel.cc
         PROFILE_DEPS lite_profiler
   )
 lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel
-  cpp_op_desc tensor
+  cpp_op_desc tensor utils
   )
 
 add_dependencies(kernel kernel_list_h)
diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt
index 75971570fb078ce4e39413e5b3df629fe2a7ac3e..53988f063b89ae3e75f4c27cc1d937d12bb6dae5 100644
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 
 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
index 731215f542567ec3ff0cc87d6990624bfa6b2bc2..599e8f6c3791ac68474ca27e6c627bd2fc43765a 100644
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/core/arena/framework.h"
+#include <set>
 #include "lite/core/context.h"
 #include "lite/operators/subgraph_op.h"
 
@@ -22,37 +23,54 @@ namespace arena {
 
 void TestCase::CreateInstruction() {
   std::shared_ptr<lite::OpLite> op = nullptr;
-  if (place_.target == TARGET(kNPU) || place_.target == TARGET(kXPU)) {
+  static const std::set<TargetType> subgraph_op_supported_targets(
+      {TARGET(kNPU), TARGET(kXPU), TARGET(kHuaweiAscendNPU)});
+  bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) !=
+                            subgraph_op_supported_targets.end();
+#if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
+  enable_subgraph_op = false;  // Use XPU kernel directly if XTCL is disabled.
+#endif
+  if (enable_subgraph_op) {
     // Create a new block desc to wrap the original op desc
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
     int sub_block_idx = 0;
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
     sub_block_desc->ClearOps();
     sub_block_desc->ClearVars();
-    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
-    *sub_block_op_desc = *op_desc_;
+    auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_op_desc = *op_desc_;
     // Add the block desc into the subgraph op which used to replace the
     // original op
     op_desc_.reset(new cpp::OpDesc());
     op_desc_->SetType("subgraph");
     op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
-    auto in_names = sub_block_op_desc->input_vars();
-    auto out_names = sub_block_op_desc->output_vars();
+    auto in_names = sub_op_desc->input_vars();
+    auto out_names = sub_op_desc->output_vars();
     op_desc_->SetInput("Inputs", in_names);
     op_desc_->SetOutput("Outputs", out_names);
-    op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names);
+    // filter only data op (not const op by persisiable)
+    std::vector<std::string> in_data_names;
+    for (auto name : in_names) {
+      if (!(inst_scope_->FindTensor(name)->persistable())) {
+        in_data_names.push_back(name);
+      }
+    }
+    op_desc_->SetAttr<std::vector<std::string>>("input_data_names",
+                                                in_data_names);
     op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
     op = LiteOpRegistry::Global().Create(op_desc().Type());
-    static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
+    static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
+        sub_program_desc);
   } else {
     op = LiteOpRegistry::Global().Create(op_desc().Type());
   }
   CHECK(op) << "no op for " << op_desc().Type();
-  op->Attach(*op_desc_, inst_scope_);
+  op->Attach(*op_desc_, inst_scope_.get());
   auto kernels = op->CreateKernels({place_});
   // filter out the target kernel
   CHECK(!kernels.empty()) << "No kernel found for place "
                           << place_.DebugString();
-  auto it = std::remove_if(
+  auto it = std::find_if(
       kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& k) {
         return k->alias() == alias_;
       });
@@ -72,53 +90,35 @@ void TestCase::CreateInstruction() {
 void TestCase::PrepareInputsForInstruction() {
   for (auto& arg : op_desc().InputArgumentNames()) {
     for (auto& var : op_desc().Input(arg)) {
-      std::string kernel_key = instruction_->kernel()->key_with_alias();
-      const auto* param_type = ParamTypeRegistry::Global().RetrieveInArgument(
-          place_, kernel_key, arg);
-
-      const Type* inst_type = nullptr;
-      if (param_type->type->IsTensor()) {
-        inst_type = Type::GetTensorTy(TARGET(kHost));
-      } else if (param_type->type->IsTensorList()) {
-        inst_type = Type::GetTensorListTy(TARGET(kHost));
-      } else {
-        LOG(FATAL) << "unsupported param_type";
-      }
-
-      CHECK(scope_->FindVar(var));
-      if (!TargetCompatibleTo(*inst_type, *param_type->type)) {
-        /// Create a tensor or tensor_array in the instruction's scope,
-        /// alloc memory and then copy data there.
-        if (param_type->type->IsTensor()) {
-          const auto* shared_tensor = scope_->FindTensor(var);
-          auto* target_tensor = inst_scope_->NewTensor(var);
-          CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet";
-          target_tensor->Resize(shared_tensor->dims());
-          TargetCopy(param_type->type->target(),
-                     target_tensor->mutable_data(param_type->type->target(),
-                                                 shared_tensor->memory_size()),
-                     shared_tensor->raw_data(),
-                     shared_tensor->memory_size());
-        } else if (param_type->type->IsTensorList()) {
-          const auto* shared_tensor_array =
-              scope_->FindVar(var)->GetMutable<std::vector<Tensor>>();
-          auto* target_tensor_array =
-              inst_scope_->Var(var)->GetMutable<std::vector<Tensor>>();
-          CHECK(!shared_tensor_array->empty())
-              << "shared_tensor_array is empty yet";
-          target_tensor_array->resize(shared_tensor_array->size());
-          for (size_t i = 0; i < shared_tensor_array->size(); i++) {
-            target_tensor_array->at(i).Resize(
-                shared_tensor_array->at(i).dims());
-            TargetCopy(param_type->type->target(),
-                       target_tensor_array->at(i).mutable_data(
-                           param_type->type->target(),
-                           shared_tensor_array->at(i).memory_size()),
-                       shared_tensor_array->at(i).raw_data(),
-                       shared_tensor_array->at(i).memory_size());
-          }
-        } else {
-          LOG(FATAL) << "not support";
+      const auto* type = instruction_->kernel()->GetInputDeclType(arg);
+      CHECK(base_scope_->FindVar(var));
+      /// Create a tensor or tensor_array in the instruction's scope,
+      /// alloc memory and then copy data there.
+      if (type->IsTensor() &&
+          !TargetCompatibleTo(*Type::GetTensorTy(TARGET(kHost)), *type)) {
+        const auto* base_tensor = base_scope_->FindTensor(var);
+        auto* inst_tensor = inst_scope_->FindMutableTensor(var);
+        CHECK(!base_tensor->dims().empty())
+            << "The dims of input tensor is empty yet";
+        TargetCopy(type->target(),
+                   inst_tensor->mutable_data(type->target(),
+                                             base_tensor->memory_size()),
+                   base_tensor->raw_data(),
+                   base_tensor->memory_size());
+      } else if (type->IsTensorList() &&
+                 !TargetCompatibleTo(*Type::GetTensorListTy(TARGET(kHost)),
+                                     *type)) {
+        const auto* base_tensor_list = base_scope_->FindTensorList(var);
+        auto* inst_tensor_list = inst_scope_->FindMutableTensorList(var);
+        CHECK_EQ(base_tensor_list->size(), inst_tensor_list->size());
+        for (size_t i = 0; i < base_tensor_list->size(); i++) {
+          CHECK(!base_tensor_list->at(i).dims().empty())
+              << "The dims of input tensor[" << i << "] is empty yet";
+          TargetCopy(type->target(),
+                     inst_tensor_list->at(i).mutable_data(
+                         type->target(), base_tensor_list->at(i).memory_size()),
+                     inst_tensor_list->at(i).raw_data(),
+                     inst_tensor_list->at(i).memory_size());
         }
       }
     }
@@ -126,78 +126,88 @@ void TestCase::PrepareInputsForInstruction() {
 }
 
 template <typename T>
-bool TestCase::CheckTensorPrecision(const Tensor* a_tensor,
-                                    const Tensor* b_tensor,
+bool TestCase::CheckTensorPrecision(const Tensor* inst_tensor,
+                                    const Tensor* base_tensor,
                                     float abs_error) {
-  CHECK(a_tensor);
-  CHECK(b_tensor);
+  CHECK(inst_tensor);
+  CHECK(base_tensor);
 
-  CHECK(ShapeEquals(a_tensor->dims(), b_tensor->dims()));
+  CHECK(ShapeEquals(inst_tensor->dims(), base_tensor->dims()));
 
-  CHECK(a_tensor->lod() == b_tensor->lod()) << "lod not match";
+  CHECK(inst_tensor->lod() == base_tensor->lod()) << "lod not match";
 
   // The baseline should output in host devices.
-  CHECK(b_tensor->target() == TARGET(kHost) ||
-        b_tensor->target() == TARGET(kX86) ||
-        b_tensor->target() == TARGET(kARM));
-
-  const T* a_data{};
-  switch (a_tensor->target()) {
+  CHECK(base_tensor->target() == TARGET(kHost) ||
+        base_tensor->target() == TARGET(kX86) ||
+        base_tensor->target() == TARGET(kARM));
+  const T* inst_data{};
+  Tensor inst_host_tensor;
+  inst_host_tensor.Resize(inst_tensor->dims());
+  switch (inst_tensor->target()) {
     case TARGET(kX86):
     case TARGET(kHost):
     case TARGET(kARM):
-      a_data = static_cast<const T*>(a_tensor->raw_data());
+      inst_data = static_cast<const T*>(inst_tensor->raw_data());
       break;
+#ifdef LITE_WITH_XPU
+    case TARGET(kXPU):
+      CopySync<TARGET(kXPU)>(inst_host_tensor.mutable_data<T>(),
+                             inst_tensor->raw_data(),
+                             sizeof(T) * inst_tensor->dims().production(),
+                             IoDirection::DtoH);
+      inst_data = inst_host_tensor.data<T>();
+      break;
+#endif
 
     default:
       // Before compare, need to copy data from `target` device to host.
       LOG(FATAL) << "Not supported";
   }
 
-  CHECK(a_data);
+  CHECK(inst_data);
 
-  const T* b_data = static_cast<const T*>(b_tensor->raw_data());
+  const T* base_data = static_cast<const T*>(base_tensor->raw_data());
 
   bool success = true;
-  for (int i = 0; i < a_tensor->dims().production(); i++) {
-    EXPECT_NEAR(a_data[i], b_data[i], abs_error);
-    if (fabsf(a_data[i] - b_data[i]) > abs_error) {
+  for (int i = 0; i < inst_tensor->dims().production(); i++) {
+    EXPECT_NEAR(inst_data[i], base_data[i], abs_error);
+    if (fabsf(inst_data[i] - base_data[i]) > abs_error) {
       success = false;
     }
   }
   return success;
 }
 
-bool TestCase::CheckPrecision(const Tensor* a_tensor,
-                              const Tensor* b_tensor,
+bool TestCase::CheckPrecision(const Tensor* inst_tensor,
+                              const Tensor* base_tensor,
                               float abs_error,
                               PrecisionType precision_type) {
   PrecisionType precision_type_t = precision_type;
   if (precision_type == PRECISION(kAny)) {
-    precision_type_t = b_tensor->precision();
+    precision_type_t = base_tensor->precision();
   }
-  CHECK(precision_type_t == b_tensor->precision())
+  CHECK(precision_type_t == base_tensor->precision())
       << "arg precision type and base tensor precision type are not matched! "
          "arg precision type is: "
       << PrecisionToStr(precision_type) << ", base tensor precision type is: "
-      << PrecisionToStr(b_tensor->precision());
-  CHECK(a_tensor->precision() == b_tensor->precision())
+      << PrecisionToStr(base_tensor->precision());
+  CHECK(inst_tensor->precision() == base_tensor->precision())
       << "real tensor precision type and base tensor precision type are not "
          "matched! real tensor precision type is: "
-      << PrecisionToStr(a_tensor->precision())
+      << PrecisionToStr(inst_tensor->precision())
       << ", base tensor precision type is: "
-      << PrecisionToStr(b_tensor->precision());
+      << PrecisionToStr(base_tensor->precision());
   switch (precision_type_t) {
     case PRECISION(kFloat):
-      return CheckTensorPrecision<float>(a_tensor, b_tensor, abs_error);
+      return CheckTensorPrecision<float>(inst_tensor, base_tensor, abs_error);
     case PRECISION(kInt8):
-      return CheckTensorPrecision<int8_t>(a_tensor, b_tensor, abs_error);
+      return CheckTensorPrecision<int8_t>(inst_tensor, base_tensor, abs_error);
     case PRECISION(kInt32):
-      return CheckTensorPrecision<int32_t>(a_tensor, b_tensor, abs_error);
+      return CheckTensorPrecision<int32_t>(inst_tensor, base_tensor, abs_error);
     case PRECISION(kInt64):
-      return CheckTensorPrecision<int64_t>(a_tensor, b_tensor, abs_error);
+      return CheckTensorPrecision<int64_t>(inst_tensor, base_tensor, abs_error);
     case PRECISION(kBool):
-      return CheckTensorPrecision<bool>(a_tensor, b_tensor, abs_error);
+      return CheckTensorPrecision<bool>(inst_tensor, base_tensor, abs_error);
     default:
       LOG(FATAL) << "not support type: " << PrecisionToStr(precision_type);
       return false;
@@ -209,24 +219,24 @@ bool TestCase::CheckPrecision(const std::string& var_name,
                               PrecisionType precision_type) {
   bool success = true;
   if (inst_scope_->FindVar(var_name)->IsType<Tensor>()) {
-    auto a_tensor = inst_scope_->FindTensor(var_name);
-    auto b_tensor = base_scope_->FindTensor(var_name);
-    success = success &&
-              CheckPrecision(a_tensor, b_tensor, abs_error, precision_type);
+    auto inst_tensor = inst_scope_->FindTensor(var_name);
+    auto base_tensor = base_scope_->FindTensor(var_name);
+    success =
+        success &&
+        CheckPrecision(inst_tensor, base_tensor, abs_error, precision_type);
   } else if (inst_scope_->FindVar(var_name)->IsType<std::vector<Tensor>>()) {
-    auto a_tensor_array =
-        inst_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>();
-    auto b_tensor_array =
-        base_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>();
-    CHECK_EQ(a_tensor_array->size(), b_tensor_array->size());
-    for (size_t i = 0; i < a_tensor_array->size(); i++) {
-      Tensor* a_tensor = &(a_tensor_array->at(i));
-      Tensor* b_tensor = &(b_tensor_array->at(i));
-      if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) {
+    auto inst_tensor_list = inst_scope_->FindMutableTensorList(var_name);
+    auto base_tensor_list = base_scope_->FindMutableTensorList(var_name);
+    CHECK_EQ(inst_tensor_list->size(), base_tensor_list->size());
+    for (size_t i = 0; i < inst_tensor_list->size(); i++) {
+      Tensor* inst_tensor = &(inst_tensor_list->at(i));
+      Tensor* base_tensor = &(base_tensor_list->at(i));
+      if (inst_tensor->dims().size() == 0 && base_tensor->dims().size() == 0) {
         continue;
       }
-      success = success &&
-                CheckPrecision(a_tensor, b_tensor, abs_error, precision_type);
+      success =
+          success &&
+          CheckPrecision(inst_tensor, base_tensor, abs_error, precision_type);
     }
   } else {
     LOG(FATAL) << "unsupported var type";
@@ -234,19 +244,6 @@ bool TestCase::CheckPrecision(const std::string& var_name,
   return success;
 }
 
-TestCase::~TestCase() {
-  if (op_desc_->Type() == "subgraph") {
-    // Release the subblock desc of Subgraph op
-    auto subgraph_op = const_cast<operators::SubgraphOp*>(
-        static_cast<const operators::SubgraphOp*>(instruction_->op()));
-    CHECK(subgraph_op);
-    auto sub_block_desc = subgraph_op->GetSubBlock();
-    if (sub_block_desc) {
-      delete sub_block_desc;
-    }
-  }
-}
-
 }  // namespace arena
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h
index cf864a32044e3dfd03ecd03327a0db69275ef586..4ccb05428d38c65f8cad36f1702c034cfe62705b 100644
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -28,7 +28,7 @@
 #include "lite/core/program.h"
 #include "lite/core/scope.h"
 #include "lite/core/types.h"
-#include "lite/model_parser/cpp/op_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 
 namespace paddle {
 namespace lite {
@@ -40,13 +40,15 @@ namespace arena {
 class TestCase {
  public:
   explicit TestCase(const Place& place, const std::string& alias)
-      : place_(place), scope_(new Scope), alias_(alias) {
+      : place_(place),
+        alias_(alias),
+        inst_scope_(new Scope),
+        base_scope_(new Scope) {
     ctx_ = ContextScheduler::Global().NewContext(place_.target);
   }
-  virtual ~TestCase();
+  virtual ~TestCase() {}
 
   void Prepare() {
-    PrepareScopes();
     PrepareData();
     op_desc_.reset(new cpp::OpDesc);
     PrepareOpDesc(op_desc_.get());
@@ -91,16 +93,15 @@ class TestCase {
   // kernel registry.
   void CheckKernelConsistWithDefinition() {}
 
-  Scope& scope() { return *scope_; }
-
-  Scope* baseline_scope() { return base_scope_; }
-  Scope* inst_scope() { return inst_scope_; }
+  Scope* baseline_scope() { return base_scope_.get(); }
+  Scope* inst_scope() { return inst_scope_.get(); }
 
  protected:
   // Prepare inputs in scope() for Tester.
   virtual void PrepareData() = 0;
 
-  /// Prepare a tensor in host. The tensors will be created in scope_.
+  /// Prepare a tensor in host. The tensors will be created both in base_scope_
+  /// and inst_scope_.
   /// Need to specify the targets other than X86 or ARM.
   template <typename T>
   void SetCommonTensor(const std::string& var_name,
@@ -108,42 +109,47 @@ class TestCase {
                        const T* data,
                        const LoD& lod = {},
                        bool is_persistable = false) {
-    auto* tensor = scope_->NewTensor(var_name);
-    tensor->Resize(ddim);
-    auto* d = tensor->mutable_data<T>();
-    memcpy(d, data, ddim.production() * sizeof(T));
+    // Create and fill a input tensor with the given data for baseline
+    auto* base_tensor = base_scope_->NewTensor(var_name);
+    base_tensor->Resize(ddim);
+    memcpy(base_tensor->mutable_data<T>(), data, ddim.production() * sizeof(T));
 
     // set lod
-    if (!lod.empty()) *tensor->mutable_lod() = lod;
+    if (!lod.empty()) *base_tensor->mutable_lod() = lod;
     // set persistable
-    tensor->set_persistable(is_persistable);
+    base_tensor->set_persistable(is_persistable);
+
+    // Create a copy for instruction
+    auto* inst_tensor = inst_scope_->NewTensor(var_name);
+    inst_tensor->CopyDataFrom(*base_tensor);
   }
 
   /// Prepare a tensor_array in host. The tensors will be created in scope_.
   /// Need to specify the targets other than X86 or ARM.
   template <typename T>
   void SetCommonTensorList(const std::string& var_name,
-                           const std::vector<DDim>& array_tensor_dims,
+                           const std::vector<DDim>& ddims,
                            const std::vector<std::vector<T>>& datas,
                            const std::vector<LoD>& lods = {}) {
-    CHECK_EQ(array_tensor_dims.size(), datas.size());
+    // Create a tensor array for baseline, and a copy for instruction
+    CHECK_EQ(ddims.size(), datas.size());
     if (!lods.empty()) {
-      CHECK_EQ(array_tensor_dims.size(), lods.size());
+      CHECK_EQ(ddims.size(), lods.size());
     }
 
-    auto* tensor_array =
-        scope_->Var(var_name)->GetMutable<std::vector<Tensor>>();
-    for (int i = 0; i < array_tensor_dims.size(); i++) {
-      Tensor tmp;
-      tmp.Resize(array_tensor_dims[i]);
-      auto* tmp_data = tmp.mutable_data<T>();
-      memcpy(tmp_data,
+    auto* base_tensor_list = base_scope_->NewTensorList(var_name);
+    auto* inst_tensor_list = inst_scope_->NewTensorList(var_name);
+    for (int i = 0; i < ddims.size(); i++) {
+      Tensor item;
+      item.Resize(ddims[i]);
+      memcpy(item.mutable_data<T>(),
              datas[i].data(),
-             array_tensor_dims[i].production() * sizeof(T));
+             ddims[i].production() * sizeof(T));
       if (!lods.empty()) {
-        tmp.set_lod(lods[i]);
+        item.set_lod(lods[i]);
       }
-      tensor_array->push_back(tmp);
+      base_tensor_list->push_back(item);
+      inst_tensor_list->push_back(item);
     }
   }
 
@@ -157,11 +163,6 @@ class TestCase {
   std::unique_ptr<KernelContext> ctx_;
   void CreateInstruction();
 
-  void PrepareScopes() {
-    inst_scope_ = &scope_->NewScope();
-    base_scope_ = &scope_->NewScope();
-  }
-
   // Check shape
   // TODO(Superjomn) Move this method to utils or DDim?
   bool ShapeEquals(const DDim& a, const DDim& b) {
@@ -172,25 +173,23 @@ class TestCase {
     return true;
   }
 
-  /// Copy the input tensors to target devices needed by the instruction.
+  // Copy the host tensors to the device tensors if needed by the instruction.
   void PrepareInputsForInstruction();
 
   // Create output tensors and variables.
   void PrepareOutputsForInstruction() {
     for (auto x : op_desc().output_vars()) {
-      inst_scope_->NewTensor(x);
-      base_scope_->NewTensor(x);
+      inst_scope_->Var(x);
     }
   }
 
  private:
   Place place_;
-  std::shared_ptr<Scope> scope_;
   std::string alias_;
   // The workspace for the Instruction.
-  Scope* inst_scope_{};
+  std::shared_ptr<Scope> inst_scope_;
   // The workspace for the baseline implementation.
-  Scope* base_scope_{};
+  std::shared_ptr<Scope> base_scope_;
   std::unique_ptr<cpp::OpDesc> op_desc_;
   std::unique_ptr<Instruction> instruction_;
 };
diff --git a/lite/core/context.cc b/lite/core/context.cc
index eb8f90d7fa90d459846b24bc93b5d26cdfc3969a..abb44945ec66e1a89efc1ccb08ec1df370f2e099 100644
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -17,14 +17,19 @@
 namespace paddle {
 namespace lite {
 
-#ifdef LITE_WITH_NPU
-std::string Context<TargetType::kNPU>::subgraph_model_cache_dir_{""};  // NOLINT
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+thread_local std::string
+    Context<TargetType::kHuaweiAscendNPU>::subgraph_model_cache_dir_{
+        ""};  // NOLINT
+thread_local int
+    Context<TargetType::kHuaweiAscendNPU>::huawei_ascend_device_id_{
+        0};  // NOLINT
 #endif
 
-#ifdef LITE_WITH_XPU
-std::string Context<TargetType::kXPU>::_multi_encoder_precision;  // NOLINT
-thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
-int Context<TargetType::kXPU>::_workspace_l3_size_per_thread{0};
+#ifdef LITE_WITH_MLU
+int Context<TargetType::kMLU>::next_queue_id_{0};
+std::map<int, int> Context<TargetType::kMLU>::queue_id_map_;
+std::mutex Context<TargetType::kMLU>::map_mutex_;
 #endif
 
 }  // namespace lite
diff --git a/lite/core/context.h b/lite/core/context.h
index f606eeffaf8ccf932e2d17f03478d4d893ee482d..69f6a4b9d6bc87422d06e66e8d329547ccf5f24a 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -25,6 +25,7 @@
 #ifdef LITE_WITH_MLU
 #include <cnml.h>
 #include <cnrt.h>
+#include <mutex>  // NOLINT
 #include "lite/backends/mlu/mlu_utils.h"
 #endif
 #ifdef LITE_WITH_XPU
@@ -38,6 +39,7 @@
 #include <utility>
 #include <vector>
 #include "lite/core/device_info.h"
+#include "lite/core/scope.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
 #include "lite/utils/all.h"
@@ -60,6 +62,7 @@ using FPGAContext = Context<TargetType::kFPGA>;
 using BMContext = Context<TargetType::kBM>;
 using MLUContext = Context<TargetType::kMLU>;
 using RKNPUContext = Context<TargetType::kRKNPU>;
+using HuaweiAscendNPUContext = Context<TargetType::kHuaweiAscendNPU>;
 
 template <>
 class Context<TargetType::kHost> {
@@ -83,6 +86,35 @@ class Context<TargetType::kNPU> {
   NPUContext& operator=(const NPUContext& ctx) {}
   std::string name() const { return "NPUContext"; }
 
+  static void SetSubgraphModelCacheDir(Scope* scope,
+                                       std::string subgraph_model_cache_dir) {
+    auto var = scope->Var("SUBGRAPH_MODEL_CACHE_DIR");
+    CHECK(var);
+    auto data = var->GetMutable<std::string>();
+    CHECK(data);
+    *data = subgraph_model_cache_dir;
+  }
+  static std::string SubgraphModelCacheDir(Scope* scope) {
+    auto var = scope->FindVar("SUBGRAPH_MODEL_CACHE_DIR");
+    if (!var) return "";
+    return var->Get<std::string>();
+  }
+};
+#endif
+
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+template <>
+class Context<TargetType::kHuaweiAscendNPU> {
+ public:
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(HuaweiAscendNPUContext* ctx) {}
+
+  HuaweiAscendNPUContext& operator=(const HuaweiAscendNPUContext& ctx) {
+    return *this;
+  }
+  std::string name() const { return "HuaweiAscendNPUContext"; }
+
   static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) {
     subgraph_model_cache_dir_ = subgraph_model_cache_dir;
   }
@@ -90,8 +122,14 @@ class Context<TargetType::kNPU> {
     return subgraph_model_cache_dir_;
   }
 
+  static void SetHuaweiAscendDeviceID(int huawei_ascend_device_id) {
+    huawei_ascend_device_id_ = huawei_ascend_device_id;
+  }
+  static int HuaweiAscendDeviceID() { return huawei_ascend_device_id_; }
+
  private:
-  static std::string subgraph_model_cache_dir_;
+  static thread_local std::string subgraph_model_cache_dir_;
+  static thread_local int huawei_ascend_device_id_;
 };
 #endif
 
@@ -143,45 +181,12 @@ class Context<TargetType::kXPU> {
 
   void CopySharedTo(XPUContext* ctx) {}
 
+  // TODO(miaotianxiang): remove this
   static xdnn::Context* GetRawContext() {
-    if (_tls_raw_ctx == nullptr) {
-      _tls_raw_ctx = xdnn::create_context();
-      CHECK(_tls_raw_ctx);
-      int r = xdnn::set_workspace_l3_size(_tls_raw_ctx,
-                                          _workspace_l3_size_per_thread);
-      if (r != 0) {
-        LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r
-                     << ", _workspace_l3_size_per_thread = "
-                     << _workspace_l3_size_per_thread;
-      }
-    }
-    return _tls_raw_ctx;
-  }
-
-  static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
-    _workspace_l3_size_per_thread = l3_size;
-  }
-
-  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
-  // thread
-  static void SetDev(int dev_no = 0) {
-    const char* dev_env = getenv("LITE_XPU_DEV");
-    if (dev_env) {
-      xpu_set_device(atoi(dev_env));
-      return;
-    }
-
-    xpu_set_device(dev_no);
+    return TargetWrapperXPU::GetRawContext();
   }
 
   std::string name() const { return "XPUContext"; }
-
- public:
-  static std::string _multi_encoder_precision;  // NOLINT
-
- private:
-  static thread_local xdnn::Context* _tls_raw_ctx;
-  static int _workspace_l3_size_per_thread;
 };
 #endif
 
@@ -249,11 +254,11 @@ class Context<TargetType::kMLU> {
   void InitOnce() {}
 
   MLUContext& operator=(const MLUContext& ctx) {
-    this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
+    this->Init(ctx.device_id_, ctx.exec_queue_id_);
     return *this;
   }
 
-  void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
+  void Init(int dev_id, int exec_queue_id = 0) {
     CHECK_GT(devs.size(), 0UL)
         << "Env is not initialized or current target is not exit!";
     if (dev_id >= static_cast<int>(devs.size())) {
@@ -264,21 +269,19 @@ class Context<TargetType::kMLU> {
       device_id_ = dev_id;
     }
     SetMluDevice(device_id_);
-    if (io_queue_id >= devs[dev_id].max_queue()) {
-      LOG(WARNING) << "data queue index exceeds the maximum queue number, "
-                      "set to default qeueu(0)!";
-      io_queue_id = 0;
-    }
-    if (exec_queue_id >= devs[dev_id].max_queue()) {
-      LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
-                      "set to default qeueu(0)!";
-      exec_queue_id = 0;
+
+    // get queue id from map
+    std::unique_lock<std::mutex> lk(map_mutex_);
+    if (queue_id_map_.find(exec_queue_id) == queue_id_map_.end()) {
+      queue_id_map_[exec_queue_id] =
+          next_queue_id_++ % devs[dev_id].max_queue();
     }
-    io_queue_ = devs[dev_id].io_queues()[io_queue_id];
-    exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
+    exec_queue_id_ = queue_id_map_[exec_queue_id];
+    VLOG(4) << "pick mlu queue id: " << exec_queue_id_;
+    lk.unlock();
 
-    exec_queue_id_ = exec_queue_id;
-    io_queue_id_ = io_queue_id;
+    io_queue_ = devs[dev_id].io_queues()[exec_queue_id_];
+    exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id_];
   }
 
   void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
@@ -290,10 +293,12 @@ class Context<TargetType::kMLU> {
   void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
 
   cnmlCoreVersion_t MLUCoreVersion() {
-    return DeviceInfo::Global().MLUCoreVersion();
+    return paddle::lite::TargetWrapperMlu::MLUCoreVersion();
   }
 
-  int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
+  int MLUCoreNumber() {
+    return paddle::lite::TargetWrapperMlu::MLUCoreNumber();
+  }
 
   u32_t affinity() { return affinity_; }
 
@@ -304,10 +309,12 @@ class Context<TargetType::kMLU> {
   std::string name() const { return "MLUContext"; }
 
  private:
+  static int next_queue_id_;
+  static std::map<int, int> queue_id_map_;
+  static std::mutex map_mutex_;
   int device_id_;
   // overall information
   int exec_queue_id_;
-  int io_queue_id_;
   cnrtQueue_t io_queue_;
   cnrtQueue_t exec_queue_;
 
@@ -415,6 +422,13 @@ class ContextScheduler {
             &ctx->As<NPUContext>());
         break;
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+      case TARGET(kHuaweiAscendNPU):
+        kernel_contexts_[TargetType::kHuaweiAscendNPU]
+            .As<HuaweiAscendNPUContext>()
+            .CopySharedTo(&ctx->As<HuaweiAscendNPUContext>());
+        break;
+#endif
 #ifdef LITE_WITH_APU
       case TARGET(kAPU):
         kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
@@ -455,7 +469,7 @@ class ContextScheduler {
       case TARGET(kMLU): {
         int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
         auto& context = ctx->As<MLUContext>();
-        context.Init(dev_id);
+        context.Init(dev_id, exec_stream_id);
         kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
             &context);
         LOG(INFO) << "New Context for MLU";
@@ -496,6 +510,9 @@ class ContextScheduler {
 #ifdef LITE_WITH_NPU
     InitContext<TargetType::kNPU, NPUContext>();
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+    InitContext<TargetType::kHuaweiAscendNPU, HuaweiAscendNPUContext>();
+#endif
 #ifdef LITE_WITH_APU
     InitContext<TargetType::kAPU, APUContext>();
 #endif
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
index ac79ede37406188f495690179b4a4886bc009d80..6d404cee9718a94d2646728c8f2d79576ceb7860 100644
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -66,15 +66,6 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
 thread_local TensorLite DeviceInfo::workspace_;
 thread_local int64_t DeviceInfo::count_ = 0;
 
-#ifdef LITE_WITH_MLU
-thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
-thread_local int DeviceInfo::mlu_core_number_{1};
-thread_local bool DeviceInfo::use_first_conv_{false};
-thread_local std::vector<float> DeviceInfo::mean_vec_;
-thread_local std::vector<float> DeviceInfo::std_vec_;
-thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
-#endif
-
 #ifdef TARGET_IOS
 const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
 const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
@@ -1089,45 +1080,6 @@ int DeviceInfo::Setup() {
   return 0;
 }
 
-#ifdef LITE_WITH_MLU
-void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
-                               int core_number,
-                               bool use_first_conv,
-                               const std::vector<float>& mean_vec,
-                               const std::vector<float>& std_vec,
-                               DataLayoutType input_layout) {
-  switch (core_version) {
-    case (lite_api::MLUCoreVersion::MLU_220):
-      mlu_core_version_ = CNML_MLU220;
-      break;
-    case (lite_api::MLUCoreVersion::MLU_270):
-      mlu_core_version_ = CNML_MLU270;
-      break;
-    default:
-      mlu_core_version_ = CNML_MLU270;
-      break;
-  }
-  mlu_core_number_ = core_number;
-  use_first_conv_ = use_first_conv;
-  mean_vec_ = mean_vec;
-  std_vec_ = std_vec;
-  input_layout_ = input_layout;
-}
-
-cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
-
-int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
-
-bool DeviceInfo::UseFirstConv() { return use_first_conv_; }
-
-const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
-
-const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
-
-DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
-
-#endif  // LITE_WITH_MLU
-
 void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
 #ifdef ARM_WITH_OMP
   thread_num = std::min(thread_num, core_num_);
diff --git a/lite/core/device_info.h b/lite/core/device_info.h
index f5b75039ea14f67cee9d009261b2dd1fc6b46825..f3f10c2d5740d6e8cc7e219b8f0d9d9ff17a8496 100644
--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -55,20 +55,6 @@ class DeviceInfo {
   int Setup();
 
   void SetRunMode(lite_api::PowerMode mode, int thread_num);
-#ifdef LITE_WITH_MLU
-  void SetMLURunMode(lite_api::MLUCoreVersion core_version,
-                     int core_number,
-                     bool use_first_conv,
-                     const std::vector<float>& mean_vec,
-                     const std::vector<float>& std_vec,
-                     DataLayoutType input_layout);
-  cnmlCoreVersion_t MLUCoreVersion();
-  int MLUCoreNumber();
-  bool UseFirstConv();
-  const std::vector<float>& MeanVec() const;
-  const std::vector<float>& StdVec() const;
-  DataLayoutType InputLayout() const;
-#endif
   void SetCache(int l1size, int l2size, int l3size);
   void SetArch(ARMArch arch) { arch_ = arch; }
 
@@ -120,15 +106,6 @@ class DeviceInfo {
   static thread_local TensorLite workspace_;
   static thread_local int64_t count_;
 
-#ifdef LITE_WITH_MLU
-  static thread_local cnmlCoreVersion_t mlu_core_version_;
-  static thread_local int mlu_core_number_;
-  static thread_local bool use_first_conv_;
-  static thread_local std::vector<float> mean_vec_;
-  static thread_local std::vector<float> std_vec_;
-  static thread_local DataLayoutType input_layout_;
-#endif
-
   void SetDotInfo(int argc, ...);
   void SetFP16Info(int argc, ...);
   void SetFP32Info(int argc, ...);
diff --git a/lite/core/memory.cc b/lite/core/memory.cc
index 1f2f7fed7d61b67a76f54a092b6d48951bc9fcbd..83e41d2c0960d87a0201b55b943529a9df4f6ab2 100644
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -140,6 +140,11 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
           dst, src, size, IoDirection::HtoD);
       break;
 #endif
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      TargetWrapperXPU::MemcpySync(dst, src, size, IoDirection::HtoD);
+      break;
+#endif
 #ifdef LITE_WITH_OPENCL
     case TargetType::kOpenCL:
       TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);
diff --git a/lite/core/memory.h b/lite/core/memory.h
index a1013910019251271ddfccfbc700297c45226fe6..c80c8fb6b6e1356ebfa52920a8ee39f61ed20692 100644
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -97,6 +97,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
     case TARGET(kBM):
       TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, dir);
       break;
+#endif
+#ifdef LITE_WITH_XPU
+    case TARGET(kXPU):
+      TargetWrapperXPU::MemcpySync(dst, src, size, dir);
+      break;
 #endif
     default:
       LOG(FATAL)
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index b8234b18922f454c41e295209da13de024184adc..cd129b332fa79dc45d74dc8a0befc1e67a68c316 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -18,17 +18,22 @@ lite_cc_library(mir_passes
       fusion/conv_activation_fuse_pass.cc
       fusion/var_conv_2d_activation_fuse_pass.cc
       fusion/conv_bn_fuse_pass.cc
+      fusion/conv_conv_fuse_pass.cc
       fusion/elementwise_add_activation_fuse_pass.cc
       fusion/quant_dequant_fuse_pass.cc
       fusion/sequence_pool_concat_fuse_pass.cc
       fusion/scale_activation_fuse_pass.cc
       fusion/__xpu__resnet_fuse_pass.cc
+      fusion/__xpu__resnet_cbam_fuse_pass.cc
       fusion/__xpu__multi_encoder_fuse_pass.cc
       fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
       fusion/__xpu__fc_fuse_pass.cc
+      fusion/__xpu__mmdnn_fuse_pass.cc
       elimination/identity_scale_eliminate_pass.cc
       elimination/identity_dropout_eliminate_pass.cc
       elimination/elementwise_mul_constant_eliminate_pass.cc
+      elimination/remove_tf_redundant_ops_pass.cc
+      elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
       static_kernel_pick_pass.cc
       variable_place_inference_pass.cc
       type_target_cast_pass.cc
diff --git a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7866cb956c4e51d3b69687751325ca3ff4eda9d6
--- /dev/null
+++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
@@ -0,0 +1,244 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
+#include <algorithm>
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+// Remove all of the unused nodes from the contorl flow op and update the inputs
+// and outputs of the op info The unused nodes are defined as the nodes which
+// are only linked to the control flow op nodes but nerver linked to the other
+// op nodes.
+//
+// For example:
+// graph[0]: main block
+//                      in_x
+//             in_f      |   in_z(unused node)
+//                  \    |    /
+//                   \   |   /
+//        in_w ------- while ------- in_y(unused_node)
+//                    /  |
+//                   /   |
+// (unused node)out_y    |
+//                     out_x
+//
+// graph[1]: sub block
+//                     in_x
+//                       |
+//                       |
+//                     conv2d----in_f
+//                       |
+//                       |
+//                      fc ------in_w
+//                       |
+//                       |
+//                     softmax
+//                       |
+//                       |
+//                     out_x
+//
+// After the pass is applied:
+//                      in_x
+//             in_f      |
+//                  \    |
+//                   \   |
+//        in_w ------- while
+//                       |
+//                       |
+//                       |
+//                     out_x
+
+// Remove the var node from var2rm if it is recursively referred to any op in
+// the subblock
+void CollectUnusedInputOutputNodes(
+    int block_idx,
+    std::vector<std::unique_ptr<mir::SSAGraph>>* graphs,
+    const std::unordered_set<std::string>& control_flow_op_types,
+    std::unordered_map<std::string, Node*>* in_vars2rm,
+    std::unordered_map<std::string, Node*>* out_vars2rm) {
+  auto block_size = graphs->size();
+  for (auto& op_node : (*graphs)[block_idx]->StmtTopologicalOrder()) {
+    if (!op_node->IsStmt()) continue;
+    auto op_info = op_node->AsStmt().op_info();
+    auto op_type = op_info->Type();
+    if (control_flow_op_types.count(op_type)) {
+      int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
+      CHECK(block_idx >= 0 && block_idx < block_size);
+      CollectUnusedInputOutputNodes(sub_block_idx,
+                                    graphs,
+                                    control_flow_op_types,
+                                    in_vars2rm,
+                                    out_vars2rm);
+    } else {
+      for (auto& var_node : op_node->inlinks) {
+        auto& var_name = var_node->AsArg().name;
+        if (in_vars2rm->count(var_name)) {
+          in_vars2rm->erase(var_name);
+        }
+      }
+      for (auto& var_node : op_node->outlinks) {
+        auto& var_name = var_node->AsArg().name;
+        // Tensor array may be only used as the output vars in the sublock
+        if (in_vars2rm->count(var_name)) {
+          in_vars2rm->erase(var_name);
+        }
+        if (out_vars2rm->count(var_name)) {
+          out_vars2rm->erase(var_name);
+        }
+      }
+    }
+  }
+}
+
+// Remove the unused var nodes from the graph and update the op_info of the
+// control flow op
+void RemoveNodesFromGraphAndUpdateOpInfo(
+    SSAGraph* graph,
+    Node* op_node,
+    const std::unordered_map<std::string, Node*>& in_vars2rm,
+    const std::unordered_map<std::string, Node*>& out_vars2rm) {
+  auto op_info = op_node->AsStmt().mutable_op_info();
+  auto op_type = op_info->Type();
+  // Unlink the in_vars2rm and out_vars2rm from the control flow op node, and
+  // remove them if nerver used.
+  for (auto& var_node : in_vars2rm) {
+    VLOG(3) << "in var node '" << var_node.first << "' is unlinked to "
+            << op_type;
+    RemoveDirectedLink(var_node.second, op_node);
+  }
+  for (auto& var_node : out_vars2rm) {
+    VLOG(3) << "out var node '" << var_node.first << "' is unlinked from "
+            << op_type;
+    RemoveDirectedLink(op_node, var_node.second);
+    // Unlink from all of the out op nodes.
+    std::unordered_set<Node*> out_op_nodes;
+    for (auto* out_op_node : var_node.second->outlinks) {
+      if (!out_op_nodes.count(out_op_node)) {
+        out_op_nodes.insert(out_op_node);
+      }
+    }
+    for (auto* out_op_node : out_op_nodes) {
+      RemoveDirectedLink(var_node.second, out_op_node);
+    }
+  }
+  // Remove the unused nodes from the graph if their inlinks and outlinks are
+  // empty
+  std::unordered_set<const Node*> removed_var_nodes;
+  for (auto& var_node : in_vars2rm) {
+    if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
+        !removed_var_nodes.count(var_node.second)) {
+      removed_var_nodes.insert(var_node.second);
+      graph->RemoveNode(var_node.second);
+      VLOG(3) << "in var node " << var_node.first << " is removed";
+    }
+  }
+  for (auto& var_node : out_vars2rm) {
+    if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
+        !removed_var_nodes.count(var_node.second)) {
+      removed_var_nodes.insert(var_node.second);
+      graph->RemoveNode(var_node.second);
+      VLOG(3) << "out var node " << var_node.first << " is removed";
+    }
+  }
+  // Update the op info of the control flow op
+  for (auto& input : *op_info->mutable_inputs()) {
+    for (auto var = input.second.begin(); var != input.second.end();) {
+      if (in_vars2rm.count(*var)) {
+        var = input.second.erase(var);
+      } else {
+        ++var;
+      }
+    }
+  }
+  for (auto& output : *op_info->mutable_outputs()) {
+    for (auto var = output.second.begin(); var != output.second.end();) {
+      if (out_vars2rm.count(*var)) {
+        var = output.second.erase(var);
+      } else {
+        ++var;
+      }
+    }
+  }
+}
+
+void ControlFlowOpUnusedInputsAndOutputsEliminatePass::SetAllGraphs(
+    std::vector<std::unique_ptr<mir::SSAGraph>>* graphs) {
+  CHECK(graphs && !graphs->empty());
+  graphs_ = graphs;
+}
+
+void ControlFlowOpUnusedInputsAndOutputsEliminatePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  // Remove the unused input and output nodes from the control flow op nodes
+  // Which are only linked to the control flow op nodes but nerver linked to the
+  // other op nodes
+  const std::unordered_set<std::string> control_flow_op_types = {
+      "while", "conditional_block"};
+  auto block_size = graphs_->size();
+  for (auto& op_node : graph->StmtTopologicalOrder()) {
+    if (!op_node->IsStmt()) continue;
+    auto op_info = op_node->AsStmt().mutable_op_info();
+    auto op_type = op_info->Type();
+    if (!control_flow_op_types.count(op_type)) continue;
+    int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
+    CHECK(sub_block_idx >= 0 && sub_block_idx < block_size);
+    // Initialize the unused nodes with all of the input and output nodes
+    std::unordered_map<std::string, Node *> in_vars2rm, out_vars2rm;
+    for (auto* var_node : op_node->inlinks) {
+      auto& var_name = var_node->AsArg().name;
+      if (!in_vars2rm.count(var_name)) {
+        in_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
+      }
+    }
+    for (auto* var_node : op_node->outlinks) {
+      auto& var_name = var_node->AsArg().name;
+      if (!out_vars2rm.count(var_name)) {
+        out_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
+      }
+    }
+    // Remove the nodes which used in subblock recursively, and the remaining
+    // nodes are the unused one.
+    CollectUnusedInputOutputNodes(sub_block_idx,
+                                  graphs_,
+                                  control_flow_op_types,
+                                  &in_vars2rm,
+                                  &out_vars2rm);
+    if (in_vars2rm.size() > 0 || out_vars2rm.size() > 0) {
+      // Remove the unused nodes from graph, and update the op info of the
+      // control flow op
+      RemoveNodesFromGraphAndUpdateOpInfo(
+          graph.get(), op_node, in_vars2rm, out_vars2rm);
+    }
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(
+    control_flow_op_unused_inputs_and_outputs_eliminate_pass,
+    paddle::lite::mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass)
+    .BindTargets({TARGET(kNPU)});
diff --git a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..2863661de1e93d15bfe835e39033d4ecaee6d8cc
--- /dev/null
+++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ControlFlowOpUnusedInputsAndOutputsEliminatePass : public mir::StmtPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph> &graph) override;
+  void SetAllGraphs(std::vector<std::unique_ptr<mir::SSAGraph>> *graphs);
+
+ private:
+  std::vector<std::unique_ptr<mir::SSAGraph>> *graphs_;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..673854b118a8adaca73cb905eda4892b6903665c
--- /dev/null
+++ b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc
@@ -0,0 +1,245 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/elimination/remove_tf_redundant_ops_pass.h"
+#include <set>
+#include "lite/core/mir/graph_visualize_pass.h"
+#include "lite/core/mir/pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher.h"
+#include "lite/model_parser/cpp_desc.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void RemoveTFRedundantOpsPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  RemoveSqueeze2Reshape2Pattern(graph);
+  RemoveReshape2Pattern(graph);
+}
+
+void RemoveTFRedundantOpsPass::RemoveReshape2Pattern(
+    const std::unique_ptr<SSAGraph>& graph) {
+  bool found = false;
+  Node* softmax_node{nullptr};
+  Node* reshape2_node{nullptr};
+  std::string reshape2_out_arg_name;
+  Node* fetch_node{nullptr};
+  std::string fetch_in_arg_name;
+  DDim softmax_out_dims;
+  DDim reshape2_out_dims;
+
+  for (auto& op_node : graph->StmtTopologicalOrder()) {
+    if (op_node->AsStmt().picked_kernel().op_type() == "softmax") {
+      softmax_node = op_node;
+    } else if (op_node->AsStmt().picked_kernel().op_type() == "reshape2") {
+      reshape2_node = op_node;
+    } else if (op_node->AsStmt().picked_kernel().op_type() == "fetch") {
+      fetch_node = op_node;
+      fetch_in_arg_name = fetch_node->inlinks.front()->AsArg().name;
+    }
+  }
+
+  if (softmax_node == nullptr || reshape2_node == nullptr) {
+    return;
+  }
+
+  // Get out tensor dims of softmax, reshape2
+  auto* scope = softmax_node->AsStmt().op()->scope();
+  auto softmax_out_arg_name = softmax_node->outlinks.front()->AsArg().name;
+  auto softmax_out_tensor =
+      scope->FindVar(softmax_out_arg_name)->Get<lite::Tensor>();
+  softmax_out_dims = softmax_out_tensor.dims();
+
+  for (auto out_node : reshape2_node->outlinks) {
+    if (out_node->IsArg() && out_node->outlinks.size() != 0) {
+      reshape2_out_arg_name = reshape2_node->outlinks.front()->AsArg().name;
+      auto reshape2_out_tensor =
+          scope->FindVar(reshape2_out_arg_name)->Get<lite::Tensor>();
+      reshape2_out_dims = reshape2_out_tensor.dims();
+    }
+  }
+
+  VLOG(3) << "reshape2_out_dims:" << reshape2_out_dims;
+  VLOG(3) << "softmax_out_dims:" << softmax_out_dims;
+  VLOG(3) << "found:" << found;
+
+  if (softmax_out_dims == reshape2_out_dims &&
+      softmax_node->outlinks.front() == reshape2_node->inlinks.front() &&
+      reshape2_out_arg_name == fetch_in_arg_name) {
+    found = true;
+  }
+
+  if (found) {
+    // link out_arg to op
+    IR_NODE_LINK_TO(softmax_node->outlinks.front(), fetch_node);
+
+    // collect nodes to safe remove
+    std::set<const Node*> nodes_to_remove;
+    auto remove_inst_node_and_out_args_node = [&](Node* n) {
+      nodes_to_remove.insert(n);
+      for (auto& out : n->outlinks) {
+        nodes_to_remove.insert(out);
+      }
+    };
+
+    remove_inst_node_and_out_args_node(reshape2_node);
+    GraphSafeRemoveNodes(graph.get(), nodes_to_remove);
+    auto fetch_op_desc = fetch_node->AsStmt().mutable_op_info();
+    fetch_op_desc->SetInput("X",
+                            {softmax_node->outlinks.front()->AsArg().name});
+  }
+  VLOG(5) << "\n" << Visualize(graph.get());
+}
+
+void RemoveTFRedundantOpsPass::RemoveSqueeze2Reshape2Pattern(
+    const std::unique_ptr<SSAGraph>& graph) {
+  VLOG(5) << Visualize(graph.get());
+  bool found = false;
+
+  // find out_arg->squeeze2
+  // find out_arg_dims of out_arg
+  Node* out_arg_node{nullptr};
+  DDim out_arg_dims;
+  Node* squeeze2_node{nullptr};
+
+  // find squeeze2->reshape2
+  // find output dims of squeeze2 and reshape2 nodes
+  DDim squeeze2_out_dims;
+  Node* reshape2_node{nullptr};
+  Node* reshape2_out_node{nullptr};
+  DDim reshape2_out_dims;
+
+  // find next inst node of reshape2
+  Node* next_inst_node_of_reshape2_out{nullptr};
+
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    if (node->AsStmt().picked_kernel().op_type() != "squeeze2") continue;
+    auto* scope = node->AsStmt().op()->scope();
+
+    // find inlinks of squeeze2: out_arg_node
+    squeeze2_node = node;
+    auto squeeze2_inlinks = squeeze2_node->inlinks;
+    VLOG(5) << "squeeze2_inlinks.size():" << squeeze2_inlinks.size();
+    for (auto& in_link : squeeze2_inlinks) {
+      if (in_link->IsArg() && squeeze2_inlinks.size() == 1) {
+        out_arg_node = in_link;
+        auto* var = scope->FindVar(out_arg_node->AsArg().name);
+        out_arg_dims = var->Get<lite::Tensor>().dims();
+        VLOG(5) << "arg name:" << out_arg_node->AsArg().name
+                << " dims:" << out_arg_dims;
+      } else {
+        // found mutli-input links
+        continue;
+      }
+    }
+
+    // find squeeze2->reshape2 pattern
+    // and output dims of squeeze2, reshape2 nodes
+    auto squeeze2_outlinks = squeeze2_node->outlinks;
+    for (auto& squeeze2_out_link : squeeze2_outlinks) {
+      if (squeeze2_out_link->IsArg() &&
+          squeeze2_out_link->outlinks.size() != 0) {
+        auto* squeeze2_out_var =
+            scope->FindVar(squeeze2_out_link->AsArg().name);
+        squeeze2_out_dims = squeeze2_out_var->Get<lite::Tensor>().dims();
+
+        VLOG(5) << "squeeze2_out_arg.name:" << squeeze2_out_link->AsArg().name
+                << " squeeze2_out_dims:" << squeeze2_out_dims
+                << " squeeze2_out_link->outlinks.size():"
+                << squeeze2_out_link->outlinks.size();
+
+        for (auto& out2_link : squeeze2_out_link->outlinks) {
+          if (out2_link->IsStmt() &&
+              out2_link->AsStmt().picked_kernel().op_type() == "reshape2") {
+            reshape2_node = out2_link;
+            for (auto& reshape2_out_link : reshape2_node->outlinks) {
+              if (reshape2_out_link->IsArg() &&
+                  reshape2_out_link->outlinks.size() != 0) {
+                reshape2_out_node = reshape2_out_link;
+                auto* reshape2_out_var =
+                    scope->FindVar(reshape2_out_link->AsArg().name);
+                reshape2_out_dims =
+                    reshape2_out_var->Get<lite::Tensor>().dims();
+
+                VLOG(5) << "reshape2_out_node:" << reshape2_out_node
+                        << " reshape2_out_name:"
+                        << reshape2_out_link->AsArg().name
+                        << " reshape2_out_dims:" << reshape2_out_dims;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // find next inst node of reshape2
+    VLOG(5) << "reshape2_out_node->outlinks.size():"
+            << reshape2_out_node->outlinks.size()
+            << " reshape2_out_node->IsStmt():" << reshape2_out_node->IsStmt();
+    VLOG(5) << "reshape2_out_node->AsArg().name:"
+            << reshape2_out_node->AsArg().name;
+    if (reshape2_out_node != nullptr &&
+        reshape2_out_node->outlinks.size() == 1 &&
+        reshape2_out_node->outlinks.front()->IsStmt()) {
+      next_inst_node_of_reshape2_out = reshape2_out_node->outlinks.front();
+      found = true;
+      break;
+      VLOG(5)
+          << "next_inst_node_of_reshape2_out->picked_kernel().op_type():"
+          << next_inst_node_of_reshape2_out->AsStmt().picked_kernel().op_type();
+    }
+
+    VLOG(5) << "==============================";
+    VLOG(5) << "out_arg_dims:" << out_arg_dims;
+    VLOG(5) << "squeeze2_out_dims:" << squeeze2_out_dims;
+    VLOG(5) << "reshape2_out_dims:" << reshape2_out_dims;
+    VLOG(5) << "==============================";
+  }
+
+  // replace pattern
+  if (found && out_arg_dims[1] == squeeze2_out_dims[1] &&
+      out_arg_dims[1] == reshape2_out_dims[1] && out_arg_dims[1] == 1001 &&
+      out_arg_dims[2] == out_arg_dims[3] && out_arg_dims[2] == 1 &&
+      next_inst_node_of_reshape2_out->AsStmt().picked_kernel().op_type() ==
+          "softmax") {
+    // link out_arg to op
+    IR_NODE_LINK_TO(out_arg_node, next_inst_node_of_reshape2_out);
+
+    // collect nodes to safe remove
+    std::set<const Node*> nodes_to_remove;
+    auto remove_inst_node_and_out_args_node = [&](Node* n) {
+      nodes_to_remove.insert(n);
+      for (auto& out : n->outlinks) {
+        nodes_to_remove.insert(out);
+      }
+    };
+    remove_inst_node_and_out_args_node(squeeze2_node);
+    remove_inst_node_and_out_args_node(reshape2_node);
+    GraphSafeRemoveNodes(graph.get(), nodes_to_remove);
+    auto next_inst_op_desc =
+        next_inst_node_of_reshape2_out->AsStmt().mutable_op_info();
+    next_inst_op_desc->SetInput("X", {out_arg_node->AsArg().name});
+    VLOG(5) << Visualize(graph.get());
+  }
+  VLOG(5) << "replace pattern fininshed";
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(remove_tf_redundant_ops_pass,
+                  paddle::lite::mir::RemoveTFRedundantOpsPass)
+    .BindTargets({TARGET(kOpenCL), TARGET(kARM)});
diff --git a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.h b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..652a8fb4a7f67e173527725e3bbecfadcde96798
--- /dev/null
+++ b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/tensor.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+/*
+ * mir::RemoveTFRedundantOpsPass remove reshape2->squeeze2 pattern
+ * and last reshape2 op for tensorflow mobilenetv1/v2.
+ */
+class RemoveTFRedundantOpsPass : public mir::StmtPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+  void RemoveReshape2Pattern(const std::unique_ptr<SSAGraph>& graph);
+  void RemoveSqueeze2Reshape2Pattern(const std::unique_ptr<SSAGraph>& graph);
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt
index a7a4cee798c1e8ef5b9b8f8d9e8e5810554fc571..95723bbd21dc02ed8bb5b46c48f9836d3f9aff1f 100644
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -16,6 +16,9 @@ lite_cc_library(fuse_var_conv_activation
 lite_cc_library(fuse_conv_bn
         SRCS conv_bn_fuser.cc
         DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_conv_conv
+        SRCS conv_conv_fuser.cc
+        DEPS pattern_matcher_high_api)     
 lite_cc_library(fuse_elementwise_add_activation
         SRCS elementwise_add_activation_fuser.cc
         DEPS pattern_matcher_high_api)
@@ -42,6 +45,7 @@ set(mir_fusers
     fuse_conv_activation
     fuse_var_conv_activation
     fuse_conv_bn
+    fuse_conv_conv
     fuse_quant_dequant
     fuse_elementwise_add_activation
     fuse_transpose_softmax_transpose
diff --git a/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..db950fd4b4d671ed618c8bc53010e5be6f5fd78b
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
@@ -0,0 +1,1644 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class XPUMmdnnFloat2Fix {
+ public:
+  void operator()(SSAGraph* graph) {
+    for (auto* node : graph->StmtTopologicalOrder()) {
+      CHECK(node->IsStmt());
+      auto* op_info = node->stmt()->op_info();
+      std::string op_type = op_info->Type();
+
+      static const std::vector<std::string> target_ops{"var_conv_2d",
+                                                       "search_fc"};
+      if (std::find(target_ops.begin(), target_ops.end(), op_type) !=
+          target_ops.end()) {
+        std::string weight_name = op_info->Input("W").front();
+        auto* scope = node->stmt()->op()->scope();
+        auto* weight_t = scope->FindMutableTensor(weight_name);
+        auto weight_dims = weight_t->dims();
+        auto weight_len = weight_t->numel();
+        float* weight_on_host = weight_t->mutable_data<float>();
+        float max_f =
+            paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
+        std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
+        paddle::lite::xpu::math::ConvertFP32ToInt16(
+            weight_on_host, weight_int16.get(), max_f, weight_len);
+        memcpy(
+            weight_on_host, weight_int16.get(), weight_len * sizeof(int16_t));
+
+        auto update_op_info = *op_info;
+        update_op_info.SetAttr<bool>("__xpu__float_to_fix", true);
+        update_op_info.SetAttr<float>("__xpu__w_max", max_f);
+        node->stmt()->ResetOp(update_op_info, graph->valid_places());
+        VLOG(3) << "Float2Fix, op_type=" << op_type
+                << ", weight_name=" << weight_name;
+      } else if (op_type == "match_matrix_tensor") {
+        std::string weight_name = op_info->Input("W").front();
+        auto* scope = node->stmt()->op()->scope();
+        auto* weight_t = scope->FindMutableTensor(weight_name);
+        auto weight_dims = weight_t->dims();
+        auto weight_len = weight_t->numel();
+        float* weight_on_host = weight_t->mutable_data<float>();
+        float max_f =
+            paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
+        std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
+        std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
+        paddle::lite::xpu::math::ConvertFP32ToInt16(
+            weight_on_host, weight_int16.get(), max_f, weight_len);
+        paddle::lite::xpu::math::Transpose(weight_int16.get(),
+                                           weight_trans_int16.get(),
+                                           weight_dims[0],
+                                           weight_dims[1] * weight_dims[2]);
+        memcpy(weight_on_host,
+               weight_trans_int16.get(),
+               weight_len * sizeof(int16_t));
+
+        auto update_op_info = *op_info;
+        update_op_info.SetAttr<bool>("__xpu__float_to_fix", true);
+        update_op_info.SetAttr<float>("__xpu__w_max", max_f);
+        node->stmt()->ResetOp(update_op_info, graph->valid_places());
+        VLOG(3) << "Float2Fix && Transposed, op_type=" << op_type
+                << ", weight_name=" << weight_name;
+      } else if (op_type == "search_grnn") {
+        auto* scope = node->stmt()->op()->scope();
+
+        std::string wi_name = op_info->Input("Wi").front();
+        auto* wi_t = scope->FindMutableTensor(wi_name);
+        auto wi_dims = wi_t->dims();
+        auto wi_len = wi_t->numel();
+        auto wi_stride_len = wi_len / 3;
+        float* wi_on_host = wi_t->mutable_data<float>();
+        std::unique_ptr<int16_t[]> wi_int16(new int16_t[wi_len]);
+        std::vector<float> wi_max(3);
+        for (int i = 0; i < 3; ++i) {
+          float max_f = paddle::lite::xpu::math::FindMaxAbs(
+              wi_on_host + i * wi_stride_len, wi_stride_len);
+          paddle::lite::xpu::math::ConvertFP32ToInt16(
+              wi_on_host + i * wi_stride_len,
+              wi_int16.get() + i * wi_stride_len,
+              max_f,
+              wi_stride_len);
+          wi_max[i] = max_f;
+        }
+        memcpy(wi_on_host, wi_int16.get(), wi_len * sizeof(int16_t));
+
+        std::string wh_name = op_info->Input("Wh").front();
+        auto* wh_t = scope->FindMutableTensor(wh_name);
+        auto wh_dims = wh_t->dims();
+        auto wh_len = wh_t->numel();
+        auto wh_stride_len = wh_len / 3;
+        float* wh_on_host = wh_t->mutable_data<float>();
+        std::unique_ptr<int16_t[]> wh_int16(new int16_t[wh_len]);
+        std::vector<float> wh_max(3);
+        for (int i = 0; i < 3; ++i) {
+          float max_f = paddle::lite::xpu::math::FindMaxAbs(
+              wh_on_host + i * wh_stride_len, wh_stride_len);
+          paddle::lite::xpu::math::ConvertFP32ToInt16(
+              wh_on_host + i * wh_stride_len,
+              wh_int16.get() + i * wh_stride_len,
+              max_f,
+              wh_stride_len);
+          wh_max[i] = max_f;
+        }
+        memcpy(wh_on_host, wh_int16.get(), wh_len * sizeof(int16_t));
+
+        auto update_op_info = *op_info;
+        update_op_info.SetAttr<bool>("__xpu__float_to_fix", true);
+        update_op_info.SetAttr<std::vector<float>>("__xpu__wi_max", wi_max);
+        update_op_info.SetAttr<std::vector<float>>("__xpu__wh_max", wh_max);
+        node->stmt()->ResetOp(update_op_info, graph->valid_places());
+        VLOG(3) << "Float2Fix, op_type=" << op_type << ", wi_name=" << wi_name
+                << ", wh_name=" << wh_name;
+      }
+    }
+  }
+};
+
+class XPUMmdnnSearchAttentionFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input = VarNode("input")->AsInput();
+
+    auto* search_group_padding =
+        OpNode("search_group_padding", "search_group_padding");
+    auto* out_emb_padding =
+        VarNode("out_emb_padding")
+            ->assert_is_op_output("search_group_padding", "Out_emb_padding")
+            ->AsIntermediate();
+    auto* out_new = VarNode("out_new")
+                        ->assert_is_op_output("search_group_padding", "Out_new")
+                        ->AsIntermediate();
+    auto* out_padding =
+        VarNode("out_padding")
+            ->assert_is_op_output("search_group_padding", "Out_padding")
+            ->AsIntermediate();
+
+    auto* search_seq_fc_w = VarNode("search_seq_fc_w")
+                                ->assert_is_op_input("search_seq_fc", "W")
+                                ->AsInput();
+    auto* search_seq_fc_b = VarNode("search_seq_fc_b")
+                                ->assert_is_op_input("search_seq_fc", "b")
+                                ->AsInput();
+    auto* search_seq_fc =
+        OpNode("search_seq_fc", "search_seq_fc")->AsIntermediate();
+    auto* search_seq_fc_out = VarNode("search_seq_fc_out")
+                                  ->assert_is_op_output("search_seq_fc", "Out")
+                                  ->AsIntermediate();
+
+    auto* search_aligned_mat_mul =
+        OpNode("search_aligned_mat_mul", "search_aligned_mat_mul")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_out =
+        VarNode("search_aligned_mat_mul_out")
+            ->assert_is_op_output("search_aligned_mat_mul", "Out")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_a =
+        VarNode("search_aligned_mat_mul_a")
+            ->assert_is_op_output("search_aligned_mat_mul", "_a_addr")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_b =
+        VarNode("search_aligned_mat_mul_b")
+            ->assert_is_op_output("search_aligned_mat_mul", "_b_addr")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_c =
+        VarNode("search_aligned_mat_mul_c")
+            ->assert_is_op_output("search_aligned_mat_mul", "_c_addr")
+            ->AsIntermediate();
+
+    auto* search_attention_padding_mask =
+        OpNode("search_attention_padding_mask", "search_attention_padding_mask")
+            ->AsIntermediate();
+    auto* search_attention_padding_mask_out =
+        VarNode("search_attention_padding_mask_out")
+            ->assert_is_op_output("search_attention_padding_mask", "Out")
+            ->AsIntermediate();
+    auto* search_attention_padding_mask_pad_begin =
+        VarNode("search_attention_padding_mask_pad_begin")
+            ->assert_is_op_output("search_attention_padding_mask", "pad_begin")
+            ->AsIntermediate();
+
+    auto* search_seq_softmax =
+        OpNode("search_seq_softmax", "search_seq_softmax")->AsIntermediate();
+    auto* search_seq_softmax_out =
+        VarNode("search_seq_softmax_out")
+            ->assert_is_op_output("search_seq_softmax", "Out")
+            ->AsIntermediate();
+    auto* search_seq_softmax_out_log =
+        VarNode("search_seq_softmax_out_log")
+            ->assert_is_op_output("search_seq_softmax", "Out_log")
+            ->AsIntermediate();
+
+    auto* search_aligned_mat_mul_2 =
+        OpNode("search_aligned_mat_mul_2", "search_aligned_mat_mul")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_2_out =
+        VarNode("search_aligned_mat_mul_2_out")
+            ->assert_is_op_output("search_aligned_mat_mul", "Out")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_2_a =
+        VarNode("search_aligned_mat_mul_2_a")
+            ->assert_is_op_output("search_aligned_mat_mul", "_a_addr")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_2_b =
+        VarNode("search_aligned_mat_mul_2_b")
+            ->assert_is_op_output("search_aligned_mat_mul", "_b_addr")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_2_c =
+        VarNode("search_aligned_mat_mul_2_c")
+            ->assert_is_op_output("search_aligned_mat_mul", "_c_addr")
+            ->AsIntermediate();
+
+    auto* search_seq_depadding =
+        OpNode("search_seq_depadding")->AsIntermediate();
+    auto* search_seq_depadding_out =
+        VarNode("search_seq_depadding_out")->AsOutput();
+
+    *input >> *search_group_padding >> *out_emb_padding;
+    *search_group_padding >> *out_new;
+    *search_group_padding >> *out_padding;
+
+    *search_seq_fc_w >> *search_seq_fc;
+    *search_seq_fc_b >> *search_seq_fc;
+    *out_emb_padding >> *search_seq_fc;
+    *search_seq_fc >> *search_seq_fc_out;
+
+    *search_seq_fc_out >> *search_aligned_mat_mul;
+    *out_emb_padding >> *search_aligned_mat_mul;
+    *search_aligned_mat_mul >> *search_aligned_mat_mul_out;
+    *search_aligned_mat_mul >> *search_aligned_mat_mul_a;
+    *search_aligned_mat_mul >> *search_aligned_mat_mul_b;
+    *search_aligned_mat_mul >> *search_aligned_mat_mul_c;
+
+    *search_aligned_mat_mul_out >> *search_attention_padding_mask;
+    *out_padding >> *search_attention_padding_mask;
+    *search_attention_padding_mask >> *search_attention_padding_mask_out;
+    *search_attention_padding_mask >> *search_attention_padding_mask_pad_begin;
+
+    *search_attention_padding_mask_out >> *search_seq_softmax;
+    *search_seq_softmax >> *search_seq_softmax_out;
+    *search_seq_softmax >> *search_seq_softmax_out_log;
+
+    *search_seq_softmax_out >> *search_aligned_mat_mul_2;
+    *out_emb_padding >> *search_aligned_mat_mul_2;
+    *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_out;
+    *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_a;
+    *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_b;
+    *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_c;
+
+    *search_aligned_mat_mul_2_out >> *search_seq_depadding;
+    *out_new >> *search_seq_depadding;
+    *search_seq_depadding >> *search_seq_depadding_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_search_attention");
+    op_desc.SetInput("X", {matched.at("input")->arg()->name});
+    op_desc.SetInput("W", {matched.at("search_seq_fc_w")->arg()->name});
+    op_desc.SetInput("b", {matched.at("search_seq_fc_b")->arg()->name});
+    op_desc.SetOutput("Out",
+                      {matched.at("search_seq_depadding_out")->arg()->name});
+
+    auto* padding_op_info =
+        matched.at("search_group_padding")->stmt()->op_info();
+    op_desc.SetAttr<int>("pad_id", padding_op_info->GetAttr<int>("pad_id"));
+    auto* matmul_0_op_info =
+        matched.at("search_aligned_mat_mul")->stmt()->op_info();
+    op_desc.SetAttr<float>("alpha0", matmul_0_op_info->GetAttr<float>("alpha"));
+    auto* matmul_1_op_info =
+        matched.at("search_aligned_mat_mul_2")->stmt()->op_info();
+    op_desc.SetAttr<float>("alpha1", matmul_1_op_info->GetAttr<float>("alpha"));
+    auto* mask_op_info =
+        matched.at("search_attention_padding_mask")->stmt()->op_info();
+    op_desc.SetAttr<float>("mask", mask_op_info->GetAttr<float>("mask"));
+
+    auto* new_stmt = matched.at("search_group_padding")->stmt();
+    auto* scope = new_stmt->op()->scope();
+    auto w_name = matched.at("search_seq_fc_w")->arg()->name;
+    auto* w_t = scope->FindMutableTensor(w_name);
+    auto w_dims = w_t->dims();
+    int w_len = w_t->numel();
+    float* w_on_host = w_t->mutable_data<float>();
+
+    float max_f = paddle::lite::xpu::math::FindMaxAbs(w_on_host, w_len);
+    std::unique_ptr<int16_t[]> w_int16(new int16_t[w_len]);
+    paddle::lite::xpu::math::ConvertFP32ToInt16(
+        w_on_host, w_int16.get(), max_f, w_len);
+    memcpy(w_on_host, w_int16.get(), w_len * sizeof(int16_t));
+    op_desc.SetAttr<float>("W_max", max_f);
+
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, scope);
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    DirectedLink(matched.at("search_seq_fc_w"),
+                 matched.at("search_group_padding"));
+    DirectedLink(matched.at("search_seq_fc_b"),
+                 matched.at("search_group_padding"));
+    IR_OP_VAR_LINK(matched.at("search_group_padding"),
+                   matched.at("search_seq_depadding_out"));
+  }
+};
+
+// 4 inputs
+// ========
+//
+// input_x
+// input_y
+// topk_row
+// topk_col
+//
+// input_x ------- match_matrix_tensor ------- input_y
+//                           |
+//                          relu
+//                 ________/    \________
+//                 |                    |
+//            var_conv_2d               |
+//                 |                    |
+//                relu                  |
+//                 |_______      _______|
+//                         \    /
+//                   sequence_concat
+//                           |
+// topk_row ---- sequence_topk_avg_pooling ----- topk_col
+//
+class XPUMmdnnMatchConvTopkFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input_x = VarNode("input_x")
+                        ->assert_is_op_input("match_matrix_tensor", "X")
+                        ->AsInput();
+    auto* input_y = VarNode("input_y")
+                        ->assert_is_op_input("match_matrix_tensor", "Y")
+                        ->AsInput();
+    auto* input_w = VarNode("input_w")
+                        ->assert_is_op_input("match_matrix_tensor", "W")
+                        ->AsInput();
+
+    auto* match_matrix_tensor =
+        OpNode("match_matrix_tensor", "match_matrix_tensor");
+    auto* match_out = VarNode("match_out")
+                          ->assert_is_op_output("match_matrix_tensor", "Out")
+                          ->AsIntermediate();
+    auto* match_tmp = VarNode("match_tmp")
+                          ->assert_is_op_output("match_matrix_tensor", "Tmp")
+                          ->AsIntermediate();
+    auto* relu0 = OpNode("relu0", "relu")->AsIntermediate();
+    auto* relu0_out = VarNode("relu0_out")
+                          ->assert_is_op_output("relu", "Out")
+                          ->AsIntermediate();
+    auto* conv_w =
+        VarNode("conv_w")->assert_is_op_input("var_conv_2d", "W")->AsInput();
+    auto* conv = OpNode("conv", "var_conv_2d")->AsIntermediate();
+    auto* conv_out = VarNode("conv_out")
+                         ->assert_is_op_output("var_conv_2d", "Out")
+                         ->AsIntermediate();
+    auto* conv_col = VarNode("conv_col")
+                         ->assert_is_op_output("var_conv_2d", "Col")
+                         ->AsIntermediate();
+    auto* relu1 = OpNode("relu1", "relu")->AsIntermediate();
+    auto* relu1_out = VarNode("relu1_out")
+                          ->assert_is_op_output("relu", "Out")
+                          ->AsIntermediate();
+    auto* seq_concat =
+        OpNode("seq_concat", "sequence_concat")->AsIntermediate();
+    auto* seq_concat_out =
+        VarNode("seq_concat_out")
+            ->assert_is_op_output("sequence_concat", "Out")
+            ->assert_is_op_input("sequence_topk_avg_pooling", "X")
+            ->AsIntermediate();
+    auto* topk_col =
+        VarNode("topk_col")
+            ->assert_is_op_input("sequence_topk_avg_pooling", "COLUMN")
+            ->AsInput();
+    auto* topk_row =
+        VarNode("topk_row")
+            ->assert_is_op_input("sequence_topk_avg_pooling", "ROW")
+            ->AsInput();
+    auto* topk = OpNode("topk", "sequence_topk_avg_pooling")->AsIntermediate();
+    auto* topk_out =
+        VarNode("topk_out")
+            ->assert_is_op_output("sequence_topk_avg_pooling", "Out")
+            ->AsOutput();
+    auto* topk_pos =
+        VarNode("topk_pos")
+            ->assert_is_op_output("sequence_topk_avg_pooling", "pos")
+            ->AsIntermediate();
+
+    *input_x >> *match_matrix_tensor;
+    *input_y >> *match_matrix_tensor;
+    *input_w >> *match_matrix_tensor;
+    *match_matrix_tensor >> *match_out >> *relu0 >> *relu0_out;
+    *match_matrix_tensor >> *match_tmp;
+
+    *relu0_out >> *conv >> *conv_out >> *relu1 >> *relu1_out;
+    *conv_w >> *conv;
+    *conv >> *conv_col;
+
+    *relu0_out >> *seq_concat;
+    *relu1_out >> *seq_concat;
+    *seq_concat >> *seq_concat_out >> *topk >> *topk_out;
+    *topk_col >> *topk;
+    *topk_row >> *topk;
+    *topk >> *topk_pos;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_match_conv_topk");
+    op_desc.SetInput("input_x", {matched.at("input_x")->arg()->name});
+    op_desc.SetInput("input_y", {matched.at("input_y")->arg()->name});
+    op_desc.SetInput("input_w", {matched.at("input_w")->arg()->name});
+    op_desc.SetInput("conv_w", {matched.at("conv_w")->arg()->name});
+    op_desc.SetOutput("topk_out", {matched.at("topk_out")->arg()->name});
+
+    auto* match_op_info = matched.at("match_matrix_tensor")->stmt()->op_info();
+    op_desc.SetAttr<float>("input_w_max",
+                           match_op_info->GetAttr<float>("__xpu__w_max"));
+    op_desc.SetAttr<int>("dim_t", match_op_info->GetAttr<int>("dim_t"));
+    auto* conv_op_info = matched.at("conv")->stmt()->op_info();
+    op_desc.SetAttr<float>("conv_w_max",
+                           conv_op_info->GetAttr<float>("__xpu__w_max"));
+    op_desc.SetAttr<int>("output_channel",
+                         conv_op_info->GetAttr<int>("OutputChannel"));
+    auto* topk_op_info = matched.at("topk")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<int>>(
+        "topks", topk_op_info->GetAttr<std::vector<int>>("topks"));
+    op_desc.SetAttr<int>("channel_num",
+                         topk_op_info->GetAttr<int>("channel_num"));
+
+    auto* new_stmt = matched.at("match_matrix_tensor")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    // XXX(miaotianxiang): redundant links around |topk| are automatically
+    // removed as |topk| is marked intermediate.
+    // RemoveDirectedLink(matched.at("topk_col"), matched.at("topk"));
+    // RemoveDirectedLink(matched.at("topk_row"), matched.at("topk"));
+    std::vector<std::string> arg_names{"conv_w"};
+    for (auto name : arg_names) {
+      DirectedLink(matched.at(name), matched.at("match_matrix_tensor"));
+    }
+    std::vector<std::string> out_names{"topk_out"};
+    for (auto name : out_names) {
+      IR_OP_VAR_LINK(matched.at("match_matrix_tensor"), matched.at(name));
+    }
+  }
+};
+
+// 2 inputs
+// ========
+//
+// input_x
+// input_y
+//
+// input_x ------- match_matrix_tensor ------- input_y
+//    |                      |                    |
+//    |                     relu                  |
+//    |            ________/    \________         |
+//    |            |                    |         |
+//    |       var_conv_2d               |         |
+//    |            |                    |         |
+//    |           relu                  |         |
+//    |            |_______      _______|         |
+//    |                    \    /                 |
+//    |              sequence_concat              |
+//    |                      |                    |
+//    |--------- sequence_topk_avg_pooling -------|
+//
+class XPUMmdnnMatchConvTopkFuser2 : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input_x = VarNode("input_x")
+                        ->assert_is_op_input("match_matrix_tensor", "X")
+                        ->assert_is_op_input("sequence_topk_avg_pooling", "ROW")
+                        ->AsInput();
+    auto* input_y =
+        VarNode("input_y")
+            ->assert_is_op_input("match_matrix_tensor", "Y")
+            ->assert_is_op_input("sequence_topk_avg_pooling", "COLUMN")
+            ->AsInput();
+    auto* input_w = VarNode("input_w")
+                        ->assert_is_op_input("match_matrix_tensor", "W")
+                        ->AsInput();
+
+    auto* match_matrix_tensor =
+        OpNode("match_matrix_tensor", "match_matrix_tensor");
+    auto* match_out = VarNode("match_out")
+                          ->assert_is_op_output("match_matrix_tensor", "Out")
+                          ->AsIntermediate();
+    auto* match_tmp = VarNode("match_tmp")
+                          ->assert_is_op_output("match_matrix_tensor", "Tmp")
+                          ->AsIntermediate();
+    auto* relu0 = OpNode("relu0", "relu")->AsIntermediate();
+    auto* relu0_out = VarNode("relu0_out")
+                          ->assert_is_op_output("relu", "Out")
+                          ->AsIntermediate();
+    auto* conv_w =
+        VarNode("conv_w")->assert_is_op_input("var_conv_2d", "W")->AsInput();
+    auto* conv = OpNode("conv", "var_conv_2d")->AsIntermediate();
+    auto* conv_out = VarNode("conv_out")
+                         ->assert_is_op_output("var_conv_2d", "Out")
+                         ->AsIntermediate();
+    auto* conv_col = VarNode("conv_col")
+                         ->assert_is_op_output("var_conv_2d", "Col")
+                         ->AsIntermediate();
+    auto* relu1 = OpNode("relu1", "relu")->AsIntermediate();
+    auto* relu1_out = VarNode("relu1_out")
+                          ->assert_is_op_output("relu", "Out")
+                          ->AsIntermediate();
+    auto* seq_concat =
+        OpNode("seq_concat", "sequence_concat")->AsIntermediate();
+    auto* seq_concat_out =
+        VarNode("seq_concat_out")
+            ->assert_is_op_output("sequence_concat", "Out")
+            ->assert_is_op_input("sequence_topk_avg_pooling", "X")
+            ->AsIntermediate();
+    auto* topk = OpNode("topk", "sequence_topk_avg_pooling")->AsIntermediate();
+    auto* topk_out =
+        VarNode("topk_out")
+            ->assert_is_op_output("sequence_topk_avg_pooling", "Out")
+            ->AsOutput();
+    auto* topk_pos =
+        VarNode("topk_pos")
+            ->assert_is_op_output("sequence_topk_avg_pooling", "pos")
+            ->AsIntermediate();
+
+    *input_x >> *match_matrix_tensor;
+    *input_y >> *match_matrix_tensor;
+    *input_w >> *match_matrix_tensor;
+    *match_matrix_tensor >> *match_out >> *relu0 >> *relu0_out;
+    *match_matrix_tensor >> *match_tmp;
+
+    *relu0_out >> *conv >> *conv_out >> *relu1 >> *relu1_out;
+    *conv_w >> *conv;
+    *conv >> *conv_col;
+
+    *relu0_out >> *seq_concat;
+    *relu1_out >> *seq_concat;
+    *seq_concat >> *seq_concat_out >> *topk >> *topk_out;
+    *input_x >> *topk;
+    *input_y >> *topk;
+    *topk >> *topk_pos;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_match_conv_topk");
+    op_desc.SetInput("input_x", {matched.at("input_x")->arg()->name});
+    op_desc.SetInput("input_y", {matched.at("input_y")->arg()->name});
+    op_desc.SetInput("input_w", {matched.at("input_w")->arg()->name});
+    op_desc.SetInput("conv_w", {matched.at("conv_w")->arg()->name});
+    op_desc.SetOutput("topk_out", {matched.at("topk_out")->arg()->name});
+
+    auto* match_op_info = matched.at("match_matrix_tensor")->stmt()->op_info();
+    op_desc.SetAttr<float>("input_w_max",
+                           match_op_info->GetAttr<float>("__xpu__w_max"));
+    op_desc.SetAttr<int>("dim_t", match_op_info->GetAttr<int>("dim_t"));
+    auto* conv_op_info = matched.at("conv")->stmt()->op_info();
+    op_desc.SetAttr<float>("conv_w_max",
+                           conv_op_info->GetAttr<float>("__xpu__w_max"));
+    op_desc.SetAttr<int>("output_channel",
+                         conv_op_info->GetAttr<int>("OutputChannel"));
+    auto* topk_op_info = matched.at("topk")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<int>>(
+        "topks", topk_op_info->GetAttr<std::vector<int>>("topks"));
+    op_desc.SetAttr<int>("channel_num",
+                         topk_op_info->GetAttr<int>("channel_num"));
+
+    auto* new_stmt = matched.at("match_matrix_tensor")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    // XXX(miaotianxiang): redundant links around |topk| are automatically
+    // removed as |topk| is marked intermediate.
+    // RemoveDirectedLink(matched.at("topk_col"), matched.at("topk"));
+    // RemoveDirectedLink(matched.at("topk_row"), matched.at("topk"));
+    std::vector<std::string> arg_names{"conv_w"};
+    for (auto name : arg_names) {
+      DirectedLink(matched.at(name), matched.at("match_matrix_tensor"));
+    }
+    std::vector<std::string> out_names{"topk_out"};
+    for (auto name : out_names) {
+      IR_OP_VAR_LINK(matched.at("match_matrix_tensor"), matched.at(name));
+    }
+  }
+};
+
+class XPUMmdnnBidSeqRevEmbEltwiseFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input0 = VarNode("input0")->AsInput();
+    auto* input1 = VarNode("input1")->AsInput();
+    auto* emb_tbl = VarNode("emb_tbl")->AsInput();
+
+    // fwd emb
+    auto* emb0 = OpNode("emb0", "lookup_table");
+    auto* emb0_out =
+        VarNode("emb0_out")->assert_is_op_output("lookup_table", "Out");
+    auto* emb1 = OpNode("emb1", "lookup_table");
+    auto* emb1_out =
+        VarNode("emb1_out")->assert_is_op_output("lookup_table", "Out");
+
+    auto* eltwise01 = OpNode("eltwise01", "search_seq_arithmetic");
+    auto* eltwise01_out =
+        VarNode("eltwise01_out")
+            ->assert_is_op_output("search_seq_arithmetic", "Out")
+            ->AsOutput();
+
+    // rev emb
+    auto* seq_rev2 = OpNode("seq_rev2", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev2_out = VarNode("seq_rev2_out")
+                             ->assert_is_op_output("sequence_reverse", "Y")
+                             ->AsIntermediate();
+    auto* seq_rev3 = OpNode("seq_rev3", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev3_out = VarNode("seq_rev3_out")
+                             ->assert_is_op_output("sequence_reverse", "Y")
+                             ->AsIntermediate();
+    auto* emb2 = OpNode("emb2", "lookup_table")->AsIntermediate();
+    auto* emb2_out = VarNode("emb2_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->AsIntermediate();
+    auto* emb3 = OpNode("emb3", "lookup_table")->AsIntermediate();
+    auto* emb3_out = VarNode("emb3_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->AsIntermediate();
+
+    auto* eltwise23 =
+        OpNode("eltwise23", "search_seq_arithmetic")->AsIntermediate();
+    auto* eltwise23_out =
+        VarNode("eltwise23_out")
+            ->assert_is_op_output("search_seq_arithmetic", "Out")
+            ->AsOutput();
+
+    *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out;
+    *emb_tbl >> *emb0;
+    *input1 >> *emb1 >> *emb1_out >> *eltwise01;
+    *emb_tbl >> *emb1;
+
+    *input0 >> *seq_rev2 >> *seq_rev2_out >> *emb2 >> *emb2_out >> *eltwise23 >>
+        *eltwise23_out;
+    *emb_tbl >> *emb2;
+    *input1 >> *seq_rev3 >> *seq_rev3_out >> *emb3 >> *emb3_out >> *eltwise23;
+    *emb_tbl >> *emb3;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("sequence_reverse");
+    op_desc.SetInput("X", {matched.at("eltwise01_out")->arg()->name});
+    op_desc.SetOutput("Y", {matched.at("eltwise23_out")->arg()->name});
+
+    auto emb0_op = matched.at("emb0")->stmt()->op();
+    auto new_seq_rev_op = LiteOpRegistry::Global().Create("sequence_reverse");
+    new_seq_rev_op->Attach(op_desc, emb0_op->scope());
+    auto* new_seq_rev_node =
+        graph->GraphCreateInstructNode(new_seq_rev_op, emb0_op->valid_places());
+
+    DirectedLink(matched.at("eltwise01_out"), new_seq_rev_node);
+    DirectedLink(new_seq_rev_node, matched.at("eltwise23_out"));
+  }
+};
+
+class XPUMmdnnBidEmbAttFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input0 = VarNode("input0")->AsInput();
+    auto* input1 = VarNode("input1")->AsInput();
+    auto* emb_tbl = VarNode("emb_tbl")->AsInput();
+
+    auto* emb0 = OpNode("emb0", "lookup_table");
+    auto* emb0_out = VarNode("emb0_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->AsIntermediate();
+    auto* emb1 = OpNode("emb1", "lookup_table")->AsIntermediate();
+    auto* emb1_out = VarNode("emb1_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->AsIntermediate();
+    auto* eltwise01 =
+        OpNode("eltwise01", "search_seq_arithmetic")->AsIntermediate();
+    auto* eltwise01_out =
+        VarNode("eltwise01_out")
+            ->assert_is_op_output("search_seq_arithmetic", "Out")
+            ->AsOutput();
+
+    auto* att_2in1_w =
+        VarNode("att_2in1_w")
+            ->assert_is_op_input("__xpu__mmdnn_search_attention", "W")
+            ->AsInput();
+    auto* att_2in1_b =
+        VarNode("att_2in1_b")
+            ->assert_is_op_input("__xpu__mmdnn_search_attention", "b")
+            ->AsInput();
+    auto* att_2in1 =
+        OpNode("att_2in1", "__xpu__mmdnn_search_attention")->AsIntermediate();
+    auto* att_2in1_out =
+        VarNode("att_2in1_out")
+            ->assert_is_op_output("__xpu__mmdnn_search_attention", "Out")
+            ->AsIntermediate();
+    auto* seq_pool_2in1 =
+        OpNode("seq_pool_2in1", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_2in1_out = VarNode("seq_pool_2in1_out")
+                                  ->assert_is_op_output("sequence_pool", "Out")
+                                  ->AsOutput();
+    auto* seq_pool_2in1_max_idx =
+        VarNode("seq_pool_2in1_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out;
+    *emb_tbl >> *emb0;
+    *input1 >> *emb1 >> *emb1_out >> *eltwise01;
+    *emb_tbl >> *emb1;
+
+    *eltwise01_out >> *att_2in1 >> *att_2in1_out >> *seq_pool_2in1 >>
+        *seq_pool_2in1_out;
+    *seq_pool_2in1 >> *seq_pool_2in1_max_idx;
+    *att_2in1_w >> *att_2in1;
+    *att_2in1_b >> *att_2in1;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_bid_emb_att");
+    op_desc.SetInput("id0", {matched.at("input0")->arg()->name});
+    op_desc.SetInput("id1", {matched.at("input1")->arg()->name});
+    op_desc.SetInput("emb_tbl", {matched.at("emb_tbl")->arg()->name});
+    op_desc.SetInput("att_fc_w", {matched.at("att_2in1_w")->arg()->name});
+    op_desc.SetInput("att_fc_b", {matched.at("att_2in1_b")->arg()->name});
+    op_desc.SetOutput("att_pool_out",
+                      {matched.at("seq_pool_2in1_out")->arg()->name});
+    op_desc.SetOutput("emb_fw_out", {matched.at("eltwise01_out")->arg()->name});
+
+    auto* att_fc_op_info = matched.at("att_2in1")->stmt()->op_info();
+    op_desc.SetAttr<float>("att_fc_w_max",
+                           att_fc_op_info->GetAttr<float>("W_max"));
+
+    auto* new_stmt = matched.at("emb0")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    std::vector<std::string> arg_names{
+        "input1", "att_2in1_w", "att_2in1_b",
+    };
+    for (auto name : arg_names) {
+      DirectedLink(matched.at(name), matched.at("emb0"));
+    }
+    std::vector<std::string> out_names{
+        "seq_pool_2in1_out", "eltwise01_out",
+    };
+    for (auto name : out_names) {
+      IR_OP_VAR_LINK(matched.at("emb0"), matched.at(name));
+    }
+  }
+};
+
+// 5 outputs
+// =========
+//
+// eltwise01_out
+// seq_pool_right_out
+// seq_pool_left_out
+// seq_pool_2in1_out
+// concat_3in1_out
+//
+class XPUMmdnnBidEmbGrnnAttFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input0 = VarNode("input0")->AsInput();
+    auto* input1 = VarNode("input1")->AsInput();
+    auto* emb_tbl = VarNode("emb_tbl")->AsInput();
+
+    auto* emb0 = OpNode("emb0", "lookup_table");
+    auto* emb0_out = VarNode("emb0_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->AsIntermediate();
+    auto* emb1 = OpNode("emb1", "lookup_table")->AsIntermediate();
+    auto* emb1_out = VarNode("emb1_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->AsIntermediate();
+    auto* eltwise01 =
+        OpNode("eltwise01", "search_seq_arithmetic")->AsIntermediate();
+    auto* eltwise01_out =
+        VarNode("eltwise01_out")
+            ->assert_is_op_output("search_seq_arithmetic", "Out")
+            ->AsOutput();
+
+    auto* seq_rev_right0 =
+        OpNode("seq_rev_right0", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev_right0_out =
+        VarNode("seq_rev_right0_out")
+            ->assert_is_op_output("sequence_reverse", "Y")
+            ->AsIntermediate();
+    auto* grnn_right_wh = VarNode("grnn_right_wh")
+                              ->assert_is_op_input("search_grnn", "Wh")
+                              ->AsInput();
+    auto* grnn_right_wi = VarNode("grnn_right_wi")
+                              ->assert_is_op_input("search_grnn", "Wi")
+                              ->AsInput();
+    auto* grnn_right = OpNode("grnn_right", "search_grnn")->AsIntermediate();
+    auto* grnn_right_out = VarNode("grnn_right_out")
+                               ->assert_is_op_output("search_grnn", "Out")
+                               ->AsIntermediate();
+    auto* grnn_right_idx_sorted_by_width =
+        VarNode("grnn_right_idx_sorted_by_width")
+            ->assert_is_op_output("search_grnn", "idx_sorted_by_width")
+            ->AsIntermediate();
+    auto* grnn_right_layout_input =
+        VarNode("grnn_right_layout_input")
+            ->assert_is_op_output("search_grnn", "layout_input")
+            ->AsIntermediate();
+    auto* grnn_right_tmp_buffer =
+        VarNode("grnn_right_tmp_buffer")
+            ->assert_is_op_output("search_grnn", "tmp_buffer")
+            ->AsIntermediate();
+    auto* seq_rev_right1 =
+        OpNode("seq_rev_right1", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev_right1_out =
+        VarNode("seq_rev_right1_out")
+            ->assert_is_op_output("sequence_reverse", "Y")
+            ->AsIntermediate();
+    auto* seq_pool_right =
+        OpNode("seq_pool_right", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_right_out = VarNode("seq_pool_right_out")
+                                   ->assert_is_op_output("sequence_pool", "Out")
+                                   ->AsOutput();
+    auto* seq_pool_right_max_idx =
+        VarNode("seq_pool_right_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* grnn_left_wh = VarNode("grnn_left_wh")
+                             ->assert_is_op_input("search_grnn", "Wh")
+                             ->AsInput();
+    auto* grnn_left_wi = VarNode("grnn_left_wi")
+                             ->assert_is_op_input("search_grnn", "Wi")
+                             ->AsInput();
+    auto* grnn_left = OpNode("grnn_left", "search_grnn")->AsIntermediate();
+    auto* grnn_left_out = VarNode("grnn_left_out")
+                              ->assert_is_op_output("search_grnn", "Out")
+                              ->AsIntermediate();
+    auto* grnn_left_idx_sorted_by_width =
+        VarNode("grnn_left_idx_sorted_by_width")
+            ->assert_is_op_output("search_grnn", "idx_sorted_by_width")
+            ->AsIntermediate();
+    auto* grnn_left_layout_input =
+        VarNode("grnn_left_layout_input")
+            ->assert_is_op_output("search_grnn", "layout_input")
+            ->AsIntermediate();
+    auto* grnn_left_tmp_buffer =
+        VarNode("grnn_left_tmp_buffer")
+            ->assert_is_op_output("search_grnn", "tmp_buffer")
+            ->AsIntermediate();
+    auto* seq_pool_left =
+        OpNode("seq_pool_left", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_left_out = VarNode("seq_pool_left_out")
+                                  ->assert_is_op_output("sequence_pool", "Out")
+                                  ->AsOutput();
+    auto* seq_pool_left_max_idx =
+        VarNode("seq_pool_left_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* concat_2in1 = OpNode("concat_2in1", "concat")->AsIntermediate();
+    auto* concat_2in1_out = VarNode("concat_2in1_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsIntermediate();
+    auto* att_2in1_w =
+        VarNode("att_2in1_w")
+            ->assert_is_op_input("__xpu__mmdnn_search_attention", "W")
+            ->AsInput();
+    auto* att_2in1_b =
+        VarNode("att_2in1_b")
+            ->assert_is_op_input("__xpu__mmdnn_search_attention", "b")
+            ->AsInput();
+    auto* att_2in1 =
+        OpNode("att_2in1", "__xpu__mmdnn_search_attention")->AsIntermediate();
+    auto* att_2in1_out =
+        VarNode("att_2in1_out")
+            ->assert_is_op_output("__xpu__mmdnn_search_attention", "Out")
+            ->AsIntermediate();
+    auto* seq_pool_2in1 =
+        OpNode("seq_pool_2in1", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_2in1_out = VarNode("seq_pool_2in1_out")
+                                  ->assert_is_op_output("sequence_pool", "Out")
+                                  ->AsOutput();
+    auto* seq_pool_2in1_max_idx =
+        VarNode("seq_pool_2in1_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* concat_3in1 = OpNode("concat_3in1", "concat")->AsIntermediate();
+    auto* concat_3in1_out = VarNode("concat_3in1_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsOutput();
+
+    *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out;
+    *emb_tbl >> *emb0;
+    *input1 >> *emb1 >> *emb1_out >> *eltwise01;
+    *emb_tbl >> *emb1;
+
+    *eltwise01_out >> *seq_rev_right0 >> *seq_rev_right0_out >> *grnn_right >>
+        *grnn_right_out >> *seq_rev_right1 >> *seq_rev_right1_out;
+    *grnn_right_out >> *seq_pool_right >> *seq_pool_right_out;
+    *seq_pool_right >> *seq_pool_right_max_idx;
+    *grnn_right_wh >> *grnn_right;
+    *grnn_right_wi >> *grnn_right;
+    *grnn_right >> *grnn_right_idx_sorted_by_width;
+    *grnn_right >> *grnn_right_layout_input;
+    *grnn_right >> *grnn_right_tmp_buffer;
+
+    *eltwise01_out >> *grnn_left >> *grnn_left_out >> *seq_pool_left >>
+        *seq_pool_left_out;
+    *seq_pool_left >> *seq_pool_left_max_idx;
+    *grnn_left_wh >> *grnn_left;
+    *grnn_left_wi >> *grnn_left;
+    *grnn_left >> *grnn_left_idx_sorted_by_width;
+    *grnn_left >> *grnn_left_layout_input;
+    *grnn_left >> *grnn_left_tmp_buffer;
+
+    *seq_rev_right1_out >> *concat_2in1;
+    *grnn_left_out >> *concat_2in1;
+    *concat_2in1 >> *concat_2in1_out >> *att_2in1 >> *att_2in1_out >>
+        *seq_pool_2in1 >> *seq_pool_2in1_out;
+    *seq_pool_2in1 >> *seq_pool_2in1_max_idx;
+    *att_2in1_w >> *att_2in1;
+    *att_2in1_b >> *att_2in1;
+
+    *eltwise01_out >> *concat_3in1;
+    *seq_rev_right1_out >> *concat_3in1;
+    *grnn_left_out >> *concat_3in1;
+    *concat_3in1 >> *concat_3in1_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_bid_emb_grnn_att");
+    op_desc.SetInput("id0", {matched.at("input0")->arg()->name});
+    op_desc.SetInput("id1", {matched.at("input1")->arg()->name});
+    op_desc.SetInput("emb_tbl", {matched.at("emb_tbl")->arg()->name});
+    op_desc.SetInput("grnn_fw_wh", {matched.at("grnn_left_wh")->arg()->name});
+    op_desc.SetInput("grnn_fw_wi", {matched.at("grnn_left_wi")->arg()->name});
+    op_desc.SetInput("grnn_rv_wh", {matched.at("grnn_right_wh")->arg()->name});
+    op_desc.SetInput("grnn_rv_wi", {matched.at("grnn_right_wi")->arg()->name});
+    op_desc.SetInput("att_fc_w", {matched.at("att_2in1_w")->arg()->name});
+    op_desc.SetInput("att_fc_b", {matched.at("att_2in1_b")->arg()->name});
+    op_desc.SetOutput("grnn_fw_pool_out",
+                      {matched.at("seq_pool_left_out")->arg()->name});
+    op_desc.SetOutput("grnn_rv_pool_out",
+                      {matched.at("seq_pool_right_out")->arg()->name});
+    op_desc.SetOutput("att_pool_out",
+                      {matched.at("seq_pool_2in1_out")->arg()->name});
+    op_desc.SetOutput("concat_3in1_out",
+                      {matched.at("concat_3in1_out")->arg()->name});
+    op_desc.SetOutput("emb_fw_out", {matched.at("eltwise01_out")->arg()->name});
+
+    auto* grnn_fw_op_info = matched.at("grnn_left")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_fw_wh_maxs",
+        grnn_fw_op_info->GetAttr<std::vector<float>>("__xpu__wh_max"));
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_fw_wi_maxs",
+        grnn_fw_op_info->GetAttr<std::vector<float>>("__xpu__wi_max"));
+    auto* grnn_rv_op_info = matched.at("grnn_right")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_rv_wh_maxs",
+        grnn_rv_op_info->GetAttr<std::vector<float>>("__xpu__wh_max"));
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_rv_wi_maxs",
+        grnn_rv_op_info->GetAttr<std::vector<float>>("__xpu__wi_max"));
+    auto* att_fc_op_info = matched.at("att_2in1")->stmt()->op_info();
+    op_desc.SetAttr<float>("att_fc_w_max",
+                           att_fc_op_info->GetAttr<float>("W_max"));
+
+    auto* new_stmt = matched.at("emb0")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    std::vector<std::string> arg_names{
+        "input1",
+        "grnn_left_wh",
+        "grnn_left_wi",
+        "grnn_right_wh",
+        "grnn_right_wi",
+        "att_2in1_w",
+        "att_2in1_b",
+    };
+    for (auto name : arg_names) {
+      DirectedLink(matched.at(name), matched.at("emb0"));
+    }
+    std::vector<std::string> out_names{
+        "seq_pool_left_out",
+        "seq_pool_right_out",
+        "seq_pool_2in1_out",
+        "concat_3in1_out",
+        "eltwise01_out",
+    };
+    for (auto name : out_names) {
+      IR_OP_VAR_LINK(matched.at("emb0"), matched.at(name));
+    }
+  }
+};
+
+// 6 outputs
+// =========
+//
+// emb0_out
+// eltwise01_out
+// seq_pool_right_out
+// seq_pool_left_out
+// seq_pool_2in1_out
+// concat_3in1_out
+//
+class XPUMmdnnBidEmbGrnnAttFuser2 : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input0 = VarNode("input0")->AsInput();
+    auto* input1 = VarNode("input1")->AsInput();
+    auto* emb_tbl = VarNode("emb_tbl")->AsInput();
+
+    auto* emb0 = OpNode("emb0", "lookup_table");
+    auto* emb0_out = VarNode("emb0_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->assert_is_op_input("search_seq_arithmetic", "X")
+                         ->AsOutput();
+    auto* emb1 = OpNode("emb1", "lookup_table")->AsIntermediate();
+    auto* emb1_out = VarNode("emb1_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->assert_is_op_input("search_seq_arithmetic", "Y")
+                         ->AsIntermediate();
+    auto* eltwise01 =
+        OpNode("eltwise01", "search_seq_arithmetic")->AsIntermediate();
+    auto* eltwise01_out =
+        VarNode("eltwise01_out")
+            ->assert_is_op_output("search_seq_arithmetic", "Out")
+            ->AsOutput();
+
+    auto* seq_rev_right0 =
+        OpNode("seq_rev_right0", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev_right0_out =
+        VarNode("seq_rev_right0_out")
+            ->assert_is_op_output("sequence_reverse", "Y")
+            ->AsIntermediate();
+    auto* grnn_right_wh = VarNode("grnn_right_wh")
+                              ->assert_is_op_input("search_grnn", "Wh")
+                              ->AsInput();
+    auto* grnn_right_wi = VarNode("grnn_right_wi")
+                              ->assert_is_op_input("search_grnn", "Wi")
+                              ->AsInput();
+    auto* grnn_right = OpNode("grnn_right", "search_grnn")->AsIntermediate();
+    auto* grnn_right_out = VarNode("grnn_right_out")
+                               ->assert_is_op_output("search_grnn", "Out")
+                               ->AsIntermediate();
+    auto* grnn_right_idx_sorted_by_width =
+        VarNode("grnn_right_idx_sorted_by_width")
+            ->assert_is_op_output("search_grnn", "idx_sorted_by_width")
+            ->AsIntermediate();
+    auto* grnn_right_layout_input =
+        VarNode("grnn_right_layout_input")
+            ->assert_is_op_output("search_grnn", "layout_input")
+            ->AsIntermediate();
+    auto* grnn_right_tmp_buffer =
+        VarNode("grnn_right_tmp_buffer")
+            ->assert_is_op_output("search_grnn", "tmp_buffer")
+            ->AsIntermediate();
+    auto* seq_rev_right1 =
+        OpNode("seq_rev_right1", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev_right1_out =
+        VarNode("seq_rev_right1_out")
+            ->assert_is_op_output("sequence_reverse", "Y")
+            ->AsIntermediate();
+    auto* seq_pool_right =
+        OpNode("seq_pool_right", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_right_out = VarNode("seq_pool_right_out")
+                                   ->assert_is_op_output("sequence_pool", "Out")
+                                   ->AsOutput();
+    auto* seq_pool_right_max_idx =
+        VarNode("seq_pool_right_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* grnn_left_wh = VarNode("grnn_left_wh")
+                             ->assert_is_op_input("search_grnn", "Wh")
+                             ->AsInput();
+    auto* grnn_left_wi = VarNode("grnn_left_wi")
+                             ->assert_is_op_input("search_grnn", "Wi")
+                             ->AsInput();
+    auto* grnn_left = OpNode("grnn_left", "search_grnn")->AsIntermediate();
+    auto* grnn_left_out = VarNode("grnn_left_out")
+                              ->assert_is_op_output("search_grnn", "Out")
+                              ->AsIntermediate();
+    auto* grnn_left_idx_sorted_by_width =
+        VarNode("grnn_left_idx_sorted_by_width")
+            ->assert_is_op_output("search_grnn", "idx_sorted_by_width")
+            ->AsIntermediate();
+    auto* grnn_left_layout_input =
+        VarNode("grnn_left_layout_input")
+            ->assert_is_op_output("search_grnn", "layout_input")
+            ->AsIntermediate();
+    auto* grnn_left_tmp_buffer =
+        VarNode("grnn_left_tmp_buffer")
+            ->assert_is_op_output("search_grnn", "tmp_buffer")
+            ->AsIntermediate();
+    auto* seq_pool_left =
+        OpNode("seq_pool_left", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_left_out = VarNode("seq_pool_left_out")
+                                  ->assert_is_op_output("sequence_pool", "Out")
+                                  ->AsOutput();
+    auto* seq_pool_left_max_idx =
+        VarNode("seq_pool_left_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* concat_2in1 = OpNode("concat_2in1", "concat")->AsIntermediate();
+    auto* concat_2in1_out = VarNode("concat_2in1_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsIntermediate();
+    auto* att_2in1_w =
+        VarNode("att_2in1_w")
+            ->assert_is_op_input("__xpu__mmdnn_search_attention", "W")
+            ->AsInput();
+    auto* att_2in1_b =
+        VarNode("att_2in1_b")
+            ->assert_is_op_input("__xpu__mmdnn_search_attention", "b")
+            ->AsInput();
+    auto* att_2in1 =
+        OpNode("att_2in1", "__xpu__mmdnn_search_attention")->AsIntermediate();
+    auto* att_2in1_out =
+        VarNode("att_2in1_out")
+            ->assert_is_op_output("__xpu__mmdnn_search_attention", "Out")
+            ->AsIntermediate();
+    auto* seq_pool_2in1 =
+        OpNode("seq_pool_2in1", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_2in1_out = VarNode("seq_pool_2in1_out")
+                                  ->assert_is_op_output("sequence_pool", "Out")
+                                  ->AsOutput();
+    auto* seq_pool_2in1_max_idx =
+        VarNode("seq_pool_2in1_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* concat_3in1 = OpNode("concat_3in1", "concat")->AsIntermediate();
+    auto* concat_3in1_out = VarNode("concat_3in1_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsOutput();
+
+    *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out;
+    *emb_tbl >> *emb0;
+    *input1 >> *emb1 >> *emb1_out >> *eltwise01;
+    *emb_tbl >> *emb1;
+
+    *eltwise01_out >> *seq_rev_right0 >> *seq_rev_right0_out >> *grnn_right >>
+        *grnn_right_out >> *seq_rev_right1 >> *seq_rev_right1_out;
+    *grnn_right_out >> *seq_pool_right >> *seq_pool_right_out;
+    *seq_pool_right >> *seq_pool_right_max_idx;
+    *grnn_right_wh >> *grnn_right;
+    *grnn_right_wi >> *grnn_right;
+    *grnn_right >> *grnn_right_idx_sorted_by_width;
+    *grnn_right >> *grnn_right_layout_input;
+    *grnn_right >> *grnn_right_tmp_buffer;
+
+    *eltwise01_out >> *grnn_left >> *grnn_left_out >> *seq_pool_left >>
+        *seq_pool_left_out;
+    *seq_pool_left >> *seq_pool_left_max_idx;
+    *grnn_left_wh >> *grnn_left;
+    *grnn_left_wi >> *grnn_left;
+    *grnn_left >> *grnn_left_idx_sorted_by_width;
+    *grnn_left >> *grnn_left_layout_input;
+    *grnn_left >> *grnn_left_tmp_buffer;
+
+    *seq_rev_right1_out >> *concat_2in1;
+    *grnn_left_out >> *concat_2in1;
+    *concat_2in1 >> *concat_2in1_out >> *att_2in1 >> *att_2in1_out >>
+        *seq_pool_2in1 >> *seq_pool_2in1_out;
+    *seq_pool_2in1 >> *seq_pool_2in1_max_idx;
+    *att_2in1_w >> *att_2in1;
+    *att_2in1_b >> *att_2in1;
+
+    *eltwise01_out >> *concat_3in1;
+    *seq_rev_right1_out >> *concat_3in1;
+    *grnn_left_out >> *concat_3in1;
+    *concat_3in1 >> *concat_3in1_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_bid_emb_grnn_att2");
+    op_desc.SetInput("id0", {matched.at("input0")->arg()->name});
+    op_desc.SetInput("id1", {matched.at("input1")->arg()->name});
+    op_desc.SetInput("emb_tbl", {matched.at("emb_tbl")->arg()->name});
+    op_desc.SetInput("grnn_fw_wh", {matched.at("grnn_left_wh")->arg()->name});
+    op_desc.SetInput("grnn_fw_wi", {matched.at("grnn_left_wi")->arg()->name});
+    op_desc.SetInput("grnn_rv_wh", {matched.at("grnn_right_wh")->arg()->name});
+    op_desc.SetInput("grnn_rv_wi", {matched.at("grnn_right_wi")->arg()->name});
+    op_desc.SetInput("att_fc_w", {matched.at("att_2in1_w")->arg()->name});
+    op_desc.SetInput("att_fc_b", {matched.at("att_2in1_b")->arg()->name});
+    op_desc.SetOutput("emb0_out", {matched.at("emb0_out")->arg()->name});
+    op_desc.SetOutput("grnn_fw_pool_out",
+                      {matched.at("seq_pool_left_out")->arg()->name});
+    op_desc.SetOutput("grnn_rv_pool_out",
+                      {matched.at("seq_pool_right_out")->arg()->name});
+    op_desc.SetOutput("att_pool_out",
+                      {matched.at("seq_pool_2in1_out")->arg()->name});
+    op_desc.SetOutput("concat_3in1_out",
+                      {matched.at("concat_3in1_out")->arg()->name});
+    op_desc.SetOutput("emb_fw_out", {matched.at("eltwise01_out")->arg()->name});
+
+    auto* grnn_fw_op_info = matched.at("grnn_left")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_fw_wh_maxs",
+        grnn_fw_op_info->GetAttr<std::vector<float>>("__xpu__wh_max"));
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_fw_wi_maxs",
+        grnn_fw_op_info->GetAttr<std::vector<float>>("__xpu__wi_max"));
+    auto* grnn_rv_op_info = matched.at("grnn_right")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_rv_wh_maxs",
+        grnn_rv_op_info->GetAttr<std::vector<float>>("__xpu__wh_max"));
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_rv_wi_maxs",
+        grnn_rv_op_info->GetAttr<std::vector<float>>("__xpu__wi_max"));
+    auto* att_fc_op_info = matched.at("att_2in1")->stmt()->op_info();
+    op_desc.SetAttr<float>("att_fc_w_max",
+                           att_fc_op_info->GetAttr<float>("W_max"));
+
+    auto* new_stmt = matched.at("emb0")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    std::vector<std::string> arg_names{
+        "input1",
+        "grnn_left_wh",
+        "grnn_left_wi",
+        "grnn_right_wh",
+        "grnn_right_wi",
+        "att_2in1_w",
+        "att_2in1_b",
+    };
+    for (auto name : arg_names) {
+      DirectedLink(matched.at(name), matched.at("emb0"));
+    }
+    std::vector<std::string> out_names{
+        "seq_pool_left_out",
+        "seq_pool_right_out",
+        "seq_pool_2in1_out",
+        "concat_3in1_out",
+        "eltwise01_out",
+    };
+    for (auto name : out_names) {
+      IR_OP_VAR_LINK(matched.at("emb0"), matched.at(name));
+    }
+  }
+};
+
+class XPUMmdnnMergeAllFuser : public FuseBase {
+ public:
+  explicit XPUMmdnnMergeAllFuser(int n_concat_topk)
+      : n_concat_topk_(n_concat_topk) {}
+
+  void BuildPattern() override {
+    auto* concat_7in1_input0 = VarNode("concat_7in1_input0")
+                                   ->assert_is_op_nth_input("concat", "X", 0)
+                                   ->AsInput();
+    auto* concat_7in1_input1 = VarNode("concat_7in1_input1")
+                                   ->assert_is_op_nth_input("concat", "X", 1)
+                                   ->AsInput();
+    auto* concat_7in1_input2 = VarNode("concat_7in1_input2")
+                                   ->assert_is_op_nth_input("concat", "X", 2)
+                                   ->AsInput();
+    auto* concat_7in1_input3 = VarNode("concat_7in1_input3")
+                                   ->assert_is_op_nth_input("concat", "X", 3)
+                                   ->AsInput();
+    auto* concat_7in1_input4 = VarNode("concat_7in1_input4")
+                                   ->assert_is_op_nth_input("concat", "X", 4)
+                                   ->AsInput();
+    auto* concat_7in1_input5 = VarNode("concat_7in1_input5")
+                                   ->assert_is_op_nth_input("concat", "X", 5)
+                                   ->AsInput();
+    auto* concat_7in1_input6 = VarNode("concat_7in1_input6")
+                                   ->assert_is_op_nth_input("concat", "X", 6)
+                                   ->AsInput();
+    auto* concat_7in1 = OpNode("concat_7in1", "concat");
+    auto* concat_7in1_out = VarNode("concat_7in1_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsIntermediate();
+    auto* search_fc0_w = VarNode("search_fc0_w")
+                             ->assert_is_op_input("search_fc", "W")
+                             ->AsInput();
+    auto* search_fc0_b = VarNode("search_fc0_b")
+                             ->assert_is_op_input("search_fc", "b")
+                             ->AsInput();
+    auto* search_fc0 = OpNode("search_fc0", "search_fc")->AsIntermediate();
+    auto* search_fc0_out = VarNode("search_fc0_out")
+                               ->assert_is_op_output("search_fc", "Out")
+                               ->AsIntermediate();
+    auto* relu0 = OpNode("relu0", "relu")->AsIntermediate();
+    auto* relu0_out = VarNode("relu0_out")
+                          ->assert_is_op_output("relu", "Out")
+                          ->AsIntermediate();
+
+    auto* concat_topk_input0 = VarNode("concat_topk_input0")
+                                   ->assert_is_op_nth_input("concat", "X", 0)
+                                   ->AsInput();
+    auto* concat_topk_input1 = VarNode("concat_topk_input1")
+                                   ->assert_is_op_nth_input("concat", "X", 1)
+                                   ->AsInput();
+    auto* concat_topk = OpNode("concat_topk", "concat")->AsIntermediate();
+    auto* concat_topk_out = VarNode("concat_topk_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsIntermediate();
+    for (int i = 2; i < n_concat_topk_; ++i) {
+      auto concat_topk_input_name =
+          paddle::lite::string_format("concat_topk_input%d", i);
+      auto* concat_topk_inputx = VarNode(concat_topk_input_name)
+                                     ->assert_is_op_nth_input("concat", "X", i)
+                                     ->AsInput();
+      *concat_topk_inputx >> *concat_topk;
+    }
+
+    auto* seq_rev = OpNode("seq_rev", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev_out = VarNode("seq_rev_out")
+                            ->assert_is_op_output("sequence_reverse", "Y")
+                            ->AsIntermediate();
+
+    auto* grnn_rv_wh = VarNode("grnn_rv_wh")
+                           ->assert_is_op_input("search_grnn", "Wh")
+                           ->AsInput();
+    auto* grnn_rv_wi = VarNode("grnn_rv_wi")
+                           ->assert_is_op_input("search_grnn", "Wi")
+                           ->AsInput();
+    auto* grnn_rv = OpNode("grnn_rv", "search_grnn")->AsIntermediate();
+    auto* grnn_rv_out = VarNode("grnn_rv_out")
+                            ->assert_is_op_output("search_grnn", "Out")
+                            ->AsIntermediate();
+    auto* grnn_rv_idx_sorted_by_width =
+        VarNode("grnn_rv_idx_sorted_by_width")
+            ->assert_is_op_output("search_grnn", "idx_sorted_by_width")
+            ->AsIntermediate();
+    auto* grnn_rv_layout_input =
+        VarNode("grnn_rv_layout_input")
+            ->assert_is_op_output("search_grnn", "layout_input")
+            ->AsIntermediate();
+    auto* grnn_rv_tmp_buffer =
+        VarNode("grnn_rv_tmp_buffer")
+            ->assert_is_op_output("search_grnn", "tmp_buffer")
+            ->AsIntermediate();
+    auto* seq_pool_rv =
+        OpNode("seq_pool_rv", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_rv_out = VarNode("seq_pool_rv_out")
+                                ->assert_is_op_output("sequence_pool", "Out")
+                                ->AsIntermediate();
+    auto* seq_pool_rv_max_idx =
+        VarNode("seq_pool_rv_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* grnn_fw_wh = VarNode("grnn_fw_wh")
+                           ->assert_is_op_input("search_grnn", "Wh")
+                           ->AsInput();
+    auto* grnn_fw_wi = VarNode("grnn_fw_wi")
+                           ->assert_is_op_input("search_grnn", "Wi")
+                           ->AsInput();
+    auto* grnn_fw = OpNode("grnn_fw", "search_grnn")->AsIntermediate();
+    auto* grnn_fw_out = VarNode("grnn_fw_out")
+                            ->assert_is_op_output("search_grnn", "Out")
+                            ->AsIntermediate();
+    auto* grnn_fw_idx_sorted_by_width =
+        VarNode("grnn_fw_idx_sorted_by_width")
+            ->assert_is_op_output("search_grnn", "idx_sorted_by_width")
+            ->AsIntermediate();
+    auto* grnn_fw_layout_input =
+        VarNode("grnn_fw_layout_input")
+            ->assert_is_op_output("search_grnn", "layout_input")
+            ->AsIntermediate();
+    auto* grnn_fw_tmp_buffer =
+        VarNode("grnn_fw_tmp_buffer")
+            ->assert_is_op_output("search_grnn", "tmp_buffer")
+            ->AsIntermediate();
+    auto* seq_pool_fw =
+        OpNode("seq_pool_fw", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_fw_out = VarNode("seq_pool_fw_out")
+                                ->assert_is_op_output("sequence_pool", "Out")
+                                ->AsIntermediate();
+    auto* seq_pool_fw_max_idx =
+        VarNode("seq_pool_fw_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* rv_fw_concat = OpNode("rv_fw_concat", "concat")->AsIntermediate();
+    auto* rv_fw_concat_out = VarNode("rv_fw_concat_out")
+                                 ->assert_is_op_output("concat", "Out")
+                                 ->AsIntermediate();
+
+    auto* last_concat = OpNode("last_concat", "concat")->AsIntermediate();
+    auto* last_concat_out = VarNode("last_concat_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsIntermediate();
+    auto* search_fc1_w = VarNode("search_fc1_w")
+                             ->assert_is_op_input("search_fc", "W")
+                             ->AsInput();
+    auto* search_fc1_b = VarNode("search_fc1_b")
+                             ->assert_is_op_input("search_fc", "b")
+                             ->AsInput();
+    auto* search_fc1 = OpNode("search_fc1", "search_fc")->AsIntermediate();
+    auto* search_fc1_out = VarNode("search_fc1_out")
+                               ->assert_is_op_output("search_fc", "Out")
+                               ->AsIntermediate();
+    auto* relu1 = OpNode("relu1", "relu")->AsIntermediate();
+    auto* relu1_out = VarNode("relu1_out")
+                          ->assert_is_op_output("relu", "Out")
+                          ->AsIntermediate();
+    auto* search_fc2_w = VarNode("search_fc2_w")
+                             ->assert_is_op_input("search_fc", "W")
+                             ->AsInput();
+    auto* search_fc2_b = VarNode("search_fc2_b")
+                             ->assert_is_op_input("search_fc", "b")
+                             ->AsInput();
+    auto* search_fc2 = OpNode("search_fc2", "search_fc")->AsIntermediate();
+    auto* search_fc2_out = VarNode("search_fc2_out")
+                               ->assert_is_op_output("search_fc", "Out")
+                               ->AsOutput();
+
+    *concat_7in1_input0 >> *concat_7in1;
+    *concat_7in1_input1 >> *concat_7in1;
+    *concat_7in1_input2 >> *concat_7in1;
+    *concat_7in1_input3 >> *concat_7in1;
+    *concat_7in1_input4 >> *concat_7in1;
+    *concat_7in1_input5 >> *concat_7in1;
+    *concat_7in1_input6 >> *concat_7in1;
+    *concat_7in1 >> *concat_7in1_out >> *search_fc0 >> *search_fc0_out >>
+        *relu0 >> *relu0_out;
+    *search_fc0_w >> *search_fc0;
+    *search_fc0_b >> *search_fc0;
+
+    *concat_topk_input0 >> *concat_topk;
+    *concat_topk_input1 >> *concat_topk;
+    *concat_topk >> *concat_topk_out >> *seq_rev >> *seq_rev_out;
+
+    *seq_rev_out >> *grnn_rv >> *grnn_rv_out >> *seq_pool_rv >>
+        *seq_pool_rv_out;
+    *seq_pool_rv >> *seq_pool_rv_max_idx;
+    *grnn_rv_wh >> *grnn_rv;
+    *grnn_rv_wi >> *grnn_rv;
+    *grnn_rv >> *grnn_rv_idx_sorted_by_width;
+    *grnn_rv >> *grnn_rv_layout_input;
+    *grnn_rv >> *grnn_rv_tmp_buffer;
+
+    *concat_topk_out >> *grnn_fw >> *grnn_fw_out >> *seq_pool_fw >>
+        *seq_pool_fw_out;
+    *seq_pool_fw >> *seq_pool_fw_max_idx;
+    *grnn_fw_wh >> *grnn_fw;
+    *grnn_fw_wi >> *grnn_fw;
+    *grnn_fw >> *grnn_fw_idx_sorted_by_width;
+    *grnn_fw >> *grnn_fw_layout_input;
+    *grnn_fw >> *grnn_fw_tmp_buffer;
+
+    *seq_pool_rv_out >> *rv_fw_concat;
+    *seq_pool_fw_out >> *rv_fw_concat;
+    *rv_fw_concat >> *rv_fw_concat_out;
+
+    *rv_fw_concat_out >> *last_concat;
+    *relu0_out >> *last_concat;
+    *last_concat >> *last_concat_out >> *search_fc1 >> *search_fc1_out >>
+        *relu1 >> *relu1_out >> *search_fc2 >> *search_fc2_out;
+    *search_fc1_w >> *search_fc1;
+    *search_fc1_b >> *search_fc1;
+    *search_fc2_w >> *search_fc2;
+    *search_fc2_b >> *search_fc2;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_merge_all");
+    auto* concat_7in1_op_info = matched.at("concat_7in1")->stmt()->op_info();
+    op_desc.SetInput("concat_7in1_x", concat_7in1_op_info->Input("X"));
+    auto* concat_topk_op_info = matched.at("concat_topk")->stmt()->op_info();
+    op_desc.SetInput("concat_topk_x", concat_topk_op_info->Input("X"));
+    op_desc.SetInput("grnn_fw_wh", {matched.at("grnn_fw_wh")->arg()->name});
+    op_desc.SetInput("grnn_fw_wi", {matched.at("grnn_fw_wi")->arg()->name});
+    op_desc.SetInput("grnn_rv_wh", {matched.at("grnn_rv_wh")->arg()->name});
+    op_desc.SetInput("grnn_rv_wi", {matched.at("grnn_rv_wi")->arg()->name});
+    op_desc.SetInput("fc0_w", {matched.at("search_fc0_w")->arg()->name});
+    op_desc.SetInput("fc0_b", {matched.at("search_fc0_b")->arg()->name});
+    op_desc.SetInput("fc1_w", {matched.at("search_fc1_w")->arg()->name});
+    op_desc.SetInput("fc1_b", {matched.at("search_fc1_b")->arg()->name});
+    op_desc.SetInput("fc2_w", {matched.at("search_fc2_w")->arg()->name});
+    op_desc.SetInput("fc2_b", {matched.at("search_fc2_b")->arg()->name});
+
+    op_desc.SetOutput("out", {matched.at("search_fc2_out")->arg()->name});
+
+    auto* grnn_fw_op_info = matched.at("grnn_fw")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_fw_wh_maxs",
+        grnn_fw_op_info->GetAttr<std::vector<float>>("__xpu__wh_max"));
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_fw_wi_maxs",
+        grnn_fw_op_info->GetAttr<std::vector<float>>("__xpu__wi_max"));
+    auto* grnn_rv_op_info = matched.at("grnn_rv")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_rv_wh_maxs",
+        grnn_rv_op_info->GetAttr<std::vector<float>>("__xpu__wh_max"));
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_rv_wi_maxs",
+        grnn_rv_op_info->GetAttr<std::vector<float>>("__xpu__wi_max"));
+    auto* fc0_op_info = matched.at("search_fc0")->stmt()->op_info();
+    op_desc.SetAttr<float>("fc0_w_max",
+                           fc0_op_info->GetAttr<float>("__xpu__w_max"));
+    auto* fc1_op_info = matched.at("search_fc1")->stmt()->op_info();
+    op_desc.SetAttr<float>("fc1_w_max",
+                           fc1_op_info->GetAttr<float>("__xpu__w_max"));
+    auto* fc2_op_info = matched.at("search_fc2")->stmt()->op_info();
+    op_desc.SetAttr<float>("fc2_w_max",
+                           fc2_op_info->GetAttr<float>("__xpu__w_max"));
+
+    auto* new_stmt = matched.at("concat_7in1")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    std::vector<std::string> arg_names{
+        "concat_topk_input0",
+        "concat_topk_input1",
+        "grnn_fw_wh",
+        "grnn_fw_wi",
+        "grnn_rv_wh",
+        "grnn_rv_wi",
+        "search_fc0_w",
+        "search_fc0_b",
+        "search_fc1_w",
+        "search_fc1_b",
+        "search_fc2_w",
+        "search_fc2_b",
+    };
+    for (int i = 2; i < n_concat_topk_; ++i) {
+      auto concat_topk_input_name =
+          paddle::lite::string_format("concat_topk_input%d", i);
+      arg_names.push_back(concat_topk_input_name);
+    }
+    for (auto name : arg_names) {
+      DirectedLink(matched.at(name), matched.at("concat_7in1"));
+    }
+    std::vector<std::string> out_names{
+        "search_fc2_out",
+    };
+    for (auto name : out_names) {
+      IR_OP_VAR_LINK(matched.at("concat_7in1"), matched.at(name));
+    }
+  }
+
+ private:
+  int n_concat_topk_;
+};
+
+}  // namespace fusion
+
+class XPUMmdnnFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+
+    fusion::XPUMmdnnFloat2Fix float_2_fix;
+    float_2_fix(graph.get());
+    fusion::XPUMmdnnSearchAttentionFuser search_att_fuser;
+    search_att_fuser(graph.get());
+    fusion::XPUMmdnnMatchConvTopkFuser match_conv_topk_fuser;
+    match_conv_topk_fuser(graph.get());
+    fusion::XPUMmdnnMatchConvTopkFuser2 match_conv_topk_fuser2;
+    match_conv_topk_fuser2(graph.get());
+
+    fusion::XPUMmdnnBidSeqRevEmbEltwiseFuser bi_seq_rev_emb_eltwise_fuser;
+    bi_seq_rev_emb_eltwise_fuser(graph.get());
+    fusion::XPUMmdnnBidEmbGrnnAttFuser bid_emb_grnn_att_fuser;
+    bid_emb_grnn_att_fuser(graph.get());
+    fusion::XPUMmdnnBidEmbGrnnAttFuser2 bid_emb_grnn_att_fuser2;
+    bid_emb_grnn_att_fuser2(graph.get());
+    fusion::XPUMmdnnBidEmbAttFuser bid_emb_att_fuser;
+    bid_emb_att_fuser(graph.get());
+    for (int n_concat_topk : {3, 2}) {
+      fusion::XPUMmdnnMergeAllFuser merge_all_fuser(n_concat_topk);
+      merge_all_fuser(graph.get());
+    }
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__mmdnn_fuse_pass, paddle::lite::mir::XPUMmdnnFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("__xpu__mmdnn_search_attention")
+    .BindKernel("__xpu__mmdnn_bid_emb_grnn_att")
+    .BindKernel("__xpu__mmdnn_bid_emb_grnn_att2")
+    .BindKernel("__xpu__mmdnn_bid_emb_att")
+    .BindKernel("__xpu__mmdnn_match_conv_topk")
+    .BindKernel("__xpu__mmdnn_merge_all");
diff --git a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
index 525042e44b2997013943f392f592d812bd68fa0b..21bc266204d95c0f7faa8c3796e4b6255a3fe741 100644
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -383,10 +383,10 @@ class XPUSingleEncoderFuser : public FuseBase {
     op_desc.SetAttr<std::string>("act_type", act_type_);
 
     auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
     static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
     auto* single_encoder_stmt = matched.at("q_mul")->stmt();
     fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
     fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());
@@ -639,20 +639,21 @@ class XPUMultiEncoderFusePass : public ProgramPass {
     std::set<int> fc_int31_ids;
 #ifdef LITE_WITH_XPU
     // TODO(miaotianxiang): core/mir/*_pass.cc are compiled anyway and need to
-    // access Context<kXPU>::_multi_encoder_precision, but this static member
-    // variable in class specialization defined in lite/core/context.cc
-    // is only compiled iff LITE_WITH_XPU==ON. To suppress linkage error, we use
+    // access TargetWrapperXPU::multi_encoder_precision, but this static member
+    // variable in class specialization defined in
+    // lite/backends/xpu/target_wrapper.cc is only compiled iff
+    // LITE_WITH_XPU==ON. To suppress linkage error, we use
     // #ifdef here. Any better idea?
     if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" ||
-        lite::Context<TargetType::kXPU>::_multi_encoder_precision == "int31") {
+        lite::TargetWrapperXPU::multi_encoder_precision == "int31") {
       fc_int31_ids = {0, 1, 2, 3, 4, 5};
       VLOG(3) << "Use int31 in XPUMultiEncoderOp, "
-              << "lite::Context<>::_multi_encoder_precision="
-              << lite::Context<TargetType::kXPU>::_multi_encoder_precision;
+              << "lite::TargetWrapperXPU::multi_encoder_precision="
+              << lite::TargetWrapperXPU::multi_encoder_precision;
     } else {
       VLOG(3) << "Use int16 in XPUMultiEncoderOp, "
-              << "lite::Context<>::_multi_encoder_precision="
-              << lite::Context<TargetType::kXPU>::_multi_encoder_precision;
+              << "lite::TargetWrapperXPU::multi_encoder_precision="
+              << lite::TargetWrapperXPU::multi_encoder_precision;
     }
 #endif
 
diff --git a/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f017cc8c72f93a772f8bcbdc9aa96d5b0ad215d8
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
@@ -0,0 +1,1389 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include "lite/operators/subgraph_op.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class XPUResNetCbamBlock0Fuser : public FuseBase {
+ public:
+  XPUResNetCbamBlock0Fuser() {}
+
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+
+    auto* left_conv1_weight = VarNode("left_conv1_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv1 = OpNode("left_conv1", "conv2d");
+    auto* left_conv1_out = VarNode("left_conv1_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn1_scale = VarNode("left_bn1_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn1_bias = VarNode("left_bn1_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn1_mean = VarNode("left_bn1_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn1_var = VarNode("left_bn1_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate();
+    auto* left_bn1_out = VarNode("left_bn1_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
+                             ->AsIntermediate();
+    auto* left_bn1_mean_out = VarNode("left_bn1_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn1_var_out =
+        VarNode("left_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn1_saved_mean =
+        VarNode("left_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn1_saved_var =
+        VarNode("left_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate();
+    auto* left_relu1_out = VarNode("left_relu1_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+
+    auto* left_conv2_weight = VarNode("left_conv2_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate();
+    auto* left_conv2_out = VarNode("left_conv2_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn2_scale = VarNode("left_bn2_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn2_bias = VarNode("left_bn2_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn2_mean = VarNode("left_bn2_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn2_var = VarNode("left_bn2_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate();
+    auto* left_bn2_out = VarNode("left_bn2_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
+                             ->AsIntermediate();
+    auto* left_bn2_mean_out = VarNode("left_bn2_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn2_var_out =
+        VarNode("left_bn2_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn2_saved_mean =
+        VarNode("left_bn2_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn2_saved_var =
+        VarNode("left_bn2_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate();
+    auto* left_relu2_out = VarNode("left_relu2_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+
+    auto* left_conv3_weight = VarNode("left_conv3_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate();
+    auto* left_conv3_out = VarNode("left_conv3_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn3_scale = VarNode("left_bn3_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn3_bias = VarNode("left_bn3_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn3_mean = VarNode("left_bn3_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn3_var = VarNode("left_bn3_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate();
+    auto* left_bn3_out = VarNode("left_bn3_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->AsIntermediate();
+    auto* left_bn3_mean_out = VarNode("left_bn3_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn3_var_out =
+        VarNode("left_bn3_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn3_saved_mean =
+        VarNode("left_bn3_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn3_saved_var =
+        VarNode("left_bn3_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    // cbam specific
+    auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate();
+    auto* reduce_mean_out = VarNode("reduce_mean_out")
+                                ->assert_is_op_output("reduce_mean", "Out")
+                                ->assert_is_op_input("concat")
+                                ->AsIntermediate();
+    auto* reduce_max = OpNode("reduce_max", "reduce_max")->AsIntermediate();
+    auto* reduce_max_out = VarNode("reduce_max_out")
+                               ->assert_is_op_output("reduce_max", "Out")
+                               ->assert_is_op_input("concat")
+                               ->AsIntermediate();
+    auto* concat = OpNode("concat", "concat")->AsIntermediate();
+    auto* concat_out = VarNode("concat_out")
+                           ->assert_is_op_output("concat", "Out")
+                           ->assert_is_op_input("conv2d", "Input")
+                           ->AsIntermediate();
+    auto* left_conv4_weight = VarNode("left_conv4_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv4 = OpNode("left_conv4", "conv2d")->AsIntermediate();
+    auto* left_conv4_out = VarNode("left_conv4_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("sigmoid", "X")
+                               ->AsIntermediate();
+    auto* sigmoid = OpNode("sigmoid", "sigmoid")->AsIntermediate();
+    auto* sigmoid_out = VarNode("sigmoid_out")
+                            ->assert_is_op_output("sigmoid", "Out")
+                            ->assert_is_op_input("elementwise_mul")
+                            ->AsIntermediate();
+    auto* reshape = OpNode("reshape", "reshape2")->AsIntermediate();
+    auto* reshape_out = VarNode("reshape_out")
+                            ->assert_is_op_output("reshape2", "Out")
+                            ->assert_is_op_input("elementwise_mul")
+                            ->AsIntermediate();
+    auto* reshape_xshape = VarNode("reshape_xshape")
+                               ->assert_is_op_output("reshape2", "XShape")
+                               ->AsIntermediate();
+    auto* eltwise_mul =
+        OpNode("eltwise_mul", "elementwise_mul")->AsIntermediate();
+    auto* eltwise_mul_out = VarNode("eltwise_mul_out")
+                                ->assert_is_op_output("elementwise_mul", "Out")
+                                ->assert_is_op_input("elementwise_add")
+                                ->AsIntermediate();
+
+    auto* right_conv1_weight = VarNode("right_conv1_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate();
+    auto* right_conv1_out = VarNode("right_conv1_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn1_scale = VarNode("right_bn1_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn1_bias = VarNode("right_bn1_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn1_mean = VarNode("right_bn1_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn1_var = VarNode("right_bn1_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
+    auto* right_bn1_out = VarNode("right_bn1_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("elementwise_add")
+                              ->AsIntermediate();
+    auto* right_bn1_mean_out =
+        VarNode("right_bn1_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn1_var_out =
+        VarNode("right_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn1_saved_mean =
+        VarNode("right_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn1_saved_var =
+        VarNode("right_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
+    auto* add_out = VarNode("add_out")
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("relu", "X")
+                        ->AsIntermediate();
+    auto* relu = OpNode("relu", "relu")->AsIntermediate();
+    auto* relu_out =
+        VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+
+    *input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >>
+        *left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >>
+        *left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >>
+        *left_conv3 >> *left_conv3_out >> *left_bn3 >>
+        *left_bn3_out /* >> *add*/;
+
+    *left_bn3_out >> *reduce_mean >> *reduce_mean_out >> *concat;
+    *left_bn3_out >> *reduce_max >> *reduce_max_out >> *concat;
+    *concat >> *concat_out >> *left_conv4 >> *left_conv4_out >> *sigmoid >>
+        *sigmoid_out >> *eltwise_mul;
+    *left_conv4_weight >> *left_conv4;
+    *left_bn3_out >> *reshape >> *reshape_out >> *eltwise_mul;
+    *reshape >> *reshape_xshape;
+    *eltwise_mul >> *eltwise_mul_out >> *add;
+
+    *left_conv1_weight >> *left_conv1;
+    *left_bn1_scale >> *left_bn1;
+    *left_bn1_bias >> *left_bn1;
+    *left_bn1_mean >> *left_bn1;
+    *left_bn1_var >> *left_bn1;
+    *left_bn1 >> *left_bn1_mean_out;
+    *left_bn1 >> *left_bn1_var_out;
+    *left_bn1 >> *left_bn1_saved_mean;
+    *left_bn1 >> *left_bn1_saved_var;
+
+    *left_conv2_weight >> *left_conv2;
+    *left_bn2_scale >> *left_bn2;
+    *left_bn2_bias >> *left_bn2;
+    *left_bn2_mean >> *left_bn2;
+    *left_bn2_var >> *left_bn2;
+    *left_bn2 >> *left_bn2_mean_out;
+    *left_bn2 >> *left_bn2_var_out;
+    *left_bn2 >> *left_bn2_saved_mean;
+    *left_bn2 >> *left_bn2_saved_var;
+
+    *left_conv3_weight >> *left_conv3;
+    *left_bn3_scale >> *left_bn3;
+    *left_bn3_bias >> *left_bn3;
+    *left_bn3_mean >> *left_bn3;
+    *left_bn3_var >> *left_bn3;
+    *left_bn3 >> *left_bn3_mean_out;
+    *left_bn3 >> *left_bn3_var_out;
+    *left_bn3 >> *left_bn3_saved_mean;
+    *left_bn3 >> *left_bn3_saved_var;
+
+    *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
+        *right_bn1_out >> *add;
+
+    *right_conv1_weight >> *right_conv1;
+    *right_bn1_scale >> *right_bn1;
+    *right_bn1_bias >> *right_bn1;
+    *right_bn1_mean >> *right_bn1;
+    *right_bn1_var >> *right_bn1;
+    *right_bn1 >> *right_bn1_mean_out;
+    *right_bn1 >> *right_bn1_var_out;
+    *right_bn1 >> *right_bn1_saved_mean;
+    *right_bn1 >> *right_bn1_saved_var;
+
+    *add >> *add_out >> *relu >> *relu_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_cbam_block0");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter",
+                     {
+                         matched.at("left_conv1_weight")->arg()->name,
+                         matched.at("left_conv2_weight")->arg()->name,
+                         matched.at("left_conv3_weight")->arg()->name,
+                         matched.at("left_conv4_weight")->arg()->name,
+                         matched.at("right_conv1_weight")->arg()->name,
+                     });
+    op_desc.SetInput("Scale",
+                     {
+                         matched.at("left_bn1_scale")->arg()->name,
+                         matched.at("left_bn2_scale")->arg()->name,
+                         matched.at("left_bn3_scale")->arg()->name,
+                         "placeholder_sa_conv",
+                         matched.at("right_bn1_scale")->arg()->name,
+                     });
+    op_desc.SetInput("Bias",
+                     {
+                         matched.at("left_bn1_bias")->arg()->name,
+                         matched.at("left_bn2_bias")->arg()->name,
+                         matched.at("left_bn3_bias")->arg()->name,
+                         "placeholder_sa_conv",
+                         matched.at("right_bn1_bias")->arg()->name,
+                     });
+    op_desc.SetInput("Mean",
+                     {
+                         matched.at("left_bn1_mean")->arg()->name,
+                         matched.at("left_bn2_mean")->arg()->name,
+                         matched.at("left_bn3_mean")->arg()->name,
+                         "placeholder_sa_conv",
+                         matched.at("right_bn1_mean")->arg()->name,
+                     });
+    op_desc.SetInput("Var",
+                     {
+                         matched.at("left_bn1_variance")->arg()->name,
+                         matched.at("left_bn2_variance")->arg()->name,
+                         matched.at("left_bn3_variance")->arg()->name,
+                         "placeholder_sa_conv",
+                         matched.at("right_bn1_variance")->arg()->name,
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    auto block0_stmt = matched.at("left_conv1")->stmt();
+    // block0_stmt->ResetOp(op_desc, graph->valid_places());
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetProgramDesc(sub_program_desc);
+    fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
+    block0_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "left_conv2_weight",
+        "left_conv3_weight",
+        "left_conv4_weight",
+        "right_conv1_weight",
+        "left_bn1_bias",
+        "left_bn2_bias",
+        "left_bn3_bias",
+        "right_bn1_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1"));
+    }
+    IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out"));
+  }
+};
+
+class XPUResNetCbamBlock1Fuser : public FuseBase {
+ public:
+  XPUResNetCbamBlock1Fuser() {}
+
+  void BuildPattern() override {
+    auto* input = VarNode("input")
+                      ->assert_is_op_input("conv2d", "Input")
+                      ->assert_is_op_input("elementwise_add")
+                      ->AsInput();
+
+    auto* right_conv1_weight = VarNode("right_conv1_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv1 = OpNode("right_conv1", "conv2d");
+    auto* right_conv1_out = VarNode("right_conv1_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn1_scale = VarNode("right_bn1_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn1_bias = VarNode("right_bn1_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn1_mean = VarNode("right_bn1_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn1_var = VarNode("right_bn1_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
+    auto* right_bn1_out = VarNode("right_bn1_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("relu", "X")
+                              ->AsIntermediate();
+    auto* right_bn1_mean_out =
+        VarNode("right_bn1_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn1_var_out =
+        VarNode("right_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn1_saved_mean =
+        VarNode("right_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn1_saved_var =
+        VarNode("right_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* right_relu1 = OpNode("right_relu1", "relu")->AsIntermediate();
+    auto* right_relu1_out = VarNode("right_relu1_out")
+                                ->assert_is_op_output("relu", "Out")
+                                ->assert_is_op_input("conv2d", "Input")
+                                ->AsIntermediate();
+
+    auto* right_conv2_weight = VarNode("right_conv2_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv2 = OpNode("right_conv2", "conv2d")->AsIntermediate();
+    auto* right_conv2_out = VarNode("right_conv2_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn2_scale = VarNode("right_bn2_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn2_bias = VarNode("right_bn2_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn2_mean = VarNode("right_bn2_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn2_var = VarNode("right_bn2_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn2 = OpNode("right_bn2", "batch_norm")->AsIntermediate();
+    auto* right_bn2_out = VarNode("right_bn2_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("relu", "X")
+                              ->AsIntermediate();
+    auto* right_bn2_mean_out =
+        VarNode("right_bn2_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn2_var_out =
+        VarNode("right_bn2_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn2_saved_mean =
+        VarNode("right_bn2_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn2_saved_var =
+        VarNode("right_bn2_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* right_relu2 = OpNode("right_relu2", "relu")->AsIntermediate();
+    auto* right_relu2_out = VarNode("right_relu2_out")
+                                ->assert_is_op_output("relu", "Out")
+                                ->assert_is_op_input("conv2d", "Input")
+                                ->AsIntermediate();
+
+    auto* right_conv3_weight = VarNode("right_conv3_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv3 = OpNode("right_conv3", "conv2d")->AsIntermediate();
+    auto* right_conv3_out = VarNode("right_conv3_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn3_scale = VarNode("right_bn3_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn3_bias = VarNode("right_bn3_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn3_mean = VarNode("right_bn3_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn3_var = VarNode("right_bn3_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn3 = OpNode("right_bn3", "batch_norm")->AsIntermediate();
+    auto* right_bn3_out = VarNode("right_bn3_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->AsIntermediate();
+    auto* right_bn3_mean_out =
+        VarNode("right_bn3_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn3_var_out =
+        VarNode("right_bn3_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn3_saved_mean =
+        VarNode("right_bn3_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn3_saved_var =
+        VarNode("right_bn3_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    // cbam specific
+    auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate();
+    auto* reduce_mean_out = VarNode("reduce_mean_out")
+                                ->assert_is_op_output("reduce_mean", "Out")
+                                ->assert_is_op_input("concat")
+                                ->AsIntermediate();
+    auto* reduce_max = OpNode("reduce_max", "reduce_max")->AsIntermediate();
+    auto* reduce_max_out = VarNode("reduce_max_out")
+                               ->assert_is_op_output("reduce_max", "Out")
+                               ->assert_is_op_input("concat")
+                               ->AsIntermediate();
+    auto* concat = OpNode("concat", "concat")->AsIntermediate();
+    auto* concat_out = VarNode("concat_out")
+                           ->assert_is_op_output("concat", "Out")
+                           ->assert_is_op_input("conv2d", "Input")
+                           ->AsIntermediate();
+    auto* right_conv4_weight = VarNode("right_conv4_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv4 = OpNode("right_conv4", "conv2d")->AsIntermediate();
+    auto* right_conv4_out = VarNode("right_conv4_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("sigmoid", "X")
+                                ->AsIntermediate();
+    auto* sigmoid = OpNode("sigmoid", "sigmoid")->AsIntermediate();
+    auto* sigmoid_out = VarNode("sigmoid_out")
+                            ->assert_is_op_output("sigmoid", "Out")
+                            ->assert_is_op_input("elementwise_mul")
+                            ->AsIntermediate();
+    auto* reshape = OpNode("reshape", "reshape2")->AsIntermediate();
+    auto* reshape_out = VarNode("reshape_out")
+                            ->assert_is_op_output("reshape2", "Out")
+                            ->assert_is_op_input("elementwise_mul")
+                            ->AsIntermediate();
+    auto* reshape_xshape = VarNode("reshape_xshape")
+                               ->assert_is_op_output("reshape2", "XShape")
+                               ->AsIntermediate();
+    auto* eltwise_mul =
+        OpNode("eltwise_mul", "elementwise_mul")->AsIntermediate();
+    auto* eltwise_mul_out = VarNode("eltwise_mul_out")
+                                ->assert_is_op_output("elementwise_mul", "Out")
+                                ->assert_is_op_input("elementwise_add")
+                                ->AsIntermediate();
+
+    auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
+    auto* add_out = VarNode("add_out")
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("relu", "X")
+                        ->AsIntermediate();
+    auto* relu = OpNode("relu", "relu")->AsIntermediate();
+    auto* relu_out =
+        VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+
+    *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
+        *right_bn1_out >> *right_relu1 >> *right_relu1_out >> *right_conv2 >>
+        *right_conv2_out >> *right_bn2 >> *right_bn2_out >> *right_relu2 >>
+        *right_relu2_out >> *right_conv3 >> *right_conv3_out >> *right_bn3 >>
+        *right_bn3_out /* >> *add*/;
+
+    *right_bn3_out >> *reduce_mean >> *reduce_mean_out >> *concat;
+    *right_bn3_out >> *reduce_max >> *reduce_max_out >> *concat;
+    *concat >> *concat_out >> *right_conv4 >> *right_conv4_out >> *sigmoid >>
+        *sigmoid_out >> *eltwise_mul;
+    *right_conv4_weight >> *right_conv4;
+    *right_bn3_out >> *reshape >> *reshape_out >> *eltwise_mul;
+    *reshape >> *reshape_xshape;
+    *eltwise_mul >> *eltwise_mul_out >> *add;
+
+    *right_conv1_weight >> *right_conv1;
+    *right_bn1_scale >> *right_bn1;
+    *right_bn1_bias >> *right_bn1;
+    *right_bn1_mean >> *right_bn1;
+    *right_bn1_var >> *right_bn1;
+    *right_bn1 >> *right_bn1_mean_out;
+    *right_bn1 >> *right_bn1_var_out;
+    *right_bn1 >> *right_bn1_saved_mean;
+    *right_bn1 >> *right_bn1_saved_var;
+
+    *right_conv2_weight >> *right_conv2;
+    *right_bn2_scale >> *right_bn2;
+    *right_bn2_bias >> *right_bn2;
+    *right_bn2_mean >> *right_bn2;
+    *right_bn2_var >> *right_bn2;
+    *right_bn2 >> *right_bn2_mean_out;
+    *right_bn2 >> *right_bn2_var_out;
+    *right_bn2 >> *right_bn2_saved_mean;
+    *right_bn2 >> *right_bn2_saved_var;
+
+    *right_conv3_weight >> *right_conv3;
+    *right_bn3_scale >> *right_bn3;
+    *right_bn3_bias >> *right_bn3;
+    *right_bn3_mean >> *right_bn3;
+    *right_bn3_var >> *right_bn3;
+    *right_bn3 >> *right_bn3_mean_out;
+    *right_bn3 >> *right_bn3_var_out;
+    *right_bn3 >> *right_bn3_saved_mean;
+    *right_bn3 >> *right_bn3_saved_var;
+
+    *input >> *add;
+
+    *add >> *add_out >> *relu >> *relu_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_cbam_block1");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter",
+                     {
+                         matched.at("right_conv1_weight")->arg()->name,
+                         matched.at("right_conv2_weight")->arg()->name,
+                         matched.at("right_conv3_weight")->arg()->name,
+                         matched.at("right_conv4_weight")->arg()->name,
+                     });
+    op_desc.SetInput("Scale",
+                     {
+                         matched.at("right_bn1_scale")->arg()->name,
+                         matched.at("right_bn2_scale")->arg()->name,
+                         matched.at("right_bn3_scale")->arg()->name,
+                         "placeholder_sa_conv",
+                     });
+    op_desc.SetInput("Bias",
+                     {
+                         matched.at("right_bn1_bias")->arg()->name,
+                         matched.at("right_bn2_bias")->arg()->name,
+                         matched.at("right_bn3_bias")->arg()->name,
+                         "placeholder_sa_conv",
+                     });
+    op_desc.SetInput("Mean",
+                     {
+                         matched.at("right_bn1_mean")->arg()->name,
+                         matched.at("right_bn2_mean")->arg()->name,
+                         matched.at("right_bn3_mean")->arg()->name,
+                         "placeholder_sa_conv",
+                     });
+    op_desc.SetInput("Var",
+                     {
+                         matched.at("right_bn1_variance")->arg()->name,
+                         matched.at("right_bn2_variance")->arg()->name,
+                         matched.at("right_bn3_variance")->arg()->name,
+                         "placeholder_sa_conv",
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    auto block1_stmt = matched.at("right_conv1")->stmt();
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetProgramDesc(sub_program_desc);
+    fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
+    block1_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "right_conv2_weight",
+        "right_conv3_weight",
+        "right_conv4_weight",
+        "right_bn1_bias",
+        "right_bn2_bias",
+        "right_bn3_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("right_conv1"));
+    }
+    IR_OP_VAR_LINK(matched.at("right_conv1"), matched.at("relu_out"));
+  }
+};
+
+class XPUResNetCbamBlock2Fuser : public FuseBase {
+ public:
+  XPUResNetCbamBlock2Fuser() {}
+
+  void BuildPattern() override {
+    auto* input = VarNode("input")->assert_is_op_input("clip", "X")->AsInput();
+
+    auto* clip = OpNode("clip", "clip");
+    auto* clip_out = VarNode("clip_out")
+                         ->assert_is_op_output("clip", "Out")
+                         ->assert_is_op_input("elementwise_pow")
+                         ->AsIntermediate();
+    auto* eltwise_y = VarNode("eltwise_y")
+                          ->assert_is_op_input("elementwise_pow")
+                          ->assert_is_op_input("elementwise_div")
+                          ->AsIntermediate();
+    auto* eltwise_pow =
+        OpNode("eltwise_pow", "elementwise_pow")->AsIntermediate();
+    auto* eltwise_pow_out = VarNode("eltwise_pow_out")
+                                ->assert_is_op_output("elementwise_pow", "Out")
+                                ->assert_is_op_input("pad2d", "X")
+                                ->AsIntermediate();
+    auto* pad2d = OpNode("pad2d", "pad2d")->AsIntermediate();
+    auto* pad2d_out = VarNode("pad2d_out")
+                          ->assert_is_op_output("pad2d", "Out")
+                          ->assert_is_op_input("pool2d", "X")
+                          ->AsIntermediate();
+    auto* pool2d = OpNode("pool2d", "pool2d")->AsIntermediate();
+    auto* pool2d_out = VarNode("pool2d_out")
+                           ->assert_is_op_output("pool2d", "Out")
+                           ->assert_is_op_input("elementwise_pow")
+                           ->AsIntermediate();
+
+    auto* fill_const = OpNode("fill_const", "fill_constant")->AsIntermediate();
+    auto* fill_const_out = VarNode("fill_const_out")
+                               ->assert_is_op_output("fill_constant", "Out")
+                               ->assert_is_op_input("elementwise_div")
+                               ->AsIntermediate();
+    auto* eltwise_div =
+        OpNode("eltwise_div", "elementwise_div")->AsIntermediate();
+    auto* eltwise_div_out = VarNode("eltwise_div_out")
+                                ->assert_is_op_output("elementwise_div", "Out")
+                                ->assert_is_op_input("elementwise_pow")
+                                ->AsIntermediate();
+
+    auto* eltwise_pow2 =
+        OpNode("eltwise_pow2", "elementwise_pow")->AsIntermediate();
+    auto* eltwise_pow2_out = VarNode("eltwise_pow2_out")
+                                 ->assert_is_op_output("elementwise_pow", "Out")
+                                 ->AsIntermediate();
+
+    auto* shape = OpNode("shape", "shape")->AsIntermediate();
+    auto* shape_out = VarNode("shape_out")
+                          ->assert_is_op_output("shape", "Out")
+                          ->assert_is_op_input("gather")
+                          ->AsIntermediate();
+    auto* fill_const2 =
+        OpNode("fill_const2", "fill_constant")->AsIntermediate();
+    auto* fill_const2_out = VarNode("fill_const2_out")
+                                ->assert_is_op_output("fill_constant", "Out")
+                                ->assert_is_op_input("gather")
+                                ->AsIntermediate();
+    auto* gather = OpNode("gather", "gather")->AsIntermediate();
+    auto* gather_out = VarNode("gather_out")
+                           ->assert_is_op_output("gather", "Out")
+                           ->assert_is_op_input("assign", "X")
+                           ->AsIntermediate();
+    auto* assign = OpNode("assign", "assign")->AsIntermediate();
+    auto* assign_out = VarNode("assign_out")
+                           ->assert_is_op_output("assign", "Out")
+                           ->assert_is_op_input("concat")
+                           ->AsIntermediate();
+
+    auto* fill_const3 =
+        OpNode("fill_const3", "fill_constant")->AsIntermediate();
+    auto* fill_const3_out = VarNode("fill_const3_out")
+                                ->assert_is_op_output("fill_constant", "Out")
+                                ->assert_is_op_input("assign")
+                                ->AsIntermediate();
+    auto* assign2 = OpNode("assign2", "assign")->AsIntermediate();
+    auto* assign2_out = VarNode("assign2_out")
+                            ->assert_is_op_output("assign", "Out")
+                            ->assert_is_op_input("concat")
+                            ->AsIntermediate();
+
+    auto* concat = OpNode("concat", "concat")->AsIntermediate();
+    auto* concat_out = VarNode("concat_out")
+                           ->assert_is_op_output("concat", "Out")
+                           ->assert_is_op_input("cast", "X")
+                           ->AsIntermediate();
+    auto* cast = OpNode("cast", "cast")->AsIntermediate();
+    auto* cast_out = VarNode("cast_out")
+                         ->assert_is_op_output("cast", "Out")
+                         ->assert_is_op_input("reshape2", "Shape")
+                         ->AsIntermediate();
+
+    auto* reshape2 = OpNode("reshape2", "reshape2")->AsIntermediate();
+    auto* reshape2_out = VarNode("reshape2_out")
+                             ->assert_is_op_output("reshape2", "Out")
+                             ->assert_is_op_input("matmul", "X")
+                             ->AsIntermediate();
+    auto* reshape2_xshape = VarNode("reshape2_xshape")
+                                ->assert_is_op_output("reshape2", "XShape")
+                                ->AsIntermediate();
+    auto* matmul_y =
+        VarNode("matmul_y")->assert_is_op_input("matmul", "Y")->AsInput();
+    auto* matmul = OpNode("matmul", "matmul")->AsIntermediate();
+    auto* matmul_out = VarNode("matmul_out")
+                           ->assert_is_op_output("matmul", "Out")
+                           ->assert_is_op_input("elementwise_add")
+                           ->AsIntermediate();
+    auto* eltwise_add_y = VarNode("eltwise_add_y")
+                              ->assert_is_op_input("elementwise_add")
+                              ->AsInput();
+    auto* eltwise_add =
+        OpNode("eltwise_add", "elementwise_add")->AsIntermediate();
+    auto* eltwise_add_out = VarNode("eltwise_add_out")
+                                ->assert_is_op_output("elementwise_add", "Out")
+                                ->AsIntermediate();
+
+    auto* norm = OpNode("norm", "norm")->AsIntermediate();
+    auto* norm_out = VarNode("norm_out")
+                         ->assert_is_op_output("norm", "Out")
+                         ->assert_is_op_input("elementwise_add")
+                         ->AsIntermediate();
+    auto* norm_norm = VarNode("norm_norm")
+                          ->assert_is_op_output("norm", "Norm")
+                          ->AsIntermediate();
+    auto* fill_const4 =
+        OpNode("fill_const4", "fill_constant")->AsIntermediate();
+    auto* fill_const4_out = VarNode("fill_const4_out")
+                                ->assert_is_op_output("fill_constant", "Out")
+                                ->assert_is_op_input("elementwise_add")
+                                ->AsIntermediate();
+    auto* eltwise_add2 =
+        OpNode("eltwise_add2", "elementwise_add")->AsIntermediate();
+    auto* eltwise_add2_out = VarNode("eltwise_add2_out")
+                                 ->assert_is_op_output("elementwise_add", "Out")
+                                 ->assert_is_op_input("elementwise_mul")
+                                 ->AsIntermediate();
+    auto* fill_const5 =
+        OpNode("fill_const5", "fill_constant")->AsIntermediate();
+    auto* fill_const5_out = VarNode("fill_const5_out")
+                                ->assert_is_op_output("fill_constant", "Out")
+                                ->assert_is_op_input("elementwise_mul")
+                                ->AsIntermediate();
+    auto* eltwise_mul =
+        OpNode("eltwise_mul", "elementwise_mul")->AsIntermediate();
+    auto* eltwise_mul_out = VarNode("eltwise_mul_out")
+                                ->assert_is_op_output("elementwise_mul", "Out")
+                                ->assert_is_op_input("elementwise_div")
+                                ->AsIntermediate();
+
+    auto* eltwise_div2 =
+        OpNode("eltwise_div2", "elementwise_div")->AsIntermediate();
+    auto* eltwise_div2_out = VarNode("eltwise_div2_out")
+                                 ->assert_is_op_output("elementwise_div", "Out")
+                                 ->AsOutput();
+
+    *input >> *clip >> *clip_out >> *eltwise_pow >> *eltwise_pow_out >>
+        *pad2d >> *pad2d_out >> *pool2d >> *pool2d_out >> *eltwise_pow2;
+    *eltwise_y >> *eltwise_pow;
+
+    *fill_const >> *fill_const_out >> *eltwise_div >> *eltwise_div_out >>
+        *eltwise_pow2;
+    *eltwise_y >> *eltwise_div;
+
+    *eltwise_pow2 >> *eltwise_pow2_out >> *shape >> *shape_out >> *gather >>
+        *gather_out >> *assign >> *assign_out >> *concat >> *concat_out >>
+        *cast >> *cast_out >> *reshape2;
+    *fill_const2 >> *fill_const2_out >> *gather;
+    *fill_const3 >> *fill_const3_out >> *assign2 >> *assign2_out >> *concat;
+    *eltwise_pow2_out >> *reshape2;
+
+    *reshape2 >> *reshape2_out >> *matmul >> *matmul_out >> *eltwise_add >>
+        *eltwise_add_out;
+    *reshape2 >> *reshape2_xshape;
+    *matmul_y >> *matmul;
+    *eltwise_add_y >> *eltwise_add;
+
+    *eltwise_add_out >> *norm >> *norm_out >> *eltwise_add2 >>
+        *eltwise_add2_out >> *eltwise_mul >> *eltwise_mul_out >>
+        *eltwise_div2 >> *eltwise_div2_out;
+    *norm >> *norm_norm;
+    *fill_const4 >> *fill_const4_out >> *eltwise_add2;
+    *fill_const5 >> *fill_const5_out >> *eltwise_mul;
+    *eltwise_add_out >> *eltwise_div2;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_cbam_block2");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter", {matched.at("matmul_y")->arg()->name});
+    op_desc.SetInput("Scale", {"placeholder_last_fc"});
+    op_desc.SetInput("Bias", {matched.at("eltwise_add_y")->arg()->name});
+    op_desc.SetInput("Mean", {"placeholder_last_fc"});
+    op_desc.SetInput("Var", {"placeholder_last_fc"});
+    op_desc.SetOutput("Outputs", {matched.at("eltwise_div2_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    // extra traits to distill
+    auto block2_stmt = matched.at("clip")->stmt();
+    auto* scope = block2_stmt->op()->scope();
+    auto pow_tensor_name = matched.at("eltwise_y")->arg()->name;
+    auto* pow_tensor = scope->FindTensor(pow_tensor_name);
+    float pool_p = pow_tensor->data<float>()[0];
+    op_desc.SetAttr<float>("pool_p", pool_p);
+    auto* matmul_op_info = matched.at("matmul")->stmt()->op_info();
+    CHECK(matmul_op_info->GetAttr<bool>("transpose_Y") == true)
+        << "Y of last fc must have been transposed";
+
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetProgramDesc(sub_program_desc);
+    fake_subgraph_op->Attach(op_desc, scope);
+    fake_subgraph_op->SetValidPlaces(block2_stmt->op()->valid_places());
+    block2_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "matmul_y", "eltwise_add_y",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("clip"));
+    }
+    IR_OP_VAR_LINK(matched.at("clip"), matched.at("eltwise_div2_out"));
+  }
+};
+
+class XPUResNetCbamFuser : public xpu::XPUFuseBase {
+ public:
+  XPUResNetCbamFuser() {}
+
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+
+    auto* top_conv_weight = VarNode("top_conv_weight")
+                                ->assert_is_op_input("conv2d", "Filter")
+                                ->AsInput();
+    auto* top_conv = OpNode("top_conv", "conv2d");
+    auto* top_conv_out = VarNode("top_conv_out")
+                             ->assert_is_op_output("conv2d", "Output")
+                             ->assert_is_op_input("batch_norm", "X")
+                             ->AsIntermediate();
+    auto* top_bn_scale = VarNode("top_bn_scale")
+                             ->assert_is_op_input("batch_norm", "Scale")
+                             ->AsIntermediate();
+    auto* top_bn_bias = VarNode("top_bn_bias")
+                            ->assert_is_op_input("batch_norm", "Bias")
+                            ->AsInput();
+    auto* top_bn_mean = VarNode("top_bn_mean")
+                            ->assert_is_op_input("batch_norm", "Mean")
+                            ->AsIntermediate();
+    auto* top_bn_var = VarNode("top_bn_variance")
+                           ->assert_is_op_input("batch_norm", "Variance")
+                           ->AsIntermediate();
+    auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate();
+    auto* top_bn_out = VarNode("top_bn_out")
+                           ->assert_is_op_output("batch_norm", "Y")
+                           ->assert_is_op_input("relu", "X")
+                           ->AsIntermediate();
+    auto* top_bn_mean_out = VarNode("top_bn_mean_out")
+                                ->assert_is_op_output("batch_norm", "MeanOut")
+                                ->AsIntermediate();
+    auto* top_bn_var_out =
+        VarNode("top_bn_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* top_bn_saved_mean =
+        VarNode("top_bn_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* top_bn_saved_var =
+        VarNode("top_bn_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate();
+    auto* top_relu_out = VarNode("top_relu_out")
+                             ->assert_is_op_output("relu", "Out")
+                             ->assert_is_op_input("pool2d", "X")
+                             ->AsIntermediate();
+    auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate();
+    auto* top_pool_out =
+        VarNode("top_pool_out")
+            ->assert_is_op_output("pool2d", "Out")
+            ->assert_is_op_input("resnet_cbam_block0", "Inputs")
+            ->AsIntermediate();
+
+    // args are left out
+    auto* resnet_block0_1 =
+        OpNode("resnet_block0_1", "resnet_cbam_block0")->AsIntermediate();
+    auto* resnet_block0_1_out =
+        VarNode("resnet_block0_1_out")
+            ->assert_is_op_output("resnet_cbam_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_1_1 =
+        OpNode("resnet_block1_1_1", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_1_1_out =
+        VarNode("resnet_block1_1_1_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_1_2 =
+        OpNode("resnet_block1_1_2", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_1_2_out =
+        VarNode("resnet_block1_1_2_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_2 =
+        OpNode("resnet_block0_2", "resnet_cbam_block0")->AsIntermediate();
+    auto* resnet_block0_2_out =
+        VarNode("resnet_block0_2_out")
+            ->assert_is_op_output("resnet_cbam_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_1 =
+        OpNode("resnet_block1_2_1", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_2_1_out =
+        VarNode("resnet_block1_2_1_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_2 =
+        OpNode("resnet_block1_2_2", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_2_2_out =
+        VarNode("resnet_block1_2_2_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_3 =
+        OpNode("resnet_block1_2_3", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_2_3_out =
+        VarNode("resnet_block1_2_3_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_3 =
+        OpNode("resnet_block0_3", "resnet_cbam_block0")->AsIntermediate();
+    auto* resnet_block0_3_out =
+        VarNode("resnet_block0_3_out")
+            ->assert_is_op_output("resnet_cbam_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_1 =
+        OpNode("resnet_block1_3_1", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_3_1_out =
+        VarNode("resnet_block1_3_1_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_2 =
+        OpNode("resnet_block1_3_2", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_3_2_out =
+        VarNode("resnet_block1_3_2_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_3 =
+        OpNode("resnet_block1_3_3", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_3_3_out =
+        VarNode("resnet_block1_3_3_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_4 =
+        OpNode("resnet_block1_3_4", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_3_4_out =
+        VarNode("resnet_block1_3_4_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_5 =
+        OpNode("resnet_block1_3_5", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_3_5_out =
+        VarNode("resnet_block1_3_5_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_4 =
+        OpNode("resnet_block0_4", "resnet_cbam_block0")->AsIntermediate();
+    auto* resnet_block0_4_out =
+        VarNode("resnet_block0_4_out")
+            ->assert_is_op_output("resnet_cbam_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_4_1 =
+        OpNode("resnet_block1_4_1", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_4_1_out =
+        VarNode("resnet_block1_4_1_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_4_2 =
+        OpNode("resnet_block1_4_2", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_4_2_out =
+        VarNode("resnet_block1_4_2_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block2 =
+        OpNode("resnet_block2", "resnet_cbam_block2")->AsIntermediate();
+    auto* resnet_block2_out =
+        VarNode("resnet_block2_out")
+            ->assert_is_op_output("resnet_cbam_block2", "Outputs")
+            ->AsOutput();
+
+    *input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >>
+        *top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >>
+        *resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >>
+        *resnet_block1_1_1_out >> *resnet_block1_1_2 >>
+        *resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >>
+        *resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >>
+        *resnet_block1_2_2_out >> *resnet_block1_2_3 >>
+        *resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >>
+        *resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >>
+        *resnet_block1_3_2_out >> *resnet_block1_3_3 >>
+        *resnet_block1_3_3_out >> *resnet_block1_3_4 >>
+        *resnet_block1_3_4_out >> *resnet_block1_3_5 >>
+        *resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >>
+        *resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >>
+        *resnet_block1_4_2_out >> *resnet_block2 >> *resnet_block2_out;
+
+    *top_conv_weight >> *top_conv;
+    *top_bn_scale >> *top_bn;
+    *top_bn_bias >> *top_bn;
+    *top_bn_mean >> *top_bn;
+    *top_bn_var >> *top_bn;
+    *top_bn >> *top_bn_mean_out;
+    *top_bn >> *top_bn_var_out;
+    *top_bn >> *top_bn_saved_mean;
+    *top_bn >> *top_bn_saved_var;
+  }
+
+  void handle_placeholder_sa_conv(SSAGraph* graph,
+                                  const key2nodes_t& matched,
+                                  paddle::lite::Scope* scope,
+                                  const std::string& filter_name,
+                                  std::vector<std::string>* max_filter_name) {
+    auto* filter_t = scope->FindMutableTensor(filter_name);
+    int filter_len = filter_t->numel();
+    float* filter_on_host = filter_t->mutable_data<float>();
+
+    float max_f =
+        paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
+    std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
+    paddle::lite::xpu::math::ConvertFP32ToInt16(
+        filter_on_host, filter_int16.get(), max_f, filter_len);
+    memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
+
+    // create new arg in graph and scope
+    std::string max_name = filter_name + "_max";
+    max_filter_name->push_back(max_name);
+    auto* max_filter_node = graph->NewArgumentNode(max_name);
+    max_filter_node->arg()->is_weight = true;
+    max_filter_node->arg()->type = LiteType::GetTensorTy(
+        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    DirectedLink(max_filter_node, matched.at("top_conv"));
+    auto* max_filter_t = scope->NewTensor(max_name);
+    max_filter_t->Resize({4});
+    float* max_ptr = max_filter_t->mutable_data<float>();
+    max_ptr[0] = max_f;
+    max_ptr[1] = max_f;
+    max_ptr[2] = max_f;
+    max_ptr[3] = max_f;
+  }
+
+  void handle_placeholder_last_fc(SSAGraph* graph,
+                                  const key2nodes_t& matched,
+                                  paddle::lite::Scope* scope,
+                                  const std::string& filter_name,
+                                  std::vector<std::string>* max_filter_name) {
+    auto* filter_t = scope->FindMutableTensor(filter_name);
+    auto filter_dims = filter_t->dims();
+    int filter_len = filter_t->numel();
+    float* filter_on_host = filter_t->mutable_data<float>();
+
+    // XXX(miaotianxiang): Y has already been transposed in model...
+    float max_f =
+        paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
+    std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
+    paddle::lite::xpu::math::ConvertFP32ToInt16(
+        filter_on_host, filter_int16.get(), max_f, filter_len);
+    memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
+
+    // create new arg in graph and scope
+    std::string max_name = filter_name + "_max";
+    max_filter_name->push_back(max_name);
+    auto* max_filter_node = graph->NewArgumentNode(max_name);
+    max_filter_node->arg()->is_weight = true;
+    max_filter_node->arg()->type = LiteType::GetTensorTy(
+        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    DirectedLink(max_filter_node, matched.at("top_conv"));
+    auto* max_filter_t = scope->NewTensor(max_name);
+    max_filter_t->Resize({4});
+    float* max_ptr = max_filter_t->mutable_data<float>();
+    max_ptr[0] = max_f;
+    max_ptr[1] = max_f;
+    max_ptr[2] = max_f;
+    max_ptr[3] = max_f;
+  }
+
+  void InsertNewNode(SSAGraph* graph,
+                     const key2nodes_t& matched,
+                     const std::vector<Node*>& extra_input_vars) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__resnet_cbam");
+    op_desc.SetInput("Input", {matched.at("input")->arg()->name});
+    std::vector<std::string> filter_name = {
+        matched.at("top_conv_weight")->arg()->name};
+    std::vector<std::string> scale_name = {
+        matched.at("top_bn_scale")->arg()->name};
+    std::vector<std::string> bias_name = {
+        matched.at("top_bn_bias")->arg()->name};
+    std::vector<std::string> mean_name = {
+        matched.at("top_bn_mean")->arg()->name};
+    std::vector<std::string> var_name = {
+        matched.at("top_bn_variance")->arg()->name};
+    std::vector<std::string> max_filter_name;
+    std::vector<std::string> resnet_block_vec = {
+        "resnet_block0_1",
+        "resnet_block1_1_1",
+        "resnet_block1_1_2",
+        "resnet_block0_2",
+        "resnet_block1_2_1",
+        "resnet_block1_2_2",
+        "resnet_block1_2_3",
+        "resnet_block0_3",
+        "resnet_block1_3_1",
+        "resnet_block1_3_2",
+        "resnet_block1_3_3",
+        "resnet_block1_3_4",
+        "resnet_block1_3_5",
+        "resnet_block0_4",
+        "resnet_block1_4_1",
+        "resnet_block1_4_2",
+        "resnet_block2",
+    };
+    for (auto& block : resnet_block_vec) {
+      auto* block_op_info = matched.at(block)->stmt()->op_info();
+      auto block_filter_name = block_op_info->Input("Filter");
+      std::copy(block_filter_name.begin(),
+                block_filter_name.end(),
+                std::back_inserter(filter_name));
+      auto block_scale_name = block_op_info->Input("Scale");
+      std::copy(block_scale_name.begin(),
+                block_scale_name.end(),
+                std::back_inserter(scale_name));
+      auto block_bias_name = block_op_info->Input("Bias");
+      std::copy(block_bias_name.begin(),
+                block_bias_name.end(),
+                std::back_inserter(bias_name));
+      auto block_mean_name = block_op_info->Input("Mean");
+      std::copy(block_mean_name.begin(),
+                block_mean_name.end(),
+                std::back_inserter(mean_name));
+      auto block_var_name = block_op_info->Input("Var");
+      std::copy(block_var_name.begin(),
+                block_var_name.end(),
+                std::back_inserter(var_name));
+    }
+
+    auto* resnet_cbam_stmt = matched.at("top_conv")->stmt();
+    auto* scope = resnet_cbam_stmt->op()->scope();
+    for (size_t i = 0; i < filter_name.size(); ++i) {
+      if (scale_name[i] == "placeholder_sa_conv") {
+        handle_placeholder_sa_conv(
+            graph, matched, scope, filter_name[i], &max_filter_name);
+        continue;
+      } else if (scale_name[i] == "placeholder_last_fc") {
+        handle_placeholder_last_fc(
+            graph, matched, scope, filter_name[i], &max_filter_name);
+        continue;
+      }
+
+      auto* filter_t = scope->FindMutableTensor(filter_name[i]);
+      auto* scale_t = scope->FindMutableTensor(scale_name[i]);
+      auto* bias_t = scope->FindMutableTensor(bias_name[i]);
+      auto* mean_t = scope->FindMutableTensor(mean_name[i]);
+      auto* var_t = scope->FindMutableTensor(var_name[i]);
+
+      int mean_len = mean_t->numel();
+      int filter_len = filter_t->numel();
+      int filter_stride = filter_len / mean_len;
+
+      float* filter_on_host = filter_t->mutable_data<float>();
+      float* scale_on_host = scale_t->mutable_data<float>();
+      float* bias_on_host = bias_t->mutable_data<float>();
+      float* mean_on_host = mean_t->mutable_data<float>();
+      float* var_on_host = var_t->mutable_data<float>();
+
+      // Perform preprocess
+      for (int i = 0; i < mean_len; ++i) {
+        scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f);
+      }
+      for (int i = 0; i < mean_len; ++i) {
+        for (int j = 0; j < filter_stride; ++j) {
+          filter_on_host[i * filter_stride + j] *= scale_on_host[i];
+        }
+      }
+      for (int i = 0; i < mean_len; ++i) {
+        bias_on_host[i] += -mean_on_host[i] * scale_on_host[i];
+      }
+
+      float max_f =
+          paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
+      std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
+      paddle::lite::xpu::math::ConvertFP32ToInt16(
+          filter_on_host, filter_int16.get(), max_f, filter_len);
+      memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
+
+      // create new arg in graph and scope
+      std::string max_name = filter_name[i] + "_max";
+      max_filter_name.push_back(max_name);
+      auto* max_filter_node = graph->NewArgumentNode(max_name);
+      max_filter_node->arg()->is_weight = true;
+      max_filter_node->arg()->type = LiteType::GetTensorTy(
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+      DirectedLink(max_filter_node, matched.at("top_conv"));
+      auto* max_filter_t = scope->NewTensor(max_name);
+      max_filter_t->Resize({4});
+      float* max_ptr = max_filter_t->mutable_data<float>();
+      max_ptr[0] = max_f;
+      max_ptr[1] = max_f;
+      max_ptr[2] = max_f;
+      max_ptr[3] = max_f;
+    }
+    op_desc.SetInput("Filter", filter_name);
+    op_desc.SetInput("Bias", bias_name);
+    op_desc.SetInput("MaxFilter", max_filter_name);
+    op_desc.SetOutput("Output", {matched.at("resnet_block2_out")->arg()->name});
+    op_desc.SetAttr<int>("xpu", 1);
+    auto* block2_op_info = matched.at("resnet_block2")->stmt()->op_info();
+    op_desc.SetAttr<float>("pool_p", block2_op_info->GetAttr<float>("pool_p"));
+
+    auto resnet_cbam_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    resnet_cbam_op->Attach(op_desc, scope);
+    resnet_cbam_op->SetValidPlaces(resnet_cbam_stmt->op()->valid_places());
+    auto kernels =
+        resnet_cbam_op->CreateKernels(resnet_cbam_op->valid_places());
+    resnet_cbam_stmt->SetOp(resnet_cbam_op);
+    resnet_cbam_stmt->SetKernels(std::move(kernels));
+
+    IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv"));
+    for (auto* node : extra_input_vars) {
+      IR_NODE_LINK_TO(node, matched.at("top_conv"));
+    }
+    IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("resnet_block2_out"));
+  }
+};
+
+}  // namespace fusion
+
+class XPUResNetCbamFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    fusion::XPUResNetCbamBlock0Fuser block0_fuser;
+    block0_fuser(graph.get());
+    fusion::XPUResNetCbamBlock1Fuser block1_fuser;
+    block1_fuser(graph.get());
+    fusion::XPUResNetCbamBlock2Fuser block2_fuser;
+    block2_fuser(graph.get());
+    fusion::XPUResNetCbamFuser resnet_fuser;
+    resnet_fuser(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__resnet_cbam_fuse_pass,
+                  paddle::lite::mir::XPUResNetCbamFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("__xpu__resnet_cbam");
diff --git a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
index de2210a76ea0647cb02131a088ceb754afd0ef9c..7024a872f30d3c78affe82648c902a6128de7070 100644
--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
@@ -315,10 +315,10 @@ class XPUResNetBlock0Fuser : public FuseBase {
     auto block0_stmt = matched.at("left_conv1")->stmt();
     // block0_stmt->ResetOp(op_desc, graph->valid_places());
     auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
     static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
     fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
     fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
     block0_stmt->SetOp(fake_subgraph_op);
@@ -577,10 +577,10 @@ class XPUResNetBlock1Fuser : public FuseBase {
 
     auto block1_stmt = matched.at("right_conv1")->stmt();
     auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
     static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
     fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
     fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
     block1_stmt->SetOp(fake_subgraph_op);
diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
index 68c07c0ffd0694aec0ff073082e1192213a0ef4a..20023830123939f1cf83706f69ca8a7a2703b646 100644
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
@@ -25,21 +25,21 @@ namespace mir {
 void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   std::vector<std::string> act_types{"relu"};
   bool has_int8 = false;
-  bool has_arm_float = false;
+  bool has_arm = false;
   bool has_cuda = false;
   for (auto& place : graph->valid_places()) {
     if (place.precision == PRECISION(kInt8)) {
       has_int8 = true;
     }
-    if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) {
-      has_arm_float = true;
+    if (place.target == TARGET(kARM)) {
+      has_arm = true;
     }
     if (place.target == TARGET(kCUDA)) {
       has_cuda = true;
     }
   }
 
-  if (!has_int8 && has_arm_float) {
+  if (has_arm) {
     act_types.push_back("relu6");
     act_types.push_back("leaky_relu");
   }
@@ -64,4 +64,5 @@ REGISTER_MIR_PASS(lite_conv_activation_fuse_pass,
                   paddle::lite::mir::ConvActivationFusePass)
     .BindTargets({TARGET(kAny)})
     .ExcludeTargets({TARGET(kXPU)})
+    .ExcludeTargets({TARGET(kMLU)})
     .BindKernel("conv2d");
diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc
index 69be8dab0a06c26d5ca2bcdfe8327634edb9637d..a05f8fe8da5ee72581a9254b4d39354a0c5180e6 100644
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -156,12 +156,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   //       little difference for int8
   ///////////////////////////////////////////////////////////////////////////////
   if (enable_int8) {
-    PADDLE_ENFORCE(conv_op_desc->HasAttr("weight_scale"),
-                   "INT8 mode: Conv should has weight_scale attr");
+    std::string weight_name = conv_op_desc->Input("Filter").front();
+    CHECK(conv_op_desc->HasInputScale(weight_name))
+        << "INT8 mode: Conv should has weight_scale attr";
     auto conv_weight_d = conv_weight_t->mutable_data<int8_t>();
     // compute new conv_weight for int8
-    auto weight_scale =
-        conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
+    auto weight_scale = conv_op_desc->GetInputScale(weight_name);
     if (conv_type_ == "conv2d_transpose" && !depthwise) {
       int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                    conv_weight_t->dims()[3];
@@ -188,11 +188,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
         }
       }
     }
-    conv_op_desc->SetAttr("weight_scale", weight_scale);
+    conv_op_desc->SetInputScale(weight_name, weight_scale);
   } else if (is_weight_quantization) {
     std::string scale_name = conv_weight_name + "_quant_scale";
     if (conv_op_desc->HasAttr(scale_name)) {
-      auto scale = conv_op_desc->GetAttr<std::vector<float>>(scale_name);
+      std::vector<float> scale =
+          conv_op_desc->GetAttr<std::vector<float>>(scale_name);
       CHECK_EQ(scale.size(), alpha_tensor.numel());
       for (size_t i = 0; i < scale.size(); i++) {
         scale[i] *= alpha_data[i];
diff --git a/lite/core/mir/fusion/conv_bn_fuser.h b/lite/core/mir/fusion/conv_bn_fuser.h
index 8bd8c0ce0600bb68667d96d07d43fa3028b5a856..841566067ba6675271227adfa82c74defac35f2a 100644
--- a/lite/core/mir/fusion/conv_bn_fuser.h
+++ b/lite/core/mir/fusion/conv_bn_fuser.h
@@ -18,7 +18,7 @@
 #include <memory>
 #include <string>
 #include "lite/core/mir/pattern_matcher_high_api.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/core/mir/fusion/conv_conv_fuse_pass.cc b/lite/core/mir/fusion/conv_conv_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9c4f0c02cd89e04d93af8e4dab71acc5d24e411
--- /dev/null
+++ b/lite/core/mir/fusion/conv_conv_fuse_pass.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/conv_conv_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/conv_conv_fuser.h"
+#include "lite/core/mir/graph_visualize_pass.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  // initialze fuser params
+  std::vector<bool> conv_has_bias_cases{true, false};
+  std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
+  bool has_arm = false;
+  for (auto& place : graph->valid_places()) {
+    if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) {
+      has_arm = true;
+      break;
+    }
+  }
+  if (!has_arm) {
+    return;
+  }
+  // only support fp32 fusion
+  for (auto conv_has_bias0 : conv_has_bias_cases) {
+    for (auto conv_has_bias1 : conv_has_bias_cases) {
+      for (auto conv_type0 : conv_type_cases) {
+        for (auto conv_type1 : conv_type_cases) {
+          VLOG(4) << "conv_has_bias0:" << conv_has_bias0
+                  << " conv_type0:" << conv_type0;
+          VLOG(4) << "conv_has_bias1:" << conv_has_bias1
+                  << " conv_type1:" << conv_type1;
+          fusion::ConvConvFuser fuser(
+              conv_type0, conv_type1, conv_has_bias0, conv_has_bias1);
+          fuser(graph.get());
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_conv_conv_fuse_pass, paddle::lite::mir::ConvConvFusePass)
+    .BindTargets({TARGET(kARM)});
diff --git a/lite/core/mir/fusion/conv_conv_fuse_pass.h b/lite/core/mir/fusion/conv_conv_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..64e1b87ec9a8618572d6044f6dde2ab25c5a11c4
--- /dev/null
+++ b/lite/core/mir/fusion/conv_conv_fuse_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ConvConvFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/conv_conv_fuser.cc b/lite/core/mir/fusion/conv_conv_fuser.cc
new file mode 100644
index 0000000000000000000000000000000000000000..737f96e69baa8953c0231fcc4c9e104907b17381
--- /dev/null
+++ b/lite/core/mir/fusion/conv_conv_fuser.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/conv_conv_fuser.h"
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void ConvConvFuser::BuildPattern() {
+  auto* conv_input0 = VarNode("conv_input0")
+                          ->assert_is_op_input(conv_type0_, "Input")
+                          ->AsInput();
+  auto* conv_weight0 = VarNode("conv_weight0")
+                           ->assert_is_op_input(conv_type0_, "Filter")
+                           ->AsInput();
+  auto* conv0 = OpNode("conv2d0", conv_type0_)->assert_is_op(conv_type0_);
+  auto* conv_out0 = VarNode("conv_out0")
+                        ->assert_is_op_output(conv_type0_, "Output")
+                        ->assert_is_op_input(conv_type1_, "Input")
+                        ->AsIntermediate();
+
+  auto* conv_weight1 = VarNode("conv_weight1")
+                           ->assert_is_op_input(conv_type1_, "Filter")
+                           ->AsIntermediate();
+  auto* conv1 = OpNode("conv2d1", conv_type1_)
+                    ->assert_is_op(conv_type1_)
+                    ->assert_op_attr<int>("groups", 1)
+                    ->AsIntermediate();
+
+  auto* conv_out1 = VarNode("conv_out1")
+                        ->assert_is_op_output(conv_type1_, "Output")
+                        ->AsOutput();
+
+  if (conv_has_bias0_) {
+    if (conv_has_bias1_) {
+      auto* conv_bias0 = VarNode("conv_bias0")
+                             ->assert_is_op_input(conv_type0_, "Bias")
+                             ->AsIntermediate();
+      auto* conv_bias1 = VarNode("conv_bias1")
+                             ->assert_is_op_input(conv_type1_, "Bias")
+                             ->AsInput();
+      conv0->LinksFrom({conv_input0, conv_weight0, conv_bias0})
+          .LinksTo({conv_out0});
+      conv1->LinksFrom({conv_out0, conv_weight1, conv_bias1})
+          .LinksTo({conv_out1});
+    } else {
+      auto* conv_bias0 = VarNode("conv_bias0")
+                             ->assert_is_op_input(conv_type0_, "Bias")
+                             ->AsIntermediate();
+      conv0->LinksFrom({conv_input0, conv_weight0, conv_bias0})
+          .LinksTo({conv_out0});
+      conv1->LinksFrom({conv_out0, conv_weight1}).LinksTo({conv_out1});
+    }
+  } else {
+    conv0->LinksFrom({conv_input0, conv_weight0}).LinksTo({conv_out0});
+    if (conv_has_bias1_) {
+      auto* conv_bias1 = VarNode("conv_bias1")
+                             ->assert_is_op_input(conv_type1_, "Bias")
+                             ->AsInput();
+      conv1->LinksFrom({conv_out0, conv_weight1, conv_bias1})
+          .LinksTo({conv_out1});
+    } else {
+      conv1->LinksFrom({conv_out0, conv_weight1}).LinksTo({conv_out1});
+    }
+  }
+}
+
+void ConvConvFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
+  auto conv_instruct = matched.at("conv2d0")->stmt();
+  auto conv_op_desc = conv_instruct->mutable_op_info();
+  auto conv = conv_instruct->op();
+  auto* scope = conv->scope();
+  auto conv_op_desc1 = matched.at("conv2d1")->stmt()->mutable_op_info();
+
+  // conv0
+  auto weight0_t = scope->FindVar(matched.at("conv_weight0")->arg()->name)
+                       ->GetMutable<lite::Tensor>();
+
+  // conv1
+  auto weight1_t = scope->FindVar(matched.at("conv_weight1")->arg()->name)
+                       ->GetMutable<lite::Tensor>();
+  // auto groups0 = conv_op_desc->GetAttr<int>("groups");
+  auto groups1 = conv_op_desc1->GetAttr<int>("groups");
+  auto strides1 = conv_op_desc1->GetAttr<std::vector<int>>("strides");
+  auto paddings1 = conv_op_desc1->GetAttr<std::vector<int>>("paddings");
+  auto dilations1 = conv_op_desc1->GetAttr<std::vector<int>>("dilations");
+
+  bool enable0_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
+  bool enable1_int8 = conv_op_desc1->HasAttr("enable_int8") ? true : false;
+  int kw = weight1_t->dims()[2];
+  int kh = weight1_t->dims()[3];
+  if (!(kw == 1 && kh == 1)) {
+    return;
+  }
+  CHECK_EQ(enable0_int8, enable1_int8) << "The Conv compute type must be same";
+  CHECK_EQ(groups1, 1) << "The groups of weight1_dim must be 1";
+  CHECK_EQ(weight0_t->dims()[0], weight1_t->dims()[1])
+      << "weight0_dims[0] == weight1_dim[1]";
+  for (int i = 0; i < strides1.size(); i++) {
+    CHECK_EQ(strides1[i], 1) << "strides[" << i << "]: " << strides1[i]
+                             << " must be 1";
+  }
+  for (int i = 0; i < paddings1.size(); i++) {
+    CHECK_EQ(paddings1[i], 0) << "paddings1[" << i << "]: " << paddings1[i]
+                              << " must be 0";
+  }
+  for (int i = 0; i < dilations1.size(); i++) {
+    CHECK_EQ(dilations1[i], 1) << "dilations1[" << i << "]: " << dilations1[i]
+                               << " must be 1";
+  }
+  // comupte new_wight and new bias
+  ///////////////////////////////////////////////////////////////////////////////
+  // Compute ConvConvFuser
+  // Before fusion
+  //
+  //   conv(x) = conv(x) = kx + z = y
+  //   conv(y) = ay + b
+  //
+  // After fusion:
+  //
+  //   conv(conv(x)) = a(kx + z) + b = akx + az + b
+  //
+  //   new_weights = ak
+  //   new_bias = az + b
+  ///////////////////////////////////////////////////////////////////////////////
+  if (enable0_int8) {
+    LOG(FATAL) << "it doesn't support";
+    return;
+  } else {
+    // compute new conv_weight
+    Tensor weight_tensor;
+    auto in_dims = weight0_t->dims();
+    auto weight_dims = weight1_t->dims();
+    const float* din = weight0_t->data<float>();
+    const float* weights = weight1_t->data<float>();
+    int oc0 = in_dims[0];
+    int ic = in_dims[1];
+    int ih = in_dims[2];
+    int iw = in_dims[3];
+    int oc = weight_dims[0];
+    weight_tensor.Resize({oc, ic, ih, iw});
+    float* dout = weight_tensor.mutable_data<float>();
+    ComputeNewWeight(dout, din, weights, oc0, ic, ih, iw, oc);
+    weight0_t->CopyDataFrom(weight_tensor);
+  }
+  // compute new conv_bias
+  if (conv_has_bias0_ && conv_op_desc->HasInput("Bias") &&
+      conv_op_desc->Input("Bias").size() > 0) {
+    auto bias_t0 = scope->FindVar(matched.at("conv_bias0")->arg()->name)
+                       ->GetMutable<lite::Tensor>();
+    if (conv_has_bias1_ && conv_op_desc1->HasInput("Bias") &&
+        conv_op_desc1->Input("Bias").size() > 0) {
+      auto bias_t1 = scope->FindVar(matched.at("conv_bias1")->arg()->name)
+                         ->GetMutable<lite::Tensor>();
+      Tensor bias;
+      bias.CopyDataFrom(*bias_t1);
+      auto bias_data = bias.mutable_data<float>();
+      ComputeNewBias(bias_data, bias_t0, weight1_t, bias_t1);
+      bias_t1->CopyDataFrom(bias);
+      conv_op_desc->SetInput(
+          "Bias", {matched.at("conv_bias1")->arg()->name});  // conv_bias
+      IR_NODE_LINK_TO(matched.at("conv_bias1"), matched.at("conv2d0"));
+    } else {
+      Tensor bias;
+      auto weight_dims = weight1_t->dims();
+      bias.Resize({weight_dims[0]});
+      auto bias_d = bias.mutable_data<float>();
+      ComputeNewBias(bias_d, bias_t0, weight1_t, nullptr);
+      bias_t0->CopyDataFrom(bias);
+      conv_op_desc->SetInput(
+          "Bias", {matched.at("conv_bias0")->arg()->name});  // conv_bias
+    }
+  } else {
+    if (conv_has_bias1_ && conv_op_desc1->HasInput("Bias") &&
+        conv_op_desc1->Input("Bias").size() > 0) {
+      conv_op_desc->SetInput(
+          "Bias", {matched.at("conv_bias1")->arg()->name});  // conv_bias
+      IR_NODE_LINK_TO(matched.at("conv_bias1"), matched.at("conv2d0"));
+    }
+  }
+  conv_op_desc->SetType(conv_type0_);
+  conv_op_desc->SetInput("Input", {matched.at("conv_input0")->arg()->name});
+  conv_op_desc->SetInput("Filter", {matched.at("conv_weight0")->arg()->name});
+  conv_op_desc->SetOutput("Output", {matched.at("conv_out1")->arg()->name});
+
+  auto update_conv_desc = *conv_instruct->mutable_op_info();
+  conv_instruct->ResetOp(update_conv_desc, graph->valid_places());
+
+  IR_OP_VAR_LINK(matched.at("conv2d0"), matched.at("conv_out1"));
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/conv_conv_fuser.h b/lite/core/mir/fusion/conv_conv_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d1f58d1c8746a137e2078006016ec6007c2afbb
--- /dev/null
+++ b/lite/core/mir/fusion/conv_conv_fuser.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class ConvConvFuser : public FuseBase {
+ public:
+  explicit ConvConvFuser(const std::string& conv_type0,
+                         const std::string& conv_type1,
+                         const bool conv_has_bias0,
+                         const bool conv_has_bias1)
+      : conv_type0_(conv_type0),
+        conv_type1_(conv_type1),
+        conv_has_bias0_(conv_has_bias0),
+        conv_has_bias1_(conv_has_bias1) {}
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  void ComputeNewWeight(float* dout,
+                        const float* din,
+                        const float* weights,
+                        int oc0,
+                        int ic,
+                        int ih,
+                        int iw,
+                        int oc1) {
+    // input conv_weight0_t weights conv_weight1_t
+    // output weight_tensor
+    // ksize = 1
+    int in_size = ih * iw;
+    int in_channel_size = ic * in_size;
+    // out = w1[j, i, ih, iw] * w2[k, j, kw, kh]
+    // out_dim = [oc1, ic, kh, kw], din_dim = [oc0, ic, kh, kw]
+    // weight_dim = [oc1, oc0, kh, kw]
+    for (int k = 0; k < oc1; k++) {
+      const float* weights_ptr = weights + k * oc0;
+      float* out_ptr = dout + k * in_channel_size;
+      for (int c = 0; c < ic; c++) {
+        float* out_ptr_channel = out_ptr + c * in_size;
+        const float* din_ptr = din + c * in_size;
+        for (int i = 0; i < in_size; i++) {
+          float sum = 0.f;
+          for (int j = 0; j < oc0; j++) {
+            sum += din_ptr[j * in_channel_size] * weights_ptr[j];
+          }
+          *out_ptr_channel++ = sum;
+        }
+      }
+    }
+  }
+
+  void ComputeNewBias(float* dout,
+                      Tensor* bias0_tensor,
+                      Tensor* weight_tensor,
+                      Tensor* bias1_tensor) {
+    // input bias0_tensor weight_tensor bias1_tensor
+    // output bias_tensor
+    auto in_dims = bias0_tensor->dims();
+    auto weight_dims = weight_tensor->dims();
+    const float* din = bias0_tensor->data<float>();
+    const float* weights = weight_tensor->data<float>();
+    int ic = in_dims[0];
+    int oc = weight_dims[0];
+    // out_k = b0[num, j, 1, 1] * w2[k, j, 1, 1]
+    if (bias1_tensor) {
+      const float* din2 = bias1_tensor->data<float>();
+      for (int k = 0; k < oc; k++) {
+        const float* weights_ptr = weights + k * ic;
+        float sum = 0.f;
+        for (int j = 0; j < ic; j++) {
+          sum += din[j] * weights_ptr[j];
+        }
+        dout[k] = sum + din2[k];
+      }
+    } else {
+      for (int k = 0; k < oc; k++) {
+        const float* weights_ptr = weights + k * ic;
+        float sum = 0.f;
+        for (int j = 0; j < ic; j++) {
+          sum += din[j] * weights_ptr[j];
+        }
+        dout[k] = sum;
+      }
+    }
+  }
+
+ private:
+  std::string conv_type0_{"conv2d"};
+  std::string conv_type1_{"conv2d"};
+  bool conv_has_bias0_{false};
+  bool conv_has_bias1_{false};
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc
index e2d8f96c53bd76d9495035c6ec56a5364b9bdcf5..d9bffffebfaabcca9c63700caf6e3ee91fa2eecb 100644
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -24,8 +24,13 @@ namespace mir {
 
 void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 #ifdef LITE_WITH_X86
+#ifdef LITE_WITH_MLU
+  fusion::FcFuser fuser(false);
+  fuser(graph.get());
+#else
   fusion::FcFuser fuser(true);
   fuser(graph.get());
+#endif
 #endif
 
   fusion::FcFuser fuser2(false);
@@ -38,7 +43,9 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
     .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kXPU), TARGET(kX86)})
+    .ExcludeTargets({TARGET(kXPU)})
+#ifndef LITE_WITH_MLU
+    .ExcludeTargets({TARGET(kX86)})
+#endif
     .ExcludeTargets({TARGET(kBM)})
-    .ExcludeTargets({TARGET(kCUDA)})
     .BindKernel("fc");
diff --git a/lite/core/mir/fusion/fc_fuser.cc b/lite/core/mir/fusion/fc_fuser.cc
index 3c99131083d37ea2c8511ed136bff17c891529af..8fdde50fc3015b411ee13fed15e92a93a1c722e5 100644
--- a/lite/core/mir/fusion/fc_fuser.cc
+++ b/lite/core/mir/fusion/fc_fuser.cc
@@ -71,7 +71,20 @@ void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
 }
 
 cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
-  cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
+  auto op_desc = *matched.at("mul")->stmt()->op_info();
+
+  // Get the input scale from mul
+  std::vector<float> x_scale_vct;
+  std::vector<float> y_scale_vct;
+  auto input_x_name = op_desc.Input("X").front();
+  auto input_y_name = op_desc.Input("Y").front();
+  bool is_quantized_op = op_desc.HasInputScale(input_x_name) &&
+                         op_desc.HasInputScale(input_y_name);
+  if (is_quantized_op) {
+    x_scale_vct = op_desc.GetInputScale(input_x_name);
+    y_scale_vct = op_desc.GetInputScale(op_desc.Input("Y").front());
+  }
+
   op_desc.mutable_inputs()->clear();
   op_desc.mutable_outputs()->clear();
   op_desc.SetType("fc");
@@ -85,6 +98,13 @@ cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
   if (with_relu_) {
     op_desc.SetAttr("activation_type", std::string{"relu"});
   }
+
+  // Set the input scale into fc
+  if (is_quantized_op) {
+    op_desc.SetInputScale(matched.at("x")->arg()->name, x_scale_vct);
+    op_desc.SetInputScale(matched.at("W")->arg()->name, y_scale_vct);
+  }
+
   return op_desc;
 }
 
diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
index 80a033c75f2e23efa091375ee2a9f78e3ff40d71..da42d6d0c79a2a7975eacca7095fedababac6d89 100644
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -34,19 +34,25 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   }
 
   // fuse quantized node and dequant node
-  for (auto& op_type : {"conv2d", "mul", "depthwise_conv2d"}) {
+  std::vector<std::string> quantized_op_types = {
+      "conv2d", "depthwise_conv2d", "conv2d_transpose", "mul"};
+  for (auto& op_type : quantized_op_types) {
     fusion::DequantOpFuser fuser(op_type);
     fuser(graph.get());
   }
-
-  for (auto& op_type : {"conv2d", "depthwise_conv2d"}) {
+  for (auto& op_type : quantized_op_types) {
     fusion::ChannelWiseDequantOpFuser fuser(op_type);
     fuser(graph.get());
   }
 
   // process quant_dequant_node
-  fusion::DeleteQuantDequantOpFuser dqd_fuser;
-  dqd_fuser(graph.get());
+  std::vector<std::string> quant_dequant_op_types = {
+      "fake_quantize_dequantize_abs_max",
+      "fake_quantize_dequantize_moving_average_abs_max"};
+  for (auto& op_type : quant_dequant_op_types) {
+    fusion::DeleteQuantDequantOpFuser dqd_fuser(op_type);
+    dqd_fuser(graph.get());
+  }
 }
 
 }  // namespace mir
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
index f6d03cc23d56f8ae25f22b5b2667ed451ef8afaa..758a85c84064fa8d1953a6531300208d13525634 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -23,6 +23,20 @@ namespace lite {
 namespace mir {
 namespace fusion {
 
+static std::string GetWeightArgname(const std::string& op_type) {
+  std::string weight_argname{};
+  std::vector<std::string> conv_ops = {
+      "conv2d", "depthwise_conv2d", "conv2d_transpose"};
+  std::vector<std::string> mul_ops = {"mul", "matmul"};
+  if (std::find(conv_ops.begin(), conv_ops.end(), op_type) != conv_ops.end()) {
+    weight_argname = "Filter";
+  } else if (std::find(mul_ops.begin(), mul_ops.end(), op_type) !=
+             mul_ops.end()) {
+    weight_argname = "Y";
+  }
+  return weight_argname;
+}
+
 void DeleteQuantOpFuser::BuildPattern() {
   auto* input_scale_node = VarNode("input_scale_node")
                                ->assert_is_op_input(quant_op_type_, "InScale");
@@ -64,13 +78,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
   for (auto* quantized_node : outlinks) {
     // save input scale in quantized op by input argname + index
     auto op_desc = *quantized_node->stmt()->mutable_op_info();
-    std::string argname;
-    int index;
-    op_desc.GetInputArgname(out_act_name, &argname);
-    op_desc.GetInputIndex(out_act_name, &index);
-    op_desc.SetAttr<float>(argname + std::to_string(index) + "_input_scale",
-                           scale_value);
-    op_desc.SetAttr<float>("input_scale", scale_value);  // save it for now
+    op_desc.SetInputScale(out_act_name, {scale_value});
     op_desc.SetAttr<int>("bit_length", bit_length);
     op_desc.UpdateAllInputs(out_act_name, in_act_name);
     quantized_node->stmt()->ResetOp(op_desc, graph->valid_places());
@@ -89,20 +97,13 @@ cpp::OpDesc DeleteQuantOpFuser::GenOpDesc(const key2nodes_t& matched) {
 }
 
 void DequantOpFuser::BuildPattern() {
-  std::string weight_name = "";
-  if (quantized_op_type_ == "conv2d" ||
-      quantized_op_type_ == "depthwise_conv2d") {
-    weight_name = "Filter";
-  } else {
-    weight_name = "Y";
-  }
-
+  std::string weight_argname = GetWeightArgname(quantized_op_type_);
   auto* quantized_op_input = VarNode("quantized_op_input")
                                  ->assert_is_op_input(quantized_op_type_)
                                  ->AsInput();
   auto* quantized_op_weight =
       VarNode("quantized_op_weight")
-          ->assert_is_op_input(quantized_op_type_, weight_name)
+          ->assert_is_op_input(quantized_op_type_, weight_argname)
           ->AsInput();
   auto* quantized_op = OpNode("quantized_op", quantized_op_type_)
                            ->assert_is_op(quantized_op_type_)
@@ -135,6 +136,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
   auto* quantized_op = matched.at("quantized_op");
   auto* dequant_op = matched.at("dequant_op");
   auto* dequant_op_out = matched.at("dequant_op_out");
+  auto weight_name = quantized_op_weight->arg()->name;
 
   // obtain weight_scale from max_range
   auto* scope = quantized_op->stmt()->op()->scope();
@@ -150,14 +152,15 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
   //        = max(abs(weight)) / range
 
   // set op desc
-  cpp::OpDesc op_desc = *quantized_op->stmt()->op_info();
+  auto op_desc = *quantized_op->stmt()->op_info();
   auto quantized_weight_var_name = quantized_op_weight->arg()->name;
   auto quantized_weight_t =
       scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
   std::vector<float> weight_scale;
-  int weight_scale_size;
+  int weight_scale_size = 0;
   if (quantized_op_type_ == "conv2d" ||
-      quantized_op_type_ == "depthwise_conv2d") {
+      quantized_op_type_ == "depthwise_conv2d" ||
+      quantized_op_type_ == "conv2d_transpose") {
     op_desc.SetInput("Input", {quantized_op_input->arg()->name});
     op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
     // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
@@ -173,7 +176,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
     weight_scale.push_back(whole_weight_scale);
   }
   op_desc.SetAttr("enable_int8", true);
-  op_desc.SetAttr("weight_scale", weight_scale);
+  op_desc.SetInputScale(weight_name, weight_scale);
 
   // change the weight from the float type to int8 type.
   Tensor temp_tensor;
@@ -204,12 +207,13 @@ cpp::OpDesc DequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
 
 void ChannelWiseDequantOpFuser::BuildPattern() {
   std::string dequant_op_type = "fake_channel_wise_dequantize_max_abs";
+  std::string weight_argname = GetWeightArgname(quantized_op_type_);
   auto* quantized_op_input = VarNode("quantized_op_input")
                                  ->assert_is_op_input(quantized_op_type_)
                                  ->AsInput();
   auto* quantized_op_weight =
       VarNode("quantized_op_weight")
-          ->assert_is_op_input(quantized_op_type_, "Filter")
+          ->assert_is_op_input(quantized_op_type_, weight_argname)
           ->AsInput();
   auto* quantized_op = OpNode("quantized_op", quantized_op_type_)
                            ->assert_is_op(quantized_op_type_)
@@ -246,6 +250,7 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
   auto* dequant_op_channel_scale = matched.at("dequant_op_channel_scale");
   auto* dequant_op = matched.at("dequant_op");
   auto* dequant_op_out = matched.at("dequant_op_out");
+  auto weight_name = quantized_op_weight->arg()->name;
 
   // obtain input weight_scale from fake_dequant op
   auto* scope = quantized_op->stmt()->op()->scope();
@@ -265,17 +270,20 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
   }
 
   // set op desc
-  cpp::OpDesc op_desc = *quantized_op->stmt()->op_info();
+  auto op_desc = *quantized_op->stmt()->op_info();
   if (quantized_op_type_ == "conv2d" ||
-      quantized_op_type_ == "depthwise_conv2d") {
+      quantized_op_type_ == "depthwise_conv2d" ||
+      quantized_op_type_ == "conv2d_transpose") {
     op_desc.SetInput("Input", {quantized_op_input->arg()->name});
     op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
   } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") {
     op_desc.SetInput("X", {quantized_op_input->arg()->name});
     op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
   }
-  op_desc.SetAttr("enable_int8", true);
-  op_desc.SetAttr("weight_scale", weight_scale);
+  if (quantized_op_type_ != "conv2d_transpose") {
+    op_desc.SetAttr("enable_int8", true);
+  }
+  op_desc.SetInputScale(weight_name, weight_scale);
 
   // change the weight from the float type to int8 type.
   auto quantized_weight_var_name = quantized_op_weight->arg()->name;
@@ -307,30 +315,33 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
 }
 
 void DeleteQuantDequantOpFuser::BuildPattern() {
-  std::string quant_dequant_op_type =
-      "fake_quantize_dequantize_moving_average_abs_max";
-  auto* input_scale_node =
-      VarNode("input_scale_node")
-          ->assert_is_op_input(quant_dequant_op_type, "InScale");
-  auto* input_act_node =
-      VarNode("input_act_node")->assert_is_op_input(quant_dequant_op_type, "X");
-  auto* quant_dequant_node = OpNode("quant_dequant_node", quant_dequant_op_type)
-                                 ->assert_is_op(quant_dequant_op_type);
+  auto* input_act_node = VarNode("input_act_node")
+                             ->assert_is_op_input(quant_dequant_op_type_, "X");
+  auto* quant_dequant_node =
+      OpNode("quant_dequant_node", quant_dequant_op_type_)
+          ->assert_is_op(quant_dequant_op_type_);
   auto* output_scale_node =
       VarNode("output_scale_node")
-          ->assert_is_op_output(quant_dequant_op_type, "OutScale");
+          ->assert_is_op_output(quant_dequant_op_type_, "OutScale");
   auto* output_act_node =
       VarNode("output_act_node")
-          ->assert_is_op_output(quant_dequant_op_type, "Out");
-
-  quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
+          ->assert_is_op_output(quant_dequant_op_type_, "Out");
+
+  if (quant_dequant_op_type_ ==
+      "fake_quantize_dequantize_moving_average_abs_max") {
+    auto* input_scale_node =
+        VarNode("input_scale_node")
+            ->assert_is_op_input(quant_dequant_op_type_, "InScale");
+    quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
+  } else {
+    quant_dequant_node->LinksFrom({input_act_node});
+  }
   output_scale_node->LinksFrom({quant_dequant_node});
   output_act_node->LinksFrom({quant_dequant_node});
 }
 
 void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
                                               const key2nodes_t& matched) {
-  auto* input_scale_node = matched.at("input_scale_node");
   auto* input_act_node = matched.at("input_act_node");
   auto* quant_dequant_node = matched.at("quant_dequant_node");
   auto* output_scale_node = matched.at("output_scale_node");
@@ -352,22 +363,7 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
     // Save quantization info in op_info attr
     auto op_info = *quantized_node->stmt()->op_info();
     op_info.SetAttr<int>("bit_length", bit_length);
-
-    std::string argname;
-    int index;
-    op_info.GetInputArgname(output_act_name, &argname);
-    op_info.GetInputIndex(output_act_name, &index);
-    op_info.SetAttr<float>(argname + std::to_string(index) + "_input_scale",
-                           scale_value);
-    std::string op_type = op_info.Type();
-    // Analyse the weight scale or input scale.
-    if (((op_type == "conv2d" || op_type == "depthwise_conv2d") &&
-         argname == "Input") ||
-        ((op_type == "mul" || op_type == "matmul") && argname == "Y")) {
-      op_info.SetAttr<float>("weight_scale", scale_value);
-    } else {
-      op_info.SetAttr<float>("input_scale", scale_value);
-    }
+    op_info.SetInputScale(output_act_name, {scale_value});
 
     op_info.UpdateAllInputs(output_act_name, input_act_name);
     quantized_node->stmt()->ResetOp(op_info, graph->valid_places());
@@ -375,7 +371,12 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
   }
   // delete nodes and edges
   std::set<const Node*> nodes2rm = {
-      input_scale_node, quant_dequant_node, output_scale_node, output_act_node};
+      quant_dequant_node, output_scale_node, output_act_node};
+  if (quant_dequant_op_type_ ==
+      "fake_quantize_dequantize_moving_average_abs_max") {
+    auto* input_scale_node = matched.at("input_scale_node");
+    nodes2rm.insert(input_scale_node);
+  }
   GraphSafeRemoveNodes(graph, nodes2rm);
 }
 
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h
index ac3ac112b3aa504bc075125f2f13292073ca9444..c2dd1e5191cf0ad9b242dfa230abe3d38bad0cf7 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -86,17 +86,22 @@ class ChannelWiseDequantOpFuser : public FuseBase {
   std::string quantized_op_type_{};
 };
 
-/* The pattern like "fake_quantize_dequantize_moving_average_abs_max +
- * quantized_op" can be deteted by this fuser. The fuser modifies the input
- * scale for the quantized_op and deletes the fake_quant_dequant_op.
+/* The pattern like "fake_quantize_dequantize_op + quantized_op" can be
+ * deteted by this fuser. The fuser modifies the input scale for the
+ * quantized_op and deletes the fake_quant_dequant_op.
 */
 class DeleteQuantDequantOpFuser : public FuseBase {
  public:
+  explicit DeleteQuantDequantOpFuser(const std::string& quant_dequant_op_type)
+      : quant_dequant_op_type_(quant_dequant_op_type) {}
   void BuildPattern() override;
   void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
 
  private:
   cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+
+ private:
+  std::string quant_dequant_op_type_{};
 };
 
 }  // namespace fusion
diff --git a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc b/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc
index d578b725ec42c926e5f0581fd8eeef855e586bdc..68417783e932f3c882eaae38e620b8b651b937dd 100644
--- a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc
+++ b/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc
@@ -84,11 +84,12 @@ cpp::OpDesc TransposeSoftmaxTransposeFuser::GenOpDesc(
   op_desc.SetInput("X", {matched.at("x1")->arg()->name});
   op_desc.SetOutput("Out", {matched.at("out")->arg()->name});
   op_desc.SetAttr("axis",
-                  matched.at("transpose1")
-                      ->stmt()
-                      ->op_info()
-                      ->GetAttr<std::vector<int>>("axis")
-                      .back());
+                  *(matched.at("transpose1")
+                        ->stmt()
+                        ->op_info()
+                        ->GetAttr<std::vector<int>>("axis")
+                        .end() -
+                    1));
 
   return op_desc;
 }
diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc
index d7486c0933dbbe74115bd6358962817b2b946c12..3c9bac1c5b9fbf6d48683f6423a4c670b17cb127 100644
--- a/lite/core/mir/generate_program_pass.cc
+++ b/lite/core/mir/generate_program_pass.cc
@@ -39,6 +39,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     nodes_in_order = graph->StmtTopologicalOrder();
   }
 
+  insts_.emplace_back();
   for (auto& item : nodes_in_order) {
     if (item->IsStmt()) {
       auto& stmt = item->AsStmt();
@@ -57,7 +58,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
             .SetSyncStreams(stmt.sync_streams_);
       }
 #endif
-      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
+      insts_.back().emplace_back(stmt.op(), std::move(stmt.kernels().front()));
     }
   }
 }
diff --git a/lite/core/mir/generate_program_pass.h b/lite/core/mir/generate_program_pass.h
index b126b4aba4d09a95a0033b04ed241812c88a3287..2ef4d035710d9542b365789aeabe8a08537ff225 100644
--- a/lite/core/mir/generate_program_pass.h
+++ b/lite/core/mir/generate_program_pass.h
@@ -42,7 +42,7 @@ class GenerateProgramPass : public ProgramPass {
   }
 
  private:
-  std::vector<Instruction> insts_;
+  std::vector<std::vector<Instruction>> insts_;
 };
 
 }  // namespace mir
diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc
index 55b7a004567ec5a5298e084839d6dcf5a8591882..98b1597b49b9a7e151c86d11843e45163890191a 100644
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -62,15 +62,17 @@ std::string Visualize(mir::SSAGraph* graph) {
            << string_trunc(op_info->GetAttr<std::string>(attr_name)) << "\"";
         break;
       case AttrType::FLOATS: {
-        auto vals = op_info->GetAttr<std::vector<float>>(attr_name);
+        std::vector<float> vals =
+            op_info->GetAttr<std::vector<float>>(attr_name);
         os << ":floats: {" + Join(vals, ",") << "}";
       } break;
       case AttrType::INTS: {
-        auto vals = op_info->GetAttr<std::vector<int>>(attr_name);
+        std::vector<int> vals = op_info->GetAttr<std::vector<int>>(attr_name);
         os << ":ints: {" + Join(vals, ",") + "}";
       } break;
       case AttrType::STRINGS: {
-        auto vals = op_info->GetAttr<std::vector<std::string>>(attr_name);
+        std::vector<std::string> vals =
+            op_info->GetAttr<std::vector<std::string>>(attr_name);
         os << ":strings: {" + string_trunc(Join(vals, ",")) << "}";
       } break;
       default:
diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc
index 5ad094fd4219bcbb3c59ec1c71f42af6cac5a11a..eddbebb545351fa6b1820682af487bb7b04e8bb3 100644
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -314,4 +314,6 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
                      TARGET(kXPU),
                      TARGET(kBM),
                      TARGET(kRKNPU),
-                     TARGET(kAPU)});
+                     TARGET(kAPU),
+                     TARGET(kMLU),
+                     TARGET(kHuaweiAscendNPU)});
diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc
index ba48d5d4ead5ea922ded0bff3a87c2c127595790..e09220d083ee8241001b6d9d55fb48eb1ba74f2e 100644
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -14,18 +14,22 @@
 
 #include "lite/core/mir/mlu_postprocess_pass.h"
 #include <list>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 #include "lite/core/mir/graph_visualize_pass.h"
 #include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/subgraph/subgraph_detector.h"
 #include "lite/operators/subgraph_op.h"
 
 namespace paddle {
 namespace lite {
 namespace mir {
 
+static thread_local int g_stream_id = 0;
+
 Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
                                            const std::string& cast_arg_name,
                                            SSAGraph* graph,
@@ -37,6 +41,10 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
   cast_arg->AsArg().type = cast_type;
   inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
 
+  VLOG(4) << "insert cast before subgraph";
+  VLOG(4) << "curent node type: " << cur_node->AsArg().type->name()
+          << " cast to node type: " << cast_type->name();
+
   // create the stmt node
   auto* cast_inst = graph->NewInstructNode();
   // create op
@@ -60,14 +68,17 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
     CHECK(0) << "Unsupport cast type";
   }
   cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
+
+  auto v_places = graph->valid_places();
   // create kernels
-  auto kernels = cast_op->CreateKernels(graph->valid_places());
+  auto kernels = cast_op->CreateKernels(v_places);
   std::vector<std::unique_ptr<KernelBase>> selected_kernels;
   bool is_found = false;
   for (auto& kernel : kernels) {
     if (op_type == "cast") {
       const Type* in_arg_ty = kernel->GetInputDeclType("X");
-      if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
+      if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) &&
+          DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type)) {
         is_found = true;
       }
     } else if (op_type == "layout") {
@@ -83,24 +94,22 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
       const Type* in_arg_ty = kernel->GetInputDeclType("Input");
       const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
       if (TargetCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) &&
-          TargetCompatibleTo(*out_arg_ty, *cast_type)) {
+          TargetCompatibleTo(*out_arg_ty, *cast_type) &&
+          PrecisionCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
+          PrecisionCompatible(*out_arg_ty, *cast_type)) {
         is_found = true;
       }
     } else {
       CHECK(0) << "Unsupport cast type";
     }
     if (is_found) {
+      VLOG(4) << "insert kernel: " << kernel->name();
       selected_kernels.emplace_back(std::move(kernel));
       // we pick the kernel
       cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
       auto& stmt = cast_inst->AsStmt();
-      if (op_type == "layout") {
-        stmt.picked_kernel().SetContext(
-            ContextScheduler::Global().NewContext(TARGET(kX86)));
-      } else {
-        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
-            stmt.picked_kernel().target()));
-      }
+      stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+          stmt.picked_kernel().target(), g_stream_id));
       break;
     }
   }
@@ -124,6 +133,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
   auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
   // for CastAfter manully set the tensor's type
   var->GetMutable<paddle::lite::Tensor>();
+  VLOG(4) << "insert cast after subgraph";
+  VLOG(4) << "curent node type: " << cur_node->AsArg().type->name()
+          << " cast to node type: " << cast_type->name();
 
   // create the stmt node
   auto* cast_inst = graph->NewInstructNode();
@@ -133,8 +145,8 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
   cpp::OpDesc op_desc;
   op_desc.SetType(op_type);
   if (op_type == "cast") {
-    op_desc.SetAttr<int>("in_dtype", 4);   // FP32
-    op_desc.SetAttr<int>("out_dtype", 5);  // FP16
+    op_desc.SetAttr<int>("in_dtype", 4);   // FP16
+    op_desc.SetAttr<int>("out_dtype", 5);  // FP32
     op_desc.SetInput("X", {cast_arg_name});
     op_desc.SetOutput("Out", {cur_node->AsArg().name});
   } else if (op_type == "layout") {
@@ -150,8 +162,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
 
   cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
 
+  auto v_places = graph->valid_places();
   // create kernels
-  auto kernels = cast_op->CreateKernels(graph->valid_places());
+  auto kernels = cast_op->CreateKernels(v_places);
   std::vector<std::unique_ptr<KernelBase>> selected_kernels;
   bool is_found = false;
   for (auto& kernel : kernels) {
@@ -164,14 +177,17 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
       const Type* in_arg_ty = kernel->GetInputDeclType("Input");
       const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
       if (DataLayoutCompatible(*in_arg_ty, *cast_type) &&
-          DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) {
+          DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type) &&
+          PrecisionCompatibleTo(*in_arg_ty, *cast_type)) {
         is_found = true;
       }
     } else if (op_type == "io_copy") {
       const Type* in_arg_ty = kernel->GetInputDeclType("Input");
       const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
       if (TargetCompatibleTo(*in_arg_ty, *cast_type) &&
-          TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type)) {
+          TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type) &&
+          PrecisionCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
+          PrecisionCompatible(*out_arg_ty, *cast_type)) {
         is_found = true;
       }
     } else {
@@ -182,13 +198,8 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
       // we pick the kernel
       cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
       auto& stmt = cast_inst->AsStmt();
-      if (op_type == "layout") {
-        stmt.picked_kernel().SetContext(
-            ContextScheduler::Global().NewContext(TARGET(kX86)));
-      } else {
-        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
-            stmt.picked_kernel().target()));
-      }
+      stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+          stmt.picked_kernel().target(), g_stream_id));
       break;
     }
   }
@@ -203,7 +214,8 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
 void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
                                       Node* head_node,
                                       Node* inst_node,
-                                      const Type* inst_type) {
+                                      const Type* inst_type,
+                                      bool use_mlu_cast) {
   const auto* head_type = head_node->AsArg().type;
 
   // break original link
@@ -218,39 +230,52 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
                 head_node->AsArg().name) != first_conv_nodes_.end();
 
   // precision cast node
-  if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
+  if (!use_mlu_cast) {
+    if (head_type->precision() != inst_type->precision() &&
+        !is_first_conv_head) {
+      cur_node = InsertCastBefore("cast",
+                                  name_prefix + "cast",
+                                  graph,
+                                  cur_node,
+                                  inst_node,
+                                  LiteType::GetTensorTy(head_type->target(),
+                                                        inst_type->precision(),
+                                                        head_type->layout()));
+    }
+
+    // layout cast node
+    if (head_type->layout() != inst_type->layout()) {
+      cur_node = InsertCastBefore("layout",
+                                  name_prefix + "layout",
+                                  graph,
+                                  cur_node,
+                                  inst_node,
+                                  LiteType::GetTensorTy(head_type->target(),
+                                                        inst_type->precision(),
+                                                        inst_type->layout()));
+    }
+
+    // io copy
     cur_node = InsertCastBefore(
-        "cast",
-        name_prefix + "cast",
+        "io_copy",
+        name_prefix + "io_copy",
         graph,
         cur_node,
         inst_node,
         LiteType::GetTensorTy(
-            head_type->target(), inst_type->precision(), head_type->layout()));
-  }
-
-  // layout cast node
-  if (head_type->layout() != inst_type->layout()) {
+            inst_type->target(), inst_type->precision(), inst_type->layout()));
+  } else {
+    // io copy
     cur_node = InsertCastBefore(
-        "layout",
-        name_prefix + "layout",
+        "io_copy",
+        name_prefix + "io_copy",
         graph,
         cur_node,
         inst_node,
         LiteType::GetTensorTy(
-            head_type->target(), inst_type->precision(), inst_type->layout()));
+            inst_type->target(), head_type->precision(), head_type->layout()));
   }
 
-  // io copy
-  cur_node = InsertCastBefore(
-      "io_copy",
-      name_prefix + "io_copy",
-      graph,
-      cur_node,
-      inst_node,
-      LiteType::GetTensorTy(
-          inst_type->target(), inst_type->precision(), inst_type->layout()));
-
   // connect cur_node to inst_node
   DirectedLink(cur_node, inst_node);
 
@@ -259,13 +284,19 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
                 head_node->AsArg().name,
                 cur_node->AsArg().name);
   // for subgraph op, modify the BlockDesc
-  auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
-                             inst_node->AsStmt().op().get())
-                             ->GetSubBlock();
-  for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
-    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
-    UpdateInputTo(
-        sub_block_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
+  auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                              inst_node->AsStmt().op().get())
+                              ->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx =
+      inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       ++sub_op_idx) {
+    auto* sub_op_desc = const_cast<cpp::OpDesc*>(
+        sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
+    UpdateInputTo(sub_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
   }
 
   // recreate the op
@@ -311,10 +342,9 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
       CHECK(subgraph_precision == PRECISION(kFloat) ||
             subgraph_precision == PRECISION(kFP16))
           << "Mlu node has unsupport precision";
-      VLOG(4) << "picked kernel precision: "
-              << PrecisionToStr(subgraph_precision);
       *arg_type = LiteType::GetTensorTy(
           subgraph_target, subgraph_precision, subgraph_layout);
+      VLOG(4) << "picked subgraph kernel type: " << (*arg_type)->name();
       break;
     }
   }
@@ -356,7 +386,8 @@ bool MLUPostprocessPass::NeedInsert(Node* node, const Type* inst_type) {
 void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
                                      Node* tail_node,
                                      Node* inst_node,
-                                     const Type* inst_type) {
+                                     const Type* inst_type,
+                                     bool use_mlu_cast) {
   const auto* tail_type = tail_node->AsArg().type;
 
   // break original link
@@ -367,39 +398,50 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
       tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
 
   // precision cast node
-  if (tail_type->precision() != inst_type->precision()) {
+  if (!use_mlu_cast) {
+    if (tail_type->precision() != inst_type->precision()) {
+      cur_node = InsertCastAfter("cast",
+                                 name_prefix + "cast",
+                                 graph,
+                                 cur_node,
+                                 inst_node,
+                                 LiteType::GetTensorTy(tail_type->target(),
+                                                       inst_type->precision(),
+                                                       tail_type->layout()));
+    }
+
+    // layout cast node
+    if (tail_type->layout() != inst_type->layout()) {
+      cur_node = InsertCastAfter("layout",
+                                 name_prefix + "layout",
+                                 graph,
+                                 cur_node,
+                                 inst_node,
+                                 LiteType::GetTensorTy(tail_type->target(),
+                                                       inst_type->precision(),
+                                                       inst_type->layout()));
+    }
+
+    // io copy
     cur_node = InsertCastAfter(
-        "cast",
-        name_prefix + "cast",
+        "io_copy",
+        name_prefix + "io_copy",
         graph,
         cur_node,
         inst_node,
         LiteType::GetTensorTy(
-            tail_type->target(), inst_type->precision(), tail_type->layout()));
-  }
-
-  // layout cast node
-  if (tail_type->layout() != inst_type->layout()) {
+            inst_type->target(), inst_type->precision(), inst_type->layout()));
+  } else {
     cur_node = InsertCastAfter(
-        "layout",
-        name_prefix + "layout",
+        "io_copy",
+        name_prefix + "io_copy",
         graph,
         cur_node,
         inst_node,
         LiteType::GetTensorTy(
-            tail_type->target(), inst_type->precision(), inst_type->layout()));
+            inst_type->target(), tail_type->precision(), tail_type->layout()));
   }
 
-  // io copy
-  cur_node = InsertCastAfter(
-      "io_copy",
-      name_prefix + "io_copy",
-      graph,
-      cur_node,
-      inst_node,
-      LiteType::GetTensorTy(
-          inst_type->target(), inst_type->precision(), inst_type->layout()));
-
   // connect cur_node to inst_node
   DirectedLink(inst_node, cur_node);
 
@@ -408,21 +450,27 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
                  tail_node->AsArg().name,
                  cur_node->AsArg().name);
   // for subgraph op, modify the BlockDesc
-  auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
-                             inst_node->AsStmt().op().get())
-                             ->GetSubBlock();
-  for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
-    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
+  auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                              inst_node->AsStmt().op().get())
+                              ->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx =
+      inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       ++sub_op_idx) {
+    auto* sub_op_desc = const_cast<cpp::OpDesc*>(
+        sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
     UpdateOutputTo(
-        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+        sub_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
     /* graph like this
      *        subgraph_op_0
      *          /       \
      *         /         \
      * subgraph_op_1   host_op
      */
-    UpdateInputTo(
-        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+    UpdateInputTo(sub_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
   }
 
   // recreate the op
@@ -446,15 +494,22 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
   }
 }
 
-bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
-  auto* block_desc =
-      static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
-          ->GetSubBlock();
-  for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
-    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
-    CHECK(op_desc);
-    if (op_desc->Type() == "conv2d") {
-      for (auto& names : op_desc->inputs()) {
+bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node,
+                                               Node* inst_node) {
+  auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                              inst_node->AsStmt().op().get())
+                              ->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx =
+      inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       sub_op_idx++) {
+    auto sub_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx);
+    CHECK(sub_op_desc);
+    if (sub_op_desc->Type() == "conv2d") {
+      for (auto& names : sub_op_desc->inputs()) {
         if (std::find(names.second.begin(),
                       names.second.end(),
                       arg_node->AsArg().name) != names.second.end()) {
@@ -496,6 +551,74 @@ void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) {
   }
 }
 
+void MLUPostprocessPass::ModifyInputOutputDataType(SSAGraph* graph) {
+  for (auto& node : graph->mutable_nodes()) {
+    if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
+      const Type* subgraph_arg_type = nullptr;
+      GetSubgraphOpArgType(&node, &subgraph_arg_type, graph);
+      for (auto& in_node : node.inlinks) {
+        const auto* in_node_type = in_node->AsArg().type;
+        VLOG(4) << "MLU subgraph input type: " << in_node->AsArg().name
+                << *in_node_type;
+        if (in_node->AsArg().is_weight || in_node->AsArg().is_persist) {
+          CHECK(in_node_type->target() == TARGET(kHost) &&
+                in_node_type->precision() == PRECISION(kAny) &&
+                in_node_type->layout() == DATALAYOUT(kNCHW))
+              << "MLU subgraph unexpected persistent input type!";
+          in_node->AsArg().type = LiteType::GetTensorTy(
+              TARGET(kMLU), PRECISION(kAny), DATALAYOUT(kNHWC));
+        } else {
+          CHECK((in_node_type->target() == TARGET(kHost) ||
+                 in_node_type->target() == TARGET(kX86)) &&
+                in_node_type->precision() == PRECISION(kFloat) &&
+                in_node_type->layout() == DATALAYOUT(kNCHW))
+              << "MLU subgraph unexpected common input type!";
+        }
+      }
+      for (auto& out_node : node.outlinks) {
+        const auto* out_node_type = out_node->AsArg().type;
+        auto& out_arg = out_node->AsArg();
+        VLOG(4) << "MLU subgraph output type: " << out_node->AsArg().name
+                << *out_node_type;
+        if (out_node->AsArg().is_weight || out_node->AsArg().is_persist) {
+          CHECK(out_node_type->target() == TARGET(kHost) &&
+                out_node_type->precision() == PRECISION(kAny) &&
+                out_node_type->layout() == DATALAYOUT(kNCHW))
+              << "MLU subgraph unexpected persistent input type!";
+          out_node->AsArg().type = LiteType::GetTensorTy(
+              TARGET(kMLU), PRECISION(kAny), DATALAYOUT(kNHWC));
+        } else if (out_node_type->precision() == PRECISION(kAny) &&
+                   out_node->outlinks.empty()) {
+          out_arg.is_persist = true;
+          out_arg.type = LiteType::GetTensorTy(
+              TARGET(kMLU), PRECISION(kAny), DATALAYOUT(kNHWC));
+        } else {
+          CHECK(out_node_type->precision() == PRECISION(kFloat))
+              << "MLU subgraph unexpected common output type!";
+          if (out_node->outlinks.empty()) {
+            out_arg.type = LiteType::GetTensorTy(TARGET(kHost),
+                                                 subgraph_arg_type->precision(),
+                                                 DATALAYOUT(kNHWC));
+            VLOG(4) << "unused output node type: " << out_arg.name
+                    << out_node_type->name();
+          } else {
+            out_arg.type = LiteType::GetTensorTy(
+                TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+            VLOG(4) << "output node type: " << out_arg.name
+                    << out_node_type->name();
+          }
+        }
+        const auto target = out_node->AsArg().type->target();
+        const auto precision = out_node->AsArg().type->precision();
+        const auto layout = out_node->AsArg().type->layout();
+        VLOG(4) << "arg name: " << out_node->AsArg().name
+                << " type: " << TargetToStr(target) << ", "
+                << PrecisionToStr(precision) << ", " << DataLayoutToStr(layout);
+      }
+    }
+  }
+}
+
 void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
   for (auto& node : graph->mutable_nodes()) {
     if (!node.IsStmt()) continue;
@@ -515,6 +638,16 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
                                     old_type->precision(),
                                     paddle::lite_api::DataLayoutType::kNHWC,
                                     old_type->device());
+          // modify inst feed to NHWC, while set_mlu_input_layout(kNHWC)
+          // invoked, to keep consistent with actual data layout
+          auto place = node.AsStmt().place();
+          place.layout = DATALAYOUT(kNHWC);
+          std::vector<Place> valid_places = {place};
+          auto updated_op_info = *node.AsStmt().op_info();
+          node.AsStmt().ResetOp(updated_op_info, valid_places, nullptr);
+          auto kernel = &(node.AsStmt().picked_kernel());
+          VLOG(4) << "kernel info: " << kernel->name();
+          node.AsStmt().op()->AttachKernel(kernel);
         }
       }
     }
@@ -540,6 +673,219 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
   }
 }
 
+std::pair<bool, std::string> CheckInputAndInsert(Scope* scope,
+                                                 cpp::BlockDesc* block_desc,
+                                                 const std::string& input_name,
+                                                 const Type* tensor_type,
+                                                 const Type* subgraph_type) {
+  auto cur_node = input_name;
+  bool do_insert = false;
+  if (!DataLayoutCompatible(*tensor_type, *subgraph_type)) {
+    auto layout_op = block_desc->AddOp<cpp::OpDesc>();
+    auto layout_arg_name = string_format("%s/layout", cur_node.c_str());
+    scope->Var(layout_arg_name);
+    VLOG(4) << "insert layout for subgraph input, arg tensor name: "
+            << layout_arg_name;
+    layout_op->SetType("layout");
+    layout_op->SetInput("Input", {cur_node});
+    layout_op->SetOutput("Out", {layout_arg_name});
+    cur_node = layout_arg_name;
+    do_insert = true;
+  }
+
+  if (!PrecisionCompatible(*tensor_type, *subgraph_type) &&
+      tensor_type->precision() != PRECISION(kInt8) &&
+      tensor_type->precision() != PRECISION(kInt32)) {
+    auto cast_op = block_desc->AddOp<cpp::OpDesc>();
+    auto cast_arg_name = string_format("%s/cast", cur_node.c_str());
+    scope->Var(cast_arg_name);
+    VLOG(4) << "insert cast for subgraph input, arg tensor name: "
+            << cast_arg_name;
+    cast_op->SetType("cast");
+    cast_op->SetAttr<int>("in_dtype", 5);   // FP32
+    cast_op->SetAttr<int>("out_dtype", 4);  // FP16
+    cast_op->SetInput("X", {cur_node});
+    cast_op->SetOutput("Out", {cast_arg_name});
+    cur_node = cast_arg_name;
+    do_insert = true;
+  }
+
+  return std::make_pair(do_insert, cur_node);
+}
+
+std::pair<bool, std::string> CheckOutputAndInsert(
+    Scope* scope,
+    cpp::BlockDesc* block_desc,
+    const std::string& output_name,
+    const Type* tensor_type,
+    const Type* subgraph_type) {
+  auto cur_node = output_name;
+  bool do_insert = false;
+  cpp::OpDesc *layout_op = nullptr, *cast_op = nullptr;
+  size_t cast_idx = 0;
+
+  // subgraph -> cast -> layout -> output
+  if (!PrecisionCompatible(*tensor_type, *subgraph_type)) {
+    cast_op = block_desc->AddOp<cpp::OpDesc>();
+    cast_idx = block_desc->OpsSize() - 1;
+    CHECK_EQ(cast_op, block_desc->GetOp<cpp::OpDesc>(cast_idx));
+    cast_op->SetType("cast");
+    cast_op->SetAttr<int>("in_dtype", 4);   // FP16
+    cast_op->SetAttr<int>("out_dtype", 5);  // FP32
+    do_insert = true;
+  }
+
+  if (!DataLayoutCompatible(*tensor_type, *subgraph_type)) {
+    auto layout_arg_name = string_format("%s/layout", cur_node.c_str());
+    scope->Var(layout_arg_name);
+    VLOG(4) << "insert layout for subgraph output, arg tensor name: "
+            << layout_arg_name;
+    layout_op = block_desc->AddOp<cpp::OpDesc>();
+    layout_op->SetType("layout");
+    layout_op->SetInput("Input", {layout_arg_name});
+    layout_op->SetOutput("Out", {cur_node});
+    cur_node = layout_arg_name;
+    do_insert = true;
+  }
+
+  if (cast_op) {
+    cast_op = block_desc->GetOp<cpp::OpDesc>(cast_idx);
+    auto cast_arg_name = string_format("%s/cast", cur_node.c_str());
+    scope->Var(cast_arg_name);
+    VLOG(4) << "insert cast for subgraph output, arg tensor name: "
+            << cast_arg_name;
+    cast_op->SetInput("X", {cast_arg_name});
+    cast_op->SetOutput("Out", {cur_node});
+    cur_node = cast_arg_name;
+  }
+
+  return std::make_pair(do_insert, cur_node);
+}
+
+// insert cast op on mlu, to avoid cast on cpu
+void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
+                                        const Type* subgraph_type) {
+  CHECK_EQ(subgraph_node->AsStmt().op()->Type(), "subgraph");
+  auto subgraph_op =
+      dynamic_cast<operators::SubgraphOp*>(subgraph_node->AsStmt().op().get());
+  CHECK(subgraph_op);
+  auto sub_program_desc = subgraph_op->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx = subgraph_op->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc = const_cast<cpp::BlockDesc*>(
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx));
+
+  // create a new block desc to keep op sequence correct
+  cpp::BlockDesc new_block_desc;
+  new_block_desc.ClearOps();
+  new_block_desc.ClearVars();
+  new_block_desc.SetIdx(sub_block_desc->Idx());
+  new_block_desc.SetParentIdx(sub_block_desc->ParentIdx());
+  new_block_desc.SetForwardBlockIdx(sub_block_desc->ForwardBlockIdx());
+
+  // find all IO that is not weight or persist
+  std::list<std::string> i_names, o_names;
+  std::map<std::string, std::string> node_replace;
+
+  // Insert cast op for iotensor which is not weight or persist
+  for (auto& input : subgraph_node->inlinks) {
+    auto input_name = input->AsArg().name;
+    if (!(input->AsArg().is_weight || input->AsArg().is_persist)) {
+      i_names.emplace_back(input_name);
+      auto ret = CheckInputAndInsert(subgraph_op->scope(),
+                                     &new_block_desc,
+                                     input_name,
+                                     input->AsArg().type,
+                                     subgraph_type);
+      if (ret.first) {
+        node_replace[input_name] = ret.second;
+      }
+    }
+  }
+  for (auto& output : subgraph_node->outlinks) {
+    auto output_name = output->AsArg().name;
+    if (!(output->AsArg().is_weight || output->AsArg().is_persist)) {
+      o_names.emplace_back(output_name);
+      auto ret = CheckOutputAndInsert(subgraph_op->scope(),
+                                      sub_block_desc,
+                                      output_name,
+                                      output->AsArg().type,
+                                      subgraph_type);
+      if (ret.first) {
+        node_replace[output_name] = ret.second;
+      }
+    }
+  }
+
+  // update input and output
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       ++sub_op_idx) {
+    auto sub_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx);
+    auto new_op_desc = new_block_desc.AddOp<cpp::OpDesc>();
+    *new_op_desc = *sub_op_desc;
+
+    if (sub_op_desc->Type() != "layout" && sub_op_desc->Type() != "cast") {
+      auto op_input_args = new_op_desc->InputArgumentNames();
+      for (auto& input_arg : op_input_args) {
+        auto op_input = new_op_desc->Input(input_arg);
+        for (auto& it : i_names) {
+          auto index = std::find(op_input.begin(), op_input.end(), it);
+          if (index != op_input.end() &&
+              node_replace.find(it) != node_replace.end()) {
+            index = op_input.erase(index);
+            op_input.emplace(index, node_replace.at(it));
+            VLOG(4) << new_op_desc->Type() << "] change input from " << it
+                    << " to " << node_replace.at(it);
+          }
+        }
+        new_op_desc->SetInput(input_arg, op_input);
+      }
+
+      auto op_output_args = new_op_desc->OutputArgumentNames();
+      for (auto& output_arg : op_output_args) {
+        auto op_output = new_op_desc->Output(output_arg);
+        for (auto& it : o_names) {
+          auto index = std::find(op_output.begin(), op_output.end(), it);
+          if (index != op_output.end() &&
+              node_replace.find(it) != node_replace.end()) {
+            index = op_output.erase(index);
+            op_output.emplace(index, node_replace.at(it));
+            VLOG(4) << new_op_desc->Type() << "] change output from " << it
+                    << " to " << node_replace.at(it);
+          }
+        }
+        new_op_desc->SetOutput(output_arg, op_output);
+      }
+    }
+  }
+
+  *sub_block_desc = new_block_desc;
+}
+
+void ModifyValidPlaces(SSAGraph* graph, bool use_mlu_cast) {
+  // remove invalid places, since only support X86, host, MLU
+  auto v_places = graph->valid_places();
+  for (auto it = v_places.begin(); it != v_places.end();) {
+    if (it->target != TARGET(kMLU) && it->target != TARGET(kHost) &&
+        it->target != TARGET(kX86)) {
+      it = v_places.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
+  if (use_mlu_cast) {
+    // insert mlu float place for float io copy, no effect to subgraph type
+    v_places.emplace_back(TARGET(kMLU), PRECISION(kFloat), DATALAYOUT(kNHWC));
+  }
+
+  graph->SetValidPlaces(v_places);
+  VLOG(4) << "valid places after modified:";
+  for (auto& p : v_places) {
+    VLOG(4) << p.DebugString();
+  }
+}
+
 void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 // currently for non-persistent input and output args, mlu subgraph op
 // only support float16/float32 data type
@@ -549,35 +895,47 @@ void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 // arg_in and arg_out are assumed to be NHWC which user should be aware of.
 // Thus here we change these args' layout to NHWC
 #ifdef LITE_WITH_MLU
-  if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) {
+  ModifyInputOutputDataType(graph.get());
+
+  if (lite::TargetWrapperMlu::InputLayout() == DATALAYOUT(kNHWC)) {
     ModifyLayout(graph.get());
   }
 
-  if (lite::DeviceInfo::Global().UseFirstConv()) {
+  if (lite::TargetWrapperMlu::UseFirstConv()) {
     GatherAndModifyFirstConvNodes(graph.get());
   }
 #endif
 
+  g_stream_id = static_cast<int>(reinterpret_cast<int64_t>(graph.get()));
+  bool disable_mlu_cast = GetBoolFromEnv("LITE_DISABLE_MLU_CAST");
+  ModifyValidPlaces(graph.get(), !disable_mlu_cast);
   // insert io_copy, layout and precision cast of subgraph's inputs and outputs
   for (auto& node : graph->mutable_nodes()) {
     if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
       const Type* subgraph_arg_type = nullptr;
       GetSubgraphOpArgType(&node, &subgraph_arg_type, graph.get());
+      if (!disable_mlu_cast) {
+        AdjustSubgraph(&node, subgraph_arg_type);
+      }
 
       auto links_tmp = node.inlinks;
       for (auto p_in : links_tmp) {
         if (NeedInsert(p_in, subgraph_arg_type)) {
-          InsertBefore(graph.get(), p_in, &node, subgraph_arg_type);
+          InsertBefore(
+              graph.get(), p_in, &node, subgraph_arg_type, !disable_mlu_cast);
         }
       }
       links_tmp.assign(node.outlinks.begin(), node.outlinks.end());
       for (auto p_out : links_tmp) {
         if (NeedInsert(p_out, subgraph_arg_type)) {
-          InsertAfter(graph.get(), p_out, &node, subgraph_arg_type);
+          InsertAfter(
+              graph.get(), p_out, &node, subgraph_arg_type, !disable_mlu_cast);
         }
       }
     }
   }
+  // std::vector<std::vector<Node*>> subgraphs({graph->NodeTopologicalOrder()});
+  // SubgraphVisualizer(graph.get(), subgraphs)();
 }
 
 }  // namespace mir
diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h
index 688dd06fb5fbec0c8e1c53acfe4215456ddb4192..5a31c1d8322db7bbc57de8dd18fdaf8ff4b0c885 100644
--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
@@ -79,6 +79,8 @@ class MLUPostprocessPass : public ProgramPass {
                             const Type** arg_type,
                             SSAGraph* graph);
 
+  void ModifyInputOutputDataType(SSAGraph* graph);
+
   void ModifyLayout(SSAGraph* graph);
 
   bool NeedInsert(Node* node, const Type* inst_type);
@@ -86,12 +88,14 @@ class MLUPostprocessPass : public ProgramPass {
   void InsertBefore(SSAGraph* graph,
                     Node* head_node,
                     Node* inst_node,
-                    const Type* type);
+                    const Type* type,
+                    bool use_mlu_cast);
 
   void InsertAfter(SSAGraph* graph,
                    Node* tail_node,
                    Node* inst_node,
-                   const Type* type);
+                   const Type* type,
+                   bool use_mlu_cast);
 
   Node* InsertCastBefore(const std::string& op_type,
                          const std::string& cast_arg_name,
@@ -115,6 +119,8 @@ class MLUPostprocessPass : public ProgramPass {
 
   bool IsFirstConvInSubgraph(Node* arg_node, Node* inst);
 
+  void AdjustSubgraph(Node* subgraph_node, const Type* op_type);
+
  private:
   std::set<std::string> first_conv_nodes_;
 };
diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc
index 66b37446a4cc6a33c09757266c9dd2cbc818325e..259447aa21b76261a266a243dcc9c2a7530c9dc5 100644
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -37,34 +37,53 @@ void QuantizedOpAttributesInferencePass::Apply(
     auto& inst = op_node->AsStmt();
     auto op_info = inst.op_info();
     auto op_type = op_info->Type();
-    if (!op_info->HasAttr("input_scale")) continue;
-    bool found = false;
-    float output_scale;
+
+    // Check if any of the inputs of the op have scale value
+    bool has_input_scale = false;
+    for (auto in_var_node : op_node->inlinks) {
+      CHECK(in_var_node->IsArg());
+      auto in_var_node_name = in_var_node->arg()->name;
+      has_input_scale |= op_info->HasInputScale(in_var_node_name);
+    }
+    if (!has_input_scale) continue;
+
+    // Infer the output scale according to its out_threshold or the input scale
+    // of its adjacent ops
+    bool is_quantized = true;
     for (auto out_var_node : op_node->outlinks) {
       CHECK(out_var_node->IsArg());
+      std::vector<float> output_scale;
+      bool has_output_scale = false;
+      auto out_var_node_name = out_var_node->arg()->name;
       for (auto out_op_node : out_var_node->outlinks) {
         CHECK(out_op_node->IsStmt());
         auto& out_inst = out_op_node->AsStmt();
         auto out_op_info = out_inst.op_info();
-        if (!out_op_info->HasAttr("input_scale")) continue;
-        auto input_scale = out_op_info->GetAttr<float>("input_scale");
-        if (!found) {
-          found = true;
+        if (!out_op_info->HasInputScale(out_var_node_name)) continue;
+        auto input_scale = out_op_info->GetInputScale(out_var_node_name);
+        if (!has_output_scale) {
           output_scale = input_scale;
+          has_output_scale = true;
         } else {
-          CHECK_EQ(output_scale, input_scale);
+          CHECK_EQ(output_scale.size(), input_scale.size());
         }
       }
+      if (has_output_scale) {
+        inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale);
+      } else if (op_info->HasAttr("out_threshold")) {
+        // Only consider one output, there are only one out_threshold
+        int bit_length = op_info->GetAttr<int>("bit_length");
+        int range = (1 << (bit_length - 1)) - 1;
+        output_scale = std::vector<float>{
+            op_info->GetAttr<float>("out_threshold") / range};
+        inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale);
+      } else {
+        is_quantized = false;
+      }
     }
-    if (found) {
-      inst.mutable_op_info()->SetAttr("output_scale", output_scale);
-    } else if (op_info->HasAttr("output_scale")) {
-      int bit_length = op_info->GetAttr<int>("bit_length");
-      int range = (1 << (bit_length - 1)) - 1;
-      output_scale = op_info->GetAttr<float>("output_scale");
-      inst.mutable_op_info()->SetAttr("output_scale", output_scale / range);
-    }
-    if (op_info->HasAttr("output_scale")) {
+
+    // Fix the missing of the attribute 'enable_int8'.
+    if (is_quantized) {
       inst.mutable_op_info()->SetAttr("enable_int8", true);
     }
   }
diff --git a/lite/core/mir/runtime_context_assign_pass.cc b/lite/core/mir/runtime_context_assign_pass.cc
index 5b6f968484b7b49838a004c3edfd00ff9b7e5e5e..7ad833b22885204130b50a931dc2da7d040c654c 100644
--- a/lite/core/mir/runtime_context_assign_pass.cc
+++ b/lite/core/mir/runtime_context_assign_pass.cc
@@ -44,6 +44,10 @@ class RuntimeContextAssignPass : public StmtPass {
         inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
             inst.picked_kernel().target()));
       }
+#elif LITE_WITH_MLU
+      inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+          inst.picked_kernel().target(),
+          static_cast<int>(reinterpret_cast<int64_t>(graph.get()))));
 #else
       int stream_id = inst.stream_id_;
 
diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc
index f8991a359b177799cc5f59651c5d305fe64231ef..9cf7bc8995766e47895ce3dd2ef6bf7bcb614e5c 100644
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -153,60 +153,61 @@ Node *SSAGraph::GraphCreateInstructNode(
 }
 
 void SSAGraph::Build(const Program &program,
-                     const std::vector<Place> &valid_places) {
+                     const std::vector<Place> &valid_places,
+                     int block_idx) {
   CHECK(node_storage_.empty());
 
-  auto weights_name = program.weights();
-  auto is_weights = [&](const std::string &name) -> bool {
-    auto it = std::find(weights_name.begin(), weights_name.end(), name);
-    if (it == weights_name.end()) return false;
+  auto weights = program.weights();
+  auto is_weight = [&](const std::string &name) -> bool {
+    auto it = std::find(weights.begin(), weights.end(), name);
+    if (it == weights.end()) return false;
     return true;
   };
 
-  std::map<std::string, PrecisionType> var_types = program.var_data_type();
-
-  std::map<std::string, mir::Node *> arg_update_node_map_;
-  for (auto &op : program.ops()) {
+  auto var_type_map = program.var_type_map();
+  std::map<std::string, mir::Node *> arg_update_node_map;
+  for (auto &op : program.ops(block_idx)) {
     VLOG(3) << op->op_info()->Type();
     auto *op_node = GraphCreateInstructNode(op, valid_places);
-    for (const std::string &name : op->op_info()->input_names()) {
+    auto *op_info = op->op_info();
+    const auto &op_type = op_info->Type();
+    for (const auto &var_name : op_info->input_names()) {
       mir::Node *arg_node = nullptr;
-      if (arg_update_node_map_.count(name)) {
-        arg_node = arg_update_node_map_.at(name);
+      if (arg_update_node_map.count(var_name)) {
+        arg_node = arg_update_node_map.at(var_name);
       } else {
         node_storage_.emplace_back();
         arg_node = &node_storage_.back();
-        arg_node->AsArg(name, node_storage_.size() - 1);
-        arg_update_node_map_[name] = arg_node;
+        arg_node->AsArg(var_name, node_storage_.size() - 1);
+        arg_update_node_map[var_name] = arg_node;
       }
-      if (var_types.count(name)) {
+      if (var_type_map.count(var_name)) {
         if (!arg_node->arg()->type) {
-          arg_node->arg()->type = LiteType::GetTensorTy(
-              TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+          arg_node->arg()->type = var_type_map[var_name];
         }
         // Store the original data type of the output tensors for
         // type_precision_cast_pass, to keep the consistency between the
         // output types of original graph and optimized graph's
-        if (op->op_info()->Type() == "fetch") {
+        if (op_type == "fetch") {
           op->mutable_op_info()->SetAttr<int>(
-              "data_type", static_cast<int>(var_types[name]));
+              "data_type",
+              static_cast<int>(var_type_map[var_name]->precision()));
         }
       }
-      if (is_weights(name)) arg_node->AsArg().is_weight = true;
+      if (is_weight(var_name)) arg_node->AsArg().is_weight = true;
       CHECK(arg_node->IsRoleSet());
       DirectedLink(arg_node, op_node);
     }
-    for (const std::string &name : op->op_info()->output_names()) {
+    for (const auto &var_name : op->op_info()->output_names()) {
       node_storage_.emplace_back();
       auto *arg_node = &node_storage_.back();
-      arg_node->AsArg(name, node_storage_.size() - 1);
-      arg_update_node_map_[name] = arg_node;
-      if (var_types.count(name) && !arg_node->arg()->type) {
-        arg_node->arg()->type = LiteType::GetTensorTy(
-            TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+      arg_node->AsArg(var_name, node_storage_.size() - 1);
+      arg_update_node_map[var_name] = arg_node;
+      if (var_type_map.count(var_name) && !arg_node->arg()->type) {
+        arg_node->arg()->type = var_type_map[var_name];
       }
 
-      if (is_weights(name)) arg_node->AsArg().is_weight = true;
+      if (is_weight(var_name)) arg_node->AsArg().is_weight = true;
       CHECK(arg_node->IsRoleSet());
       DirectedLink(op_node, arg_node);
     }
diff --git a/lite/core/mir/ssa_graph.h b/lite/core/mir/ssa_graph.h
index e2967cf96a6b00ccc225ce05b043cb94f161b1d6..819b0a71ea1be04c85316e90001aef311b7d7238 100644
--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
@@ -35,9 +35,13 @@ class GraphBase {};
 
 class SSAGraph : GraphBase {
  public:
-  // @param program: the op program
+  // @param program: the target program with vars and ops
   // @param valid_places: the valid places user set for the system.
-  void Build(const Program &program, const std::vector<Place> &valid_places);
+  // @param block_idx: the block index in the target program, default is 0(main
+  // block)
+  void Build(const Program &program,
+             const std::vector<Place> &valid_places,
+             int block_idx = kRootBlockIdx);
   void RemoveNode(const mir::Node *node);
 
   std::vector<mir::Node *> StmtTopologicalOrder();
diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc
index 1de0d1a26577b31e1dfc5187562cc80bce6fe4d1..b5dd1f8b9c119f4647b72a35eb71df37f31fc6f8 100644
--- a/lite/core/mir/static_kernel_pick_pass.cc
+++ b/lite/core/mir/static_kernel_pick_pass.cc
@@ -110,15 +110,16 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
       if (out_type_int8) {
         auto out_node = node.outlinks.front();
         CHECK(out_node->IsArg());
+        auto out_node_name = out_node->arg()->name;
         auto one_adj_op_node = out_node->outlinks.front();
         CHECK(one_adj_op_node->IsStmt());
         auto& one_adj_instruct = one_adj_op_node->AsStmt();
         CHECK(one_adj_instruct.op_info()->HasAttr("enable_int8"));
-        CHECK(one_adj_instruct.op_info()->HasAttr("input_scale"));
+        CHECK(one_adj_instruct.op_info()->HasInputScale(out_node_name));
 
-        instruct.mutable_op_info()->SetAttr(
-            "output_scale",
-            one_adj_instruct.op_info()->GetAttr<float>("input_scale"));
+        instruct.mutable_op_info()->SetOutputScale(
+            out_node_name,
+            one_adj_instruct.op_info()->GetInputScale(out_node_name));
 
         auto update_desc = *instruct.mutable_op_info();
         instruct.ResetOp(update_desc, graph->valid_places());
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
index 31a38280ff537d486f5fb3ba46dee5b025d3f1f1..13805b2b18634551d4b74ac436954fa8f6b9ed05 100644
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -411,34 +411,60 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   cpp::OpDesc subgraph_op_desc;
   subgraph_op_desc.SetType("subgraph");
 
-  // Create a new sub block desc for storing all of Ops and Vars of the target
-  // subgraph and sub_block_idx is set as a attribute of subgraph op,
-  // sub_block_idx < 0 means it's a new subgraph op
-  int sub_block_idx = -(subgraph_idx + 1);
-  auto sub_block_desc = new cpp::BlockDesc();
+  // Create a program desc and a block desc for storing all of Ops and Vars of
+  // the target subgraph and sub_block_idx is set as a attribute of subgraph op,
+  // sub_block_idx = 0 means it's a new subgraph op
+  auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+  int sub_block_idx = 0;
+  auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
   sub_block_desc->ClearOps();
   sub_block_desc->ClearVars();
   for (auto &op_node : subgraph_nodes) {
-    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
-    *sub_block_op_desc = *op_node->AsStmt().op_info();
+    auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_op_desc = *op_node->AsStmt().op_info();
   }
   subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx);
 
   // Extract input and output nodes from the target subgraph
-  std::set<Node *> input_var_nodes;
+  std::set<Node *> idata_var_nodes;
   std::set<Node *> weight_var_nodes;
-  std::set<Node *> output_var_nodes;
+  std::set<Node *> odata_var_nodes;
   std::set<Node *> local_var_nodes;
   std::set<Node *> unused_var_nodes;
   ExtractInputsOutputs(subgraph_nodes,
-                       &input_var_nodes,
+                       &idata_var_nodes,
                        &weight_var_nodes,
-                       &output_var_nodes,
+                       &odata_var_nodes,
                        &local_var_nodes,
                        &unused_var_nodes);
-
+  // A simplified model without the original weight/local/unused nodes on the
+  // subgraph ops will be saved only if 'SUBGRAPH_ONLINE_MODE' is set to
+  // true(default) and Predictor->Run(...), Predictor->Save(...) is called.
+  std::set<Node *> input_var_nodes(idata_var_nodes.begin(),
+                                   idata_var_nodes.end());
+  std::set<Node *> output_var_nodes(odata_var_nodes.begin(),
+                                    odata_var_nodes.end());
+  if (GetBoolFromEnv(SUBGRAPH_ONLINE_MODE, true)) {
+    input_var_nodes.insert(weight_var_nodes.begin(), weight_var_nodes.end());
+    output_var_nodes.insert(local_var_nodes.begin(), local_var_nodes.end());
+    output_var_nodes.insert(unused_var_nodes.begin(), unused_var_nodes.end());
+  }
   // Set input and output name mapping which stores the real inputs and
   // outputs
+  std::vector<std::string> idata_var_names;
+  std::vector<std::string> odata_var_names;
+  for (auto &var_node : idata_var_nodes) {
+    idata_var_names.push_back(var_node->AsArg().name);
+  }
+  for (auto &var_node : odata_var_nodes) {
+    odata_var_names.push_back(var_node->AsArg().name);
+  }
+  subgraph_op_desc.SetAttr<std::vector<std::string>>("input_data_names",
+                                                     idata_var_names);
+  subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
+                                                     odata_var_names);
+  // Set all of the inputs and outputs to the target subgraph op
+  // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
   std::vector<std::string> input_var_names;
   std::vector<std::string> output_var_names;
   for (auto &var_node : input_var_nodes) {
@@ -447,60 +473,36 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   for (auto &var_node : output_var_nodes) {
     output_var_names.push_back(var_node->AsArg().name);
   }
-  subgraph_op_desc.SetAttr<std::vector<std::string>>("input_data_names",
-                                                     input_var_names);
-  subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
-                                                     output_var_names);
+  subgraph_op_desc.SetInput("Inputs", input_var_names);
+  subgraph_op_desc.SetOutput("Outputs", output_var_names);
+  auto subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+  static_cast<operators::SubgraphOp *>(subgraph_op.get())
+      ->SetProgramDesc(sub_program_desc);
+  auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
+  subgraph_op->Attach(subgraph_op_desc, any_op->scope());
 
-  // Set input/output scale values of input/output var nodes for
-  // type_precision_cast_pass.
-  std::vector<float> input_data_scales;
-  std::vector<float> output_data_scales;
+  // Export the scale values of the input/output var nodes of the inner op nodes
+  // only for type_precision_cast_pass.
   for (auto &var_node : input_var_nodes) {
+    auto var_node_name = var_node->arg()->name;
     auto any_op_node = var_node->outlinks.front();
     CHECK(any_op_node->IsStmt());
     auto &any_inst = any_op_node->AsStmt();
-    if (any_inst.op_info()->HasAttr("input_scale")) {
-      input_data_scales.push_back(
-          any_inst.op_info()->GetAttr<float>("input_scale"));
+    if (any_inst.op_info()->HasInputScale(var_node_name)) {
+      subgraph_op->mutable_op_info()->SetInputScale(
+          var_node_name, any_inst.op_info()->GetInputScale(var_node_name));
     }
   }
   for (auto &var_node : output_var_nodes) {
+    auto var_node_name = var_node->arg()->name;
     auto any_op_node = var_node->inlinks.front();
     CHECK(any_op_node->IsStmt());
     auto &any_inst = any_op_node->AsStmt();
-    if (any_inst.op_info()->HasAttr("output_scale")) {
-      output_data_scales.push_back(
-          any_inst.op_info()->GetAttr<float>("output_scale"));
+    if (any_inst.op_info()->HasOutputScale(var_node_name)) {
+      subgraph_op->mutable_op_info()->SetOutputScale(
+          var_node_name, any_inst.op_info()->GetOutputScale(var_node_name));
     }
   }
-  if (input_data_scales.size() > 0) {
-    subgraph_op_desc.SetAttr<std::vector<float>>("input_data_scales",
-                                                 input_data_scales);
-  }
-  if (output_data_scales.size() > 0) {
-    subgraph_op_desc.SetAttr<std::vector<float>>("output_data_scales",
-                                                 output_data_scales);
-  }
-
-  // Set all of the inputs and outputs to the target subgraph op
-  // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
-  for (auto &var_node : weight_var_nodes) {
-    input_var_names.push_back(var_node->AsArg().name);
-  }
-  for (auto &var_node : local_var_nodes) {
-    output_var_names.push_back(var_node->AsArg().name);
-  }
-  for (auto &var_node : unused_var_nodes) {
-    output_var_names.push_back(var_node->AsArg().name);
-  }
-  subgraph_op_desc.SetInput("Inputs", input_var_names);
-  subgraph_op_desc.SetOutput("Outputs", output_var_names);
-  auto subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-  static_cast<operators::SubgraphOp *>(subgraph_op.get())
-      ->SetSubBlock(sub_block_desc);
-  auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
-  subgraph_op->Attach(subgraph_op_desc, any_op->scope());
 
   // Create and add a new subgraph node into the graph
   auto subgraph_op_node =
@@ -508,26 +510,13 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   for (auto &var_node : input_var_nodes) {
     IR_NODE_LINK_TO(var_node, subgraph_op_node);
   }
-  for (auto &var_node : weight_var_nodes) {
-    IR_NODE_LINK_TO(var_node, subgraph_op_node);
-  }
   for (auto &var_node : output_var_nodes) {
     IR_OP_VAR_LINK(subgraph_op_node, var_node);
   }
-  for (auto &var_node : local_var_nodes) {
-    IR_OP_VAR_LINK(subgraph_op_node, var_node);
-  }
-  for (auto &var_node : unused_var_nodes) {
-    IR_OP_VAR_LINK(subgraph_op_node, var_node);
-  }
 
   // Remove subgraph nodes and unused var nodes
-  auto nodes2rm = GetNodes2RM(subgraph_nodes,
-                              {input_var_nodes,
-                               weight_var_nodes,
-                               output_var_nodes,
-                               local_var_nodes,
-                               unused_var_nodes});
+  auto nodes2rm =
+      GetNodes2RM(subgraph_nodes, {input_var_nodes, output_var_nodes});
   GraphSafeRemoveNodes(graph, nodes2rm);
 }
 
@@ -602,7 +591,17 @@ std::set<const Node *> GetNodes2RM(
   std::set<const Node *> nodes2rm(op_nodes.begin(), op_nodes.end());
   for (auto &op_node : op_nodes) {
     for (auto &var_node : op_node->inlinks) {
-      if (!nodes2rm.count(var_node)) {
+      bool skip = false;
+      // skip the var node which is used by any other ops that doesn't belong to
+      // the subgraph ops.
+      for (auto &out_op_node : var_node->outlinks) {
+        if (std::find(op_nodes.begin(), op_nodes.end(), out_op_node) !=
+            op_nodes.end()) {
+          skip = true;
+          break;
+        }
+      }
+      if (!skip && !nodes2rm.count(var_node)) {
         nodes2rm.insert(var_node);
       }
     }
diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc
index f52c0332fa3cfce904d2b7c8bf010bc3d3ac6ac9..f7e354f7a22582991ca64fa2d5fcc147bf6ed427 100644
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -20,7 +20,7 @@
 #include "lite/api/paddle_use_passes.h"
 #include "lite/core/mir/ssa_graph.h"
 #include "lite/core/program.h"
-#include "lite/model_parser/cpp/program_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/model_parser/model_parser.h"
 
 DEFINE_string(model_dir, "", "model_dir");
@@ -141,12 +141,11 @@ std::vector<std::string> AddFetchDesc(
 }
 
 TEST(Subgraph, detect_simple_model) {
-  cpp::ProgramDesc program_desc;
+  auto program_desc = std::make_shared<cpp::ProgramDesc>();
   std::vector<Place> valid_places{{TARGET(kHost), PRECISION(kFloat)}};
   auto scope = std::make_shared<Scope>();
   // Build a simple network
-  program_desc.ClearBlocks();
-  auto* block_desc = program_desc.AddBlock<cpp::BlockDesc>();
+  auto* block_desc = program_desc->AddBlock<cpp::BlockDesc>();
   block_desc->ClearOps();
   block_desc->ClearVars();
   auto* var_desc = block_desc->AddVar<cpp::VarDesc>();
@@ -181,13 +180,13 @@ TEST(Subgraph, detect_custom_model) {
                  "the path of model files.";
     return;
   }
-  cpp::ProgramDesc program_desc;
+  auto program_desc = std::make_shared<cpp::ProgramDesc>();
   auto scope = std::make_shared<Scope>();
   LoadModelPb(FLAGS_model_dir,
               FLAGS_model_file,
               FLAGS_params_file,
               scope.get(),
-              &program_desc,
+              program_desc.get(),
               !FLAGS_model_file.empty() && !FLAGS_params_file.empty(),
               false);
   std::vector<Place> valid_places({
@@ -200,6 +199,9 @@ TEST(Subgraph, detect_custom_model) {
 #ifdef LITE_WITH_NPU
       Place{TARGET(kNPU), PRECISION(kFloat)},
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+      Place{TARGET(kHuaweiAscendNPU), PRECISION(kFloat)},
+#endif
 #ifdef LITE_WITH_XTCL
       Place{TARGET(kXPU), PRECISION(kFloat)},
 #endif
diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc
index f4df5c5f454c08c5f79dd220e579632dc7cf05a5..429c780912094baf9ceb8b5124dc197abd51af41 100644
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -40,6 +40,21 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   fuser();
 }
 
+void HuaweiAscendNPUSubgraphPass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  std::set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 void APUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   std::set<std::string> supported_lists;
 #define USE_SUBGRAPH_BRIDGE(op_type, target) \
@@ -119,6 +134,9 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
     .BindTargets({TARGET(kNPU)});
+REGISTER_MIR_PASS(huawei_ascend_npu_subgraph_pass,
+                  paddle::lite::mir::HuaweiAscendNPUSubgraphPass)
+    .BindTargets({TARGET(kHuaweiAscendNPU)});
 REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass)
     .BindTargets({TARGET(kAPU)});
 REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h
index 8c2b501a62356c91e93f3c4ca91f70879d3c9229..c40a527cfe72ab1556e868d05aab5c0280fa4514 100644
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass {
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };
 
+class HuaweiAscendNPUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 class APUSubgraphPass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
index 8fd3751f9ca1585af6b8b00f23acd6bacf5b7a51..5a57623b0c984be24e2d0b97ee575b22d369fdad 100644
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -13,8 +13,12 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <cmath>
+
 #include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
 #include "lite/api/test_helper.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
@@ -183,6 +187,10 @@ TEST(Subgraph, generate_model_and_check_precision) {
 #ifdef LITE_WITH_NPU
   valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  valid_places.push_back(
+      lite_api::Place{TARGET(kHuaweiAscendNPU), PRECISION(kFloat)});
+#endif
 #ifdef LITE_WITH_XTCL
   valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
 #endif
diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc
index 1133e5ba8203ec9fea177844a6311c993f6b8ff7..44b6eaf1eb0c5c96630dd66d129919b40f3ea8c6 100644
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -249,11 +249,13 @@ void OpenCLTypeLayoutTransformPass::Apply(
 REGISTER_MIR_PASS(type_layout_cast_pass,
                   paddle::lite::mir::TypeLayoutTransformPass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kMLU)})
     .BindKernel("layout_once")
     .BindKernel("layout");
 
 REGISTER_MIR_PASS(type_layout_cast_preprocess_pass,
                   paddle::lite::mir::OpenCLTypeLayoutTransformPass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kMLU)})
     .BindKernel("layout_once")
     .BindKernel("layout");
diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc
index 25648877568f6427843f8ded6890450c265b4f06..40ece35993cfd2f8bce07e605387741202973614 100644
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -36,14 +36,20 @@ void UpdateInputsForSubgraph(OpLite* op,
       op_desc->GetAttr<std::vector<std::string>>("input_data_names");
   std::replace(input_data_names.begin(), input_data_names.end(), from, to);
   op_desc->SetAttr("input_data_names", input_data_names);
-  auto* subblock_desc = static_cast<operators::SubgraphOp*>(op)->GetSubBlock();
-  CHECK(subblock_desc);
-  for (size_t i = 0; i < subblock_desc->OpsSize(); i++) {
-    auto* subblock_op_desc = subblock_desc->GetOp<cpp::OpDesc>(i);
-    for (auto& subblock_op_input : *subblock_op_desc->mutable_inputs()) {
-      for (auto& subblock_var_name : subblock_op_input.second) {
-        if (subblock_var_name == from) {
-          subblock_var_name = to;
+  auto sub_program_desc =
+      static_cast<operators::SubgraphOp*>(op)->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx = op_desc->GetAttr<int32_t>("sub_block");
+  auto sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       sub_op_idx++) {
+    auto sub_op_desc = const_cast<cpp::OpDesc*>(
+        sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
+    for (auto& sub_op_input : *sub_op_desc->mutable_inputs()) {
+      for (auto& sub_var_name : sub_op_input.second) {
+        if (sub_var_name == from) {
+          sub_var_name = to;
         }
       }
     }
@@ -66,65 +72,30 @@ void UpdateInputs(OpLite* op, const std::string& from, const std::string& to) {
   }
 }
 
-// Infer the scale value for the new calib op from the subgraph op
-static bool InferScaleFromSubgraph(std::string var_name,
-                                   const OpInfo* op_info,
-                                   float* scale,
-                                   bool reverse = false) {
-  std::string attr_name = reverse ? "output_data_names" : "input_data_names";
-  if (!op_info->HasAttr(attr_name)) return false;
-  auto input_or_output_names =
-      op_info->GetAttr<std::vector<std::string>>(attr_name);
-  attr_name = reverse ? "output_data_scales" : "input_data_scales";
-  if (!op_info->HasAttr(attr_name)) return false;
-  auto input_or_output_scales = op_info->GetAttr<std::vector<float>>(attr_name);
-  auto size = input_or_output_names.size();
-  CHECK(size == input_or_output_scales.size());
-  for (size_t i = 0; i < size; i++) {
-    if (input_or_output_names[i] == var_name) {
-      *scale = input_or_output_scales[i];
-      return true;
-    }
-  }
-  return false;
-}
-
 // Infer the scale value for the new calib op from the input_scale of the
 // current op and output_scale of the previous op.
 // case 1: prev_op->var_node->op_node(int8->any op, with input_scale).
-// case 2: prev_op->var_node->op_node(subgraph op, int8->any, with
-// input_data_scales).
-// case 3: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any,
+// case 2: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any,
 // without input_scale).
-// case 4: prev_op(any->int8, subgraph_op, with
-// output_data_scales)->var_node->op_node(fp32->any, without input_scale).
 static bool InferScale(Node* var_node, Node* op_node, float* scale) {
   bool found = false;
   auto& inst = op_node->AsStmt();
   auto op_info = inst.op_info();
   auto op_type = op_info->Type();
   auto var_name = var_node->AsArg().name;
-  if (op_type == "subgraph") {
-    found = InferScaleFromSubgraph(var_name, op_info, scale, false);
+  if (op_info->HasInputScale(var_name)) {
+    *scale = op_info->GetInputScale(var_name)[0];
+    found = true;
   } else {
-    if (op_info->HasAttr("input_scale")) {
-      *scale = op_info->GetAttr<float>("input_scale");
+    // Obtain the output_scale from one of its previous Ops
+    auto prev_op_node = var_node->inlinks.front();
+    CHECK(prev_op_node->IsStmt());
+    auto& prev_inst = prev_op_node->AsStmt();
+    auto prev_op_info = prev_inst.op_info();
+    auto prev_op_type = prev_op_info->Type();
+    if (prev_op_info->HasOutputScale(var_name)) {
+      *scale = prev_op_info->GetOutputScale(var_name)[0];
       found = true;
-    } else {
-      // Obtain the output_scale from one of its previous Ops
-      auto prev_op_node = var_node->inlinks.front();
-      CHECK(prev_op_node->IsStmt());
-      auto& prev_inst = prev_op_node->AsStmt();
-      auto prev_op_info = prev_inst.op_info();
-      auto prev_op_type = prev_op_info->Type();
-      if (prev_op_type == "subgraph") {
-        found = InferScaleFromSubgraph(var_name, prev_op_info, scale, true);
-      } else {
-        if (prev_op_info->HasAttr("output_scale")) {
-          *scale = prev_op_info->GetAttr<float>("output_scale");
-          found = true;
-        }
-      }
     }
   }
   return found;
diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h
index d9f420cfad90d3c6a1f08072d8c5f87d2326661a..f7d35bfef3ac53903448c48300c144f8fd15652d 100644
--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -59,25 +59,46 @@ class VariablePlaceInferencePass : public DebugPass {
   }
 
   // Set the type of the weight
-  void SetWeightType(Node* w,
+  void SetWeightType(Node* weight_node,
                      const LiteType& type,
-                     const std::map<std::string, bool>& lite_with_targets) {
+                     const std::map<std::string, bool>& with_targets) {
     VLOG(4) << "type.precision():" << PrecisionRepr(type.precision());
-    if (lite_with_targets.at("kFPGA")) {
-      w->AsArg().type = LiteType::GetTensorTy(
+    if (with_targets.at("kFPGA")) {
+      weight_node->AsArg().type = LiteType::GetTensorTy(
           TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-    } else if (lite_with_targets.at("kOpenCL")) {
-      w->AsArg().type = LiteType::GetTensorTy(
+    } else if (with_targets.at("kOpenCL")) {
+      weight_node->AsArg().type = LiteType::GetTensorTy(
           TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-    } else if (lite_with_targets.at("kCUDA")) {
-      w->AsArg().type = LiteType::GetTensorTy(
+    } else if (with_targets.at("kCUDA")) {
+      weight_node->AsArg().type = LiteType::GetTensorTy(
           TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
     } else {
-      w->AsArg().type = LiteType::GetTensorTy(
+      weight_node->AsArg().type = LiteType::GetTensorTy(
           TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
     }
   }
 
+  // Update a's kUnk fields from b's fields.
+  void UpdateTypeFrom(const Type** a, const Type* b) {
+    auto target = (*a)->target();
+    auto precision = (*a)->precision();
+    auto layout = (*a)->layout();
+    if (target == TARGET(kUnk)) {
+      target = b->target();
+    }
+    if (precision == PRECISION(kUnk)) {
+      precision = b->precision();
+    }
+    if (layout == DATALAYOUT(kUnk)) {
+      layout = b->layout();
+    }
+    if ((*a)->IsTensor() && b->IsTensor()) {
+      *a = LiteType::GetTensorTy(target, precision, layout);
+    } else if ((*a)->IsTensorList() && b->IsTensorList()) {
+      *a = LiteType::GetTensorListTy(target, precision, layout);
+    }
+  }
+
   void InferenceArgumentPlace(SSAGraph* graph) {
     auto& valid_places = graph->valid_places();
     auto valid_places_has_target = [&](TargetType t) -> bool {
@@ -88,122 +109,90 @@ class VariablePlaceInferencePass : public DebugPass {
       }
       return false;
     };
-    std::map<std::string, bool> lite_with_targets{
+    std::map<std::string, bool> with_targets{
         {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
         {"kCUDA", valid_places_has_target(TARGET(kCUDA))},
         {"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
-    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
-    VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
+    VLOG(4) << "with_targets['kOpenCL']:" << with_targets["kOpenCL"];
+    VLOG(4) << "with_targets['kFPGA']:" << with_targets["kFPGA"];
 
     VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
-    for (auto& x : graph->StmtTopologicalOrder()) {
-      auto& inst = x->AsStmt();
+    for (auto& node : graph->StmtTopologicalOrder()) {
+      auto& inst = node->AsStmt();
+      const auto* op_info = inst.op_info();
+      const auto& op_type = op_info->Type();
+      auto& kernel = inst.picked_kernel();
+
       // The IoCopyOp is a tool operator, it won't support the type inference.
       // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
-      // for
-      // tool operator
-      if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
-        VLOG(3) << "inst.op_type() == 'io_copy', continue";
-        if (inst.op_type() == "io_copy") continue;
+      // for tool operator
+      if ((!with_targets["kFPGA"]) && (!with_targets["kOpenCL"])) {
+        VLOG(3) << "skip 'io_copy' if target is FPGA and OpenCL";
+        if (op_type == "io_copy") continue;
       }
-      // deal with inputs
-      VLOG(4) << "Infering op " << inst.op_info()->Repr();
-      // TODO(zhaolong): Add check if the node's name in op's arguments.
 
-      auto get_argname = [&](
-          const std::string& node_name,
-          const std::map<std::string, std::vector<std::string>>& argname_map)
-          -> std::string {
-            for (auto& ele : argname_map) {
-              auto it =
-                  std::find(ele.second.begin(), ele.second.end(), node_name);
-              if (it != ele.second.end()) return ele.first;
-            }
-            return "";
-          };
-
-      for (auto* x_in : x->inlinks) {
-        std::string node_name = x_in->AsArg().name;
-        std::string arg_name = get_argname(node_name, inst.op_info()->inputs());
-        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
-                                   << node_name;
-        VLOG(4) << "-- input arg_name:" << arg_name << " "
-                << "-- node name:" << node_name;
-        auto type = inst.picked_kernel().GetInputDeclType(arg_name);
-        if (!x_in->AsArg().type) {
-          VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
-          if (x_in->AsArg().is_weight) {
-            SetWeightType(x_in, *type, lite_with_targets);
+      // Infering the input and output variable's place according to the
+      // declaration of I/O arguments of the picked kernel of the op
+      VLOG(4) << "Op " << op_info->Repr();
+      for (auto* in_node : node->inlinks) {
+        auto& var = in_node->AsArg();
+        const auto& var_name = var.name;
+        auto* var_type = &var.type;
+        std::string arg_name;
+        CHECK(op_info->GetInputArgname(var_name, &arg_name))
+            << "Can not find the input argument for var " << var_name;
+        VLOG(4) << " - input arg name:" << arg_name << " var name:" << var_name;
+        const auto* decl_type = kernel.GetInputDeclType(arg_name);
+        if (!(*var_type)) {
+          VLOG(4) << "set type " << *decl_type << " " << var_name;
+          if (var.is_weight) {
+            SetWeightType(in_node, *decl_type, with_targets);
           } else {
-            x_in->AsArg().type = type;
+            *var_type = decl_type;
           }
-        } else if (x_in->AsArg().type->target() == TARGET(kUnk) &&
-                   x_in->AsArg().type->precision() != PRECISION(kUnk) &&
-                   x_in->AsArg().type->layout() == DATALAYOUT(kUnk)) {
+        } else if (!(*var_type)->place().is_valid()) {
           // If is quantization, infer the Int8 type.
-          if (type->precision() == PRECISION(kInt8)) {
-            x_in->AsArg().type = type;
+          if (decl_type->precision() == PRECISION(kInt8)) {
+            *var_type = decl_type;
           } else {
-            PrecisionType tmp_ptype = x_in->AsArg().type->precision();
-            x_in->AsArg().type = LiteType::GetTensorTy(
-                type->target(), tmp_ptype, type->layout());
+            UpdateTypeFrom(var_type, decl_type);
           }
         }
       }
-
-      VLOG(4) << "inst " << inst.op_info()->Repr();
-      for (auto* x_out : x->outlinks) {
-        std::string node_name = x_out->AsArg().name;
-        std::string arg_name =
-            get_argname(node_name, inst.op_info()->outputs());
-        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
-                                   << node_name << " in Inst "
-                                   << inst.op_type();
-        VLOG(4) << "-- output arg_name " << arg_name;
-        auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
-        if (!x_out->AsArg().type) {
-          VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
-          if (x_out->AsArg().is_weight) {
-            SetWeightType(x_out, *type, lite_with_targets);
+      for (auto* out_node : node->outlinks) {
+        auto& var = out_node->AsArg();
+        const auto& var_name = var.name;
+        auto* var_type = &var.type;
+        std::string arg_name;
+        CHECK(op_info->GetOutputArgname(var_name, &arg_name))
+            << "Can not find the output argument for var " << var_name;
+        VLOG(4) << " - output arg name:" << arg_name
+                << " var name:" << var_name;
+        const auto* decl_type = kernel.GetOutputDeclType(arg_name);
+        if (!(*var_type)) {
+          VLOG(4) << "set type " << *decl_type << " " << var_name;
+          if (var.is_weight) {
+            SetWeightType(out_node, *decl_type, with_targets);
           } else {
-            x_out->AsArg().type = type;
+            *var_type = decl_type;
           }
-        } else if (x_out->AsArg().type->target() == TARGET(kUnk) &&
-                   x_out->AsArg().type->precision() != PRECISION(kUnk) &&
-                   x_out->AsArg().type->layout() == DATALAYOUT(kUnk)) {
+        } else if (!(*var_type)->place().is_valid()) {
           // If is quantization, infer the Int8 type.
-          if (type->precision() == PRECISION(kInt8)) {
-            x_out->AsArg().type = type;
-          } else if (type->precision() == PRECISION(kFP16) &&
-                     type->target() != TARGET(kOpenCL)) {
-            x_out->AsArg().type = type;
+          if (decl_type->precision() == PRECISION(kInt8) ||
+              (decl_type->precision() == PRECISION(kFP16) &&
+               decl_type->target() != TARGET(kOpenCL))) {
+            *var_type = decl_type;
           } else {
-            PrecisionType tmp_ptype = x_out->AsArg().type->precision();
-            x_out->AsArg().type = LiteType::GetTensorTy(
-                type->target(), tmp_ptype, type->layout());
+            UpdateTypeFrom(var_type, decl_type);
           }
         }
       }
     }
   }
 
-  // Update me's kUnk fields by other's fields.
-  void UpdatePlace(Place* me, const Place& other) {
-    CHECK(other.is_valid());
-    if (me->target == TARGET(kUnk)) {
-      me->target = other.target;
-    }
-    if (me->precision == PRECISION(kUnk)) {
-      me->precision = other.precision;
-    }
-    if (me->layout == DATALAYOUT(kUnk)) {
-      me->layout = other.layout;
-    }
-  }
-
  private:
-  // The default target for arguments, e.g. load weights to CPU memory for CUDA
-  // computation by default.
+  // The default target for arguments, e.g. load weights to CPU memory for
+  // CUDA computation by default.
   TargetType argument_default_target_{TARGET(kHost)};
 };
 
diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc
index 537636065d6aeea67fd7c8c71fb00b183720fecc..585aaf3b703bca0a0a34030106dbf793e2a31d52 100644
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -18,6 +18,7 @@
 #include <utility>
 #include <vector>
 #include "lite/core/op_registry.h"
+#include "lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
@@ -186,5 +187,114 @@ void OpLite::AttachOutput(const cpp::OpDesc &op_desc,
   }
 }
 
+bool OpInfo::GetInputArgname(const std::string &value_name,
+                             std::string *out) const {
+  for (auto &item : inputs()) {
+    auto it = std::find(item.second.begin(), item.second.end(), value_name);
+    if (it != item.second.end()) {
+      *out = item.first;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool OpInfo::GetOutputArgname(const std::string &value_name,
+                              std::string *out) const {
+  for (auto &item : outputs()) {
+    auto it = std::find(item.second.begin(), item.second.end(), value_name);
+    if (it != item.second.end()) {
+      *out = item.first;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool OpInfo::GetInputIndex(const std::string &input_name, int *out) const {
+  for (auto &item : inputs()) {
+    auto it = std::find(item.second.begin(), item.second.end(), input_name);
+    if (it != item.second.end()) {
+      *out = it - item.second.begin();
+      return true;
+    }
+  }
+  return false;
+}
+
+bool OpInfo::GetOutputIndex(const std::string &output_name, int *out) const {
+  for (auto &item : outputs()) {
+    auto it = std::find(item.second.begin(), item.second.end(), output_name);
+    if (it != item.second.end()) {
+      *out = it - item.second.begin();
+      return true;
+    }
+  }
+  return false;
+}
+
+bool OpInfo::HasInputScale(const std::string &input_name) const {
+  std::string argname;
+  int index;
+  if (GetInputArgname(input_name, &argname) &&
+      GetInputIndex(input_name, &index)) {
+    return HasAttr(argname + to_string(index) + "_scale");
+  } else {
+    return false;
+  }
+}
+
+bool OpInfo::HasOutputScale(const std::string &output_name) const {
+  std::string argname;
+  int index;
+  if (GetOutputArgname(output_name, &argname) &&
+      GetOutputIndex(output_name, &index)) {
+    return HasAttr(argname + to_string(index) + "_scale");
+  } else {
+    return false;
+  }
+}
+
+void OpInfo::SetInputScale(const std::string &input_name,
+                           const std::vector<float> &scale_value) {
+  std::string argname;
+  int index;
+  CHECK(GetInputArgname(input_name, &argname));
+  CHECK(GetInputIndex(input_name, &index));
+  CHECK(scale_value.size() > 0)
+      << "Error in SetInputScale: the scales should not be empty";
+  SetAttr<std::vector<float>>(argname + to_string(index) + "_scale",
+                              scale_value);
+}
+
+void OpInfo::SetOutputScale(const std::string &output_name,
+                            const std::vector<float> &scale_value) {
+  std::string argname;
+  int index;
+  CHECK(GetOutputArgname(output_name, &argname));
+  CHECK(GetOutputIndex(output_name, &index));
+  CHECK(scale_value.size() > 0)
+      << "Error in SetOutputScale: the scales should not be empty";
+  SetAttr<std::vector<float>>(argname + to_string(index) + "_scale",
+                              scale_value);
+}
+
+std::vector<float> OpInfo::GetInputScale(const std::string &input_name) const {
+  std::string argname;
+  int index;
+  CHECK(GetInputArgname(input_name, &argname));
+  CHECK(GetInputIndex(input_name, &index));
+  return GetAttr<std::vector<float>>(argname + to_string(index) + "_scale");
+}
+
+std::vector<float> OpInfo::GetOutputScale(
+    const std::string &output_name) const {
+  std::string argname;
+  int index;
+  CHECK(GetOutputArgname(output_name, &argname));
+  CHECK(GetOutputIndex(output_name, &index));
+  return GetAttr<std::vector<float>>(argname + to_string(index) + "_scale");
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h
index 301065d5b6bb5c4f41b19d9a9034985ca2f74d89..d94753220a1b5d963092c62c43d7e49b03243c63 100644
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -24,7 +24,7 @@
 #include "lite/core/context.h"
 #include "lite/core/kernel.h"
 #include "lite/core/scope.h"
-#include "lite/model_parser/cpp/op_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/operators/op_params.h"
 
 namespace paddle {
@@ -99,7 +99,7 @@ class OpLite : public Registry {
   std::vector<std::unique_ptr<KernelBase>> CreateKernels(
       const std::vector<Place> &places, const std::string &kernel_type = "");
 
-  lite::Scope *scope() { return scope_; }
+  Scope *scope() { return scope_; }
 
   // Assign op param to kernel.
   virtual void AttachKernel(KernelBase *kernel) = 0;
@@ -169,7 +169,7 @@ class OpLite : public Registry {
   }
 
  protected:
-  lite::Scope *scope_{nullptr};
+  Scope *scope_{nullptr};
   std::unique_ptr<KernelBase> kernel_;
   std::string op_type_;
   std::vector<Place> valid_places_;
@@ -229,55 +229,8 @@ class OpInfo : public cpp::OpDesc {
     return OutputArgumentNames();
   }
 
-  bool GetInputArgname(const std::string &value_name, std::string *out) const {
-    for (auto &item : inputs_) {
-      auto it = std::find(item.second.begin(), item.second.end(), value_name);
-      if (it != item.second.end()) {
-        *out = item.first;
-        return true;
-      }
-    }
-    return false;
-  }
-  bool GetOutputArgname(const std::string &value_name, std::string *out) const {
-    for (auto &item : outputs_) {
-      auto it = std::find(item.second.begin(), item.second.end(), value_name);
-      if (it != item.second.end()) {
-        *out = item.first;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  // For the input variable name, find the index of the corresponding
-  // input argname
-  bool GetInputIndex(const std::string &value_name, int *out) const {
-    for (auto &item : inputs_) {
-      auto it = std::find(item.second.begin(), item.second.end(), value_name);
-      if (it != item.second.end()) {
-        *out = it - item.second.begin();
-        return true;
-      }
-    }
-    return false;
-  }
-
-  // For the output variable name, find the index of the corresponding
-  // output argname
-  bool GetOutputIndex(const std::string &value_name, int *out) const {
-    for (auto &item : outputs_) {
-      auto it = std::find(item.second.begin(), item.second.end(), value_name);
-      if (it != item.second.end()) {
-        *out = it - item.second.begin();
-        return true;
-      }
-    }
-    return false;
-  }
-
   void UpdateAllInputs(const std::string &from, const std::string &to) {
-    for (auto &item : inputs_) {
+    for (auto &item : *mutable_inputs()) {
       for (auto &var : item.second) {
         if (var == from) var = to;
       }
@@ -285,12 +238,32 @@ class OpInfo : public cpp::OpDesc {
   }
 
   void UpdateAllOutputs(const std::string &from, const std::string &to) {
-    for (auto &item : outputs_) {
+    for (auto &item : *mutable_outputs()) {
       for (auto &var : item.second) {
         if (var == from) var = to;
       }
     }
   }
+
+  bool GetInputArgname(const std::string &value_name, std::string *out) const;
+  bool GetOutputArgname(const std::string &value_name, std::string *out) const;
+
+  bool GetInputIndex(const std::string &input_name, int *out) const;
+  bool GetOutputIndex(const std::string &output_name, int *out) const;
+
+  bool HasInputScale(const std::string &input_name) const;
+  bool HasOutputScale(const std::string &output_name) const;
+
+  void SetInputScale(const std::string &input_name,
+                     const std::vector<float> &scale_value);
+  void SetOutputScale(const std::string &output_name,
+                      const std::vector<float> &scale_value);
+
+  // For conv2d, depthwise_conv2d and mul, the scale of weight are a vector.
+  // Otherwise, all input and output scales are scalar, but we save these
+  // as vecotr.
+  std::vector<float> GetInputScale(const std::string &input_name) const;
+  std::vector<float> GetOutputScale(const std::string &output_name) const;
 };
 
 }  // namespace lite
diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc
index ef6d3cfaf001ea55cef23faee11d508920c49715..cb773edd18ee236a30cbfcf5d6b1ce5773f0269d 100644
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -17,277 +17,5 @@
 #include <set>
 
 namespace paddle {
-namespace lite {
-
-const std::map<std::string, std::string> &GetOp2PathDict() {
-  return OpKernelInfoCollector::Global().GetOp2PathDict();
-}
-
-std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
-    const std::string &op_type,
-    TargetType target,
-    PrecisionType precision,
-    DataLayoutType layout) {
-  Place place{target, precision, layout};
-  VLOG(5) << "creating " << op_type << " kernel for " << place.DebugString();
-#define CREATE_KERNEL1(target__, precision__)                                \
-  switch (layout) {                                                          \
-    case DATALAYOUT(kNCHW):                                                  \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kNCHW)>(op_type);                             \
-    case DATALAYOUT(kAny):                                                   \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kAny)>(op_type);                              \
-    case DATALAYOUT(kNHWC):                                                  \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kNHWC)>(op_type);                             \
-    case DATALAYOUT(kImageDefault):                                          \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kImageDefault)>(op_type);                     \
-    case DATALAYOUT(kImageFolder):                                           \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kImageFolder)>(op_type);                      \
-    case DATALAYOUT(kImageNW):                                               \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kImageNW)>(op_type);                          \
-    default:                                                                 \
-      LOG(FATAL) << "unsupported kernel layout " << DataLayoutToStr(layout); \
-  }
-
-#define CREATE_KERNEL(target__)                         \
-  switch (precision) {                                  \
-    case PRECISION(kFloat):                             \
-      CREATE_KERNEL1(target__, kFloat);                 \
-    case PRECISION(kInt8):                              \
-      CREATE_KERNEL1(target__, kInt8);                  \
-    case PRECISION(kFP16):                              \
-      CREATE_KERNEL1(target__, kFP16);                  \
-    case PRECISION(kAny):                               \
-      CREATE_KERNEL1(target__, kAny);                   \
-    case PRECISION(kInt32):                             \
-      CREATE_KERNEL1(target__, kInt32);                 \
-    case PRECISION(kInt64):                             \
-      CREATE_KERNEL1(target__, kInt64);                 \
-    default:                                            \
-      CHECK(false) << "not supported kernel precision " \
-                   << PrecisionToStr(precision);        \
-  }
-
-  switch (target) {
-    case TARGET(kHost): {
-      CREATE_KERNEL(kHost);
-    } break;
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_X86)
-    case TARGET(kX86): {
-      CREATE_KERNEL(kX86);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_CUDA)
-    case TARGET(kCUDA): {
-      CREATE_KERNEL(kCUDA);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_ARM)
-    case TARGET(kARM): {
-      CREATE_KERNEL(kARM);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_OPENCL)
-    case TARGET(kOpenCL): {
-      CREATE_KERNEL(kOpenCL);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_NPU)
-    case TARGET(kNPU): {
-      CREATE_KERNEL(kNPU);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_APU)
-    case TARGET(kAPU): {
-      CREATE_KERNEL(kAPU);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_XPU)
-    case TARGET(kXPU): {
-      CREATE_KERNEL(kXPU);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_FPGA)
-    case TARGET(kFPGA): {
-      CREATE_KERNEL(kFPGA);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_BM)
-    case TARGET(kBM): {
-      CREATE_KERNEL(kBM);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_MLU)
-    case TARGET(kMLU): {
-      CREATE_KERNEL(kMLU);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_RKNPU)
-    case TARGET(kRKNPU): {
-      CREATE_KERNEL(kRKNPU);
-    } break;
-#endif
-    default:
-      CHECK(false) << "not supported kernel target " << TargetToStr(target);
-  }
-
-#undef CREATE_KERNEL
-  return std::list<std::unique_ptr<KernelBase>>();
-}
-
-KernelRegistry::KernelRegistry() : registries_() {
-#define INIT_FOR(target__, precision__, layout__)            \
-  registries_[std::make_tuple(TARGET(target__),              \
-                              PRECISION(precision__),        \
-                              DATALAYOUT(layout__))]         \
-      .set<KernelRegistryForTarget<TARGET(target__),         \
-                                   PRECISION(precision__),   \
-                                   DATALAYOUT(layout__)> *>( \
-          &KernelRegistryForTarget<TARGET(target__),         \
-                                   PRECISION(precision__),   \
-                                   DATALAYOUT(layout__)>::Global());
-// Currently, just register 2 kernel targets.
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_CUDA)
-  INIT_FOR(kCUDA, kFloat, kNCHW);
-  INIT_FOR(kCUDA, kFloat, kNHWC);
-  INIT_FOR(kCUDA, kInt8, kNCHW);
-  INIT_FOR(kCUDA, kFP16, kNCHW);
-  INIT_FOR(kCUDA, kFP16, kNHWC);
-  INIT_FOR(kCUDA, kAny, kNCHW);
-  INIT_FOR(kCUDA, kAny, kAny);
-  INIT_FOR(kCUDA, kInt8, kNHWC);
-  INIT_FOR(kCUDA, kInt64, kNCHW);
-  INIT_FOR(kCUDA, kInt64, kNHWC);
-  INIT_FOR(kCUDA, kInt32, kNCHW);
-  INIT_FOR(kCUDA, kInt32, kNHWC);
-#endif
-
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_MLU)
-  INIT_FOR(kMLU, kFloat, kNHWC);
-  INIT_FOR(kMLU, kFloat, kNCHW);
-  INIT_FOR(kMLU, kFP16, kNHWC);
-  INIT_FOR(kMLU, kFP16, kNCHW);
-  INIT_FOR(kMLU, kInt8, kNHWC);
-  INIT_FOR(kMLU, kInt8, kNCHW);
-  INIT_FOR(kMLU, kInt16, kNHWC);
-  INIT_FOR(kMLU, kInt16, kNCHW);
-#endif
-
-  INIT_FOR(kHost, kAny, kNCHW);
-  INIT_FOR(kHost, kAny, kNHWC);
-  INIT_FOR(kHost, kAny, kAny);
-  INIT_FOR(kHost, kBool, kNCHW);
-  INIT_FOR(kHost, kBool, kNHWC);
-  INIT_FOR(kHost, kBool, kAny);
-  INIT_FOR(kHost, kFloat, kNCHW);
-  INIT_FOR(kHost, kFloat, kNHWC);
-  INIT_FOR(kHost, kFloat, kAny);
-  INIT_FOR(kHost, kFP16, kNCHW);
-  INIT_FOR(kHost, kFP16, kNHWC);
-  INIT_FOR(kHost, kFP16, kAny);
-  INIT_FOR(kHost, kInt8, kNCHW);
-  INIT_FOR(kHost, kInt8, kNHWC);
-  INIT_FOR(kHost, kInt8, kAny);
-  INIT_FOR(kHost, kInt16, kNCHW);
-  INIT_FOR(kHost, kInt16, kNHWC);
-  INIT_FOR(kHost, kInt16, kAny);
-  INIT_FOR(kHost, kInt32, kNCHW);
-  INIT_FOR(kHost, kInt32, kNHWC);
-  INIT_FOR(kHost, kInt32, kAny);
-  INIT_FOR(kHost, kInt64, kNCHW);
-  INIT_FOR(kHost, kInt64, kNHWC);
-  INIT_FOR(kHost, kInt64, kAny);
-
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_X86)
-  INIT_FOR(kX86, kFloat, kNCHW);
-  INIT_FOR(kX86, kAny, kNCHW);
-  INIT_FOR(kX86, kAny, kAny);
-  INIT_FOR(kX86, kInt64, kNCHW);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_ARM)
-  INIT_FOR(kARM, kFloat, kNCHW);
-  INIT_FOR(kARM, kFloat, kNHWC);
-  INIT_FOR(kARM, kInt8, kNCHW);
-  INIT_FOR(kARM, kInt8, kNHWC);
-  INIT_FOR(kARM, kAny, kNCHW);
-  INIT_FOR(kARM, kAny, kAny);
-  INIT_FOR(kARM, kInt32, kNCHW);
-  INIT_FOR(kARM, kInt64, kNCHW);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_OPENCL)
-  INIT_FOR(kOpenCL, kFloat, kNCHW);
-  INIT_FOR(kOpenCL, kFloat, kNHWC);
-  INIT_FOR(kOpenCL, kAny, kNCHW);
-  INIT_FOR(kOpenCL, kAny, kNHWC);
-  INIT_FOR(kOpenCL, kFloat, kAny);
-  INIT_FOR(kOpenCL, kInt8, kNCHW);
-  INIT_FOR(kOpenCL, kAny, kAny);
-  INIT_FOR(kOpenCL, kFP16, kNCHW);
-  INIT_FOR(kOpenCL, kFP16, kNHWC);
-  INIT_FOR(kOpenCL, kFP16, kImageDefault);
-  INIT_FOR(kOpenCL, kFP16, kImageFolder);
-  INIT_FOR(kOpenCL, kFP16, kImageNW);
-  INIT_FOR(kOpenCL, kFloat, kImageDefault);
-  INIT_FOR(kOpenCL, kFloat, kImageFolder);
-  INIT_FOR(kOpenCL, kFloat, kImageNW);
-  INIT_FOR(kOpenCL, kAny, kImageDefault);
-  INIT_FOR(kOpenCL, kAny, kImageFolder);
-  INIT_FOR(kOpenCL, kAny, kImageNW);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_NPU)
-  INIT_FOR(kNPU, kFloat, kNCHW);
-  INIT_FOR(kNPU, kFloat, kNHWC);
-  INIT_FOR(kNPU, kInt8, kNCHW);
-  INIT_FOR(kNPU, kInt8, kNHWC);
-  INIT_FOR(kNPU, kAny, kNCHW);
-  INIT_FOR(kNPU, kAny, kNHWC);
-  INIT_FOR(kNPU, kAny, kAny);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_APU)
-  INIT_FOR(kAPU, kInt8, kNCHW);
-  INIT_FOR(kXPU, kFloat, kNCHW);
-  INIT_FOR(kXPU, kInt8, kNCHW);
-  INIT_FOR(kXPU, kAny, kNCHW);
-  INIT_FOR(kXPU, kAny, kAny);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_FPGA)
-  INIT_FOR(kFPGA, kFP16, kNHWC);
-  INIT_FOR(kFPGA, kFP16, kAny);
-  INIT_FOR(kFPGA, kFloat, kNHWC);
-  INIT_FOR(kFPGA, kAny, kNHWC);
-  INIT_FOR(kFPGA, kAny, kAny);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_BM)
-  INIT_FOR(kBM, kFloat, kNCHW);
-  INIT_FOR(kBM, kInt8, kNCHW);
-  INIT_FOR(kBM, kAny, kNCHW);
-  INIT_FOR(kBM, kAny, kAny);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_RKNPU)
-  INIT_FOR(kRKNPU, kFloat, kNCHW);
-  INIT_FOR(kRKNPU, kInt8, kNCHW);
-  INIT_FOR(kRKNPU, kAny, kNCHW);
-  INIT_FOR(kRKNPU, kAny, kAny);
-#endif
-
-#undef INIT_FOR
-}
-
-KernelRegistry &KernelRegistry::Global() {
-  static auto *x = new KernelRegistry;
-  return *x;
-}
-
-}  // namespace lite
+namespace lite {}  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
index 2128e218554fb304474c14cfacd7867e491a4fe6..90a2b563af7e17a4806bd47cb883d9590cdab40f 100644
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -17,7 +17,6 @@
 #include <list>
 #include <map>
 #include <memory>
-#include <set>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -33,19 +32,19 @@ using LiteType = paddle::lite::Type;
 
 class OpKernelInfoCollector {
  public:
-  static OpKernelInfoCollector &Global() {
-    static auto *x = new OpKernelInfoCollector;
+  static OpKernelInfoCollector& Global() {
+    static auto* x = new OpKernelInfoCollector;
     return *x;
   }
-  void AddOp2path(const std::string &op_name, const std::string &op_path) {
+  void AddOp2path(const std::string& op_name, const std::string& op_path) {
     size_t index = op_path.find_last_of('/');
     if (index != std::string::npos) {
       op2path_.insert(std::pair<std::string, std::string>(
           op_name, op_path.substr(index + 1)));
     }
   }
-  void AddKernel2path(const std::string &kernel_name,
-                      const std::string &kernel_path) {
+  void AddKernel2path(const std::string& kernel_name,
+                      const std::string& kernel_path) {
     size_t index = kernel_path.find_last_of('/');
     if (index != std::string::npos) {
       kernel2path_.insert(std::pair<std::string, std::string>(
@@ -53,13 +52,13 @@ class OpKernelInfoCollector {
     }
   }
   void SetKernel2path(
-      const std::map<std::string, std::string> &kernel2path_map) {
+      const std::map<std::string, std::string>& kernel2path_map) {
     kernel2path_ = kernel2path_map;
   }
-  const std::map<std::string, std::string> &GetOp2PathDict() {
+  const std::map<std::string, std::string>& GetOp2PathDict() {
     return op2path_;
   }
-  const std::map<std::string, std::string> &GetKernel2PathDict() {
+  const std::map<std::string, std::string>& GetKernel2PathDict() {
     return kernel2path_;
   }
 
@@ -71,409 +70,185 @@ class OpKernelInfoCollector {
 namespace paddle {
 namespace lite {
 
-const std::map<std::string, std::string> &GetOp2PathDict();
-
-using KernelFunc = std::function<void()>;
-using KernelFuncCreator = std::function<std::unique_ptr<KernelFunc>()>;
-class LiteOpRegistry final : public Factory<OpLite, std::shared_ptr<OpLite>> {
+class OpLiteFactory {
  public:
-  static LiteOpRegistry &Global() {
-    static auto *x = new LiteOpRegistry;
-    return *x;
+  // Register a function to create an op
+  void RegisterCreator(const std::string& op_type,
+                       std::function<std::shared_ptr<OpLite>()> fun) {
+    op_registry_[op_type] = fun;
   }
 
- private:
-  LiteOpRegistry() = default;
-};
-
-template <typename OpClass>
-class OpLiteRegistor : public Registor<OpClass> {
- public:
-  explicit OpLiteRegistor(const std::string &op_type)
-      : Registor<OpClass>([&] {
-          LiteOpRegistry::Global().Register(
-              op_type, [op_type]() -> std::unique_ptr<OpLite> {
-                return std::unique_ptr<OpLite>(new OpClass(op_type));
-              });
-        }) {}
-};
-template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
-using KernelRegistryForTarget =
-    Factory<KernelLite<Target, Precision, Layout>, std::unique_ptr<KernelBase>>;
-
-class KernelRegistry final {
- public:
-  using any_kernel_registor_t =
-      variant<KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNHWC)> *,  //
-
-              KernelRegistryForTarget<TARGET(kX86),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kX86),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kInt32),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kInt64),
-                                      DATALAYOUT(kNCHW)> *,  //
-
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kInt64),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kInt32),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNHWC)> *,  //
-
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageFolder)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageNW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageFolder)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageNW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kImageDefault)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kImageFolder)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kImageNW)> *,  //
-
-              KernelRegistryForTarget<TARGET(kNPU),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kNPU),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kNPU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
+  static OpLiteFactory& Global() {
+    static OpLiteFactory* x = new OpLiteFactory;
+    return *x;
+  }
 
-              KernelRegistryForTarget<TARGET(kAPU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kXPU),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kXPU),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kXPU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
+  std::shared_ptr<OpLite> Create(const std::string& op_type) const {
+    auto it = op_registry_.find(op_type);
+    if (it == op_registry_.end()) return nullptr;
+    return it->second();
+  }
 
-              KernelRegistryForTarget<TARGET(kBM),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kBM),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kBM),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
+  std::string DebugString() const {
+    STL::stringstream ss;
+    for (const auto& item : op_registry_) {
+      ss << " - " << item.first << "\n";
+    }
+    return ss.str();
+  }
 
-              KernelRegistryForTarget<TARGET(kRKNPU),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kRKNPU),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kRKNPU),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kRKNPU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
+  std::vector<std::string> GetAllOps() const {
+    std::vector<std::string> res;
+    for (const auto& op : op_registry_) {
+      res.push_back(op.first);
+    }
+    return res;
+  }
 
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
+ protected:
+  std::map<std::string, std::function<std::shared_ptr<OpLite>()>> op_registry_;
+};
 
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kInt16),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kInt16),
-                                      DATALAYOUT(kNCHW)> *  //
-              >;
+using LiteOpRegistry = OpLiteFactory;
 
-  KernelRegistry();
+// Register OpLite by initializing a static OpLiteRegistrar instance
+class OpLiteRegistrar {
+ public:
+  OpLiteRegistrar(const std::string& op_type,
+                  std::function<std::shared_ptr<OpLite>()> fun) {
+    OpLiteFactory::Global().RegisterCreator(op_type, fun);
+  }
+  // Touch function is used to guarantee registrar was initialized.
+  void touch() {}
+};
 
-  static KernelRegistry &Global();
+class KernelFactory {
+ public:
+  // Register a function to create kernels
+  void RegisterCreator(const std::string& op_type,
+                       TargetType target,
+                       PrecisionType precision,
+                       DataLayoutType layout,
+                       std::function<std::unique_ptr<KernelBase>()> fun) {
+    op_registry_[op_type][std::make_tuple(target, precision, layout)].push_back(
+        fun);
+  }
 
-  template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
-  void Register(
-      const std::string &name,
-      typename KernelRegistryForTarget<Target, Precision, Layout>::creator_t
-          &&creator) {
-    using kernel_registor_t =
-        KernelRegistryForTarget<Target, Precision, Layout>;
-    auto &varient = registries_[std::make_tuple(Target, Precision, Layout)];
-    auto *reg = varient.template get<kernel_registor_t *>();
-    CHECK(reg) << "Can not be empty of " << name;
-    reg->Register(name, std::move(creator));
-#ifdef LITE_ON_MODEL_OPTIMIZE_TOOL
-    kernel_info_map_[name].push_back(
-        std::make_tuple(Target, Precision, Layout));
-#endif  // LITE_ON_MODEL_OPTIMIZE_TOOL
+  static KernelFactory& Global() {
+    static KernelFactory* x = new KernelFactory;
+    return *x;
   }
 
-  template <TargetType Target,
-            PrecisionType Precision = PRECISION(kFloat),
-            DataLayoutType Layout = DATALAYOUT(kNCHW)>
-  std::list<std::unique_ptr<KernelBase>> Create(const std::string &op_type) {
-    using kernel_registor_t =
-        KernelRegistryForTarget<Target, Precision, Layout>;
-    std::list<std::unique_ptr<KernelBase>> kernel_list;
-    std::tuple<TargetType, PrecisionType, DataLayoutType> temp_tuple(
-        Target, Precision, Layout);
-    if (registries_[temp_tuple].valid()) {
-      kernel_list =
-          registries_[temp_tuple].template get<kernel_registor_t *>()->Creates(
-              op_type);
+  /**
+   * Create all kernels belongs to an op.
+   */
+  std::list<std::unique_ptr<KernelBase>> Create(const std::string& op_type) {
+    std::list<std::unique_ptr<KernelBase>> res;
+    if (op_registry_.find(op_type) == op_registry_.end()) return res;
+    auto& kernel_registry = op_registry_[op_type];
+    for (auto it = kernel_registry.begin(); it != kernel_registry.end(); ++it) {
+      for (auto& fun : it->second) {
+        res.emplace_back(fun());
+      }
     }
-    return kernel_list;
+    return res;
   }
 
-  std::list<std::unique_ptr<KernelBase>> Create(const std::string &op_type,
+  /**
+   * Create a specific kernel. Return a list for API compatible.
+   */
+  std::list<std::unique_ptr<KernelBase>> Create(const std::string& op_type,
                                                 TargetType target,
                                                 PrecisionType precision,
-                                                DataLayoutType layout);
+                                                DataLayoutType layout) {
+    std::list<std::unique_ptr<KernelBase>> res;
+    if (op_registry_.find(op_type) == op_registry_.end()) return res;
+    auto& kernel_registry = op_registry_[op_type];
+    auto it = kernel_registry.find(std::make_tuple(target, precision, layout));
+    if (it == kernel_registry.end()) return res;
+    for (auto& fun : it->second) {
+      res.emplace_back(fun());
+    }
+    return res;
+  }
 
   std::string DebugString() const {
-#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
-    return "No more debug info";
-#else   // LITE_ON_MODEL_OPTIMIZE_TOOL
     STL::stringstream ss;
-    ss << "\n";
-    ss << "Count of kernel kinds: ";
-    int count = 0;
-    for (auto &item : kernel_info_map_) {
-      count += item.second.size();
-    }
-    ss << count << "\n";
-
-    ss << "Count of registered kernels: " << kernel_info_map_.size() << "\n";
-    for (auto &item : kernel_info_map_) {
-      ss << "op: " << item.first << "\n";
-      for (auto &kernel : item.second) {
-        ss << "   - (" << TargetToStr(std::get<0>(kernel)) << ",";
-        ss << PrecisionToStr(std::get<1>(kernel)) << ",";
-        ss << DataLayoutToStr(std::get<2>(kernel));
-        ss << ")";
-        ss << "\n";
-      }
+    for (const auto& item : op_registry_) {
+      ss << " - " << item.first << "\n";
     }
-
     return ss.str();
-#endif  // LITE_ON_MODEL_OPTIMIZE_TOOL
   }
 
- private:
-  mutable std::map<std::tuple<TargetType, PrecisionType, DataLayoutType>,
-                   any_kernel_registor_t>
-      registries_;
-#ifndef LITE_ON_TINY_PUBLISH
-  mutable std::map<
-      std::string,
-      std::vector<std::tuple<TargetType, PrecisionType, DataLayoutType>>>
-      kernel_info_map_;
-#endif
+ protected:
+  // Outer map: op -> a map of kernel.
+  // Inner map: kernel -> creator function.
+  // Each kernel was represented by a combination of <TargetType, PrecisionType,
+  // DataLayoutType>
+  std::map<std::string,
+           std::map<std::tuple<TargetType, PrecisionType, DataLayoutType>,
+                    std::list<std::function<std::unique_ptr<KernelBase>()>>>>
+      op_registry_;
 };
 
-template <TargetType target,
-          PrecisionType precision,
-          DataLayoutType layout,
-          typename KernelType>
-class KernelRegistor : public lite::Registor<KernelType> {
+using KernelRegistry = KernelFactory;
+
+// Register Kernel by initializing a static KernelRegistrar instance
+class KernelRegistrar {
  public:
-  KernelRegistor(const std::string &op_type, const std::string &alias)
-      : Registor<KernelType>([=] {
-          KernelRegistry::Global().Register<target, precision, layout>(
-              op_type, [=]() -> std::unique_ptr<KernelType> {
-                std::unique_ptr<KernelType> x(new KernelType);
-                x->set_op_type(op_type);
-                x->set_alias(alias);
-                return x;
-              });
-        }) {}
+  KernelRegistrar(const std::string& op_type,
+                  TargetType target,
+                  PrecisionType precision,
+                  DataLayoutType layout,
+                  std::function<std::unique_ptr<KernelBase>()> fun) {
+    KernelFactory::Global().RegisterCreator(
+        op_type, target, precision, layout, fun);
+  }
+  // Touch function is used to guarantee registrar was initialized.
+  void touch() {}
 };
 
 }  // namespace lite
 }  // namespace paddle
 
-// Operator registry
-#define LITE_OP_REGISTER_INSTANCE(op_type__) op_type__##__registry__instance__
-#define REGISTER_LITE_OP(op_type__, OpClass)                              \
-  static paddle::lite::OpLiteRegistor<OpClass> LITE_OP_REGISTER_INSTANCE( \
-      op_type__)(#op_type__);                                             \
-  int touch_op_##op_type__() {                                            \
-    OpKernelInfoCollector::Global().AddOp2path(#op_type__, __FILE__);     \
-    return LITE_OP_REGISTER_INSTANCE(op_type__).Touch();                  \
+// Register an op.
+#define REGISTER_LITE_OP(op_type__, OpClass)                                   \
+  static paddle::lite::OpLiteRegistrar op_type__##__registry(                  \
+      #op_type__, []() {                                                       \
+        return std::unique_ptr<paddle::lite::OpLite>(new OpClass(#op_type__)); \
+      });                                                                      \
+  int touch_op_##op_type__() {                                                 \
+    op_type__##__registry.touch();                                             \
+    OpKernelInfoCollector::Global().AddOp2path(#op_type__, __FILE__);          \
+    return 0;                                                                  \
   }
 
-// Kernel registry
-#define LITE_KERNEL_REGISTER(op_type__, target__, precision__) \
-  op_type__##__##target__##__##precision__##__registor__
-#define LITE_KERNEL_REGISTER_INSTANCE(                   \
-    op_type__, target__, precision__, layout__, alias__) \
-  op_type__##__##target__##__##precision__##__##layout__##registor__instance__##alias__  // NOLINT
-
-#define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \
-  LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__)
-
+// Register a kernel.
 #define REGISTER_LITE_KERNEL(                                                 \
     op_type__, target__, precision__, layout__, KernelClass, alias__)         \
-  static paddle::lite::KernelRegistor<TARGET(target__),                       \
-                                      PRECISION(precision__),                 \
-                                      DATALAYOUT(layout__),                   \
-                                      KernelClass>                            \
-      LITE_KERNEL_REGISTER_INSTANCE(                                          \
-          op_type__, target__, precision__, layout__, alias__)(#op_type__,    \
-                                                               #alias__);     \
-  static KernelClass LITE_KERNEL_INSTANCE(                                    \
-      op_type__, target__, precision__, layout__, alias__);                   \
+  static paddle::lite::KernelRegistrar                                        \
+      op_type__##target__##precision__##layout__##alias__##_kernel_registry(  \
+          #op_type__,                                                         \
+          TARGET(target__),                                                   \
+          PRECISION(precision__),                                             \
+          DATALAYOUT(layout__),                                               \
+          []() {                                                              \
+            std::unique_ptr<KernelClass> x(new KernelClass);                  \
+            x->set_op_type(#op_type__);                                       \
+            x->set_alias(#alias__);                                           \
+            return x;                                                         \
+          });                                                                 \
   int touch_##op_type__##target__##precision__##layout__##alias__() {         \
+    op_type__##target__##precision__##layout__##alias__##_kernel_registry     \
+        .touch();                                                             \
     OpKernelInfoCollector::Global().AddKernel2path(                           \
         #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \
         __FILE__);                                                            \
-    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \
-        .Touch();                                                             \
     return 0;                                                                 \
   }                                                                           \
-  static bool LITE_KERNEL_PARAM_INSTANCE(                                     \
-      op_type__, target__, precision__, layout__, alias__) UNUSED =           \
-      paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__),          \
-                                                   PRECISION(precision__),    \
-                                                   DATALAYOUT(layout__)>(     \
-          #op_type__ "/" #alias__)
-
-#define LITE_KERNEL_INSTANCE(                            \
-    op_type__, target__, precision__, layout__, alias__) \
-  op_type__##target__##precision__##layout__##alias__
-#define LITE_KERNEL_PARAM_INSTANCE(                      \
-    op_type__, target__, precision__, layout__, alias__) \
-  op_type__##target__##precision__##layout__##alias__##param_register
+  static auto                                                                 \
+      op_type__##target__##precision__##layout__##alias__##param_register     \
+          UNUSED = paddle::lite::ParamTypeRegistry::NewInstance<              \
+              TARGET(target__),                                               \
+              PRECISION(precision__),                                         \
+              DATALAYOUT(layout__)>(#op_type__ "/" #alias__)
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 05f801facdf9557da1e872d69fcde0bf3b321d2e..42dac8e59bda84ce5dc2cb04f2f3712d1386b96c 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -19,6 +19,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
 #include "lite/core/mir/generate_program_pass.h"
 #include "lite/core/mir/pass_manager.h"
 #include "lite/core/mir/pass_utils.h"
@@ -36,6 +37,9 @@ namespace lite {
  * lite::Optimizer optimize a program. It utilize the mir passes to analysis the
  * program and export an optimized program.
  */
+// TODO(hong1986032) Support the following passes for the subblocks
+const std::set<std::string> kSubblockUnsupportedPasses(
+    {"memory_optimize_pass"});
 class Optimizer {
  public:
   Optimizer() {}
@@ -60,14 +64,20 @@ class Optimizer {
     program_ = &program;
     valid_places_ = valid_places;
     CHECK(!valid_places.empty()) << "At least one valid_place should be set";
-    CHECK(!graph_) << "duplicate optimize found";
-
-    graph_.reset(new mir::SSAGraph);
-    graph_->Build(program, valid_places);
-    graph_->SetValidPlaces(valid_places);
+    CHECK(graphs_.empty()) << "duplicate optimize found";
+
+    auto block_size = program.block_size();
+    for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+      std::unique_ptr<mir::SSAGraph> graph;
+      graph.reset(new mir::SSAGraph);
+      graph->Build(program, valid_places, block_idx);
+      graph->SetValidPlaces(valid_places);
+      graphs_.emplace_back(std::move(graph));
+    }
 
     SpecifyKernelPickTactic(kernel_pick_factor);
     InitTargetTypeTransformPass();
+    InitControlFlowOpUnusedInputsAndOutputsEliminatePass();
 
     if (passes.empty() || passes.size() == 1) {
       std::vector<std::string> passes_local{
@@ -76,6 +86,7 @@ class Optimizer {
            "lite_conv_elementwise_fuse_pass",      // conv-elemwise-bn
            "lite_conv_bn_fuse_pass",               //
            "lite_conv_elementwise_fuse_pass",      // conv-bn-elemwise
+           "lite_conv_conv_fuse_pass",             //
            // TODO(Superjomn) Refine the fusion related design to select fusion
            // kernels for devices automatically.
            "lite_conv_activation_fuse_pass",              //
@@ -94,6 +105,8 @@ class Optimizer {
 #endif
            "identity_dropout_eliminate_pass",
            "__xpu__resnet_fuse_pass",
+           "__xpu__resnet_cbam_fuse_pass",
+           "__xpu__mmdnn_fuse_pass",
            "__xpu__multi_encoder_fuse_pass",
            "__xpu__embedding_with_eltwise_add_fuse_pass",
            "__xpu__fc_fuse_pass",
@@ -104,12 +117,19 @@ class Optimizer {
                                                       // 'enable_int8' for all
                                                       // of the quantized ops.
            "npu_subgraph_pass",
+           "huawei_ascend_npu_subgraph_pass",
            "xpu_subgraph_pass",
            "bm_subgraph_pass",
            "apu_subgraph_pass",
            "rknpu_subgraph_pass",
-           "static_kernel_pick_pass",        // pick original kernel from graph
+           "mlu_subgraph_pass",
+           "control_flow_op_unused_inputs_and_outputs_eliminate_pass",
+           "static_kernel_pick_pass",  // pick original kernel from graph
+
+           "remove_tf_redundant_ops_pass",
            "variable_place_inference_pass",  // inference arg/var's
+
+           "mlu_postprocess_pass",
            // info(target/precision/layout/device)
            // using kernel info
            "argument_type_display_pass",  // debug pass: show arg-type-node's
@@ -139,13 +159,9 @@ class Optimizer {
            "variable_place_inference_pass",  //
            "argument_type_display_pass",
 
-           "mlu_subgraph_pass",
-
            "runtime_context_assign_pass",
            "argument_type_display_pass",
 
-           "mlu_postprocess_pass",
-
            "memory_optimize_pass"}};
 
       if (passes.size() == 1) {
@@ -172,13 +188,15 @@ class Optimizer {
     exec_scope_ = program.exec_scope();
   }
 
-  const lite::Scope* exec_scope() const { return exec_scope_; }
+  const Scope* exec_scope() const { return exec_scope_; }
 
   // Generate a new program based on the mir graph.
   std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
     auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
         "generate_program_pass");
-    pass->Apply(graph_);
+    for (auto& graph : graphs_) {
+      pass->Apply(graph);
+    }
     auto program = pass->GenProgram();
     CHECK(exec_scope_);
     program->set_exec_scope(exec_scope_);
@@ -194,20 +212,32 @@ class Optimizer {
     pass->SetValidPlaces(valid_places_);
   }
 
+  void InitControlFlowOpUnusedInputsAndOutputsEliminatePass() {
+    auto* pass =
+        mir::PassManager::Global()
+            .LookUp<mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass>(
+                "control_flow_op_unused_inputs_and_outputs_eliminate_pass");
+    CHECK(pass);
+    CHECK(!graphs_.empty());
+    pass->SetAllGraphs(&graphs_);
+  }
+
   // Generate C++ code which combines the inference program, model and weights.
   void GenCode(const std::string& code_dir);
 
-  const mir::SSAGraph& ssa_graph() const {
-    CHECK(graph_);
-    return *graph_;
+  const mir::SSAGraph& ssa_graph(int block_idx = kRootBlockIdx) const {
+    CHECK(!graphs_.empty());
+    CHECK(graphs_[block_idx]);
+    return *graphs_[block_idx];
   }
 
-  mir::SSAGraph* mutable_ssa_graph() {
-    CHECK(graph_);
-    return graph_.get();
+  mir::SSAGraph* mutable_ssa_graph(int block_idx = kRootBlockIdx) {
+    CHECK(!graphs_.empty());
+    CHECK(graphs_[block_idx]);
+    return graphs_[block_idx].get();
   }
 
-  lite::Scope* exec_scope() { return exec_scope_; }
+  Scope* exec_scope() { return exec_scope_; }
 
  protected:
   void SpecifyKernelPickTactic(core::KernelPickFactor factor);
@@ -231,16 +261,23 @@ class Optimizer {
         LOG(INFO) << "   - Skip " << x
                   << " because the target or kernel does not match.";
       } else {
-        pass->Apply(graph_);
+        // Check the pass whether it is supported for processing subblocks
+        if (kSubblockUnsupportedPasses.count(x)) {
+          pass->Apply(graphs_[kRootBlockIdx]);
+        } else {
+          for (auto& graph : graphs_) {
+            pass->Apply(graph);
+          }
+        }
         LOG(INFO) << "== Finished running: " << x;
       }
     }
   }
 
  private:
-  std::unique_ptr<mir::SSAGraph> graph_;
+  std::vector<std::unique_ptr<mir::SSAGraph>> graphs_;
   std::vector<Place> valid_places_;
-  lite::Scope* exec_scope_{};
+  Scope* exec_scope_{};
   Program* program_{};
 };
 
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 6dee8796ee1dcd944940f41cb9454344fb8367a7..bd6dd09683b5004167ee1f8d6426fde0fff4f6b0 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -15,9 +15,8 @@
 #include "lite/core/program.h"
 #include <algorithm>
 #include <map>
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
+#include <set>
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/operators/conditional_block_op.h"
 #include "lite/operators/subgraph_op.h"
 #include "lite/operators/while_op.h"
@@ -28,122 +27,221 @@
 namespace paddle {
 namespace lite {
 
-void RuntimeProgram::SaveOpInfosToProgram(cpp::ProgramDesc* desc) {
-  CHECK(desc);
-  // NOTE: RuntimeProgram do not has all meta info, so save model just update
-  // upon origin model
-  CHECK(desc->BlocksSize());
-  auto main_block = desc->GetBlock<cpp::BlockDesc>(0);
-  main_block->ClearOps();
-  for (auto& node : instructions_) {
-    auto op_type = node.op()->op_info()->Type();
-    if (op_type == "subgraph") {
-      auto subgraph_op = const_cast<operators::SubgraphOp*>(
-          static_cast<const operators::SubgraphOp*>(node.op()));
-      int sub_block_idx = subgraph_op->op_info()->GetAttr<int32_t>("sub_block");
-      if (sub_block_idx < 0) {
-        // It's a new subgraph op when its sub_block_idx < 0, Now we add its
+void RuntimeProgram::SaveToProgram(
+    std::shared_ptr<cpp::ProgramDesc> program_desc) {
+  CHECK(program_desc);
+  auto block_size = program_desc->BlocksSize();
+  CHECK_GT(block_size, 0) << "No block found!";
+  // TODD(hong19860320) Only support updating the block desc which already
+  // exists in the origin program desc
+  CHECK_LE(block_size, instructions_.size())
+      << "Invalid block size, expected (0," << instructions_.size()
+      << "] but got " << block_size;
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+    // Record all of the origin vars in the origin block
+    std::map<std::string, cpp::VarDesc> origin_var_maps;
+    auto var_size = block_desc->VarsSize();
+    for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
+      auto v = block_desc->GetVar<cpp::VarDesc>(var_idx);
+      origin_var_maps.emplace(v->Name(), *v);
+    }
+    // Update the ops and vars for each block according to the instructions
+    block_desc->ClearVars();
+    block_desc->ClearOps();
+    std::set<std::string> already_added_vars;
+    for (auto& inst : instructions_[block_idx]) {
+      auto* op = const_cast<OpLite*>(inst.op());
+      auto* op_info = op->op_info();
+      auto op_type = op_info->Type();
+      auto* kernel = inst.mutable_kernel();
+      auto* scope = op->scope();
+      // Update the origin vars which are referred by the instructions
+      // Add the new vars which are created in the passes and referred by the
+      // instructions
+      auto var_names = op_info->input_names();
+      auto out_names = op_info->output_names();
+      // Combine input and output vars and delete the duplicates
+      var_names.insert(var_names.end(), out_names.begin(), out_names.end());
+      std::stable_sort(var_names.begin(), var_names.end());
+      var_names.erase(std::unique(var_names.begin(), var_names.end()),
+                      var_names.end());
+      for (auto& var_name : var_names) {
+        if (already_added_vars.count(var_name)) continue;
+        auto* v = block_desc->AddVar<cpp::VarDesc>();
+        v->SetName(var_name);
+        auto it = origin_var_maps.find(var_name);
+        if (it != origin_var_maps.end()) {
+          v->SetType(it->second.GetType());
+          v->SetPersistable(it->second.Persistable());
+          if (var_name != "feed" && var_name != "fetch") {
+            v->SetShape(it->second.GetShape());
+            v->SetDataType(it->second.GetDataType());
+          }
+        } else {
+          std::string arg_name;
+          const Type* decl_type;
+          if (op_info->GetInputArgname(var_name, &arg_name)) {
+            decl_type = kernel->GetInputDeclType(arg_name);
+          } else {
+            op_info->GetOutputArgname(var_name, &arg_name);
+            decl_type = kernel->GetOutputDeclType(arg_name);
+          }
+          if (decl_type->IsTensor()) {
+            v->SetType(cpp::VarDesc::Type::LOD_TENSOR);
+            auto tensor = scope->FindVar(var_name)->GetMutable<Tensor>();
+            v->SetPersistable(tensor->persistable());
+            if (var_name != "feed" && var_name != "fetch") {
+              v->SetShape(tensor->dims().data());
+              auto precision = tensor->precision();
+              switch (precision) {
+#define SET_DATATYPE(precision__, data_type)           \
+  case PrecisionType::precision__:                     \
+    v->SetDataType(data_type);                         \
+    LOG(INFO) << "Update var " << var_name << " done"; \
+    break
+                SET_DATATYPE(kBool, VarDescAPI::VarDataType::BOOL);
+                SET_DATATYPE(kFloat, VarDescAPI::VarDataType::FP32);
+                SET_DATATYPE(kFP16, VarDescAPI::VarDataType::FP16);
+                SET_DATATYPE(kInt8, VarDescAPI::VarDataType::INT8);
+                SET_DATATYPE(kInt16, VarDescAPI::VarDataType::INT16);
+                SET_DATATYPE(kInt32, VarDescAPI::VarDataType::INT32);
+                SET_DATATYPE(kInt64, VarDescAPI::VarDataType::INT64);
+#undef SET_DATATYPE
+                default:
+                  LOG(WARNING) << "Unknown precision type "
+                               << PrecisionToStr(precision) << " for var "
+                               << var_name << " in op " << op_type;
+              }
+            }
+          } else if (decl_type->IsTensorList()) {
+            // Set persistable=false for tensor array
+            v->SetType(cpp::VarDesc::Type::LOD_TENSOR_ARRAY);
+            v->SetPersistable(false);
+          } else {
+            CHECK(false) << "Unsupported decl type " << *decl_type
+                         << " for var " << var_name << " in op " << op_type;
+          }
+        }
+        already_added_vars.insert(var_name);
+      }
+      // Replace all of origin ops with the instructions
+      auto op_desc = block_desc->AddOp<cpp::OpDesc>();
+      *op_desc = *op_info;
+      op_desc->SetAttr(kKernelTypeAttr, kernel->SerializedKernelType());
+      if (op_type == "subgraph" && !op_info->GetAttr<int32_t>("sub_block")) {
+        // It's a new subgraph op when its sub_block_idx = 0, Now we add its
         // subblock desc to the program desc, Then update its sub_block_idx to
         // the index of block desc of the program desc.
-        sub_block_idx = desc->BlocksSize();
-        auto sub_block_desc = subgraph_op->GetSubBlock();
-        CHECK(sub_block_desc);
-        auto new_block_desc = desc->AddBlock<cpp::BlockDesc>();
-        *new_block_desc = *sub_block_desc;
-        delete sub_block_desc;
-        subgraph_op->mutable_op_info()->SetAttr<int32_t>("sub_block",
-                                                         sub_block_idx);
-        subgraph_op->SetSubBlock(new_block_desc);
-        // Update main block desc after a new subblock desc is added
-        main_block = desc->GetBlock<cpp::BlockDesc>(0);
+        auto subgraph_op = static_cast<operators::SubgraphOp*>(op);
+        auto sub_program_desc = subgraph_op->GetProgramDesc();
+        CHECK(sub_program_desc);
+        auto sub_block_desc = program_desc->AddBlock<cpp::BlockDesc>();
+        *sub_block_desc = *sub_program_desc->GetBlock<cpp::BlockDesc>(0);
+        subgraph_op->SetProgramDesc(program_desc);
+        op_desc->SetAttr<int32_t>("sub_block", program_desc->BlocksSize() - 1);
+        // Attach op and kernel again to update the new block_idx and
+        // program_desc
+        subgraph_op->Attach(*op_desc, scope);
+        subgraph_op->AttachKernel(kernel);
+        // Update the pointer of block desc after a new subblock desc is added
+        block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
       }
     }
-    auto op = main_block->AddOp<cpp::OpDesc>();
-    *op = *node.op()->op_info();
-    op->SetAttr(kKernelTypeAttr, node.kernel()->SerializedKernelType());
   }
 }
 
-// `UpdateVarsOfProgram` will remove unused var_descs and add new created
-// vars' descs in the block 0. Now, the type of a new created var can only
-// be LOD_TENSOR.
-void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
-  CHECK(desc);
-  CHECK(desc->BlocksSize());
-  std::map<std::string, cpp::VarDesc> origin_var_maps;
-  auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
-  auto var_size = main_block.VarsSize();
-  for (int i = 0; i < var_size; i++) {
-    auto v = main_block.GetVar<cpp::VarDesc>(i);
-    auto name = v->Name();
-    origin_var_maps.emplace(name, *v);
-  }
-
-  main_block.ClearVars();
-  for (auto& node : instructions_) {
-    auto* op = const_cast<lite::OpLite*>(node.op());
-    auto* kernel = node.kernel();
-    auto* scope = op->scope();
-    auto in_names = op->op_info()->input_names();
-    auto out_names = op->op_info()->output_names();
-    in_names.insert(in_names.end(), out_names.begin(), out_names.end());
-    std::stable_sort(in_names.begin(), in_names.end());
-    in_names.erase(std::unique(in_names.begin(), in_names.end()),
-                   in_names.end());
-    for (auto& in_name : in_names) {
-      auto it = origin_var_maps.find(in_name);
-      if (it != origin_var_maps.end()) {
-        auto* v = main_block.AddVar<cpp::VarDesc>();
-        v->SetName((it->second).Name());
-        v->SetType((it->second).GetType());
-        v->SetPersistable((it->second).Persistable());
-        if ((it->second).Name() != "feed" && (it->second).Name() != "fetch") {
-          v->SetShape((it->second).GetShape());
-          v->SetDataType((it->second).GetDataType());
-        }
+// Create runtime program from sub_block desc according to block_idx and
+// program_desc, which is used for while/conditional_block/subgraph op.
+RuntimeProgram::RuntimeProgram(
+    const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+    Scope* exec_scope,
+    int block_idx)
+    : exec_scope_(exec_scope) {
+#ifdef LITE_WITH_OPENCL
+  using OpenCLContext = Context<TargetType::kOpenCL>;
+  std::unique_ptr<KernelContext> local_ctx(new KernelContext());
+  local_ctx->As<OpenCLContext>().InitOnce();
+#endif
+  CHECK(program_desc);
+  auto block_size = program_desc->BlocksSize();
+  CHECK(block_size) << "No block found!";
+  CHECK(block_idx >= 0 && block_idx < block_size)
+      << "Invalid block index, expected [0," << (block_size - 1) << "] but got "
+      << block_idx;
+  auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+  instructions_.resize(kRootBlockIdx + 1);
+  auto op_size = block_desc->OpsSize();
+  for (size_t op_idx = 0; op_idx < op_size; op_idx++) {
+    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
+    CHECK(op_desc);
+    std::string op_type = op_desc->Type();
+    // if (op_type == "feed" || op_type == "fetch") continue;
+    // Create op and pick up the best kernel
+    auto op = LiteOpRegistry::Global().Create(op_type);
+    CHECK(op) << "no Op found for " << op_type;
+    if (op_type == "while") {
+      static_cast<operators::WhileOp*>(op.get())->SetProgramDesc(program_desc);
+    } else if (op_type == "conditional_block") {
+      static_cast<operators::ConditionalBlockOp*>(op.get())->SetProgramDesc(
+          program_desc);
+    } else if (op_type == "subgraph") {
+      static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
+          program_desc);
+    }
+    op->Attach(*op_desc, exec_scope_);
+    std::unique_ptr<KernelBase> kernel;
+    if (op_desc->HasAttr(kKernelTypeAttr)) {
+      // Create op and pick up the best kernel according to the
+      // kKernelTypeAttr attribute
+      auto kernel_type = op_desc->GetAttr<std::string>(kKernelTypeAttr);
+      std::string alias;
+      Place place;
+      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
+      VLOG(3) << "Found the attr '" << kKernelTypeAttr << "': " << kernel_type
+              << " for " << op_type;
+      auto kernels = op->CreateKernels({place});
+      CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
+      auto it = std::find_if(
+          kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
+            return it->alias() == alias;
+          });
+      CHECK(it != kernels.end());
+      kernel = std::move(*it);
+    } else {
+      // TODO(hong19860320) add kernel picking according to the type of input
+      // and output tensors
+      VLOG(3) << "The attr '" << kKernelTypeAttr
+              << "' not found, pick the first kernel for " << op_type;
+      std::vector<std::unique_ptr<KernelBase>> kernels;
+#if defined(LITE_WITH_ARM)
+      kernels = op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}});
+#elif defined(LITE_WITH_X86)
+      kernels = op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}});
+#endif
+      if (kernels.size() > 0) {
+        kernel = std::move(kernels.front());
       } else {
-        // New created vars must be LOD_TENSOR
-        auto* v = main_block.AddVar<cpp::VarDesc>();
-        v->SetName(in_name);
-        v->SetType(cpp::VarDesc::Type::LOD_TENSOR);
-        std::string in_arg_name;
-        const Type* type;
-        if (op->op_info()->GetInputArgname(in_name, &in_arg_name)) {
-          type = kernel->GetInputDeclType(in_arg_name);
-        } else {
-          op->op_info()->GetOutputArgname(in_name, &in_arg_name);
-          type = kernel->GetOutputDeclType(in_arg_name);
-        }
-        if (type->IsTensor()) {
-          auto tensor = scope->FindVar(in_name)->GetMutable<Tensor>();
-          v->SetPersistable(tensor->persistable());
-          if (in_name != "feed" && in_name != "fetch") {
-            v->SetShape(tensor->dims().data());
-            switch (tensor->precision()) {
-#define SET_DATATYPE(precision__, data_type)                    \
-  case PrecisionType::precision__:                              \
-    v->SetDataType(data_type);                                  \
-    LOG(INFO) << "update var" << (it->second).Name() << "done"; \
-    break
-              SET_DATATYPE(kBool, VarDescAPI::VarDataType::BOOL);
-              SET_DATATYPE(kFloat, VarDescAPI::VarDataType::FP32);
-              SET_DATATYPE(kFP16, VarDescAPI::VarDataType::FP16);
-              SET_DATATYPE(kInt8, VarDescAPI::VarDataType::INT8);
-              SET_DATATYPE(kInt16, VarDescAPI::VarDataType::INT16);
-              SET_DATATYPE(kInt32, VarDescAPI::VarDataType::INT32);
-              SET_DATATYPE(kInt64, VarDescAPI::VarDataType::INT64);
-#undef SET_DATATYPE
-              default:
-                VLOG(4) << "warning! unknown precision type";
-            }
-          }
-        } else {
-          CHECK(false) << "unsupported var type";
-        }
+        LOG(WARNING) << "No kernels found for " << op_type;
       }
     }
+#ifdef LITE_WITH_OPENCL
+    if (kernel->target() == TARGET(kOpenCL)) {
+      std::unique_ptr<KernelContext> ctx(new KernelContext());
+      (*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
+      kernel->SetContext(std::move(ctx));
+    } else {
+      kernel->SetContext(
+          ContextScheduler::Global().NewContext(kernel->target()));
+    }
+#else
+    kernel->SetContext(ContextScheduler::Global().NewContext(kernel->target()));
+#endif
+    instructions_[kRootBlockIdx].emplace_back(std::move(op), std::move(kernel));
   }
+  Init();
 }
+
 void RuntimeProgram::Run() {
 #ifdef LITE_WITH_PRECISION_PROFILE
   auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
@@ -160,7 +258,8 @@ void RuntimeProgram::Run() {
   }
 #endif
   int idx = -1;
-  for (auto& inst : instructions_) {
+  auto& insts = instructions_[kRootBlockIdx];
+  for (auto& inst : insts) {
     ++idx;
 #ifndef LITE_WITH_FPGA
     if (inst.is_feed_fetch_op()) continue;
@@ -193,57 +292,50 @@ void RuntimeProgram::Run() {
 #endif
 }
 
-void Program::Build(const cpp::ProgramDesc& prog) {
+void Program::Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc) {
   CHECK(ops_.empty()) << "Executor duplicate Build found";
 
   // Create operators.
-  auto program = prog;
-  CHECK(program.BlocksSize());
-  auto& main_block = *program.GetBlock<cpp::BlockDesc>(0);
-  for (size_t i = 0; i < main_block.OpsSize(); ++i) {
-    auto& op_desc = *main_block.GetOp<cpp::OpDesc>(i);
-    auto op_type = op_desc.Type();
-    // if (op_type == "feed" || op_type == "fetch") continue;
-    VLOG(4) << "create Op [" << op_type << "]";
-    auto op = LiteOpRegistry::Global().Create(op_type);
-    CHECK(op) << "no Op found for " << op_type;
-    if (op_type == "while" || op_type == "conditional_block" ||
-        op_type == "subgraph") {
-      auto sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
-      CHECK(sub_block_idx >= 0 && sub_block_idx < program.BlocksSize())
-          << "Invalid attribute sub_block(" << sub_block_idx << ") for "
-          << op_type;
-      auto sub_block_desc =
-          const_cast<cpp::ProgramDesc&>(prog).GetBlock<cpp::BlockDesc>(
-              sub_block_idx);
-      CHECK(sub_block_desc);
+  auto block_size = program_desc->BlocksSize();
+  CHECK(block_size);
+  ops_.resize(block_size);
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    auto* block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+    auto op_size = block_desc->OpsSize();
+    for (size_t op_idx = 0; op_idx < op_size; ++op_idx) {
+      auto* op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
+      auto op_type = op_desc->Type();
+      VLOG(4) << "create Op [" << op_type << "]";
+      auto op = LiteOpRegistry::Global().Create(op_type);
+      CHECK(op) << "no Op found for " << op_type;
       if (op_type == "while") {
-        static_cast<operators::WhileOpLite*>(op.get())->SetSubBlock(
-            sub_block_desc);
+        static_cast<operators::WhileOp*>(op.get())->SetProgramDesc(
+            program_desc);
       } else if (op_type == "conditional_block") {
-        static_cast<operators::ConditionalBlockOpLite*>(op.get())->SetSubBlock(
-            sub_block_desc);
+        static_cast<operators::ConditionalBlockOp*>(op.get())->SetProgramDesc(
+            program_desc);
       } else if (op_type == "subgraph") {
-        static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(
-            sub_block_desc);
+        static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
+            program_desc);
       }
+      op->Attach(*op_desc, exec_scope_);
+      ops_[block_idx].emplace_back(std::move(op));
     }
-    ops_.emplace_back(std::move(op));
-    ops_.back()->Attach(op_desc, exec_scope_);
   }
 }
 
-void Program::PrepareWorkspace(const cpp::ProgramDesc& prog,
-                               const std::vector<std::string>& var_names) {
+void Program::PrepareWorkspace(
+    const std::shared_ptr<cpp::ProgramDesc>& program_desc,
+    const std::vector<std::string>& vars_to_clone) {
   CHECK(!exec_scope_) << "Duplicate PrepareWorkspace found";
   exec_scope_ = &scope_->NewScope();
   // Create Feed and Fetch var.
   scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
   scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
-  tmp_vars_.push_back("feed");
-  tmp_vars_.push_back("fetch");
+  vars_.push_back("feed");
+  vars_.push_back("fetch");
 
-  auto VarPrecision2KernlPrecision =
+  auto VarDescType2PrecisionType =
       [](const lite::VarDescAPI::Type& type) -> PrecisionType {
     switch (type) {
       case lite::VarDescAPI::Type::FP32:
@@ -259,44 +351,60 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog,
       case lite::VarDescAPI::Type::INT64:
         return PRECISION(kInt64);
       default:
-        // LOG(FATAL) << "not supported type: " << static_cast<int>(type);
+        LOG(WARNING) << "Unable to convert var desc type("
+                     << static_cast<int>(type) << ") to precision type!";
         return PRECISION(kUnk);
     }
   };
 
-  auto program = prog;
-  CHECK(program.BlocksSize());
-  for (size_t b = 0; b < program.BlocksSize(); ++b) {
-    auto& main_block = *program.GetBlock<cpp::BlockDesc>(b);
-    for (size_t i = 0; i < main_block.VarsSize(); ++i) {
-      auto& var_desc = *main_block.GetVar<cpp::VarDesc>(i);
-      if (!var_desc.Persistable()) {
-        if (var_desc.GetType() == lite::VarDescAPI::Type::LOD_TENSOR &&
-            VarPrecision2KernlPrecision(var_desc.GetDataType()) !=
-                PRECISION(kUnk)) {
-          var_data_type_[var_desc.Name()] =
-              VarPrecision2KernlPrecision(var_desc.GetDataType());
-        }
-        tmp_vars_.push_back(var_desc.Name());
-        VLOG(4) << "var name: " << var_desc.Name() << " type is "
-                << static_cast<int>(var_desc.GetType()) << " data type is "
-                << static_cast<int>(var_desc.GetDataType());
-        exec_scope_->Var(var_desc.Name());
-        if (b > 0) {
-          VLOG(4) << "var: " << var_desc.Name();
+  auto block_size = program_desc->BlocksSize();
+  CHECK(block_size);
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    auto* block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+    auto var_size = block_desc->VarsSize();
+    for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
+      auto* var_desc = block_desc->GetVar<cpp::VarDesc>(var_idx);
+      const auto& var_name = var_desc->Name();
+      const auto& var_type = var_desc->GetType();
+      if (!var_desc->Persistable()) {
+        vars_.push_back(var_name);
+        auto* var = exec_scope_->Var(var_name);
+        VLOG(4) << "Var " << var_name << " in block " << block_idx;
+        VLOG(4) << " - type " << static_cast<int>(var_type);
+        if (var_type == lite::VarDescAPI::Type::LOD_TENSOR) {
+          const auto& var_data_type =
+              VarDescType2PrecisionType(var_desc->GetDataType());
+          if (var_data_type != PRECISION(kUnk)) {
+            var_type_map_[var_name] = LiteType::GetTensorTy(
+                TARGET(kUnk), var_data_type, DATALAYOUT(kUnk));
+          }
+          VLOG(4) << " - data type " << static_cast<int>(var_data_type);
+          // Create the tensor with the shape from var desc, it's convenient to
+          // the graph analysis in the passes, but you should resize the tensor
+          // with the real shape before accessing its data, because the
+          // var_shape may be [-1,3,224,224]
+          const auto& var_shape = var_desc->GetShape();
+          auto* tensor = var->GetMutable<lite::Tensor>();
+          if (tensor->dims().empty() && !var_shape.empty()) {
+            tensor->Resize(var_shape);
+            VLOG(4) << " - dims " << tensor->dims().repr();
+          }
+        } else if (var_type == lite::VarDescAPI::Type::LOD_TENSOR_ARRAY) {
+          var_type_map_[var_name] = LiteType::GetTensorListTy(
+              TARGET(kUnk), PRECISION(kUnk), DATALAYOUT(kUnk));
         }
       } else {
-        if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue;
-        weights_.push_back(var_desc.Name());
-        if (var_desc.Persistable()) scope_->Var(var_desc.Name());
+        if (var_name == "feed" || var_name == "fetch") continue;
+        weights_.push_back(var_name);
+        scope_->Var(var_name);
       }
     }
   }
 
-  for (auto i : var_names) {
-    exec_scope_->LocalVar(i);
-    auto* tensor = scope_->Var(i)->GetMutable<lite::Tensor>();
-    auto* sub_tensor = exec_scope_->Var(i)->GetMutable<lite::Tensor>();
+  for (auto var_name : vars_to_clone) {
+    exec_scope_->LocalVar(var_name);
+    auto* tensor = scope_->Var(var_name)->GetMutable<Tensor>();
+    auto* sub_tensor = exec_scope_->Var(var_name)->GetMutable<Tensor>();
     sub_tensor->CopyDataFrom(*tensor);
   }
 }
diff --git a/lite/core/program.h b/lite/core/program.h
index 6fe65f158b8d547e7a741e329a192d2661a60060..f0715b9760b81f8de42e0acee5f5839fc42dd65a 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -22,7 +22,7 @@
 #include "lite/core/kernel.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
-#include "lite/model_parser/cpp/program_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #ifdef LITE_WITH_PROFILE
 #include "lite/core/profile/profiler.h"
 #endif
@@ -41,58 +41,66 @@ static const char kKernelTypeAttr[] = "__@kernel_type_attr@__";
 // - scope: which contains all the weights
 struct Program {
  public:
-  explicit Program(const std::shared_ptr<Scope>& root) { scope_ = root; }
-  Program(const cpp::ProgramDesc& desc,
-          const std::shared_ptr<Scope>& root,
+  explicit Program(const std::shared_ptr<Scope>& root_scope) {
+    scope_ = root_scope;
+  }
+  Program(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
+          const std::shared_ptr<Scope>& root_scope,
           const std::vector<Place>& valid_places,
           const std::vector<std::string>& var_names = {})
-      : scope_(root), valid_places_(valid_places), desc_(desc) {
+      : scope_(root_scope), valid_places_(valid_places) {
     CHECK(scope_) << "scope should be init first";
     VLOG(4) << "prepare work";
-    PrepareWorkspace(desc, var_names);
+    PrepareWorkspace(program_desc, var_names);
     VLOG(4) << "build desc";
-    Build(desc);
+    Build(program_desc);
     VLOG(4) << "build desc finished";
   }
 
   std::unique_ptr<Program> Clone() const {
-    std::unique_ptr<Program> res(new Program(desc_, scope_, valid_places_));
-    return res;
+    return std::unique_ptr<Program>(new Program(scope_));
   }
 
   const std::list<std::string>& weights() const { return weights_; }
-  const std::list<std::string>& tmp_vars() const { return tmp_vars_; }
+  const std::list<std::string>& vars() const { return vars_; }
   std::list<std::string>* mutable_weights() { return &weights_; }
-  std::list<std::string>* mutable_tmp_vars() { return &tmp_vars_; }
+  std::list<std::string>* mutable_vars() { return &vars_; }
 
-  const std::list<std::shared_ptr<OpLite>>& ops() const { return ops_; }
-  std::list<std::shared_ptr<OpLite>>* mutable_ops() { return &ops_; }
+  const std::list<std::shared_ptr<OpLite>>& ops(
+      int block_idx = kRootBlockIdx) const {
+    return ops_[block_idx];
+  }
+  std::list<std::shared_ptr<OpLite>>* mutable_ops(
+      int block_idx = kRootBlockIdx) {
+    return &ops_[block_idx];
+  }
 
-  lite::Scope* exec_scope() { return exec_scope_; }
-  lite::Scope* scope() { return scope_.get(); }
+  size_t block_size() { return ops_.size(); }
 
-  const std::map<std::string, PrecisionType>& var_data_type() const {
-    return var_data_type_;
+  Scope* exec_scope() { return exec_scope_; }
+  Scope* scope() { return scope_.get(); }
+
+  const std::map<std::string, const Type*>& var_type_map() const {
+    return var_type_map_;
   }
 
  private:
   // Build from a program and scope.
-  void Build(const cpp::ProgramDesc& program);
+  void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc);
   // Create temporary variables.
-  void PrepareWorkspace(const cpp::ProgramDesc& program,
-                        const std::vector<std::string>& var_names = {});
+  void PrepareWorkspace(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
+                        const std::vector<std::string>& vars_to_clone = {});
 
  private:
-  std::map<std::string, PrecisionType> var_data_type_;
-  std::list<std::string> tmp_vars_;
+  std::map<std::string, const Type*> var_type_map_;
+  std::list<std::string> vars_;
   std::list<std::string> weights_;
-  std::list<std::shared_ptr<OpLite>> ops_;
+  std::vector<std::list<std::shared_ptr<OpLite>>> ops_;
   // the scope to run the kernels, NOTE this is the execution scope.
-  std::shared_ptr<lite::Scope> scope_;
+  std::shared_ptr<Scope> scope_;
   std::vector<Place> valid_places_;
   // Runtime scope.
-  lite::Scope* exec_scope_{};
-  cpp::ProgramDesc desc_;
+  Scope* exec_scope_{};
 };
 
 struct Instruction {
@@ -170,8 +178,22 @@ struct Instruction {
  */
 class LITE_API RuntimeProgram {
  public:
-  explicit RuntimeProgram(std::vector<Instruction>&& insts)
+  explicit RuntimeProgram(std::vector<std::vector<Instruction>>&& insts)
       : instructions_(std::move(insts)) {
+    Init();
+  }
+  explicit RuntimeProgram(
+      const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+      Scope* exec_scope,
+      int block_idx = kRootBlockIdx);
+  ~RuntimeProgram() {
+#ifdef LITE_WITH_PROFILE
+    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
+    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch);
+#endif  // LITE_WITH_PROFILE
+  }
+
+  void Init() {
     if (instructions_.empty()) {
       LOG(FATAL) << "no instructions";
     }
@@ -180,7 +202,7 @@ class LITE_API RuntimeProgram {
 #endif
 #ifdef LITE_WITH_NVTX
     const NVTXAnnotator& annotator = NVTXAnnotator::Global();
-    for (auto& inst : instructions_) {
+    for (auto& inst : instructions_[kRootBlockIdx]) {
       NVTXRangeAnnotation annotation = annotator.AnnotateBlock();
       register_layer_names_.push_back(annotator.RegisterString(
           const_cast<paddle::lite::OpLite*>(inst.op())->Type().c_str()));
@@ -188,41 +210,38 @@ class LITE_API RuntimeProgram {
     register_layer_names_.push_back(annotator.RegisterString("one_loop"));
 #endif
   }
-  ~RuntimeProgram() {
-#ifdef LITE_WITH_PROFILE
-    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
-    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch);
-#endif  // LITE_WITH_PROFILE
-  }
 
   void Run();
 
-  void set_exec_scope(lite::Scope* x) { exec_scope_ = x; }
-  lite::Scope* exec_scope() { return exec_scope_; }
+  void set_exec_scope(Scope* x) { exec_scope_ = x; }
+  Scope* exec_scope() { return exec_scope_; }
 
-  size_t num_instructions() const { return instructions_.size(); }
+  const std::vector<Instruction>& instructions(
+      int block_idx = kRootBlockIdx) const {
+    return instructions_[block_idx];
+  }
 
-  const std::vector<Instruction>& instructions() const { return instructions_; }
+  std::vector<Instruction>* mutable_instructions(
+      int block_idx = kRootBlockIdx) {
+    return &instructions_[block_idx];
+  }
 
-  // `SaveOpInfosToProgram` will update the op list(ops_) of the block 0
-  // in ProgramDesc.
-  void SaveOpInfosToProgram(cpp::ProgramDesc* desc);
+  size_t block_size() { return instructions_.size(); }
 
-  // `UpdateVarsOfProgram` will update the var list(vars_) of the block 0 in
-  // ProgramDesc. Namely, if a new var created in some passes, its var_desc will
-  // be added in vars_.
-  void UpdateVarsOfProgram(cpp::ProgramDesc* desc);
+  // Update the ops and vars of all of blocks to the given program_desc
+  // according to the instructions
+  void SaveToProgram(std::shared_ptr<cpp::ProgramDesc> program_desc);
 
  private:
   RuntimeProgram(const RuntimeProgram&) = delete;
-  std::vector<Instruction> instructions_;
-  lite::Scope* exec_scope_{};
+  std::vector<std::vector<Instruction>> instructions_;
+  Scope* exec_scope_{};
 
 #ifdef LITE_WITH_PROFILE
   profile::Profiler profiler_;
   void set_profiler() {
-    for (auto i = instructions_.begin(); i != instructions_.end(); ++i) {
-      i->set_profiler(&profiler_);
+    for (auto& inst : instructions_[kRootBlockIdx]) {
+      inst.set_profiler(&profiler_);
     }
   }
 #endif
diff --git a/lite/core/scope.h b/lite/core/scope.h
index 57e4e3a5e058000f963ff369cbd25e69b9c981c6..41d6ee8f4f55268e3389cd4cada7e48fb8f922d7 100644
--- a/lite/core/scope.h
+++ b/lite/core/scope.h
@@ -62,19 +62,36 @@ class Scope final {
   // Create a Tensor variable. This will create a new Variable called `name`.
   Tensor* NewTensor(const std::string& name) {
     auto* var = Var(name);
-    return var->GetMutable<TensorLite>();
+    return var->GetMutable<Tensor>();
   }
 
   const Tensor* FindTensor(const std::string& name) {
     auto* var = FindVar(name);
     if (!var) return nullptr;
-    return &var->Get<TensorLite>();
+    return &var->Get<Tensor>();
   }
 
   Tensor* FindMutableTensor(const std::string& name) {
     auto* var = FindVar(name);
     if (!var) return nullptr;
-    return var->GetMutable<TensorLite>();
+    return var->GetMutable<Tensor>();
+  }
+
+  std::vector<Tensor>* NewTensorList(const std::string& name) {
+    auto* var = Var(name);
+    return var->GetMutable<std::vector<Tensor>>();
+  }
+
+  const std::vector<Tensor>* FindTensorList(const std::string& name) {
+    auto* var = FindVar(name);
+    if (!var) return nullptr;
+    return &var->Get<std::vector<Tensor>>();
+  }
+
+  std::vector<Tensor>* FindMutableTensorList(const std::string& name) {
+    auto* var = FindVar(name);
+    if (!var) return nullptr;
+    return var->GetMutable<std::vector<Tensor>>();
   }
 
  private:
diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc
index 197ee4ddbcd5df62dd0f8a15eba39e2a880f7125..3b21cf9147ded7b05938edc6c2985c8fce23842f 100644
--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
@@ -84,6 +84,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
   lod_ = other.lod_;
   memory_size_ = other.memory_size_;
   precision_ = other.precision_;
+  persistable_ = other.persistable_;
   buffer_->CopyDataFrom(*other.buffer_, memory_size_);
 }
 
diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
index 3d09c071aa7ecbe51f1723cad314f2aedcdb2bd7..2604f104e72081025d9bd59bb60843cc627ad54f 100644
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -78,6 +78,28 @@ void RunModel(std::string model_dir,
   // 1. Set MobileConfig
   MobileConfig config;
   config.set_model_from_file(model_dir);
+
+  // NOTE: Use android gpu with opencl, you should ensure:
+  //  first, [compile **cpu+opencl** paddlelite
+  //    lib](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/demo_guides/opencl.md);
+  //  second, [convert and use opencl nb
+  //    model](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/user_guides/opt/opt_bin.md).
+  //
+  /*  Uncomment code below to enable OpenCL
+  bool is_opencl_backend_valid = ::IsOpenCLBackendValid();
+  std::cout << "is_opencl_backend_valid:" << is_opencl_backend_valid <<
+  std::endl;
+  if (is_opencl_backend_valid) {
+    // give opencl nb model dir
+    config.set_model_from_file(model_dir);
+  } else {
+    std::cout << "Unsupport opencl nb model." << std::endl;
+    exit(1);
+    // you can give backup cpu nb model instead
+    // config.set_model_from_file(cpu_nb_model_dir);
+  }
+  */
+
   // NOTE: To load model transformed by model_optimize_tool before
   // release/v2.3.0, plese use `set_model_dir` API as listed below.
   // config.set_model_dir(model_dir);
diff --git a/lite/fluid/data_type.cc b/lite/fluid/data_type.cc
index 0dab71ed26c1b4ee438f52e088614bb577a9eade..3ad02a9c53c311a9253bbdf481c9aa6288685654 100644
--- a/lite/fluid/data_type.cc
+++ b/lite/fluid/data_type.cc
@@ -67,7 +67,7 @@ framework::proto::VarType::Type ToDataType(std::type_index type) {
   if (it != gDataTypeMap().cpp_to_proto_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support %s as tensor type", type.name());
+  LOG(FATAL) << "Not support " << type.name() << " as tensor type";
   return static_cast<framework::proto::VarType::Type>(-1);
 }
 
@@ -76,8 +76,8 @@ std::type_index ToTypeIndex(framework::proto::VarType::Type type) {
   if (it != gDataTypeMap().proto_to_cpp_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type",
-               static_cast<int>(type));
+  LOG(FATAL) << "Not support framework::proto::VarType::Type("
+             << static_cast<int>(type) << ") as tensor type";
   return std::type_index(typeid(void));
 }
 
@@ -86,8 +86,8 @@ std::string DataTypeToString(const framework::proto::VarType::Type type) {
   if (it != gDataTypeMap().proto_to_str_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type",
-               static_cast<int>(type));
+  LOG(FATAL) << "Not support framework::proto::VarType::Type("
+             << static_cast<int>(type) << ") as tensor type";
   return std::string();
 }
 
@@ -96,7 +96,8 @@ size_t SizeOfType(framework::proto::VarType::Type type) {
   if (it != gDataTypeMap().proto_to_size_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type).c_str());
+  LOG(FATAL) << "Not support " << DataTypeToString(type).c_str()
+             << " as tensor type";
   return 0;
 }
 
diff --git a/lite/fluid/data_type.h b/lite/fluid/data_type.h
index a8b11ec465e00356561c95b56f63e3c56cbe8a5b..9896c0d54844b99748e1a7c8bddc5e178f84fb51 100644
--- a/lite/fluid/data_type.h
+++ b/lite/fluid/data_type.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <typeindex>
 #include "lite/core/framework.pb.h"
 #include "lite/fluid/float16.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -72,7 +72,7 @@ inline void VisitDataType(framework::proto::VarType::Type type,
 
   _ForEachDataType_(VisitDataTypeCallback);
 #undef VisitDataTypeCallback
-  PADDLE_THROW("Not supported %d", type);
+  LOG(FATAL) << "Not supported " << type;
 }
 
 extern std::string DataTypeToString(const framework::proto::VarType::Type type);
diff --git a/lite/fluid/eigen.h b/lite/fluid/eigen.h
index c3af7e9f6c3588f404c614430bf01f7ab5e099e5..3312c9c39eaad4fc0a4225d9734b3f80790b2979 100644
--- a/lite/fluid/eigen.h
+++ b/lite/fluid/eigen.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <vector>
 #include "lite/core/tensor.h"
 #include "lite/fluid/float16.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
@@ -30,7 +30,7 @@ struct EigenDim {
   using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
 
   static Type From(const lite::DDim& dims) {
-    PADDLE_ENFORCE_EQ(dims.size(), D, "D must match DDim::size");
+    CHECK_EQ(dims.size(), D) << "D must match DDim::size";
     Type ret;
     for (size_t d = 0; d < dims.size(); d++) {
       ret[d] = dims[d];
@@ -39,7 +39,7 @@ struct EigenDim {
   }
 
   static Type From(const DDim::value_type length) {
-    PADDLE_ENFORCE_EQ(D, 1, "D must be 1.");
+    CHECK_EQ(D, 1) << "D must be 1.";
     Type ret;
     ret[0] = length;
     return ret;
@@ -84,16 +84,16 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
   static typename EigenMatrix::Type Reshape(Tensor& tensor,  // NOLINT
                                             int num_col_dims) {
     int rank = tensor.dims().size();
-    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
-                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    CHECK(num_col_dims > 0 && num_col_dims < rank)
+        << "`num_col_dims` must be between (0, rank_of_tensor).";
     return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims));
   }
 
   static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
                                                  int num_col_dims) {
     int rank = tensor.dims().size();
-    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
-                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    CHECK(num_col_dims > 0 && num_col_dims < rank)
+        << "`num_col_dims` must be between (0, rank_of_tensor).";
     return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims));
   }
 };
diff --git a/lite/fluid/rw_lock.h b/lite/fluid/rw_lock.h
index eb9829425eca9d8bd363a45961302a7f3818e513..f68a21502073ccde6d27c46793d3f8cfa0751af3 100644
--- a/lite/fluid/rw_lock.h
+++ b/lite/fluid/rw_lock.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #endif            // !_WIN32
 
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -33,17 +33,15 @@ struct RWLock {
   ~RWLock() { pthread_rwlock_destroy(&lock_); }
 
   inline void RDLock() {
-    PADDLE_ENFORCE_EQ(
-        pthread_rwlock_rdlock(&lock_), 0, "acquire read lock failed");
+    CHECK_EQ(pthread_rwlock_rdlock(&lock_), 0) << "acquire read lock failed";
   }
 
   inline void WRLock() {
-    PADDLE_ENFORCE_EQ(
-        pthread_rwlock_wrlock(&lock_), 0, "acquire write lock failed");
+    CHECK_EQ(pthread_rwlock_wrlock(&lock_), 0) << "acquire write lock failed";
   }
 
   inline void UNLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+    CHECK_EQ(pthread_rwlock_unlock(&lock_), 0) << "unlock failed";
   }
 
  private:
diff --git a/lite/fluid/selected_rows.cc b/lite/fluid/selected_rows.cc
index 98e9325ca2f8fab3f8aa77a0bb074ae5d1be7670..361d63cf5dfd9cd21db47917047a7e2f3758ec96 100644
--- a/lite/fluid/selected_rows.cc
+++ b/lite/fluid/selected_rows.cc
@@ -119,7 +119,7 @@ void DeserializeFromStream(
     // the 1st field, unit32_t version for SelectedRows
     uint32_t version;
     is.read(reinterpret_cast<char*>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    CHECK_EQ(version, 0U) << "Only version 0 is supported";
   }
   {
     // the 2st field, rows information
@@ -163,24 +163,22 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key,
   if (iter == id_to_index_.end()) {
     rwlock_->UNLock();
     if (!auto_grown) {
-      PADDLE_THROW("key %ld not found", key);
+      LOG(FATAL) << "key " << key << " not found";
     }
     rwlock_->WRLock();
     auto map_size = id_to_index_.size();
     auto vector_size = rows_.size();
     if (map_size != vector_size) {
       rwlock_->UNLock();
-      PADDLE_THROW(
-          "id_to_index_ size %lu should have the same size with rows_ %lu",
-          map_size,
-          vector_size);
+      LOG(FATAL) << "id_to_index_ size " << map_size
+                 << " should have the same size with rows_ " << vector_size;
     }
     auto write_iter = id_to_index_.find(key);
     if (write_iter == id_to_index_.end()) {
       int row_num = rows_.size();
       if (row_num == value_->dims()[0]) {
         rwlock_->UNLock();
-        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+        LOG(FATAL) << "selected rows is full, then length exceed " << row_num;
       }
       // key logic to put a key into id_to_index_
       rows_.push_back(key);
@@ -213,16 +211,14 @@ void SelectedRows::Get(const lite::Tensor& ids,
                        lite::Tensor* value,
                        bool auto_grown,
                        bool is_test) {
-  PADDLE_ENFORCE(value->IsInitialized(),
-                 "The value tensor should be initialized.");
+  CHECK(value->IsInitialized()) << "The value tensor should be initialized.";
   if (ids.numel() == 0) {
     VLOG(3) << "keys is empty, please check data!";
   } else {
     int64_t value_width = value_->numel() / value_->dims()[0];
-    PADDLE_ENFORCE_EQ(value_width,
-                      value->numel() / value->dims()[0],
-                      "output tensor should have the same shape with table "
-                      "except the dims[0].");
+    CHECK_EQ(value_width, value->numel() / value->dims()[0])
+        << "output tensor should have the same shape with table "
+           "except the dims[0].";
     for (int i = 0; i < ids.numel(); ++i) {
       auto id = ids.data<int64_t>()[i];
       int64_t index = AutoGrownIndex(id, auto_grown, is_test);
diff --git a/lite/fluid/selected_rows.h b/lite/fluid/selected_rows.h
index 5db322f8592f4518d9e1ccc996ffb1e847e7b964..aad93552ebef5d67c77e554b29bf593f5cd176f7 100644
--- a/lite/fluid/selected_rows.h
+++ b/lite/fluid/selected_rows.h
@@ -82,7 +82,7 @@ class SelectedRows {
   int64_t Index(int64_t key) const {
     auto it = std::find(rows_.begin(), rows_.end(), key);
     if (it == rows_.end()) {
-      PADDLE_THROW("id %ld not in table", key);
+      LOG(FATAL) << "id " << key << " not in table";
     }
     return static_cast<int64_t>(std::distance(rows_.begin(), it));
   }
diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt
index 2416278ad74068d28f6de523c55513891b08cc72..5dffd7c1a93225a38e433a4ff447b9b0fc863216 100644
--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
         X86_DEPS ${x86_kernels}
         ARM_DEPS ${arm_kernels}
         NPU_DEPS ${npu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
         RKNPU_DEPS ${rknpu_kernels}
         XPU_DEPS ${xpu_kernels}
         CL_DEPS ${opencl_kernels}
@@ -44,6 +45,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
     X86_DEPS ${x86_kernels}
     ARM_DEPS ${arm_kernels}
     NPU_DEPS ${npu_kernels}
+    HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
     RKNPU_DEPS ${rknpu_kernels}
     XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
diff --git a/lite/gen_code/gen_code.cc b/lite/gen_code/gen_code.cc
index 6c43f6e0116d9adfc4fc6f315d5653b2634dfe7b..a1e69b624a600719121926fc3a4f58391fa63ce6 100644
--- a/lite/gen_code/gen_code.cc
+++ b/lite/gen_code/gen_code.cc
@@ -59,7 +59,7 @@ void Module::AddHeaderIncludeGenCode() {
   Line("#include \"lite/gen_code/paddle_infer.h\"");
   Line("#include \"lite/core/op_registry.h\"");
   Line("#include \"lite/core/scope.h\"");
-  Line("#include \"lite/model_parser/cpp/op_desc.h\"");
+  Line("#include \"lite/model_parser/cpp_desc.h\"");
   Line("");
   Line("");
 }
diff --git a/lite/gen_code/gen_code.h b/lite/gen_code/gen_code.h
index d316eac43f99664fa71cba54b3ab5360852300a0..e100904a7fe4f9c3e489c056ceeeba21657b4944 100644
--- a/lite/gen_code/gen_code.h
+++ b/lite/gen_code/gen_code.h
@@ -20,9 +20,9 @@
 #include "lite/core/program.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/compatible_pb.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/model_parser/pb/op_desc.h"
 #include "lite/utils/all.h"
 
diff --git a/lite/gen_code/gen_code_test.cc b/lite/gen_code/gen_code_test.cc
index d0b1c1f8b23f90976f4b315a1a4e13069b2136f1..5b3db0de8342f312dcb4443ebcd1fd72e857eea0 100644
--- a/lite/gen_code/gen_code_test.cc
+++ b/lite/gen_code/gen_code_test.cc
@@ -25,7 +25,7 @@
 #include "lite/core/scope.h"
 #include "lite/core/tensor.h"
 #include "lite/model_parser/compatible_pb.h"
-#include "lite/model_parser/cpp/op_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/model_parser/model_parser.h"
 #include "lite/model_parser/pb/program_desc.h"
 
diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt
index 17a836b17183d69b0e2a15b46b7a2097c323312f..91268bc28dbdf38137904f986b254a76cbd5e538 100644
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -14,3 +14,4 @@ add_subdirectory(mlu)
 add_subdirectory(apu)
 add_subdirectory(bm)
 add_subdirectory(rknpu)
+add_subdirectory(huawei_ascend_npu)
diff --git a/lite/kernels/apu/bridges/conv_op.cc b/lite/kernels/apu/bridges/conv_op.cc
index ca6e0ff2ac3930fe5cab9230dbbefa0af0a864ab..bf5e313180d9d8089b29f993384bd243b2a5ed05 100644
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
@@ -35,6 +35,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   int neuron_errCode;
   VLOG(3) << "[APU] Converting [" << op_type << "]";
 
+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
   // Get input and output vars and op attributes
   auto input_name = op_info->Input("Input").front();
   auto input = scope->FindMutableTensor(input_name);
@@ -94,30 +97,18 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                       input_dims,
                                       filter_dims);
 
-  float input_scale;
-  float output_scale;
-  std::vector<float> weight_scale;
-  if (op_info->HasAttr("enable_int8")) {
-    if (op_info->GetAttr<bool>("enable_int8")) {
-      if (op_info->HasAttr("input_scale"))
-        input_scale = op_info->GetAttr<float>("input_scale");
-      if (op_info->HasAttr("weight_scale"))
-        weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
-      if (op_info->HasAttr("output_scale"))
-        output_scale = op_info->GetAttr<float>("output_scale");
-      VLOG(3) << "has output scale:" << output_scale;
-    } else {
-      return FAILED;
-    }
-  } else {
-    return FAILED;
-  }
+  CHECK(op_info->HasInputScale(input_name));
+  auto input_scale = op_info->GetInputScale(input_name)[0];
+  CHECK(op_info->HasInputScale(filter_name));
+  auto filter_scale = op_info->GetInputScale(filter_name);
+  CHECK(op_info->HasOutputScale(output_name));
+  auto output_scale = op_info->GetOutputScale(output_name)[0];
 
   VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
           << " ,dilations: " << dilations[0] << ":" << dilations[1];
   VLOG(3) << "with_act: " << with_act << " ,act_type:" << act_type;
   VLOG(3) << "input_dims: " << input_dims << " ,output_dims: " << output_dims
-          << " ,weight_scale size: " << weight_scale.size();
+          << " ,filter_scale size: " << filter_scale.size();
   VLOG(3) << "filter_dims: " << filter_dims
           << " ,memory_size: " << filter->memory_size()
           << " ,data_size: " << filter->data_size();
@@ -216,10 +207,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   NeuronOperandType filterType;
   NeuronOperandType channelFilterType;
   NeuronSymmPerChannelQuantParams symmPerChannelQuantParams;
-  if (1 == weight_scale.size()) {
+  if (1 == filter_scale.size()) {
     // Per layer type
     filterType.type = NEURON_TENSOR_QUANT8_ASYMM;
-    filterType.scale = weight_scale[0];
+    filterType.scale = filter_scale[0];
     filterType.zeroPoint = 128;
     filterType.dimensionCount = filter_dims.size();
     filterType.dimensions = &dims_filter[0];
@@ -237,17 +228,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       symmPerChannelQuantParams.channelDim = 3;
     else
       symmPerChannelQuantParams.channelDim = 0;
-    symmPerChannelQuantParams.scaleCount = weight_scale.size();
-    symmPerChannelQuantParams.scales = weight_scale.data();
+    symmPerChannelQuantParams.scaleCount = filter_scale.size();
+    symmPerChannelQuantParams.scales = filter_scale.data();
     biasType.scale = 0;
   }
 
   std::shared_ptr<Node> filter_node = nullptr;
-  if (1 == weight_scale.size()) {
+  if (1 == filter_scale.size()) {
     NeuronModel_addOperand(model, &filterType);  // 1: filter
     filter_node = graph->Add(filter_name, dims_filter);
-    VLOG(3) << "filter node idx: " << filter_node->index() << "w_scale[0]"
-            << weight_scale[0] << ": filterType: " << filterType.dimensions[0]
+    VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]"
+            << filter_scale[0] << ": filterType: " << filterType.dimensions[0]
             << ":" << filterType.dimensions[1] << ":"
             << filterType.dimensions[2] << ":" << filterType.dimensions[3];
     memcpy(filter->mutable_data<int8_t>(),
@@ -263,8 +254,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     NeuronModel_addOperand(model, &channelFilterType);  // 1: filter
     filter_node = graph->Add(filter_name, dims_filter);
     VLOG(3) << "chennel filter node idx: " << filter_node->index()
-            << " ,scale_count:" << weight_scale.size()
-            << " weight_scale[0]:" << weight_scale.data()[0]
+            << " ,scale_count:" << filter_scale.size()
+            << " filter_scale[0]:" << filter_scale.data()[0]
             << " ,channelFilterType: " << channelFilterType.dimensions[0] << ":"
             << channelFilterType.dimensions[1] << ":"
             << channelFilterType.dimensions[2] << ":"
@@ -298,7 +289,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> bias_node = nullptr;
   if (HasInputArg(op_info, scope, "Bias")) {
     auto bias_name = op_info->Input("Bias").front();
-    auto bias_type = kernel->GetInputDeclType("Bias");
     auto bias = scope->FindMutableTensor(bias_name);
     auto bias_dims = bias->dims();
 
@@ -364,10 +354,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // Add output tensor type
   NeuronOperandType outType;
   outType.type = NEURON_TENSOR_QUANT8_ASYMM;
-  if (graph->IsOutput(output_name))
-    outType.scale = output_scale / 127;
-  else
-    outType.scale = output_scale;
+  outType.scale = output_scale;
   outType.zeroPoint = 128;
   outType.dimensionCount = output_dims.size();
   std::vector<uint32_t> dims_out = {(uint32_t)output_dims[0],
@@ -401,7 +388,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     int32_t* int32_bias_data =
         reinterpret_cast<int32_t*>(bias->mutable_data<float>());
     float2int32(
-        bias->data<float>(), input_scale, weight_scale, int32_bias_data);
+        bias->data<float>(), input_scale, filter_scale, int32_bias_data);
 
     VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << " : "
             << int32_bias_data[1] << " : " << int32_bias_data[2] << " : "
diff --git a/lite/kernels/apu/bridges/fc_op.cc b/lite/kernels/apu/bridges/fc_op.cc
index a00a35f9a0766b4fb4f02d05419a0ae42354ca37..106ce2c16f3fd287a27c92179fa3a429c7be57c8 100644
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
@@ -31,6 +31,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto scope = op->scope();
   VLOG(3) << "[APU] Converting [" + op_type + "]";
 
+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
+  // Get input and output vars and op attributes
   auto input_name = op_info->Input("Input").front();
   auto input = scope->FindMutableTensor(input_name);
   auto input_dims = input->dims();
@@ -52,23 +56,12 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
           << " out_dims: " << out_dims << " m: " << m << " k: " << k
           << " n: " << n;
 
-  float input_scale = 1.0f;
-  float out_scale = 1.0f;
-  std::vector<float> w_scale;
-  if (op_info->HasAttr("enable_int8")) {
-    if (op_info->GetAttr<bool>("enable_int8")) {
-      if (op_info->HasAttr("input_scale"))
-        input_scale = op_info->GetAttr<float>("input_scale");
-      if (op_info->HasAttr("weight_scale"))
-        w_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
-      if (op_info->HasAttr("output_scale"))
-        out_scale = op_info->GetAttr<float>("output_scale");
-    } else {
-      return FAILED;
-    }
-  } else {
-    return FAILED;
-  }
+  CHECK(op_info->HasInputScale(input_name));
+  auto input_scale = op_info->GetInputScale(input_name)[0];
+  CHECK(op_info->HasInputScale(w_name));
+  auto w_scale = op_info->GetInputScale(w_name);
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];
 
   // Add input tensor type
   NeuronOperandType inType;
diff --git a/lite/kernels/apu/bridges/pool_op.cc b/lite/kernels/apu/bridges/pool_op.cc
index 2bda76ab99af727276102e884f84534b77a59586..b82f23beaf715e8c720ffc22792b804ff6c2c225 100644
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
@@ -32,6 +32,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto scope = op->scope();
   VLOG(3) << "[APU] Converting [" + op_type + "] ";
 
+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
   auto x = scope->FindMutableTensor(x_name);
@@ -87,22 +90,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  ksize);
 
   // Add x tensor type
-  float x_scale = 1.0f;
-  float out_scale = 1.0f;
-  if (op_info->HasAttr("enable_int8")) {
-    if (op_info->GetAttr<bool>("enable_int8")) {
-      if (op_info->HasAttr("input_scale"))
-        x_scale = op_info->GetAttr<float>("input_scale");
-      if (op_info->HasAttr("output_scale"))
-        out_scale = op_info->GetAttr<float>("output_scale");
-    } else {
-      LOG(WARNING) << "Do not enable_int8";
-      return FAILED;
-    }
-  } else {
-    LOG(WARNING) << "Do not enable_int8";
-    return FAILED;
-  }
+  CHECK(op_info->HasInputScale(x_name));
+  auto x_scale = op_info->GetInputScale(x_name)[0];
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];
 
   NeuronOperandType xType;
   xType.type = NEURON_TENSOR_QUANT8_ASYMM;
diff --git a/lite/kernels/apu/bridges/softmax_op.cc b/lite/kernels/apu/bridges/softmax_op.cc
index 6a289ac987b9fa300cb548d190b6e46b67f24c44..dec6d12307b50798d04f743064360aa6870acfa3 100644
--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
@@ -31,6 +31,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto scope = op->scope();
   VLOG(3) << "[APU] Converting [" + op_type + "]";
 
+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
   auto x = scope->FindMutableTensor(x_name);
@@ -45,22 +48,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     axis += x_rank;
   }
 
-  float input_scale = 1.0f;
-  float out_scale = 1.0f;
-  if (op_info->HasAttr("enable_int8")) {
-    if (op_info->GetAttr<bool>("enable_int8")) {
-      if (op_info->HasAttr("input_scale"))
-        input_scale = op_info->GetAttr<float>("input_scale");
-      if (op_info->HasAttr("output_scale"))
-        out_scale = op_info->GetAttr<float>("output_scale");
-    } else {
-      LOG(WARNING) << "Do not enable_int8";
-      return FAILED;
-    }
-  } else {
-    LOG(WARNING) << "Do not enable_int8";
-    return FAILED;
-  }
+  CHECK(op_info->HasInputScale(x_name));
+  auto input_scale = op_info->GetInputScale(x_name)[0];
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];
 
   // Check output scale
   NeuronOperandType xType;
@@ -104,14 +95,14 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // Add out operand
   NeuronOperandType outType;
   outType.type = NEURON_TENSOR_QUANT8_ASYMM;
-  outType.scale = out_scale / 127;
+  outType.scale = out_scale;
   outType.zeroPoint = 128;
   outType.dimensionCount = x_dims.size();
   outType.dimensions = &dims_x[0];
   NeuronModel_addOperand(model, &outType);  // 3: output
   std::shared_ptr<Node> out_node = nullptr;
   out_node = graph->Add(out_name, dims_x);
-  VLOG(3) << "output_scale: " << out_scale;
+  VLOG(3) << "out_scale: " << out_scale;
 
   float beta_val[] = {1.0f};
   NeuronModel_setOperandValue(
diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc
index 6009e71e05c33f6dedfd995020612e112c888d36..579ed97b161dade9822250dab411cefd214b50f8 100644
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -28,7 +28,7 @@ namespace lite {
 namespace kernels {
 namespace apu {
 
-int SubgraphEngine::BuildDeviceProgram() {
+bool SubgraphEngine::BuildDeviceProgram() {
   unsigned int version;
   Neuron_getVersion(&version);
   VLOG(3) << "Neuron Adapter version: " << version;
@@ -37,8 +37,8 @@ int SubgraphEngine::BuildDeviceProgram() {
   subgraph::apu::Graph graph;
   int neuron_errCode = NeuronModel_create(&model_);
   if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Fail to create model";
-    return subgraph::FAILED;
+    LOG(WARNING) << "[APU] Failed to create the neuron model!";
+    return false;
   }
   graph.set_model(model_);
   graph.set_input_names(input_names_);
@@ -46,15 +46,19 @@ int SubgraphEngine::BuildDeviceProgram() {
 
   // Convert all of ops and their input vars and weights and added into the APU
   // NIR graph
+  if (!origin_program_) {
+    BuildOriginProgram();
+  }
   const auto& bridges = subgraph::Registry::Instance();
-  for (auto& inst : origin_program_) {
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
     op->InferShape();
     std::string op_type = op->op_info()->Type();
     if (!bridges.Exists(op_type, TARGET(kAPU))) {
-      return subgraph::FAILED;
+      return false;
     }
 
     auto kernel = inst.kernel();
@@ -63,60 +67,43 @@ int SubgraphEngine::BuildDeviceProgram() {
                                               const_cast<OpLite*>(op),
                                               const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
     }
   }
 
-  // Get input tensor
-  std::vector<uint32_t> ins;
-  origin_itensors_.resize(input_names_.size());
-  origin_idims_.resize(input_names_.size());
+  // Get the index of input tensors
+  std::vector<uint32_t> input_indices;
   for (int i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
-    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
-    VLOG(3) << "subgraph input name: " << i << ", " << input_names_[i] << ":"
-            << origin_idims_[i].production();
-    // Get input index
-    int idx;
-    if (graph.Has(input_names_[i])) {
-      ins.push_back(graph.Get(input_names_[i])->index());
-      VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index();
-    } else {
-      LOG(WARNING) << "Fail to find input: " << input_names_[i];
-      return subgraph::FAILED;
-    }
+    CHECK(graph.Has(input_names_[i])) << "[APU] Failed to find input node "
+                                      << input_names_[i];
+    auto index = graph.Get(input_names_[i])->index();
+    input_indices.push_back(index);
+    VLOG(3) << "[APU] Input[" << i << "] name " << input_names_[i] << " dims "
+            << origin_itensors_[i]->dims() << " index " << index;
   }
 
-  // Get output tensor
-  std::vector<uint32_t> outs;
-  origin_otensors_.resize(output_names_.size());
-  origin_odims_.resize(output_names_.size());
+  // Get the index of output tensors
+  std::vector<uint32_t> output_indices;
   for (int i = 0; i < output_names_.size(); i++) {
-    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-    VLOG(3) << "subgraph output name: " << i << ", " << output_names_[i] << ":"
-            << origin_odims_[i].production();
+    CHECK(graph.Has(output_names_[i])) << "[APU] Failed to find output node "
+                                       << output_names_[i];
     origin_otensors_[i]->mutable_data<int8_t>();
-    // Get input index
-    if (graph.Has(output_names_[i])) {
-      outs.push_back(graph.Get(output_names_[i])->index());
-      VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index();
-    } else {
-      LOG(WARNING) << "Fail to find output: " << output_names_[i];
-      return subgraph::FAILED;
-    }
+    auto index = graph.Get(output_names_[i])->index();
+    output_indices.push_back(index);
+    VLOG(3) << "[APU] Output[" << i << "] name " << output_names_[i] << " dims "
+            << origin_otensors_[i]->dims() << " index " << index;
   }
 
-  VLOG(3) << "ins size: " << ins.size() << " outs size:" << outs.size();
-  // Set subgraph input/output
-  NeuronModel_identifyInputsAndOutputs(
-      model_, ins.size(), &ins[0], outs.size(), &outs[0]);
+  // Indentify the input and output tensors of the neuron model
+  NeuronModel_identifyInputsAndOutputs(model_,
+                                       input_indices.size(),
+                                       &input_indices[0],
+                                       output_indices.size(),
+                                       &output_indices[0]);
   neuron_errCode = NeuronModel_finish(model_);
   if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode;
-    return subgraph::FAILED;
+    LOG(WARNING) << "[APU] Fail to create NIR model:" << neuron_errCode;
+    return false;
   }
   VLOG(3) << "[APU] APU NIR model created!";
 
@@ -129,15 +116,14 @@ int SubgraphEngine::BuildDeviceProgram() {
   compilation_ = lite::apu::Device::Global().Build(model_);
   if (compilation_ == nullptr) {
     LOG(WARNING) << "[APU] Build APU DLA model failed!";
-    return subgraph::FAILED;
+    return false;
   }
   VLOG(3) << "[APU] APU DLA model created, Build cost "
           << GetCurrentUS() - start_time << " us";
-
-  return status;
+  return true;
 }
 
-int SubgraphEngine::LaunchDeviceProgram() {
+bool SubgraphEngine::LaunchDeviceProgram() {
   auto GetCurrentUS = []() -> double {
     struct timeval time;
     gettimeofday(&time, NULL);
@@ -149,22 +135,19 @@ int SubgraphEngine::LaunchDeviceProgram() {
   int neuron_errCode = NeuronExecution_create(compilation_, &run);
   if (NEURON_NO_ERROR != neuron_errCode) {
     LOG(WARNING) << "[APU] Build APU runtime failed!";
-    return subgraph::FAILED;
+    return false;
   }
 
   // Set input buffer
-  Tensor input_temp;
   for (size_t i = 0; i < origin_itensors_.size(); i++) {
-    input_temp.Resize({origin_idims_[i]});
-    uint8_t* input_data = input_temp.mutable_data<uint8_t>();
-    memcpy(input_data,
-           origin_itensors_[i]->raw_data(),
-           origin_itensors_[i]->memory_size());
+    auto origin_data = origin_itensors_[i]->mutable_data<int8_t>();
+    auto converted_data = reinterpret_cast<uint8_t*>(origin_data);
     for (int j = 0; j < origin_itensors_[i]->data_size(); j++) {
-      input_data[j] += (uint8_t)128;
+      converted_data[j] =
+          static_cast<uint8_t>(static_cast<int16_t>(origin_data[j]) + 128);
     }
     NeuronExecution_setInput(
-        run, i, NULL, input_data, origin_itensors_[i]->memory_size());
+        run, i, NULL, converted_data, origin_itensors_[i]->memory_size());
   }
 
   // Set output buffer
@@ -180,19 +163,20 @@ int SubgraphEngine::LaunchDeviceProgram() {
   neuron_errCode = NeuronExecution_compute(run);
   if (NEURON_NO_ERROR != neuron_errCode) {
     LOG(WARNING) << "Fail to run execution!" << neuron_errCode;
-    return subgraph::FAILED;
+    return false;
   }
 
   for (size_t i = 0; i < origin_otensors_.size(); i++) {
-    int8_t* output_data = origin_otensors_[i]->mutable_data<int8_t>();
-    VLOG(3) << "output size:" << origin_otensors_[i]->memory_size();
+    auto converted_data = origin_otensors_[i]->mutable_data<int8_t>();
+    auto origin_data = reinterpret_cast<uint8_t*>(converted_data);
     for (int j = 0; j < origin_otensors_[i]->data_size(); j++) {
-      output_data[j] -= (int8_t)128;
+      converted_data[j] =
+          static_cast<int8_t>(static_cast<int16_t>(origin_data[j]) - 128);
     }
   }
   NeuronExecution_free(run);
   VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
-  return 0;
+  return true;
 }
 
 SubgraphEngine::~SubgraphEngine() {
@@ -207,18 +191,17 @@ SubgraphEngine::~SubgraphEngine() {
 void SubgraphCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                    param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
   CHECK(engine_);
-  engine_->Build();
 }
 
 void SubgraphCompute::Run() {
   CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 
 }  // namespace apu
diff --git a/lite/kernels/apu/subgraph_compute.h b/lite/kernels/apu/subgraph_compute.h
index ecd8a38343cd1f62bb5a3bf8e948384b90cfe826..de15abdf7fdbce8001676a2bf7f651ad1e435c74 100644
--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
@@ -31,18 +31,22 @@ class SubgraphEngine : public subgraph::Engine {
  public:
   SubgraphEngine(KernelContext *ctx,
                  int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                  const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}
 
   ~SubgraphEngine();
 
  protected:
-  int BuildDeviceProgram() override;
-  int LaunchDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
 
   NeuronModel *model_;
   NeuronCompilation *compilation_;
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 687d91202fd62ed09a5157abe90bb59eb56303b5..f4fe6ba1ebb9a7e775f0d5db1031f9fd40508c20 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -54,7 +54,7 @@ add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_k
 add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
-
+add_kernel(group_norm_compute ARM extra SRCS group_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 ## 3. extra kernels
 add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -75,9 +75,9 @@ add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_comp
 add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(clip_compute_arm ARM extra SRCS clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 # for OCR specific
 add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -86,7 +86,6 @@ add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_comp
 add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -102,7 +101,6 @@ add_kernel(deformable_conv_compute_arm ARM extra SRCS deformable_conv_compute.cc
 add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 add_kernel(mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(activation_grad_compute_arm ARM train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc
index 085e914c6e05c26d3031a4cfdac3c39d31f40f6d..5f3174edbbb53381db29bfa6b99f62a9e7094a4d 100644
--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
@@ -217,6 +217,17 @@ void AbsCompute::Run() {
       x_data, output_data, x_dims.production(), ctx.threads());
 }
 
+void ThresholdedReluCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  float threshold = param.relu_threshold;
+  lite::arm::math::act_thresholded_relu<float>(
+      x_data, output_data, x_dims.production(), threshold, ctx.threads());
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -336,3 +347,12 @@ REGISTER_LITE_KERNEL(
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+REGISTER_LITE_KERNEL(thresholded_relu,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ThresholdedReluCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h
index 2e9774637b7a9156197ffeff5f4bca13a20620bb..a915937590ee8748ac419c5b33f82c81d8480852 100644
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
@@ -175,6 +175,16 @@ class AbsCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
   virtual ~AbsCompute() = default;
 };
 
+class ThresholdedReluCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~ThresholdedReluCompute() = default;
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/arm/activation_grad_compute.cc b/lite/kernels/arm/activation_grad_compute.cc
deleted file mode 100644
index 137668fa5e0d1bd07e838b3040a31e084a7475c8..0000000000000000000000000000000000000000
--- a/lite/kernels/arm/activation_grad_compute.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/activation_grad_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void SquareGradCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto out_grad_dims = param.Out_grad->dims();
-  auto out_grad_data = param.Out_grad->data<float>();
-
-  auto x_data = param.X->data<float>();
-  auto x_grad_data = param.X_grad->mutable_data<float>();
-  lite::arm::math::act_square_grad<float>(x_data,
-                                          out_grad_data,
-                                          x_grad_data,
-                                          out_grad_dims.production(),
-                                          ctx.threads());
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(square_grad,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::SquareGradCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/argmax_compute_test.cc b/lite/kernels/arm/argmax_compute_test.cc
index 034d57cdaba77130b319d203c3ae0616720c9d31..5e511264a855ac86a9fb12ede56d51fb1ea83010 100644
--- a/lite/kernels/arm/argmax_compute_test.cc
+++ b/lite/kernels/arm/argmax_compute_test.cc
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/argmax_compute.h"
 #include <gtest/gtest.h>
+
 #include <cstdlib>
 #include <functional>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/argmax_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -66,9 +68,7 @@ void argmax_compute_ref(const operators::ArgmaxParam& param) {
 }
 
 TEST(argmax_arm, retrive_op) {
-  auto argmax =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "arg_max");
+  auto argmax = KernelRegistry::Global().Create("arg_max");
   ASSERT_FALSE(argmax.empty());
   ASSERT_TRUE(argmax.front());
 }
diff --git a/lite/kernels/arm/axpy_compute_test.cc b/lite/kernels/arm/axpy_compute_test.cc
index af145435ebe2c5bd0c1d1b78b112e8a8572d36ec..7348630e776155cd421bc78a9da7494d42e84c3f 100644
--- a/lite/kernels/arm/axpy_compute_test.cc
+++ b/lite/kernels/arm/axpy_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/axpy_compute.h"
 #include <gtest/gtest.h>
+
 #include <cstdlib>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/axpy_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -61,8 +63,7 @@ void axpy_compute_ref(const operators::AxpyParam& param) {
 }
 
 TEST(axpy_arm, retrive_op) {
-  auto axpy =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("axpy");
+  auto axpy = KernelRegistry::Global().Create("axpy");
   ASSERT_FALSE(axpy.empty());
   ASSERT_TRUE(axpy.front());
 }
diff --git a/lite/kernels/arm/batch_norm_compute_test.cc b/lite/kernels/arm/batch_norm_compute_test.cc
index bf690f88a5e776709a3988cc843762db3bf684e6..a3ef9bda4a17ebfdb5468c911cc6c9aa6a5d4fd7 100644
--- a/lite/kernels/arm/batch_norm_compute_test.cc
+++ b/lite/kernels/arm/batch_norm_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/batch_norm_compute.h"
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/batch_norm_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -78,9 +80,7 @@ void batch_norm_compute_ref(const operators::BatchNormParam& param) {
 }
 
 TEST(batch_norm_arm, retrive_op) {
-  auto batch_norm =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "batch_norm");
+  auto batch_norm = KernelRegistry::Global().Create("batch_norm");
   ASSERT_FALSE(batch_norm.empty());
   ASSERT_TRUE(batch_norm.front());
 }
diff --git a/lite/kernels/arm/calib_compute.cc b/lite/kernels/arm/calib_compute.cc
index 6dac97dcbc59991d4680ab1a98a54a900573f631..383e868843b43f4081e1eac330b1422b79307d9c 100644
--- a/lite/kernels/arm/calib_compute.cc
+++ b/lite/kernels/arm/calib_compute.cc
@@ -33,6 +33,17 @@ void CalibComputeFp32ToInt8<DLType>::Run() {
       din, dout, scale.data(), 1, 1, param.input->numel());
 }
 
+template <DataLayoutType DLType>
+void CalibComputeInt64ToInt32<DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<int64_t>();
+  std::vector<float> scale = {param.scale};
+  auto* dout = param.output->template mutable_data<int32_t>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = din[i];
+  }
+}
+
 template <DataLayoutType DLType>
 void CalibComputeInt8ToFp32<DLType>::Run() {
   auto& param = this->template Param<operators::CalibParam>();
@@ -105,6 +116,23 @@ REGISTER_LITE_KERNEL(
                                        DATALAYOUT(kNHWC))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(
+    calib,
+    kARM,
+    kInt64,
+    kNCHW,
+    paddle::lite::kernels::arm::CalibComputeInt64ToInt32<DATALAYOUT(kNCHW)>,
+    int64_to_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
     calib_once,
     kARM,
@@ -161,3 +189,20 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFloat),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once,
+    kARM,
+    kInt64,
+    kNCHW,
+    paddle::lite::kernels::arm::CalibComputeInt64ToInt32<DATALAYOUT(kNCHW)>,
+    int64_to_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/arm/calib_compute.h b/lite/kernels/arm/calib_compute.h
index a4c8b4c1232101416e95171d70ab629f6a37177b..f10bb931df9b276bc3bb01da16906f3e5b5a7dce 100644
--- a/lite/kernels/arm/calib_compute.h
+++ b/lite/kernels/arm/calib_compute.h
@@ -34,6 +34,19 @@ class CalibComputeFp32ToInt8
  private:
 };
 
+template <DataLayoutType DLType>
+class CalibComputeInt64ToInt32
+    : public KernelLite<TARGET(kARM), PRECISION(kInt64), DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt64ToInt32() override{};
+
+ private:
+};
+
 template <DataLayoutType DLType>
 class CalibComputeInt8ToFp32
     : public KernelLite<TARGET(kARM), PRECISION(kInt8), DLType> {
diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc
index 3b3ef07e105c583b7e3eb8b64b14610ca0f9e41a..919e9c603edff4383f086ac795c3dff4ed856c4f 100644
--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
@@ -62,8 +62,19 @@ void CastCompute::Run() {
     int32_t* out_data = param.Out->mutable_data<int32_t>();
     std::transform(
         x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>);
+  } else if (param.in_dtype == 0 && param.out_dtype == 5) {  // bool->fp32
+    const bool* x_data_begin = param.X->data<bool>();
+    const bool* x_data_end = x_data_begin + param.X->numel();
+    float* out_data = param.Out->mutable_data<float>();
+    std::transform(x_data_begin, x_data_end, out_data, TransOp<bool, float>);
+  } else if (param.in_dtype == 3 && param.out_dtype == 5) {  // int64->fp32
+    const int64_t* x_data_begin = param.X->data<int64_t>();
+    const int64_t* x_data_end = x_data_begin + param.X->numel();
+    float* out_data = param.Out->mutable_data<float>();
+    std::transform(x_data_begin, x_data_end, out_data, TransOp<int64_t, float>);
   } else {
-    LOG(FATAL) << "other has not been implemented";
+    LOG(FATAL) << "other has not been implemented transform with dtype"
+               << param.in_dtype << " X, dtype" << param.out_dtype << " Out";
   }
 }
 
diff --git a/lite/kernels/arm/clip_compute.cc b/lite/kernels/arm/clip_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d71eaef9e5b3e68d571a48e1a9772b8870c29b7
--- /dev/null
+++ b/lite/kernels/arm/clip_compute.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/clip_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void ClipCompute::Run() {
+  auto& param = Param<operators::ClipParam>();
+  lite::Tensor* x = param.x;
+  lite::Tensor* min_tensor = param.min_tensor;
+  lite::Tensor* max_tensor = param.max_tensor;
+  lite::Tensor* out = param.out;
+  float min = param.min;
+  float max = param.max;
+
+  if (min_tensor != nullptr) {
+    min = min_tensor->data<float>()[0];
+  }
+  if (max_tensor != nullptr) {
+    max = max_tensor->data<float>()[0];
+  }
+
+  const float* x_ptr = x->data<float>();
+  float* out_ptr = out->mutable_data<float>();
+  int64_t num = x->numel();
+  lite::arm::math::clip_kernel_fp32(x_ptr, num, min, max, out_ptr);
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    clip, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ClipCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Min", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Max", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/clip_compute.h b/lite/kernels/arm/clip_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..94c2b3a32ea2fc0847d8e223ecd61856fa8e3ed2
--- /dev/null
+++ b/lite/kernels/arm/clip_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/clip_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ClipCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ClipParam;
+
+  void Run() override;
+
+  virtual ~ClipCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/concat_compute.cc b/lite/kernels/arm/concat_compute.cc
index dc78e1b955c29b261b2103479ea00bb836c0a31f..9ab4ca54bb909876bc823ac25cb67764eab12e47 100644
--- a/lite/kernels/arm/concat_compute.cc
+++ b/lite/kernels/arm/concat_compute.cc
@@ -52,11 +52,7 @@ void ConcatFunc(const std::vector<lite::Tensor*> inputs,
       output_offset += in_stride[0];
     }
   } else {
-    std::vector<lite::Tensor*> inputs_concat(inputs.size());
-    for (int j = 0; j < inputs.size(); ++j) {
-      inputs_concat[j] = inputs[j];
-    }
-    lite::arm::math::concat_func<T>(inputs_concat, axis, out);
+    lite::arm::math::concat_func<T>(inputs, axis, out);
   }
 }
 
@@ -71,6 +67,9 @@ void ConcatCompute::Run() {
     auto* axis_tensor_data = axis_tensor->data<int>();
     axis = axis_tensor_data[0];
   }
+  if (axis < 0) {
+    axis += inputs[0]->dims().size();
+  }
 
   switch (inputs.front()->precision()) {
     case PRECISION(kFloat):
diff --git a/lite/kernels/arm/concat_compute_test.cc b/lite/kernels/arm/concat_compute_test.cc
index 44c6dedd44ad4509a3f5a9c13fc04d6f1ffbdc64..862094fd23aa339bba0b06c4200e71f06402c645 100644
--- a/lite/kernels/arm/concat_compute_test.cc
+++ b/lite/kernels/arm/concat_compute_test.cc
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/concat_compute.h"
 #include <gtest/gtest.h>
+
 #include <limits>
 #include <string>
 #include <vector>
+
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/arm/concat_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -221,8 +223,7 @@ TEST(concat_arm, compute_input_multi) {
 }
 
 TEST(concat, retrive_op) {
-  auto concat =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kAny)>("concat");
+  auto concat = KernelRegistry::Global().Create("concat");
   ASSERT_FALSE(concat.empty());
   ASSERT_TRUE(concat.front());
 }
diff --git a/lite/kernels/arm/conditional_block_compute.h b/lite/kernels/arm/conditional_block_compute.h
deleted file mode 100644
index 91eadff931ec8aa54092347bcf18f8428130ef75..0000000000000000000000000000000000000000
--- a/lite/kernels/arm/conditional_block_compute.h
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/program.h"
-#include "lite/operators/conditional_block_op.h"
-#ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/basic_profiler.h"
-#include "lite/core/profile/precision_profiler.h"
-#include "lite/core/profile/profiler.h"
-#endif
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class CondExecutor {
-  typedef std::shared_ptr<OpLite> OpPtr;
-
- public:
-  CondExecutor(cpp::BlockDesc *block, Scope *scope, Place place)
-      : scope_(scope), place_(place) {
-    int32_t op_size = block->OpsSize();
-    for (int32_t i = 0; i < op_size; ++i) {
-      auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
-      auto op_type = op_desc.Type();
-      auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
-      op_handler->Attach(op_desc, scope);
-
-      auto hostplace = place_;
-      hostplace.target = TARGET(kHost);
-      auto kernels = op_handler->CreateKernels({place_, hostplace});
-      CHECK_GT(kernels.size(), 0) << "cannot create kernel";
-      op_handler->AttachKernel(kernels[0].get());
-      op_handler->SetKernel(kernels);
-      ops_of_block_.push_back(op_handler);
-    }
-  }
-
-  void Run() {
-#ifdef LITE_WITH_PROFILE
-#ifdef LITE_WITH_PRECISION_PROFILE
-    lite::profile::Profiler profiler;
-#endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
-    for (auto &op_handler : ops_of_block_) {
-      op_handler->CheckShape();
-      op_handler->InferShape();
-#ifdef LITE_WITH_PROFILE
-#ifdef LITE_WITH_PRECISION_PROFILE
-      std::unique_ptr<KernelBase> kernel(op_handler->GetKernel());
-      Instruction inst(op_handler, std::move(kernel));
-      inst.set_profiler(&profiler);
-#endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
-      op_handler->Run();
-#ifdef LITE_WITH_PROFILE
-#ifdef LITE_WITH_PRECISION_PROFILE
-      LITE_PRECISION_PROFILE(inst)
-#endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
-    }
-  }
-
- private:
-  Scope *scope_;
-  Place place_;
-  std::vector<OpPtr> ops_of_block_;
-};
-
-class ConditionalBlockCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConditionalBlockParam;
-
-  void PrepareForRun() override;
-  void Run() override;
-
-  virtual ~ConditionalBlockCompute() = default;
-
- private:
-  std::shared_ptr<CondExecutor> executor_;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc
index 2a545e70691f030a3a1e3f2a9a9822f5cd8b85b9..54e67de5abbfc88f64a50b07335d2527d9738206 100644
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -73,7 +73,6 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
     // VLOG(3) << "invoking dw conv";
   } else if (param.groups == 1 && kw == 3 && stride == 1 && ks_equal &&
              no_dilation) {
-    // TODO(MyPandaShaoxiang): winograd conv support any pad
     impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
     // VLOG(3) << "invoking winograd conv";
   } else if (param.groups == 1 && kw == 3 && stride == 2 &&
@@ -122,10 +121,14 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
       no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>;
     // VLOG(3) << "Run DepthwiseConv Int8";
-  } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
-             ic * oc < 4 * hin * win && kps_equal && no_dilation) {
+  } else if (param.groups == 1 && kw == 3 && sw == 2 && no_dilation &&
+             pads_equal) {
     impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kFloat)>;
     // VLOG(3) << "Run DirectConv Int8";
+  } else if (param.groups == 1 && kw == 3 && sw == 1 && no_dilation &&
+             pads_equal) {
+    impl_ = new WinogradConv<PRECISION(kInt8), PRECISION(kFloat)>;
+    // VLOG(3) << "Run WinogradConv Int8";
   } else {
     impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>;
     // VLOG(3) << "Run GemmLikeConvInt8";
@@ -169,10 +172,14 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
       no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>;
     // VLOG(3) << "Run DepthwiseConv Int8";
-  } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
-             ic * oc < 4 * hin * win && kps_equal && no_dilation) {
+  } else if (param.groups == 1 && kw == 3 && sw == 2 && no_dilation &&
+             pads_equal) {
     impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kInt8)>;
     // VLOG(3) << "Run DirectConv Int8";
+  } else if (param.groups == 1 && kw == 3 && sw == 1 && no_dilation &&
+             pads_equal) {
+    impl_ = new WinogradConv<PRECISION(kInt8), PRECISION(kInt8)>;
+    // VLOG(3) << "Run WinogradConv Int8";
   } else {
     impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>;
     // VLOG(3) << "Run GemmLikeConvInt8";
diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc
index af428fd9c03a34f2d181958815a927da62982e9d..f61c6109cdfd57b30c2b57390d21dec7c3bb3aa2 100644
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "lite/kernels/arm/conv_winograd.h"
-#include <vector>
 #include "lite/backends/arm/math/conv_impl.h"
 #include "lite/backends/arm/math/packed_sgemm.h"
 
@@ -183,6 +182,189 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   }
 }
 
+template <PrecisionType OutType>
+void WinogradConv<PRECISION(kInt8), OutType>::ReInitWhenNeeded() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  int threads = ctx.threads();
+
+  auto x_dims = param.x->dims();
+  auto w_dims = param.filter->dims();
+  auto o_dims = param.output->dims();
+
+  if (last_shape_ == x_dims) {
+    return;
+  }
+  last_shape_ = x_dims;
+  //! update workspace size
+  int ic = x_dims[1];
+  int ih = x_dims[2];
+  int iw = x_dims[3];
+  int oc = o_dims[1];
+  int oh = o_dims[2];
+  int ow = o_dims[3];
+  int tile_block = 8;
+  auto pad = *(param.paddings);
+  int pad_h0 = pad[0];
+  int pad_h1 = pad[1];
+  int pad_w0 = pad[2];
+  int pad_w1 = pad[3];
+  int oc_pad = (oc + 7) / 8 * 8;
+  int ic_pad = (ic + 7) / 8 * 8;
+  const int new_input_size =
+      ic_pad * (ih + pad_h0 + pad_h1) * (iw + pad_w0 + pad_w1) +
+      oc_pad * oh * ow * sizeof(int32_t);
+  int tmp_input_thread_size_byte =
+      tile_block * ic_pad * wino_iw * wino_iw * sizeof(int16_t);
+  int tmp_output_thread_size_byte =
+      tile_block * oc_pad * wino_iw * wino_iw * sizeof(int32_t);
+  const int temp_size =
+      (tmp_input_thread_size_byte + tmp_output_thread_size_byte +
+       wino_iw * wino_iw * (8 + 8 * sizeof(int32_t))) *
+      threads;
+  workspace_size_ = temp_size + new_input_size;
+
+  //! update trans weights impl
+  // choose_small_ = ow * oh / (tile_block * threads) < 36 ? true : false;
+  // we only support 2x2 now
+  choose_small_ = true;
+  float w_fact = 0.25;
+  if (choose_small_) {
+    wino_iw = 4;
+
+    if (last_function_ == 0) {
+      return;
+    }
+    last_function_ = 0;
+  } else {
+    wino_iw = 6;
+    if (last_function_ == 1) {
+      return;
+    }
+    last_function_ = 1;
+  }
+  /// update scale
+  for (auto& ws : w_scale_) {
+    ws *= w_fact;
+  }
+
+  weights_.Resize({1, 1, 1, wino_iw * wino_iw * oc_pad * ic_pad});
+  void* trans_tmp_ptr = malloc(sizeof(int16_t) * wino_iw * wino_iw * oc * ic);
+  auto weights_data_ = weights_.mutable_data<int16_t>();
+  if (!choose_small_) {
+  } else {
+    lite::arm::math::weight_trans_c8_4x4_int8(
+        weights_data_,
+        param.filter->template data<int8_t>(),
+        ic,
+        oc,
+        trans_tmp_ptr);
+  }
+  free(trans_tmp_ptr);
+}
+
+template <PrecisionType OutType>
+void WinogradConv<PRECISION(kInt8), OutType>::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  w_scale_ = param.weight_scale;
+  if (w_scale_.size() != 1 && w_scale_.size() != param.filter->dims()[0]) {
+    LOG(FATAL) << "weights scale size must equal to filter size";
+    return;
+  }
+  if (w_scale_.size() == 1) {
+    for (int i = 0; i < param.filter->dims()[0] - 1; ++i) {
+      w_scale_.push_back(w_scale_[0]);
+    }
+  }
+  float input_scale = param.input_scale;
+  for (auto& ws : w_scale_) {
+    ws *= input_scale;
+  }
+  if (param.bias) {
+    bias_.Resize(param.bias->dims());
+    auto ptr = bias_.mutable_data<float>();
+    auto ptr_in = param.bias->template data<float>();
+    for (int i = 0; i < bias_.numel(); ++i) {
+      ptr[i] = ptr_in[i];
+    }
+  }
+  if (OutType == PRECISION(kInt8)) {
+    float output_scale = param.output_scale;
+    for (auto& ws : w_scale_) {
+      ws /= output_scale;
+    }
+    if (param.bias) {
+      auto ptr = bias_.mutable_data<float>();
+      for (int i = 0; i < bias_.numel(); ++i) {
+        ptr[i] /= output_scale;
+      }
+    }
+  }
+  ReInitWhenNeeded();
+}
+
+template <PrecisionType OutType>
+void WinogradConv<PRECISION(kInt8), OutType>::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  ctx.ExtendWorkspace(workspace_size_);
+  const auto* i_data = param.x->template data<int8_t>();
+  const auto* w_data = weights_.data<int16_t>();
+  const auto* b_data = param.bias ? bias_.data<float>() : nullptr;
+  // const float* i_data;
+  auto x_dims = param.x->dims();
+  auto w_dims = param.filter->dims();
+  auto o_dims = param.output->dims();
+
+  int iw = x_dims[3];  // nchw
+  int ih = x_dims[2];
+  int ic = x_dims[1];
+  int bs = x_dims[0];
+  int oh = o_dims[2];
+  int ow = o_dims[3];
+  int oc = o_dims[1];
+
+  // now  always choose small
+  if (OutType == PRECISION(kInt8)) {
+    auto* o_data = param.output->template mutable_data<int8_t>();
+    lite::arm::math::conv_compute_2x2_3x3_int8<int8_t>(i_data,
+                                                       o_data,
+                                                       bs,
+                                                       oc,
+                                                       oh,
+                                                       ow,
+                                                       ic,
+                                                       ih,
+                                                       iw,
+                                                       w_data,
+                                                       b_data,
+                                                       w_scale_.data(),
+                                                       param,
+                                                       &ctx);
+  } else {
+    auto* o_data = param.output->template mutable_data<float>();
+    lite::arm::math::conv_compute_2x2_3x3_int8<float>(i_data,
+                                                      o_data,
+                                                      bs,
+                                                      oc,
+                                                      oh,
+                                                      ow,
+                                                      ic,
+                                                      ih,
+                                                      iw,
+                                                      w_data,
+                                                      b_data,
+                                                      w_scale_.data(),
+                                                      param,
+                                                      &ctx);
+  }
+#ifdef LITE_WITH_PROFILE
+  kernel_func_name_ = "conv_compute_2x2_3x3_int8";
+#endif
+}
+template class WinogradConv<PRECISION(kInt8), PRECISION(kInt8)>;
+template class WinogradConv<PRECISION(kInt8), PRECISION(kFloat)>;
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/arm/conv_winograd.h b/lite/kernels/arm/conv_winograd.h
index 1cb4d69acbc562b7bb7d50944daf6c6ff3b5d790..b93a719f7dbb13aa9888ea943fa81b6ea2b38c00 100644
--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
@@ -16,11 +16,11 @@
 
 #include <cmath>
 #include <string>
+#include <vector>
 #include "lite/backends/arm/math/conv_impl.h"
 #include "lite/core/context.h"
 #include "lite/core/kernel.h"
 #include "lite/core/target_wrapper.h"
-
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -52,7 +52,34 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> {
   bool choose_small_{false};
   int wino_iw{8};
 };
+template <PrecisionType OutType>
+class WinogradConv<PRECISION(kInt8), OutType>
+    : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
+ public:
+  WinogradConv() = default;
+  ~WinogradConv() {}
+  virtual void PrepareForRun();
+  virtual void ReInitWhenNeeded();
+  virtual void Run();
+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+  std::string kernel_func_name_{"NotImplForConvWino"};
+#endif
 
+ protected:
+  using param_t = operators::ConvParam;
+  Tensor weights_;
+  Tensor bias_;
+  DDim last_shape_;
+  int workspace_size_{0};
+  int last_function_{-1};
+  bool choose_small_{true};
+  int wino_iw{4};
+  std::vector<float> w_scale_;
+};
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/arm/decode_bboxes_compute_test.cc b/lite/kernels/arm/decode_bboxes_compute_test.cc
index 271a99c29b61063877b7d1c0d2e50bc65d135d72..ef9da0f1e2c53a021c82f19d3151a2fe8fba8af4 100644
--- a/lite/kernels/arm/decode_bboxes_compute_test.cc
+++ b/lite/kernels/arm/decode_bboxes_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/decode_bboxes_compute.h"
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <string>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/decode_bboxes_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -115,9 +117,7 @@ void decode_bboxes_compute_ref(const operators::DecodeBboxesParam& param) {
 }
 
 TEST(decode_bboxes_arm, retrive_op) {
-  auto decode_bboxes =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "decode_bboxes");
+  auto decode_bboxes = KernelRegistry::Global().Create("decode_bboxes");
   ASSERT_FALSE(decode_bboxes.empty());
   ASSERT_TRUE(decode_bboxes.front());
 }
diff --git a/lite/kernels/arm/deformable_conv_compute.cc b/lite/kernels/arm/deformable_conv_compute.cc
index 6253b661d05535d7b3b4a2ee18de7707e80b2877..dfdd27799bc1df7f403f40cb50b48aebbfb8d67a 100644
--- a/lite/kernels/arm/deformable_conv_compute.cc
+++ b/lite/kernels/arm/deformable_conv_compute.cc
@@ -235,7 +235,8 @@ typedef paddle::lite::kernels::arm::DeformableConvCompute<PRECISION(kFloat),
                                                           PRECISION(kFloat)>
     DeformableConvFp32;
 
-REGISTER_LITE_KERNEL(deformconv2d, kARM, kFloat, kNCHW, DeformableConvFp32, def)
+REGISTER_LITE_KERNEL(
+    deformable_conv, kARM, kFloat, kNCHW, DeformableConvFp32, def)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
diff --git a/lite/kernels/arm/deformable_conv_compute.h b/lite/kernels/arm/deformable_conv_compute.h
index 6c8995ddd447a4382ee40e00f3b31832566ad9e9..17fae957619b7754637023a21169da9641686e59 100644
--- a/lite/kernels/arm/deformable_conv_compute.h
+++ b/lite/kernels/arm/deformable_conv_compute.h
@@ -17,6 +17,7 @@
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/kernel.h"
 #ifdef LITE_WITH_PROFILE
+#include <string>
 #include "lite/core/profile/profiler.h"
 #endif
 
@@ -56,8 +57,9 @@ class DeformableConvCompute : public KernelLite<TARGET(kARM), Ptype> {
 #ifdef LITE_WITH_PROFILE
   virtual void SetProfileRuntimeKernelInfo(
       paddle::lite::profile::OpCharacter* ch) {
-    impl_->SetProfileRuntimeKernelInfo(ch);
+    ch->kernel_func_name = kernel_func_name_;
   }
+  std::string kernel_func_name_{"NotImplForDeformableConv"};
 #endif
 
   ~DeformableConvCompute() = default;
diff --git a/lite/kernels/arm/dropout_compute_test.cc b/lite/kernels/arm/dropout_compute_test.cc
index 1c0f8db347304076caee23ee3d295bcfacbe2a1f..0aa16b8d348d7b8415120051df0e9732fada4495 100644
--- a/lite/kernels/arm/dropout_compute_test.cc
+++ b/lite/kernels/arm/dropout_compute_test.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/dropout_compute.h"
 #include <gtest/gtest.h>
+
 #include <string>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/dropout_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -30,9 +32,7 @@ TEST(dropout_arm, init) {
 }
 
 TEST(dropout, retrive_op) {
-  auto dropout =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "dropout");
+  auto dropout = KernelRegistry::Global().Create("dropout");
   ASSERT_FALSE(dropout.empty());
   ASSERT_TRUE(dropout.front());
 }
diff --git a/lite/kernels/arm/elementwise_compute.cc b/lite/kernels/arm/elementwise_compute.cc
index 8115700f5950ddfcb71df49e6a21528563f23d95..3e898d9ded2153588c164d2ccd618fc77f7c3854 100644
--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -202,17 +202,13 @@ void ElementwiseMulCompute<T, PType>::Run() {
   }
 }
 
-template <>
-void ElementwiseMulCompute<int64_t, PRECISION(kInt64)>::Run() {
-  auto& param = this->template Param<operators::ElementwiseParam>();
-  lite::arm::math::elementwise_compute_basic<int64_t>(param, "mul", "");
-}
-
-void ElementwiseMulActivationCompute::Run() {
-  auto& param = Param<operators::FusionElementwiseActivationParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
+template <typename T, PrecisionType PType>
+void ElementwiseMulActivationCompute<T, PType>::Run() {
+  auto& param =
+      this->template Param<operators::FusionElementwiseActivationParam>();
+  auto* x_data = param.X->template data<T>();
+  auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
   int axis = param.axis;
   std::string act_type = param.act_type;
   auto x_dims = param.X->dims();
@@ -221,21 +217,21 @@ void ElementwiseMulActivationCompute::Run() {
   if (x_dims.size() < y_dims.size() &&
       is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
     if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu_broadcast<float>(
+      lite::arm::math::elementwise_mul_relu_broadcast<T>(
           y_data, x_data, out_data, pre, n, post);
     } else {
       LOG(FATAL) << "unsupported Activation type: " << act_type;
     }
   } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
     if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu_broadcast(
+      lite::arm::math::elementwise_mul_relu_broadcast<T>(
           x_data, y_data, out_data, pre, n, post);
     } else {
       LOG(FATAL) << "unsupported Activation type: " << act_type;
     }
   } else {
     if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu(
+      lite::arm::math::elementwise_mul_relu<T>(
           x_data, y_data, out_data, x_dims.production());
     } else {
       LOG(FATAL) << "unsupported Activation type: " << act_type;
@@ -300,11 +296,12 @@ void ElementwiseMaxActivationCompute::Run() {
   }
 }
 
-void ElementwiseDivCompute::Run() {
-  auto& param = Param<operators::ElementwiseParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
+template <typename T, PrecisionType PType>
+void ElementwiseDivCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ElementwiseParam>();
+  auto* x_data = param.X->template data<T>();
+  auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
   int axis = param.axis;
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
@@ -313,10 +310,10 @@ void ElementwiseDivCompute::Run() {
     LOG(FATAL) << "elewise div don't support x_dims size < y_dims size";
   }
   if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    lite::arm::math::elementwise_div_broadcast(
+    lite::arm::math::elementwise_div_broadcast<T>(
         x_data, y_data, out_data, pre, n, post);
   } else {
-    lite::arm::math::elementwise_div(
+    lite::arm::math::elementwise_div<T>(
         x_data, y_data, out_data, x_dims.production());
   }
 }
@@ -351,6 +348,29 @@ void ElementwiseDivActivationCompute::Run() {
   }
 }
 
+template <typename T, PrecisionType PType>
+void ElementwiseModCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ElementwiseParam>();
+  auto* x_data = param.X->template data<T>();
+  auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_mod_broadcast<T>(
+        y_data, x_data, out_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_mod_broadcast<T>(
+        x_data, y_data, out_data, pre, n, post);
+  } else {
+    lite::arm::math::elementwise_mod<T>(
+        x_data, y_data, out_data, x_dims.production());
+  }
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -402,46 +422,60 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
-using elementwise_mul_float =
+using elementwise_mul_float_t =
     paddle::lite::kernels::arm::ElementwiseMulCompute<float, PRECISION(kFloat)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float, def)
+    elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float_t, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
-using elementwise_mul_int32 =
+using elementwise_mul_int32_t =
     paddle::lite::kernels::arm::ElementwiseMulCompute<int, PRECISION(kInt32)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32, def)
+    elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32_t, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .Finalize();
 
-using elementwise_mul_int64 =
+using elementwise_mul_int64_t =
     paddle::lite::kernels::arm::ElementwiseMulCompute<int64_t,
                                                       PRECISION(kInt64)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64, def)
+    elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64_t, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(
-    fusion_elementwise_mul_activation,
-    kARM,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::arm::ElementwiseMulActivationCompute,
-    def)
+using fusion_elementwise_mul_activation_float_t = paddle::lite::kernels::arm::
+    ElementwiseMulActivationCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     fusion_elementwise_mul_activation_float_t,
+                     def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
+using fusion_elementwise_mul_activation_int64_t = paddle::lite::kernels::arm::
+    ElementwiseMulActivationCompute<int64_t, PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kARM,
+                     kInt64,
+                     kNCHW,
+                     fusion_elementwise_mul_activation_int64_t,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(elementwise_max,
                      kARM,
                      kFloat,
@@ -465,17 +499,27 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(elementwise_div,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ElementwiseDivCompute,
-                     def)
+using elementwise_div_fp32_t =
+    paddle::lite::kernels::arm::ElementwiseDivCompute<float, PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(
+    elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32_t, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
+using elementwise_div_int64_t =
+    paddle::lite::kernels::arm::ElementwiseDivCompute<int64_t,
+                                                      PRECISION(kInt64)>;
+
+REGISTER_LITE_KERNEL(
+    elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64_t, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
     fusion_elementwise_div_activation,
     kARM,
@@ -487,3 +531,13 @@ REGISTER_LITE_KERNEL(
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+
+using elementwise_mod_int64_t =
+    paddle::lite::kernels::arm::ElementwiseModCompute<int64_t,
+                                                      PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(
+    elementwise_mod, kARM, kInt64, kNCHW, elementwise_mod_int64_t, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/arm/elementwise_compute.h b/lite/kernels/arm/elementwise_compute.h
index 731010a0d189c08f031363e6df95652c000a237b..89d9898648d25fec98568f2456fe96903da0a69d 100644
--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
@@ -62,8 +62,8 @@ class ElementwiseMulCompute : public KernelLite<TARGET(kARM), PType> {
   virtual ~ElementwiseMulCompute() = default;
 };
 
-class ElementwiseMulActivationCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ElementwiseMulActivationCompute : public KernelLite<TARGET(kARM), PType> {
  public:
   void Run() override;
 
@@ -86,8 +86,8 @@ class ElementwiseMaxActivationCompute
   virtual ~ElementwiseMaxActivationCompute() = default;
 };
 
-class ElementwiseDivCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ElementwiseDivCompute : public KernelLite<TARGET(kARM), PType> {
  public:
   void Run() override;
 
@@ -102,6 +102,22 @@ class ElementwiseDivActivationCompute
   virtual ~ElementwiseDivActivationCompute() = default;
 };
 
+template <typename T, PrecisionType PType>
+class ElementwiseModCompute : public KernelLite<TARGET(kARM), PType> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseModCompute() = default;
+};
+
+// class ElementwiseModActivationCompute
+//     : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+//  public:
+//   void Run() override;
+
+//   virtual ~ElementwiseModActivationCompute() = default;
+// };
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/arm/elementwise_compute_test.cc b/lite/kernels/arm/elementwise_compute_test.cc
index b0ac3a7d33d92239c83147a3fe7615cd2fbf0249..79262fb4ef75283eba12efa0a4ad8dc048681338 100644
--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ b/lite/kernels/arm/elementwise_compute_test.cc
@@ -12,11 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/elementwise_compute.h"
 #include <gtest/gtest.h>
+
+#include <cmath>
 #include <string>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/elementwise_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -24,9 +27,7 @@ namespace kernels {
 namespace arm {
 
 TEST(elementwise_add_arm, retrive_op) {
-  auto elementwise_add =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "elementwise_add");
+  auto elementwise_add = KernelRegistry::Global().Create("elementwise_add");
   ASSERT_FALSE(elementwise_add.empty());
   ASSERT_TRUE(elementwise_add.front());
 }
@@ -140,6 +141,119 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param,
   }
 }
 
+template <typename dtype>
+void elementwise_fmod_compute_ref(const operators::ElementwiseParam& param,
+                                  const std::string act_type) {
+  const dtype* x_data = param.X->data<const dtype>();
+  const dtype* y_data = param.Y->data<const dtype>();
+  dtype* out_data = param.Out->mutable_data<dtype>();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const dtype* din_ptr = x_data + offset;
+      const dtype diny_data = y_data[j];
+      dtype* dout_ptr = out_data + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = fmod(diny_data + fmod(*din_ptr, diny_data), diny_data);
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+  // do activation relu
+  if (act_type.size() > 0) {
+    if (act_type == "relu") {
+      for (int i = 0; i < batch; ++i) {
+        for (int j = 0; j < channels; ++j) {
+          dtype* dout_ptr = out_data + (i * channels + j) * num;
+          for (int k = 0; k < num; ++k) {
+            *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f;
+            dout_ptr++;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename dtype>
+void elementwise_imod_compute_ref(const operators::ElementwiseParam& param,
+                                  const std::string act_type) {
+  const dtype* x_data = param.X->data<const dtype>();
+  const dtype* y_data = param.Y->data<const dtype>();
+  dtype* out_data = param.Out->mutable_data<dtype>();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const dtype* din_ptr = x_data + offset;
+      const dtype diny_data = y_data[j];
+      dtype* dout_ptr = out_data + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = (*din_ptr) % diny_data;
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+  // do activation relu
+  if (act_type.size() > 0) {
+    if (act_type == "relu") {
+      for (int i = 0; i < batch; ++i) {
+        for (int j = 0; j < channels; ++j) {
+          dtype* dout_ptr = out_data + (i * channels + j) * num;
+          for (int k = 0; k < num; ++k) {
+            *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f;
+            dout_ptr++;
+          }
+        }
+      }
+    }
+  }
+}
+
+template void elementwise_fmod_compute_ref<float>(
+    const operators::ElementwiseParam& param, const std::string act_type);
+template void elementwise_imod_compute_ref<int32_t>(
+    const operators::ElementwiseParam& param, const std::string act_type);
+template void elementwise_imod_compute_ref<int64_t>(
+    const operators::ElementwiseParam& param, const std::string act_type);
+
 TEST(elementwise_add, compute) {
   ElementwiseAddCompute elementwise_add;
   operators::ElementwiseParam param;
@@ -222,8 +336,7 @@ TEST(elementwise_add, compute) {
 
 TEST(fusion_elementwise_add_activation_arm, retrive_op) {
   auto fusion_elementwise_add_activation =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "fusion_elementwise_add_activation");
+      KernelRegistry::Global().Create("fusion_elementwise_add_activation");
   ASSERT_FALSE(fusion_elementwise_add_activation.empty());
   ASSERT_TRUE(fusion_elementwise_add_activation.front());
 }
@@ -321,9 +434,7 @@ TEST(fusion_elementwise_add_activation_arm, compute) {
 }
 
 TEST(elementwise_mul_arm, retrive_op) {
-  auto elementwise_mul =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "elementwise_mul");
+  auto elementwise_mul = KernelRegistry::Global().Create("elementwise_mul");
   ASSERT_FALSE(elementwise_mul.empty());
   ASSERT_TRUE(elementwise_mul.front());
 }
@@ -416,20 +527,21 @@ TEST(elementwise_mul, compute) {
 
 TEST(fusion_elementwise_mul_activation_arm, retrive_op) {
   auto fusion_elementwise_mul_activation =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "fusion_elementwise_mul_activation");
+      KernelRegistry::Global().Create("fusion_elementwise_mul_activation");
   ASSERT_FALSE(fusion_elementwise_mul_activation.empty());
   ASSERT_TRUE(fusion_elementwise_mul_activation.front());
 }
 
 TEST(fusion_elementwise_mul_activation_arm, init) {
-  ElementwiseMulActivationCompute fusion_elementwise_mul_activation;
+  ElementwiseMulActivationCompute<float, PRECISION(kFloat)>
+      fusion_elementwise_mul_activation;
   ASSERT_EQ(fusion_elementwise_mul_activation.precision(), PRECISION(kFloat));
   ASSERT_EQ(fusion_elementwise_mul_activation.target(), TARGET(kARM));
 }
 
 TEST(fusion_elementwise_mul_activation_arm, compute) {
-  ElementwiseMulActivationCompute fusion_elementwise_mul_activation;
+  ElementwiseMulActivationCompute<float, PRECISION(kFloat)>
+      fusion_elementwise_mul_activation;
   operators::FusionElementwiseActivationParam param;
   lite::Tensor x, y, output, output_ref;
 
@@ -515,9 +627,7 @@ TEST(fusion_elementwise_mul_activation_arm, compute) {
 }
 
 TEST(elementwise_max_arm, retrive_op) {
-  auto elementwise_max =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "elementwise_max");
+  auto elementwise_max = KernelRegistry::Global().Create("elementwise_max");
   ASSERT_FALSE(elementwise_max.empty());
   ASSERT_TRUE(elementwise_max.front());
 }
@@ -610,8 +720,7 @@ TEST(elementwise_max, compute) {
 
 TEST(fusion_elementwise_max_activation_arm, retrive_op) {
   auto fusion_elementwise_max_activation =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "fusion_elementwise_max_activation");
+      KernelRegistry::Global().Create("fusion_elementwise_max_activation");
   ASSERT_FALSE(fusion_elementwise_max_activation.empty());
   ASSERT_TRUE(fusion_elementwise_max_activation.front());
 }
@@ -685,7 +794,7 @@ TEST(fusion_elementwise_max_activation_arm, compute) {
                 }
                 for (int i = 0; i < y_dim.production(); i++) {
                   float sign = i % 2 == 0 ? 0.5f : -0.5f;
-                  y_data[i] = i * sign;
+                  y_data[i] = (i + 1) * sign;
                 }
                 param.X = &x;
                 param.Y = &y;
@@ -708,6 +817,106 @@ TEST(fusion_elementwise_max_activation_arm, compute) {
   }
 }
 
+TEST(elementwise_mod_int64_arm, retrive_op) {
+  auto elementwise_mod = KernelRegistry::Global().Create("elementwise_mod");
+  ASSERT_FALSE(elementwise_mod.empty());
+  ASSERT_TRUE(elementwise_mod.front());
+}
+
+TEST(elementwise_mod_int64_arm, init) {
+  ElementwiseModCompute<int64_t, PRECISION(kInt64)> elementwise_mod;
+  ASSERT_EQ(elementwise_mod.precision(), PRECISION(kInt64));
+  ASSERT_EQ(elementwise_mod.target(), TARGET(kARM));
+}
+
+TEST(elementwise_mod_int64_arm, compute) {
+  ElementwiseModCompute<int64_t, PRECISION(kInt64)> elementwise_mod;
+  operators::ElementwiseParam param;
+  lite::Tensor x, y, output, output_ref;
+
+#if 1
+  for (auto n : {1, 3, 4}) {
+    for (auto c : {1, 3, 4}) {
+      for (auto h : {1, 3, 4}) {
+        for (auto w : {1, 3, 4}) {
+          for (auto axis : {-1, 0, 1, 3}) {
+            for (auto yd : {std::vector<int64_t>({n}),
+                            std::vector<int64_t>({c}),
+                            std::vector<int64_t>({h}),
+                            std::vector<int64_t>({w}),
+                            std::vector<int64_t>({n, c}),
+                            std::vector<int64_t>({c, h}),
+                            std::vector<int64_t>({c, h, w}),
+                            std::vector<int64_t>({n, c, h, w})}) {
+#else
+  for (auto n : {1, 3, 4, 11}) {
+    for (auto c : {1, 3, 4, 11}) {
+      for (auto h : {1, 3, 4, 11}) {
+        for (auto w : {1, 3, 4, 11}) {
+          for (auto axis : {-1, 0, 1, 2, 3}) {
+            for (auto yd : {std::vector<int64_t>({n}),
+                            std::vector<int64_t>({c}),
+                            std::vector<int64_t>({h}),
+                            std::vector<int64_t>({w}),
+                            std::vector<int64_t>({n, c}),
+                            std::vector<int64_t>({c, h}),
+                            std::vector<int64_t>({h, w}),
+                            std::vector<int64_t>({n, c, h}),
+                            std::vector<int64_t>({c, h, w}),
+                            std::vector<int64_t>({n, c, h, w})}) {
+#endif
+              auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
+              auto y_dim = DDim(yd);
+              int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
+
+              if (axis_t + y_dim.size() > 4) continue;
+              bool flag = false;
+              for (int i = 0; i < y_dim.size(); i++) {
+                if (x_dim[i + axis_t] != y_dim[i]) flag = true;
+              }
+              if (flag) continue;
+
+              x.Resize(x_dim);
+              y.Resize(y_dim);
+              output.Resize(x_dim);
+              output_ref.Resize(x_dim);
+              auto* x_data = x.mutable_data<int64_t>();
+              auto* y_data = y.mutable_data<int64_t>();
+              auto* output_data = output.mutable_data<int64_t>();
+              auto* output_ref_data = output_ref.mutable_data<int64_t>();
+              for (int i = 0; i < x_dim.production(); i++) {
+                x_data[i] = i + 1;
+              }
+              for (int i = 0; i < y_dim.production(); i++) {
+                y_data[i] = y_dim.production() - i;
+              }
+              param.X = &x;
+              param.Y = &y;
+              param.axis = axis;
+              param.Out = &output;
+              elementwise_mod.SetParam(param);
+              elementwise_mod.Run();
+              param.Out = &output_ref;
+              elementwise_imod_compute_ref<int64_t>(param, "");
+              for (int i = 0; i < output.dims().production(); i++) {
+                if (std::abs(output_data[i] - output_ref_data[i]) > 1e-5 ||
+                    std::isnan(output_data[i]) ||
+                    std::isnan(output_ref_data[i])) {
+                  LOG(FATAL) << "elementwise mod cmp error, i: " << i
+                             << ", x_data: " << x_data[i]
+                             << ", y_data: " << y_data[i]
+                             << ", output_data: " << output_data[i]
+                             << ", output_ref_data: " << output_ref_data[i];
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -719,3 +928,4 @@ USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_mod, kARM, kInt64, kNCHW, def);
diff --git a/lite/kernels/arm/fc_compute.cc b/lite/kernels/arm/fc_compute.cc
index 0ff1cd6b0dc26cdb2b45b00e34baced1bc5fa131..6e3a620a4a8989807481cb0f56ac91643eda4ce7 100644
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
@@ -88,7 +88,7 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
 
   auto i_data = param.input->data<float>();
   auto o_data = param.output->mutable_data<float>();
-  auto w_data = flag_gemm_ ? param.w->data<float>() : weights_.data<float>();
+  auto w_data = param.w->data<float>();
   const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
   if (flag_trans_bias_) {
     b_data = bias_.data<float>();
@@ -149,8 +149,7 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
 
   auto i_data = param.input->data<int8_t>();
   auto o_data = param.output->mutable_data<float>();
-  auto w_data =
-      flag_trans_weights_ ? weights_.data<int8_t>() : param.w->data<int8_t>();
+  auto w_data = param.w->data<int8_t>();
   const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
   if (flag_trans_bias_) {
     b_data = bias_.data<float>();
@@ -208,8 +207,7 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
 
   auto i_data = param.input->data<int8_t>();
   auto o_data = param.output->mutable_data<int8_t>();
-  auto w_data =
-      flag_trans_weights_ ? weights_.data<int8_t>() : param.w->data<int8_t>();
+  auto w_data = param.w->data<int8_t>();
   const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
   if (flag_trans_bias_) {
     b_data = bias_.data<float>();
diff --git a/lite/kernels/arm/fc_compute.h b/lite/kernels/arm/fc_compute.h
index 4f8a82a8689c1f221ee146176ff7074602cad1c9..e45758775d99112afa0a7e3a45e1c15a9ea371aa 100644
--- a/lite/kernels/arm/fc_compute.h
+++ b/lite/kernels/arm/fc_compute.h
@@ -104,9 +104,11 @@ class FcCompute : public KernelLite<TARGET(kARM), PType> {
     CHECK_EQ(k_, static_cast<int>(w_dims[0]));
     flag_gemm_ = check_fc_use_gemm<PType, OutType>(
         m_, param.weight_scale, param.bias != nullptr);
-    if (!flag_trans_weights_ && !flag_gemm_) {
-      flag_trans_weights_ = true;
-      fc_trans_weights<PType>(*param.w, &weights_);
+    if (flag_trans_weights_ == flag_gemm_) {
+      flag_trans_weights_ = !flag_trans_weights_;
+      Tensor tmp_tensor;
+      fc_trans_weights<PType>(*param.w, &tmp_tensor);
+      param.w->CopyDataFrom(tmp_tensor);
     }
   }
 
@@ -117,7 +119,6 @@ class FcCompute : public KernelLite<TARGET(kARM), PType> {
 
  private:
   DDim last_shape_;
-  Tensor weights_;
   Tensor bias_;
   bool flag_trans_weights_{false};
   bool flag_trans_bias_{false};
diff --git a/lite/kernels/arm/gather_compute.cc b/lite/kernels/arm/gather_compute.cc
index 3efacc4aacefcb150d53738c950ec9e797ed78c7..f5a87e5431955252e47143252ce13ba4056c4a7f 100644
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
@@ -20,44 +20,45 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-template <typename T>
+template <typename IndexType, typename DataType>
 void GatherFunc(const operators::GatherParam& param) {
   auto src_dims = param.X->dims();
   auto index_size = param.Index->dims()[0];
-  auto* p_src = param.X->data<T>();
-  const int* p_index = param.Index->data<int>();
-  auto* p_output = param.Out->mutable_data<T>();
+  auto* p_src = param.X->data<DataType>();
+  const IndexType* p_index = param.Index->data<IndexType>();
+  auto* p_output = param.Out->mutable_data<DataType>();
 
   int slice_size = 1;
   for (size_t i = 1; i < src_dims.size(); ++i) {
     slice_size *= src_dims[i];
   }
   for (int i = 0; i < index_size; ++i) {
-    int index_ = p_index[i];
+    IndexType index_ = p_index[i];
     memcpy(p_output + i * slice_size,
            p_src + index_ * slice_size,
-           slice_size * sizeof(T));
+           slice_size * sizeof(DataType));
   }
 }
 
-void GatherCompute::Run() {
-  auto& param = this->Param<operators::GatherParam>();
+template <typename IndexType>
+void GatherCompute<IndexType>::Run() {
+  auto& param = this->template Param<operators::GatherParam>();
 
   switch (param.X->precision()) {
     case PRECISION(kFloat):
-      GatherFunc<float>(param);
+      GatherFunc<IndexType, float>(param);
       break;
     case PRECISION(kInt8):
-      GatherFunc<int8_t>(param);
+      GatherFunc<IndexType, int8_t>(param);
       break;
     case PRECISION(kInt16):
-      GatherFunc<int16_t>(param);
+      GatherFunc<IndexType, int16_t>(param);
       break;
     case PRECISION(kInt32):
-      GatherFunc<int32_t>(param);
+      GatherFunc<IndexType, int32_t>(param);
       break;
     case PRECISION(kInt64):
-      GatherFunc<int64_t>(param);
+      GatherFunc<IndexType, int64_t>(param);
       break;
     default:
       LOG(FATAL) << "Gather does not implement for the "
@@ -70,10 +71,26 @@ void GatherCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
+REGISTER_LITE_KERNEL(gather,
+                     kARM,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::arm::GatherCompute<int32_t>,
+                     def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(gather,
+                     kARM,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::arm::GatherCompute<int64_t>,
+                     def_int64_idx)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .Finalize();
diff --git a/lite/kernels/arm/gather_compute.h b/lite/kernels/arm/gather_compute.h
index 9753f42972407b250886afa6bada8861a642e189..0226e5f68eee3f23dbd945af6f4f455ab79190c5 100644
--- a/lite/kernels/arm/gather_compute.h
+++ b/lite/kernels/arm/gather_compute.h
@@ -23,6 +23,7 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
+template <typename IndexType>
 class GatherCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   void Run() override;
diff --git a/lite/kernels/arm/group_norm_compute.cc b/lite/kernels/arm/group_norm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e370414f4079f8dbbc2e5cc9af294c7b3f88718
--- /dev/null
+++ b/lite/kernels/arm/group_norm_compute.cc
@@ -0,0 +1,180 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/group_norm_compute.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void GroupNormCompute::PrepareForRun() {}
+
+void GroupNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  const float* in = param.x->data<float>();
+  const float* scale = param.scale->data<float>();
+  const float* bias = param.bias->data<float>();
+  float* out = param.out->mutable_data<float>();
+  float* saved_mean = param.saved_mean->mutable_data<float>();
+  float* saved_variance = param.saved_variance->mutable_data<float>();
+  float epsilon = param.epsilon;
+  int groups = param.groups;
+  int channels = param.channels;
+  int n = param.x->dims()[0];
+  int c = param.x->dims()[1];
+  int ch_per_group = channels / groups;
+  int height = param.x->dims()[2];
+  int width = param.x->dims()[3];
+  int spatial_size = ch_per_group * height * width;
+  int ngroup = n * groups;
+  int cnt = spatial_size >> 4;
+  int remain = spatial_size % 16;
+// compute saved_mean and saved_variance
+#pragma omp parallel for
+  for (int n = 0; n < ngroup; ++n) {
+    const float* in_p = in + n * spatial_size;
+    float sum_spatial = 0.f;
+    float summ_spatial = 0.f;
+    float32x4_t sum0 = vdupq_n_f32(0.f);
+    float32x4_t sum1 = vdupq_n_f32(0.f);
+    float32x4_t sum2 = vdupq_n_f32(0.f);
+    float32x4_t sum3 = vdupq_n_f32(0.f);
+    float32x4_t summ0 = vdupq_n_f32(0.f);
+    float32x4_t summ1 = vdupq_n_f32(0.f);
+    float32x4_t summ2 = vdupq_n_f32(0.f);
+    float32x4_t summ3 = vdupq_n_f32(0.f);
+    for (int i = 0; i < cnt; i++) {
+      float32x4_t in0 = vld1q_f32(in_p);
+      float32x4_t in1 = vld1q_f32(in_p + 4);
+      float32x4_t in2 = vld1q_f32(in_p + 8);
+      float32x4_t in3 = vld1q_f32(in_p + 12);
+      sum0 = vaddq_f32(sum0, in0);
+      summ0 = vmlaq_f32(summ0, in0, in0);
+      sum1 = vaddq_f32(sum1, in1);
+      summ1 = vmlaq_f32(summ1, in1, in1);
+      sum2 = vaddq_f32(sum2, in2);
+      summ2 = vmlaq_f32(summ2, in2, in2);
+      sum3 = vaddq_f32(sum3, in3);
+      summ3 = vmlaq_f32(summ3, in3, in3);
+      in_p += 16;
+    }
+    for (int i = 0; i < remain - 3; i += 4) {
+      float32x4_t in0 = vld1q_f32(in_p);
+      sum1 = vaddq_f32(sum1, in0);
+      summ1 = vmlaq_f32(summ1, in0, in0);
+      in_p += 4;
+    }
+    float sum = 0.0;
+    float summ = 0.0;
+    sum0 = vaddq_f32(sum0, sum1);
+    sum2 = vaddq_f32(sum2, sum3);
+    summ0 = vaddq_f32(summ0, summ1);
+    summ2 = vaddq_f32(summ2, summ3);
+    for (int i = 0; i < remain % 4; i++) {
+      sum += *in_p;
+      summ += (*in_p) * (*in_p);
+      in_p++;
+    }
+    sum0 = vaddq_f32(sum0, sum2);
+    summ0 = vaddq_f32(summ0, summ2);
+    float32x2_t sum_low = vpadd_f32(vget_low_f32(sum0), vget_high_f32(sum0));
+    float32x2_t sum_high = vpadd_f32(vget_low_f32(summ0), vget_high_f32(summ0));
+    float32x2_t sum_mix = vpadd_f32(sum_low, sum_high);
+    sum += vget_lane_f32(sum_mix, 0);
+    summ += vget_lane_f32(sum_mix, 1);
+    float mean = sum / spatial_size;
+    // float variance = summ / spatial_size - mean * mean;
+    // the flolowing code has higher precision than above comment code
+    float variance = (summ - mean * mean * spatial_size) / spatial_size;
+    float std = 1.f / sqrtf(variance + epsilon);
+    saved_mean[n] = mean;
+    saved_variance[n] = std;
+  }
+  int in_size = height * width;
+  cnt = in_size >> 4;
+  remain = in_size % 16;
+// compute Group_norm result: out = scale * (in - mean) / std + bias
+#pragma omp parallel for
+  for (int i = 0; i < ngroup; ++i) {
+    const float* in_p = in + i * spatial_size;
+    float* out_p = out + i * spatial_size;
+    int numc = i % groups;
+    numc *= ch_per_group;
+    for (int c = 0; c < ch_per_group; c++) {
+      int chin = numc + c;
+      const float sstd_val = scale[chin] * saved_variance[i];
+      const float bias_val = bias[chin];
+      const float mean_val = saved_mean[i];
+      const float32x4_t vsstd = vdupq_n_f32(sstd_val);
+      const float32x4_t vbias = vdupq_n_f32(bias_val);
+      const float32x4_t vmean = vdupq_n_f32(mean_val);
+      for (int k = 0; k < cnt; k++) {
+        float32x4_t in0 = vld1q_f32(in_p);
+        float32x4_t in1 = vld1q_f32(in_p + 4);
+        float32x4_t in2 = vld1q_f32(in_p + 8);
+        float32x4_t in3 = vld1q_f32(in_p + 12);
+        float32x4_t submean0 = vsubq_f32(in0, vmean);
+        float32x4_t submean1 = vsubq_f32(in1, vmean);
+        float32x4_t submean2 = vsubq_f32(in2, vmean);
+        float32x4_t submean3 = vsubq_f32(in3, vmean);
+        float32x4_t out0 = vmlaq_f32(vbias, submean0, vsstd);
+        float32x4_t out1 = vmlaq_f32(vbias, submean1, vsstd);
+        float32x4_t out2 = vmlaq_f32(vbias, submean2, vsstd);
+        float32x4_t out3 = vmlaq_f32(vbias, submean3, vsstd);
+        vst1q_f32(out_p, out0);
+        vst1q_f32(out_p + 4, out1);
+        vst1q_f32(out_p + 8, out2);
+        vst1q_f32(out_p + 12, out3);
+        in_p += 16;
+        out_p += 16;
+      }
+      for (int k = 0; k < remain - 3; k += 4) {
+        float32x4_t in0 = vld1q_f32(in_p);
+        in_p += 4;
+        float32x4_t submean0 = vsubq_f32(in0, vmean);
+        float32x4_t out0 = vmlaq_f32(vbias, submean0, vsstd);
+        vst1q_f32(out_p, out0);
+        out_p += 4;
+      }
+      for (int k = 0; k < remain % 4; k++) {
+        *out_p = (*in_p - mean_val) * sstd_val + bias_val;
+        in_p++;
+        out_p++;
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(group_norm,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::GroupNormCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/activation_grad_compute.h b/lite/kernels/arm/group_norm_compute.h
similarity index 81%
rename from lite/kernels/arm/activation_grad_compute.h
rename to lite/kernels/arm/group_norm_compute.h
index ef03f58fa8cd499192aa6edfe3a7c51b49b14f65..7d61b8ec8d9a1c8620c54858487b21691bef84d5 100644
--- a/lite/kernels/arm/activation_grad_compute.h
+++ b/lite/kernels/arm/group_norm_compute.h
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #pragma once
-#include <algorithm>
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 
@@ -22,13 +21,17 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class SquareGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class GroupNormCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  public:
-  using param_t = operators::ActivationGradParam;
+  using param_t = operators::GroupNormParam;
+
+  void PrepareForRun() override;
 
   void Run() override;
 
-  virtual ~SquareGradCompute() = default;
+  virtual ~GroupNormCompute() = default;
+
+ private:
 };
 
 }  // namespace arm
diff --git a/lite/kernels/arm/layer_norm_compute_test.cc b/lite/kernels/arm/layer_norm_compute_test.cc
index 22fe3d06569fac424ab797712142b4d088dc7d3a..e84f9f133ce0cdecb714dc535c0f5833597105c6 100644
--- a/lite/kernels/arm/layer_norm_compute_test.cc
+++ b/lite/kernels/arm/layer_norm_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/layer_norm_compute.h"
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <limits>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/layer_norm_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -181,9 +183,7 @@ TEST(layer_norm_arm, compute) {
 }
 
 TEST(layer_norm, retrive_op) {
-  auto layer_norm =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "layer_norm");
+  auto layer_norm = KernelRegistry::Global().Create("layer_norm");
   ASSERT_FALSE(layer_norm.empty());
   ASSERT_TRUE(layer_norm.front());
 }
diff --git a/lite/kernels/arm/lrn_compute_test.cc b/lite/kernels/arm/lrn_compute_test.cc
index e7030d00427e55c7faf333997cd90cba46260cd4..9afd05b80aaffdc4be2ae1deaa5993b8fd21dce4 100644
--- a/lite/kernels/arm/lrn_compute_test.cc
+++ b/lite/kernels/arm/lrn_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/lrn_compute.h"
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <string>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/lrn_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -133,8 +135,7 @@ void lrn_compute_ref(const operators::LrnParam& param) {
 }
 
 TEST(lrn_arm, retrive_op) {
-  auto lrn =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("lrn");
+  auto lrn = KernelRegistry::Global().Create("lrn");
   ASSERT_FALSE(lrn.empty());
   ASSERT_TRUE(lrn.front());
 }
diff --git a/lite/kernels/arm/merge_lod_tensor_compute_test.cc b/lite/kernels/arm/merge_lod_tensor_compute_test.cc
index 914a58308bdf0d5c6d374d5f81ca38224941c85d..f8d92dfdc740988733ad26d5385b17050b490635 100644
--- a/lite/kernels/arm/merge_lod_tensor_compute_test.cc
+++ b/lite/kernels/arm/merge_lod_tensor_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/merge_lod_tensor_compute.h"
 #include <gtest/gtest.h>
+
 #include <cstdlib>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/merge_lod_tensor_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,9 +28,7 @@ namespace kernels {
 namespace arm {
 
 TEST(merge_lod_tensor_arm, retrive_op) {
-  auto kernel =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "merge_lod_tensor");
+  auto kernel = KernelRegistry::Global().Create("merge_lod_tensor");
   ASSERT_FALSE(kernel.empty());
   ASSERT_TRUE(kernel.front());
 }
diff --git a/lite/kernels/arm/mul_compute_test.cc b/lite/kernels/arm/mul_compute_test.cc
index cddee81fe22897dbe91721ed172b144539e0852c..76ab95b93485b3e6701dca6224ce2a5f7a8b3df7 100644
--- a/lite/kernels/arm/mul_compute_test.cc
+++ b/lite/kernels/arm/mul_compute_test.cc
@@ -12,16 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/mul_compute.h"
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <iostream>
 #include <memory>
 #include <random>
 #include <utility>
 #include <vector>
+
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/mul_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -69,8 +71,7 @@ void FillData(T* a,
 }
 
 TEST(mul_arm, retrive_op) {
-  auto mul =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("mul");
+  auto mul = KernelRegistry::Global().Create("mul");
   ASSERT_FALSE(mul.empty());
   ASSERT_TRUE(mul.front());
 }
diff --git a/lite/kernels/arm/pool_compute_test.cc b/lite/kernels/arm/pool_compute_test.cc
index acdaf0d0131621c1c2403b8a071d6cb1134f4565..c4aeb20a5bf53d80be4b407698a51ead46f6b8f5 100644
--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/pool_compute.h"
 #include <gtest/gtest.h>
+
 #include <limits>
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/pool_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -341,8 +343,7 @@ TEST(pool_arm, compute) {
 }
 
 TEST(pool_arm, retrive_op) {
-  auto pool = KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-      "pool2d");
+  auto pool = KernelRegistry::Global().Create("pool2d");
   ASSERT_FALSE(pool.empty());
   ASSERT_TRUE(pool.front());
 }
diff --git a/lite/kernels/arm/scale_compute_test.cc b/lite/kernels/arm/scale_compute_test.cc
index 0d327b9807d306770850b09ed1ed2a0337104c92..fe5e1911d0cc2c012876731f50bd04b3125b8fa2 100644
--- a/lite/kernels/arm/scale_compute_test.cc
+++ b/lite/kernels/arm/scale_compute_test.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/scale_compute.h"
 #include <gtest/gtest.h>
+
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/scale_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -103,8 +105,7 @@ TEST(scale_arm, compute) {
 }
 
 TEST(scale, retrive_op) {
-  auto scale =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("scale");
+  auto scale = KernelRegistry::Global().Create("scale");
   ASSERT_FALSE(scale.empty());
   ASSERT_TRUE(scale.front());
 }
diff --git a/lite/kernels/arm/sequence_conv_compute.cc b/lite/kernels/arm/sequence_conv_compute.cc
index a70b6717097ec0ffdaa24ba257bfdf8dbd536f3f..455615e66de53a4a6f235f8ab803394962292936 100644
--- a/lite/kernels/arm/sequence_conv_compute.cc
+++ b/lite/kernels/arm/sequence_conv_compute.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstddef>
 #include <string>
 #include <vector>
+#include "lite/backends/arm/math/conv_block_utils.h"
 #include "lite/backends/arm/math/conv_impl.h"
 #include "lite/backends/arm/math/sgemm.h"
 #include "lite/core/op_registry.h"
@@ -88,7 +89,7 @@ void SequenceConvCompute::Run() {
       paddle::lite::arm::math::im2col(
           sub_in_data,
           1,
-          sequence_len,
+          input_row_end - input_row_begin,
           hidden_dim,  // C H W -> 1, seq_len, hidden_dim
           kernel_size,
           hidden_dim,  // kernel_h, kernel_w
@@ -101,10 +102,14 @@ void SequenceConvCompute::Run() {
           1,
           1,  // stride_h, stride_w, dilation_h, dilation_w
           tmp_data);
-      local_naive_transpose(tmp_data,
-                            sub_col_data,
-                            kernel_size * hidden_dim,
-                            input_row_end - input_row_begin);
+      int cols = kernel_size * hidden_dim;
+      int rows = input_row_end - input_row_begin;
+      if (cols % 4 == 0 && rows % 4 == 0) {
+        paddle::lite::arm::math::local_transpose(
+            tmp_data, sub_col_data, cols, rows);
+      } else {
+        local_naive_transpose(tmp_data, sub_col_data, cols, rows);
+      }
     }
   }
 
diff --git a/lite/kernels/arm/softmax_compute.cc b/lite/kernels/arm/softmax_compute.cc
index 3409d0f5c5bd6e7ce1ea77809f7715b62bb10ca2..79ea23ab3fad3340c63846ea11cc89b371f5c6c9 100644
--- a/lite/kernels/arm/softmax_compute.cc
+++ b/lite/kernels/arm/softmax_compute.cc
@@ -34,7 +34,7 @@ void SoftmaxCompute::Run() {
   int inner_num = x_dims.Slice(axis + 1, x_rank).production();
   int axis_size = x_dims[axis];
   if (inner_num == 1) {
-    if (axis_size >= 4) {
+    if (axis_size > 4) {
       lite::arm::math::softmax_inner1_large_axis(
           din, dout, outer_num, axis_size);
     } else {
diff --git a/lite/kernels/arm/softmax_compute_test.cc b/lite/kernels/arm/softmax_compute_test.cc
index 459112d8c0169375584baf0cb983037682e47a3d..486ccf2cedd1af3ce0d7cc2f7d0aeecaadf15ca9 100644
--- a/lite/kernels/arm/softmax_compute_test.cc
+++ b/lite/kernels/arm/softmax_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/softmax_compute.h"
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <limits>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/softmax_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -121,9 +123,7 @@ TEST(softmax_arm, compute) {
 }
 
 TEST(softmax, retrive_op) {
-  auto softmax =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "softmax");
+  auto softmax = KernelRegistry::Global().Create("softmax");
   ASSERT_FALSE(softmax.empty());
   ASSERT_TRUE(softmax.front());
 }
diff --git a/lite/kernels/arm/split_compute_test.cc b/lite/kernels/arm/split_compute_test.cc
index 034fbb85c487df6159a6a22b9958cc9e64d9e1c6..c51ea186b52a77abec5c7560b0a028079bea4aba 100644
--- a/lite/kernels/arm/split_compute_test.cc
+++ b/lite/kernels/arm/split_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/split_compute.h"
 #include <gtest/gtest.h>
+
 #include <cstring>
 #include <limits>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/split_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -165,8 +167,7 @@ TEST(split_arm, compute) {
 }
 
 TEST(split, retrive_op) {
-  auto split =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("split");
+  auto split = KernelRegistry::Global().Create("split");
   ASSERT_FALSE(split.empty());
   ASSERT_TRUE(split.front());
 }
diff --git a/lite/kernels/arm/split_lod_tensor_compute_test.cc b/lite/kernels/arm/split_lod_tensor_compute_test.cc
index 3b2004c786698b70b4c54b68d696a9cf5f5221fd..03f5a21890ffd515e83de7895c2be886b15b8967 100644
--- a/lite/kernels/arm/split_lod_tensor_compute_test.cc
+++ b/lite/kernels/arm/split_lod_tensor_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/split_lod_tensor_compute.h"
 #include <gtest/gtest.h>
+
 #include <cstdlib>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/split_lod_tensor_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,9 +28,7 @@ namespace kernels {
 namespace arm {
 
 TEST(split_lod_tensor_arm, retrive_op) {
-  auto kernel =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "split_lod_tensor");
+  auto kernel = KernelRegistry::Global().Create("split_lod_tensor");
   ASSERT_FALSE(kernel.empty());
   ASSERT_TRUE(kernel.front());
 }
diff --git a/lite/kernels/arm/transpose_compute_test.cc b/lite/kernels/arm/transpose_compute_test.cc
index aaf3f138a54db2c7ff766325cfd61bc51ec8b1d2..74fd14754637427277a6b19b820bb5d3de66c418 100644
--- a/lite/kernels/arm/transpose_compute_test.cc
+++ b/lite/kernels/arm/transpose_compute_test.cc
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/transpose_compute.h"
 #include <gtest/gtest.h>
+
 #include <limits>
 #include <string>
 #include <vector>
+
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/arm/transpose_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -121,9 +123,7 @@ TEST(transpose_arm, compute_shape_nchw) {
 }
 
 TEST(transpose, retrive_op) {
-  auto transpose =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "transpose");
+  auto transpose = KernelRegistry::Global().Create("transpose");
   ASSERT_FALSE(transpose.empty());
   ASSERT_TRUE(transpose.front());
 }
@@ -189,9 +189,7 @@ TEST(transpose2_arm, compute_shape_nchw) {
 }
 
 TEST(transpose2, retrive_op) {
-  auto transpose2 =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "transpose2");
+  auto transpose2 = KernelRegistry::Global().Create("transpose2");
   ASSERT_FALSE(transpose2.empty());
   ASSERT_TRUE(transpose2.front());
 }
diff --git a/lite/kernels/arm/while_compute.h b/lite/kernels/arm/while_compute.h
deleted file mode 100644
index f735d96f9190755daacdf846a2d99901c1a14493..0000000000000000000000000000000000000000
--- a/lite/kernels/arm/while_compute.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <memory>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/while_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class StepExecutor {
-  typedef std::shared_ptr<OpLite> OpPtr;
-
- public:
-  StepExecutor(cpp::BlockDesc *block, Scope *scope, Place place)
-      : scope_(scope), place_(place) {
-    int32_t op_size = block->OpsSize();
-    for (int32_t i = 0; i < op_size; ++i) {
-      auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
-      auto op_type = op_desc.Type();
-      auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
-      // VLOG(4) << "while: creating Op [" << op_type << "]";
-      op_handler->Attach(op_desc, scope);
-
-      auto hostplace = place_;
-      hostplace.target = TARGET(kHost);
-      auto kernels = op_handler->CreateKernels({place_, hostplace});
-      CHECK_GT(kernels.size(), 0) << "cannot create kernel";
-      op_handler->AttachKernel(kernels[0].get());
-      op_handler->SetKernel(kernels);
-      ops_of_block_.push_back(op_handler);
-    }
-  }
-
-  void Run() {
-    for (auto &op_handler : ops_of_block_) {
-      // VLOG(4) << op_handler->op_info()->Repr();
-      op_handler->InferShape();
-      // VLOG(4) << "while: infered shape";
-      op_handler->Run();
-    }
-  }
-
- private:
-  Scope *scope_;
-  Place place_;
-  std::vector<OpPtr> ops_of_block_;
-};
-
-class WhileCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::WhileParam;
-
-  void Run() override;
-  void PrepareForRun() override;
-
-  virtual ~WhileCompute() = default;
-
- private:
-  std::shared_ptr<StepExecutor> executor_;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/bm/bridges/batch_norm_op.cc b/lite/kernels/bm/bridges/batch_norm_op.cc
index fbf70178fdd971edce34b3253b02febfa3e3b85c..f5ecc0825a17f26b1cf65605ea2e8c0c93338f39 100644
--- a/lite/kernels/bm/bridges/batch_norm_op.cc
+++ b/lite/kernels/bm/bridges/batch_norm_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <bmcompiler_if.h>
+#include <math.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/bm/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
@@ -64,10 +65,16 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto* bias_data = bias->mutable_data<float>();
   auto* mean_data = mean->mutable_data<float>();
   auto* variance_data = variance->mutable_data<float>();
+
+  float* new_bias = static_cast<float*>(malloc(bias->memory_size()));
+  float* new_scale = static_cast<float*>(malloc(scale->memory_size()));
+  CHECK(new_bias != nullptr);
+  CHECK(new_scale != nullptr);
+
   for (int c = 0; c < channel_size; c++) {
     float inv_scale = 1.f / (std::sqrt(variance_data[c] + epsilon));
-    bias_data[c] = bias_data[c] - inv_scale * scale_data[c] * mean_data[c];
-    scale_data[c] = inv_scale * scale_data[c];
+    new_bias[c] = bias_data[c] - inv_scale * scale_data[c] * mean_data[c];
+    new_scale[c] = inv_scale * scale_data[c];
   }
 
   const int input_num = 1;
@@ -86,11 +93,13 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                   output_dims.size(),
                   static_cast<const char*>(output_var_name.c_str()),
                   static_cast<const char*>(unique_op_name.c_str()),
-                  static_cast<const float*>(scale->mutable_data<float>()),
-                  static_cast<const float*>(bias->mutable_data<float>()),
+                  static_cast<const float*>(new_scale),
+                  static_cast<const float*>(new_bias),
                   1,
                   1,
                   1);
+  free(new_scale);
+  free(new_bias);
   delete[] shape;
   delete[] name;
   delete[] dim;
diff --git a/lite/kernels/bm/bridges/density_prior_box_op.cc b/lite/kernels/bm/bridges/density_prior_box_op.cc
index 137c5142d5ae544226dbe5d6cd7c872fc272b71a..895901d94e2b2077f530e196ef8f30d4f57df793 100644
--- a/lite/kernels/bm/bridges/density_prior_box_op.cc
+++ b/lite/kernels/bm/bridges/density_prior_box_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <bmcompiler_if.h>
+#include <math.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/bm/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
diff --git a/lite/kernels/bm/bridges/interpolate_op.cc b/lite/kernels/bm/bridges/interpolate_op.cc
index 8c2d39b16ac0206d83199fdeac6c30a0a352856e..a77ec4e8f788e581d9d226369210a449ec50840c 100644
--- a/lite/kernels/bm/bridges/interpolate_op.cc
+++ b/lite/kernels/bm/bridges/interpolate_op.cc
@@ -76,6 +76,8 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                         static_cast<const char*>(output_var_name.c_str()),
                         0,
                         0,
+                        0,
+                        0,
                         type);
   }
   graph->AddNode(output_var_name);
diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc
index d7640e1ac7326d9764380469dc97a7806b044437..ea0dd82325976f33f123f21e0eb4aeb5dfdbfa9d 100644
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -28,13 +28,17 @@ namespace lite {
 namespace kernels {
 namespace bm {
 
-int SubgraphEngine::BuildDeviceProgram() {
+bool SubgraphEngine::BuildDeviceProgram() {
   int status = 0;
   subgraph::bm::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
   graph.CreateCompilerHandle();
   auto& ctx = this->ctx_->template As<BMContext>();
-  for (auto& inst : origin_program_) {
+  if (!origin_program_) {
+    BuildOriginProgram();
+  }
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
@@ -42,7 +46,7 @@ int SubgraphEngine::BuildDeviceProgram() {
     std::string op_type = op->op_info()->Type();
     LOG(INFO) << op_type;
     if (!bridges.Exists(op_type, TARGET(kBM))) {
-      return subgraph::FAILED;
+      return false;
     }
     auto kernel = inst.kernel();
     status |=
@@ -50,12 +54,13 @@ int SubgraphEngine::BuildDeviceProgram() {
                                              const_cast<OpLite*>(op),
                                              const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
     }
   }
-  std::string net_name = "bmnetc_f32umodel";
+  std::string net_name = "bmnet_f32bmodel";
+  auto unique_net_name = lite::subgraph::bm::UniqueName(net_name);
   __bmcompile_opt(
-      graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 1);
+      graph.GetCompilerHandle(), const_cast<char*>(unique_net_name.c_str()), 2);
   void* bmodel_data = nullptr;
   unsigned int data_size = 0;
   bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
@@ -63,19 +68,17 @@ int SubgraphEngine::BuildDeviceProgram() {
   graph.UnlockCompilerMutex();
   bmrt_hd_ = bmrt_create(bm_hd_);
   if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) {
-    return subgraph::FAILED;
+    return false;
   }
   bmrt_get_network_names(bmrt_hd_, &net_names_);
   net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]);
   auto& stage = net_info_->stages[0];
   // input
-  origin_idims_.resize(input_names_.size());
-  origin_itensors_.resize(input_names_.size());
   device_inputs_.resize(input_names_.size());
   for (size_t i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(net_info_->input_names[i]);
+    origin_itensors_[i] =
+        exec_scope_->FindMutableTensor(net_info_->input_names[i]);
     CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
     bm_device_mem_t* p_mem =
         static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
     CHECK(p_mem != nullptr);
@@ -88,8 +91,6 @@ int SubgraphEngine::BuildDeviceProgram() {
                             stage.input_shapes[i]);
   }
   // output
-  origin_odims_.resize(output_names_.size());
-  origin_otensors_.resize(output_names_.size());
   device_outputs_.resize(net_info_->output_num);
   int out_index = 0;
   for (int i = 0; i < output_names_.size(); i++) {
@@ -97,14 +98,13 @@ int SubgraphEngine::BuildDeviceProgram() {
   }
 
   for (int i = 0; i < net_info_->output_num; i++) {
-    Tensor* t_cur = scope_->FindMutableTensor(net_info_->output_names[i]);
+    Tensor* t_cur = exec_scope_->FindMutableTensor(net_info_->output_names[i]);
     CHECK(t_cur != nullptr);
     bm_device_mem_t* p_mem =
         static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
     CHECK(p_mem != nullptr);
     if (outname_map_.find(net_info_->output_names[i]) != outname_map_.end()) {
       origin_otensors_[out_index] = t_cur;
-      origin_odims_[out_index] = origin_otensors_[out_index]->dims();
       origin_otensors_[out_index]->mutable_data<float>();
       out_index += 1;
     }
@@ -116,10 +116,10 @@ int SubgraphEngine::BuildDeviceProgram() {
                             net_info_->output_dtypes[i],
                             stage.output_shapes[i]);
   }
-  return status;
+  return true;
 }
 
-int SubgraphEngine::LaunchDeviceProgram() {
+bool SubgraphEngine::LaunchDeviceProgram() {
   for (size_t i = 0; i < device_inputs_.size(); i++) {
     bm_memcpy_s2d(bm_hd_,
                   device_inputs_[i].device_mem,
@@ -143,24 +143,23 @@ int SubgraphEngine::LaunchDeviceProgram() {
       out_index++;
     }
   }
-  return 0;
+  return true;
 }
 
 void SubgraphCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                    param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
   CHECK(engine_);
-  engine_->Build();
 }
 
 void SubgraphCompute::Run() {
   CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 
 }  // namespace bm
diff --git a/lite/kernels/bm/subgraph_compute.h b/lite/kernels/bm/subgraph_compute.h
index 60f7661c7990d90020dbfc7ec3a6e0d178dceb70..d1dcb3a6d3ef7eb6d9091eb45d1960862cca273a 100644
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
@@ -36,16 +36,20 @@ class SubgraphEngine : public subgraph::Engine {
  public:
   SubgraphEngine(KernelContext *ctx,
                  int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                  const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}
 
  protected:
-  int BuildDeviceProgram() override;
-  int LaunchDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
 
  private:
   void *bmrt_hd_;
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index 1a58a51c36a1ccbb21bb2830a197c096e7ddac51..3d396cfa12f8d89e4d868f5bce98cf143ab072ec 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -6,12 +6,16 @@ message(STATUS "compile with lite CUDA kernels")
 
 # basic kernels
 add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(fc_compute_cuda CUDA basic SRCS fc_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(gru_compute_cuda CUDA basic SRCS gru_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(matmul_compute_cuda CUDA basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(abs_compute_cuda CUDA basic SRCS abs_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(tanh_compute_cuda CUDA basic SRCS tanh_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sigmoid_compute_cuda CUDA basic SRCS sigmoid_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_concat_compute_cuda CUDA extra SRCS sequence_pool_concat_compute.cu DEPS ${lite_kernel_deps})
@@ -34,7 +38,10 @@ add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.
 add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm ${math_cuda})
 add_kernel(sequence_reverse_compute_cuda CUDA extra SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_pad_compute_cuda CUDA extra SRCS sequence_pad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(sequence_unpad_compute_cuda CUDA extra SRCS sequence_unpad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(sequence_concat_compute_cuda CUDA extra SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_mask_compute_cuda CUDA extra SRCS sequence_mask_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_arithmetic_compute_cuda CUDA extra SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(attention_padding_mask_compute_cuda CUDA extra SRCS attention_padding_mask_compute.cu DEPS ${lite_kernel_deps})
@@ -44,6 +51,8 @@ add_kernel(match_matrix_tensor_compute_cuda CUDA extra SRCS match_matrix_tensor_
 add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} cuda_batched_gemm)
 add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
 add_kernel(var_conv_2d_compute_cuda CUDA extra SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(topk_pooling_compute_cuda CUDA extra SRCS topk_pooling_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(assign_value_compute_cuda CUDA extra SRCS assign_value_compute.cu DEPS ${lite_kernel_deps})
 
 # unit test
 lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda)
@@ -53,6 +62,7 @@ nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_
 nv_test(abs_compute_cuda_test SRCS abs_compute_test.cc DEPS abs_compute_cuda)
 nv_test(tanh_compute_cuda_test SRCS tanh_compute_test.cc DEPS tanh_compute_cuda)
 nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda)
+nv_test(sigmoid_compute_cuda_test SRCS sigmoid_compute_test.cc DEPS sigmoid_compute_cuda)
 nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda)
 nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
 nv_test(search_group_padding_compute_cuda_test SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_cuda)
@@ -60,7 +70,10 @@ nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute
 nv_test(elementwise_compute_cuda_test SRCS elementwise_compute_test.cc DEPS elementwise_compute_cuda)
 nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda)
 #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda)
-nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) 
+nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
+nv_test(fc_compute_cuda_test SRCS fc_compute_test.cc DEPS fc_compute_cuda)
+nv_test(gru_compute_cuda_test SRCS gru_compute_test.cc DEPS gru_compute_cuda)
+nv_test(matmul_compute_cuda_test SRCS matmul_compute_test.cc DEPS matmul_compute_cuda)
 nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
 nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda)
 #nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda)
@@ -74,9 +87,14 @@ if(LITE_BUILD_EXTRA)
     nv_test(search_aligned_mat_mul_compute_cuda_test SRCS search_aligned_mat_mul_compute_test.cc DEPS search_aligned_mat_mul_compute_cuda)
     nv_test(search_seq_fc_compute_cuda_test SRCS search_seq_fc_compute_test.cc DEPS search_seq_fc_compute_cuda)
     nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda)
+    nv_test(sequence_pad_compute_cuda_test SRCS sequence_pad_compute_test.cc DEPS sequence_pad_compute_cuda)
+    nv_test(sequence_unpad_compute_cuda_test SRCS sequence_unpad_compute_test.cc DEPS sequence_unpad_compute_cuda)
+    nv_test(sequence_mask_compute_cuda_test SRCS sequence_mask_compute_test.cc DEPS sequence_mask_compute_cuda)
     nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
     #nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
     #nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
     nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
     #nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda)
+    nv_test(topk_pooling_compute_cuda_test SRCS topk_pooling_compute_test.cc DEPS topk_pooling_compute_cuda)
+    nv_test(assign_value_compute_cuda_test SRCS assign_value_compute_test.cc DEPS assign_value_compute_cuda)
 endif()
diff --git a/lite/kernels/cuda/assign_value_compute.cu b/lite/kernels/cuda/assign_value_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6a2740101c2883b3b2f7c999bd96fd3fbd3ab3ce
--- /dev/null
+++ b/lite/kernels/cuda/assign_value_compute.cu
@@ -0,0 +1,76 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "lite/backends/cuda/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/kernels/cuda/assign_value_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <class T>
+void TensorFromVector(const std::vector<T>& src,
+                      lite::Tensor* dst,
+                      cudaStream_t* stream) {
+  auto* src_ptr = static_cast<const void*>(src.data());
+  auto* dst_ptr = static_cast<void*>(dst->mutable_data<T>(TARGET(kCUDA)));
+  auto size = src.size() * sizeof(T);
+  TargetWrapperCuda::MemcpyAsync(
+      dst_ptr, src_ptr, size, IoDirection::HtoD, *stream);
+}
+
+void AssignValueCompute::Run() {
+  auto& param = Param<operators::AssignValueParam>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  int dtype = param.dtype;
+  std::vector<float> fp32_values = param.fp32_values;
+  std::vector<int> int32_values = param.int32_values;
+  std::vector<int64_t> int64_values = param.int64_values;
+  std::vector<int> bool_values = param.bool_values;
+  auto* out = param.Out;
+
+  if (dtype == static_cast<int>(lite::core::FluidType::INT32)) {
+    TensorFromVector(int32_values, out, &stream);
+  } else if (dtype == static_cast<int>(lite::core::FluidType::FP32)) {
+    TensorFromVector(fp32_values, out, &stream);
+  } else if (dtype == static_cast<int>(lite::core::FluidType::INT64)) {
+    TensorFromVector(int64_values, out, &stream);
+  } else if (dtype == static_cast<int>(lite::core::FluidType::BOOL)) {
+    TensorFromVector(bool_values, out, &stream);
+  } else {
+    LOG(FATAL) << "Unsupported dtype for assign_value_op:" << dtype;
+  }
+  return;
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(assign_value,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::AssignValueCompute,
+                     def)
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kAny))})
+    .Finalize();
diff --git a/lite/kernels/cuda/assign_value_compute.h b/lite/kernels/cuda/assign_value_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..c334e36d8061437881a4ea67d960f87b7ffb3ceb
--- /dev/null
+++ b/lite/kernels/cuda/assign_value_compute.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class AssignValueCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::AssignValueParam;
+
+  void Run() override;
+  virtual ~AssignValueCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/assign_value_compute_test.cc b/lite/kernels/cuda/assign_value_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c29426b745e92f71bcfeca6a8fc2890cd1908b4
--- /dev/null
+++ b/lite/kernels/cuda/assign_value_compute_test.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/assign_value_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <functional>
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class AssignValueTest : public ::testing::Test {
+ protected:
+  AssignValueTest() : dtype_(5), shape_({1}) {
+    int num = std::accumulate(
+        shape_.begin(), shape_.end(), 1, std::multiplies<int>());
+    fp32_values_.resize(num);
+    int32_values_.resize(num);
+    int64_values_.resize(num);
+    bool_values_.resize(num);
+    for (int i = 0; i < num; ++i) {
+      fp32_values_[i] = i + 5;
+      int32_values_[i] = i;
+      int64_values_[i] = i;
+      bool_values_[i] = i;
+    }
+    std::vector<int64_t> out_shape(shape_.size(), 0);
+    for (size_t i = 0; i < shape_.size(); ++i) out_shape[i] = shape_[i];
+    out_ref_.Resize(lite::DDim(out_shape));
+    out_gpu_.Resize(out_ref_.dims());
+    out_cpu_.Resize(out_ref_.dims());
+
+    RunBaseLine(&out_ref_);
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.shape = shape_;
+    param_.dtype = dtype_;
+    param_.fp32_values = fp32_values_;
+    param_.int32_values = int32_values_;
+    param_.int64_values = int64_values_;
+    param_.bool_values = bool_values_;
+    param_.Out = &out_gpu_;
+  }
+
+  void InitFloatInput() {}
+
+  void InitHalfInput() {}
+
+  void RunBaseLine(lite::Tensor* out) {
+    if (dtype_ == static_cast<int>(lite::core::FluidType::INT32)) {
+      for (size_t i = 0; i < int32_values_.size(); ++i) {
+        out->mutable_data<int>()[i] = int32_values_[i];
+      }
+    } else if (dtype_ == static_cast<int>(lite::core::FluidType::FP32)) {
+      for (size_t i = 0; i < fp32_values_.size(); ++i) {
+        out->mutable_data<float>()[i] = fp32_values_[i];
+      }
+    } else if (dtype_ == static_cast<int>(lite::core::FluidType::INT64)) {
+      for (size_t i = 0; i < int64_values_.size(); ++i) {
+        out->mutable_data<int64_t>()[i] = int64_values_[i];
+      }
+    } else if (dtype_ == static_cast<bool>(lite::core::FluidType::BOOL)) {
+      for (size_t i = 0; i < bool_values_.size(); ++i) {
+        out->mutable_data<bool>()[i] = bool_values_[i];
+      }
+    } else {
+      LOG(FATAL) << "Unsupported dtype_ for assign_value_op:" << dtype_;
+    }
+  }
+
+  int dtype_;
+  std::vector<int> shape_;
+  std::vector<float> fp32_values_;
+  std::vector<int> int32_values_;
+  std::vector<int64_t> int64_values_;
+  std::vector<int> bool_values_;
+
+  lite::Tensor out_ref_;
+  lite::Tensor out_gpu_;
+  lite::Tensor out_cpu_;
+
+  operators::AssignValueParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(AssignValueTest, fp32) {
+  InitFloatInput();
+  AssignValueCompute kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/concat_compute_test.cc b/lite/kernels/cuda/concat_compute_test.cc
index cc12fcd289d36c38f02663c6a7aaa0ec7c70653a..08dd4013a5ce75ea5abc0c9d678f7437276df161 100644
--- a/lite/kernels/cuda/concat_compute_test.cc
+++ b/lite/kernels/cuda/concat_compute_test.cc
@@ -69,7 +69,7 @@ void concat_compute_ref(const operators::ConcatParam& param) {
   std::vector<int> input_cols(input.size());
   for (int i = 0; i < num; ++i) {
     int input_i_numel = input[i]->dims().size() == 0 ? 0 : 1;
-    for (int didx = 0; didx < input[i]->dims().size(); ++didx) {
+    for (size_t didx = 0; didx < input[i]->dims().size(); ++didx) {
       input_i_numel *= input[i]->dims()[didx];
     }
     int t_cols = input_i_numel / rows;
diff --git a/lite/kernels/cuda/dropout_compute.cc b/lite/kernels/cuda/dropout_compute.cc
index 7e3a3a62432f3bc5f2e62112b2b220abc17ee2bd..f9303a39cebda322526e6cc25401db35e1f4309b 100644
--- a/lite/kernels/cuda/dropout_compute.cc
+++ b/lite/kernels/cuda/dropout_compute.cc
@@ -23,6 +23,9 @@ namespace cuda {
 
 void DropoutCompute::Run() {
   auto& param = Param<operators::DropoutParam>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
   const float* x_data = param.x->data<float>();
   float* out_data = param.output->mutable_data<float>(TARGET(kCUDA));
   int num = param.x->dims().production();
@@ -31,7 +34,7 @@ void DropoutCompute::Run() {
   if (param.dropout_implementation == "downgrade_in_infer") {
     scale = 1.0f - prob_data;
   }
-  lite::cuda::math::scale(num, x_data, out_data, scale, 0);
+  lite::cuda::math::scale(num, x_data, out_data, scale, 0.f, stream);
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/fc_compute.cu b/lite/kernels/cuda/fc_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0ad376577b133540b782e2726564302a95ddf216
--- /dev/null
+++ b/lite/kernels/cuda/fc_compute.cu
@@ -0,0 +1,353 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/cuda/fc_compute.h"
+
+#include <string>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+struct FcTypeTraits;
+
+template <>
+struct FcTypeTraits<float> {
+  typedef float4 Type;
+};
+
+template <typename T>
+__global__ void AddBiasV2(const int num, const T* bias, T* data, int K) {
+  CUDA_KERNEL_LOOP(index, num) {
+    int bias_idx = index % K;
+    const T bias_ptr = bias[bias_idx];
+    const T in_ptr = data[index];
+    T packed_val;
+    packed_val.x = in_ptr.x + bias_ptr.x;
+    packed_val.y = in_ptr.y + bias_ptr.y;
+    data[index] = packed_val;
+  }
+}
+
+template <>
+__global__ void AddBiasV2(const int num,
+                          const half2* bias,
+                          half2* data,
+                          int K) {
+  CUDA_KERNEL_LOOP(index, num) {
+    int bias_idx = index % K;
+    const half2 bias_ptr = bias[bias_idx];
+    const half2 in_ptr = data[index];
+#if __CUDA_ARCH__ >= 530
+    data[index] = __hadd2(in_ptr, bias_ptr);
+#else
+    half2 packed_val;
+    packed_val.x = __hadd(in_ptr.x, bias_ptr.x);
+    packed_val.y = __hadd(in_ptr.y, bias_ptr.y);
+    data[index] = packed_val;
+#endif
+  }
+}
+
+template <typename T>
+__global__ void AddBiasReluV2(const int num, const T* bias, T* data, int K) {
+  CUDA_KERNEL_LOOP(index, num) {
+    int bias_idx = index % K;
+    const T bias_ptr = bias[bias_idx];
+    const T in_ptr = data[index];
+    T packed_val;
+    packed_val.x = fmaxf(0.f, in_ptr.x + bias_ptr.x);
+    packed_val.y = fmaxf(0.f, in_ptr.y + bias_ptr.y);
+    data[index] = packed_val;
+  }
+}
+
+template <>
+__global__ void AddBiasReluV2(const int num,
+                              const half2* bias,
+                              half2* data,
+                              int K) {
+  CUDA_KERNEL_LOOP(index, num) {
+    int bias_idx = index % K;
+    const half2 bias_ptr = bias[bias_idx];
+    const half2 in_ptr = data[index];
+#if __CUDA_ARCH__ >= 530
+    data[index] = __hmul2(__hgt2(in_ptr + bias_ptr, __float2half2_rn(0.f)),
+                          in_ptr + bias_ptr);
+#else
+    const float2 bias = __half22float2(bias_ptr);
+    const float2 in = __half22float2(in_ptr);
+    data[index] = __floats2half2_rn(
+        bias.x + in.x > 0.0f ? static_cast<float>(bias.x + in.x) : 0.0f,
+        bias.y + in.y > 0.0f ? static_cast<float>(bias.y + in.y) : 0.0f);
+#endif
+  }
+}
+
+template <typename T>
+__global__ void AddBiasV4(const int num, const T* bias, T* data, int K) {
+  CUDA_KERNEL_LOOP(index, num) {
+    int bias_idx = index % K;
+    const T bias_ptr = bias[bias_idx];
+    const T in_ptr = data[index];
+    T packed_val;
+    packed_val.x = in_ptr.x + bias_ptr.x;
+    packed_val.y = in_ptr.y + bias_ptr.y;
+    packed_val.z = in_ptr.z + bias_ptr.z;
+    packed_val.w = in_ptr.w + bias_ptr.w;
+    data[index] = packed_val;
+  }
+}
+
+template <typename T>
+__global__ void AddBiasReluV4(const int num, const T* bias, T* data, int K) {
+  CUDA_KERNEL_LOOP(index, num) {
+    int bias_idx = index % K;
+    const T bias_ptr = bias[bias_idx];
+    const T in_ptr = data[index];
+    T packed_val;
+    packed_val.x = fmaxf(0.f, in_ptr.x + bias_ptr.x);
+    packed_val.y = fmaxf(0.f, in_ptr.y + bias_ptr.y);
+    packed_val.z = fmaxf(0.f, in_ptr.z + bias_ptr.z);
+    packed_val.w = fmaxf(0.f, in_ptr.w + bias_ptr.w);
+    data[index] = packed_val;
+  }
+}
+
+template <typename T>
+__global__ void AddBias(const int num, const T* bias, T* data) {
+  int offset = blockIdx.x * num;
+
+  for (int i = threadIdx.x; i < num; i += blockDim.x) {
+    T temp;
+#if __CUDA_ARCH__ >= 350
+    temp = __ldg(data + offset + i) + __ldg(bias + i);
+#else
+    temp = data[offset + i] + bias[i];
+#endif
+    data[offset + i] = temp;
+  }
+}
+
+template <>
+__global__ void AddBias(const int num, const half* bias, half* data) {
+  int offset = blockIdx.x * num;
+
+  for (int i = threadIdx.x; i < num; i += blockDim.x) {
+    half temp;
+#if __CUDA_ARCH__ >= 350
+    temp = __hadd(__ldg(data + offset + i), __ldg(bias + i));
+#else
+    temp = __hadd(data[offset + i], bias[i]);
+#endif
+    data[offset + i] = temp;
+  }
+}
+
+template <typename T>
+__global__ void AddBiasRelu(const int num, const T* bias, T* data) {
+  int offset = blockIdx.x * num;
+
+  for (int i = threadIdx.x; i < num; i += blockDim.x) {
+    T temp;
+#if __CUDA_ARCH__ >= 350
+    temp = __ldg(data + offset + i) + __ldg(bias + i);
+#else
+    temp = data[offset + i] + bias[i];
+#endif
+    data[offset + i] = static_cast<int>(temp > 0) * temp;
+  }
+}
+
+template <>
+__global__ void AddBiasRelu<half>(const int num, const half* bias, half* data) {
+  int offset = blockIdx.x * num;
+
+  for (int i = threadIdx.x; i < num; i += blockDim.x) {
+    half temp;
+#if __CUDA_ARCH__ >= 350
+    temp = __hadd(__ldg(data + offset + i), __ldg(bias + i));
+#else
+    temp = __hadd(data[offset + i], bias[i]);
+#endif
+
+#if __CUDA_ARCH__ >= 530
+    data[offset + i] =
+        __hgt(temp, __float2half(0.0f)) ? temp : __float2half(0.0f);
+#else
+    data[offset + i] =
+        __float2half(__half2float(temp) > 0.f ? __half2float(temp) : 0.f);
+#endif
+  }
+}
+
+template <typename T, PrecisionType PType>
+void FcCompute<T, PType>::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<T, T>);
+}
+
+template <typename T, PrecisionType PType>
+void FcCompute<T, PType>::Run() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+  auto& param = this->template Param<param_t>();
+
+  const auto* x_data = param.input->template data<T>();
+  const auto* w_data = param.w->template data<T>();
+  const auto* b_data = param.bias ? param.bias->template data<T>() : nullptr;
+
+  auto out_vec = param.output->dims().Vectorize();
+  out_vec.back() = param.w->dims()[1];
+  param.output->Resize(out_vec);
+  auto* out_data = param.output->template mutable_data<T>(TARGET(kCUDA));
+
+  int in_num_col_dims = param.in_num_col_dims;
+
+  int M = static_cast<int>(
+      param.input->dims().Slice(0, param.in_num_col_dims).production());
+  int K = static_cast<int>(
+      param.input->dims()
+          .Slice(param.in_num_col_dims, param.input->dims().size())
+          .production());
+  int K2 = static_cast<int>(param.w->dims()[0]);
+  int N = static_cast<int>(param.w->dims()[1]);
+  CHECK_EQ(K, K2) << "x_w must be equal with y_h";
+
+  CHECK(gemm_impl_->init(false, false, M, N, K, &context));
+  gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &context);
+
+  if (b_data == nullptr) {
+    return;
+  }
+
+  std::string activation_type = param.activation_type;
+  if (N % 4 == 0) {
+    const int threads = 256;
+    const int num = M * N / 4;
+    const int blocks = (num + threads - 1) / threads;
+    typedef typename FcTypeTraits<T>::Type trans_type;
+    const auto* bias_ptr_v4 = reinterpret_cast<const trans_type*>(b_data);
+    auto* data_ptr_v4 = reinterpret_cast<trans_type*>(out_data);
+    if (activation_type == "relu") {
+      AddBiasReluV4<trans_type><<<blocks, threads, 0, stream>>>(
+          num, bias_ptr_v4, data_ptr_v4, N / 4);
+    } else if (activation_type == "") {
+      AddBiasV4<trans_type><<<blocks, threads, 0, stream>>>(
+          num, bias_ptr_v4, data_ptr_v4, N / 4);
+    } else {
+      LOG(FATAL) << "not supported activation type: " << activation_type;
+    }
+  } else {
+    const int threads = 256;
+    const int blocks = M;
+    if (activation_type == "relu") {
+      AddBiasRelu<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
+    } else if (activation_type == "") {
+      AddBias<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
+    } else {
+      LOG(FATAL) << "not supported activation type: " << activation_type;
+    }
+  }
+}
+
+template <>
+void FcCompute<half, PRECISION(kFP16)>::Run() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+  auto& param = this->template Param<param_t>();
+
+  const auto* x_data = param.input->template data<half>();
+  const auto* w_data = param.w->template data<half>();
+  const auto* b_data = param.bias ? param.bias->template data<half>() : nullptr;
+
+  auto out_vec = param.output->dims().Vectorize();
+  out_vec.back() = param.w->dims()[1];
+  param.output->Resize(out_vec);
+  auto* out_data = param.output->template mutable_data<half>(TARGET(kCUDA));
+
+  int in_num_col_dims = param.in_num_col_dims;
+
+  int M = static_cast<int>(
+      param.input->dims().Slice(0, param.in_num_col_dims).production());
+  int K = static_cast<int>(
+      param.input->dims()
+          .Slice(param.in_num_col_dims, param.input->dims().size())
+          .production());
+  int K2 = static_cast<int>(param.w->dims()[0]);
+  int N = static_cast<int>(param.w->dims()[1]);
+  CHECK_EQ(K, K2) << "x_w must be equal with y_h";
+
+  CHECK(gemm_impl_->init(false, false, M, N, K, &context));
+  gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &context);
+
+  if (b_data == nullptr) {
+    return;
+  }
+
+  std::string activation_type = param.activation_type;
+  if (N % 2 == 0) {
+    const int threads = 256;
+    const int num = M * N / 2;
+    const int blocks = (num + threads - 1) / threads;
+    const auto* bias_ptr_v2 = reinterpret_cast<const half2*>(b_data);
+    auto* data_ptr_v2 = reinterpret_cast<half2*>(out_data);
+    if (activation_type == "relu") {
+      AddBiasReluV2<half2><<<blocks, threads, 0, stream>>>(
+          num, bias_ptr_v2, data_ptr_v2, N / 2);
+    } else if (activation_type == "") {
+      AddBiasV2<half2><<<blocks, threads, 0, stream>>>(
+          num, bias_ptr_v2, data_ptr_v2, N / 2);
+    } else {
+      LOG(FATAL) << "not supported activation type: " << activation_type;
+    }
+  } else {
+    const int threads = 256;
+    const int blocks = M;
+    if (activation_type == "relu") {
+      AddBiasRelu<half><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
+    } else if (activation_type == "") {
+      AddBias<half><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
+    } else {
+      LOG(FATAL) << "not supported activation type: " << activation_type;
+    }
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using FcFp32 = paddle::lite::kernels::cuda::FcCompute<float, PRECISION(kFloat)>;
+
+using FcFp16 = paddle::lite::kernels::cuda::FcCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(fc, kCUDA, kFloat, kNCHW, FcFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fc, kCUDA, kFP16, kNCHW, FcFp16, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/fc_compute.h b/lite/kernels/cuda/fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..700194c115824762411e952c77d06cb01a754bc0
--- /dev/null
+++ b/lite/kernels/cuda/fc_compute.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/kernel.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType PType>
+class FcCompute : public KernelLite<TARGET(kCUDA), PType> {
+ public:
+  using param_t = operators::FcParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~FcCompute() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::Gemm<T, T>> gemm_impl_{nullptr};
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/fc_compute_test.cc b/lite/kernels/cuda/fc_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa0dada729ca01cb1a4176ca585ce8f921f3aa42
--- /dev/null
+++ b/lite/kernels/cuda/fc_compute_test.cc
@@ -0,0 +1,231 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/fc_compute.h"
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class FcTest : public ::testing::Test {
+ protected:
+  FcTest()
+      : m_(8),
+        k_(16),
+        n_(64),
+        in_num_col_dims_(1),
+        act_type_("relu"),
+        x_shape_({m_, k_}),
+        w_shape_({k_, n_}),
+        b_shape_({n_}),
+        out_shape_({m_, n_}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(lite::DDim(x_shape_));
+
+    w_ref_.Resize(lite::DDim(w_shape_));
+    w_gpu_.Resize(lite::DDim(w_shape_));
+
+    b_ref_.Resize(lite::DDim(b_shape_));
+    b_gpu_.Resize(lite::DDim(b_shape_));
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+    auto w_ref_data = w_ref_.mutable_data<float>();
+    auto b_ref_data = b_ref_.mutable_data<float>();
+
+    // prepare input
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+    for (int64_t i = 0; i < w_ref_.numel(); i++) {
+      w_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+    for (int64_t i = 0; i < b_ref_.numel(); i++) {
+      b_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_cpu_.Resize(out_ref_.dims());
+    out_gpu_.Resize(out_ref_.dims());
+    RunBaseLine(&x_ref_, &w_ref_, &b_ref_, &out_ref_);
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.input = &x_gpu_;
+    param_.w = &w_gpu_;
+    param_.bias = &b_gpu_;
+    param_.in_num_col_dims = in_num_col_dims_;
+    param_.activation_type = act_type_;
+    param_.output = &out_gpu_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    w_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(w_ref_.data<float>(),
+                                                    w_gpu_.dims());
+    b_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(b_ref_.data<float>(),
+                                                    b_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(x_shape_));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+    w_half_.Resize(w_ref_.dims());
+    auto w_half_data = w_half_.mutable_data<half>();
+    for (int64_t i = 0; i < w_half_.numel(); i++) {
+      w_half_data[i] = half(lite::float16(w_ref_.data<float>()[i]));
+    }
+    w_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(w_half_data, w_gpu_.dims());
+    b_half_.Resize(b_ref_.dims());
+    auto b_half_data = b_half_.mutable_data<half>();
+    for (int64_t i = 0; i < b_half_.numel(); i++) {
+      b_half_data[i] = half(lite::float16(b_ref_.data<float>()[i]));
+    }
+    b_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(b_half_data, b_gpu_.dims());
+  }
+
+  void RunBaseLine(const lite::Tensor* x,
+                   const lite::Tensor* w,
+                   const lite::Tensor* b,
+                   lite::Tensor* out) {
+    const float* data_in = x->data<float>();
+    const float* bias = b->data<float>();
+    const float* weights = w->data<float>();
+    float* data_out = out->mutable_data<float>();
+    int out_rows = x->dims()[0];
+    int in_cols = x->numel() / out_rows;
+    int out_cols = w->numel() / in_cols;
+    int index_out;
+    for (int i = 0; i < out_rows; i++) {
+      for (int j = 0; j < out_cols; j++) {
+        index_out = i * out_cols + j;
+        data_out[index_out] = bias ? bias[j] : 0;
+        for (int k = 0; k < in_cols; k++) {
+          data_out[index_out] +=
+              data_in[i * in_cols + k] * weights[k * out_cols + j];
+        }
+        if (act_type_ == "relu") {
+          data_out[index_out] *= static_cast<int>(data_out[index_out] > 0);
+        }
+      }
+    }
+  }
+
+  int m_, k_, n_, in_num_col_dims_;
+  std::string act_type_;
+  std::vector<int64_t> x_shape_, w_shape_, b_shape_, out_shape_;
+  lite::Tensor x_ref_, w_ref_, b_ref_, out_ref_;
+  lite::Tensor x_gpu_, w_gpu_, b_gpu_;
+  lite::Tensor x_half_, w_half_, b_half_;
+  lite::Tensor out_cpu_, out_gpu_;
+
+  operators::FcParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(FcTest, TestFP32) {
+  InitFloatInput();
+  FcCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = out_cpu_.data<float>()[i];
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5);
+  }
+}
+
+TEST_F(FcTest, TestFP16) {
+  InitHalfInput();
+  FcCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 2e-2);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/gru_compute.cu b/lite/kernels/cuda/gru_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ddca95048b303cce55cc3435b15f945a84fc8c0c
--- /dev/null
+++ b/lite/kernels/cuda/gru_compute.cu
@@ -0,0 +1,394 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/cuda/gru_compute.h"
+
+#include <string>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/bias.h"
+#include "lite/backends/cuda/math/gru_forward.h"
+#include "lite/backends/cuda/math/sequence2batch.h"
+#include "lite/backends/cuda/target_wrapper.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+struct GRUMetaValue {
+  T* gate_weight;
+  T* state_weight;
+  T* gate_value;
+  T* reset_output_value;
+  T* output_value;
+  T* prev_out_value;
+};
+
+template <typename T>
+struct GRUUnitFunctor {
+  static void compute(GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const lite::cuda::math::ActivationType& active_node,
+                      const lite::cuda::math::ActivationType& active_gate,
+                      bool origin_mode,
+                      lite::cuda::math::Gemm<T, T>* blas,
+                      CUDAContext* context) {
+    dim3 threads, grids;
+    if (batch_size == 1) {
+      if (lite::TargetWrapperCuda::GetComputeCapability() >= 70) {
+        if (frame_size < 16) {
+          constexpr int tiled_size = 8;
+          int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
+          threads = dim3(tiled_size, 1);
+          grids = dim3(frame_blocks, 1);
+          lite::cuda::math::FastCollectiveGruGate<
+              T,
+              tiled_size><<<grids, threads, 0, context->exec_stream()>>>(
+              value.gate_value,
+              value.prev_out_value,
+              value.gate_weight,
+              value.reset_output_value,
+              frame_size,
+              active_gate);
+          frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+          grids = dim3(frame_blocks, 1);
+          lite::cuda::math::FastCollectiveGruOut<
+              T,
+              tiled_size><<<grids, threads, 0, context->exec_stream()>>>(
+              value.state_weight,
+              value.prev_out_value,
+              value.output_value,
+              value.gate_value,
+              value.reset_output_value,
+              frame_size,
+              active_node,
+              origin_mode);
+        } else {
+          constexpr int tiled_size = 16;
+          int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
+          threads = dim3(tiled_size, 1);
+          grids = dim3(frame_blocks, 1);
+          lite::cuda::math::FastCollectiveGruGate<
+              T,
+              tiled_size><<<grids, threads, 0, context->exec_stream()>>>(
+              value.gate_value,
+              value.prev_out_value,
+              value.gate_weight,
+              value.reset_output_value,
+              frame_size,
+              active_gate);
+          frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+          grids = dim3(frame_blocks, 1);
+          lite::cuda::math::FastCollectiveGruOut<
+              T,
+              tiled_size><<<grids, threads, 0, context->exec_stream()>>>(
+              value.state_weight,
+              value.prev_out_value,
+              value.output_value,
+              value.gate_value,
+              value.reset_output_value,
+              frame_size,
+              active_node,
+              origin_mode);
+        }
+        return;
+      } else {
+        int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+        int frame_blocks = (frame_size + 1024 - 1) / 1024;
+        threads = dim3(frame_per_block, 1);
+        grids = dim3(frame_blocks, 1);
+      }
+    } else {
+      threads = dim3(32, 32);
+      grids = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+
+    if (value.prev_out_value) {
+      CHECK(blas->init(false,
+                       false,
+                       batch_size,
+                       frame_size * 2,
+                       frame_size,
+                       frame_size,
+                       frame_size * 2,
+                       frame_size * 3,
+                       context));
+      blas->run(1.0f,
+                1.0f,
+                value.prev_out_value,
+                value.gate_weight,
+                value.gate_value,
+                context);
+    }
+
+    lite::cuda::math::GruForwardResetOutput<
+        T><<<grids, threads, 0, context->exec_stream()>>>(
+        value.gate_value,
+        value.reset_output_value,
+        value.prev_out_value,
+        frame_size,
+        batch_size,
+        active_gate,
+        batch_size != 1);
+    CUDA_POST_KERNEL_CHECK;
+
+    if (value.prev_out_value) {
+      CHECK(blas->init(false,
+                       false,
+                       batch_size,
+                       frame_size,
+                       frame_size,
+                       frame_size,
+                       frame_size,
+                       frame_size * 3,
+                       context));
+      blas->run(1.0f,
+                1.0f,
+                value.reset_output_value,
+                value.state_weight,
+                value.gate_value + frame_size * 2,
+                context);
+    }
+
+    lite::cuda::math::GruForwardFinalOutput<
+        T><<<grids, threads, 0, context->exec_stream()>>>(value.gate_value,
+                                                          value.prev_out_value,
+                                                          value.output_value,
+                                                          frame_size,
+                                                          batch_size,
+                                                          active_node,
+                                                          origin_mode,
+                                                          batch_size != 1);
+    CUDA_POST_KERNEL_CHECK;
+  }
+};
+
+template struct GRUUnitFunctor<float>;
+
+template <>
+struct GRUUnitFunctor<half> {
+  static void compute(GRUMetaValue<half> value,
+                      int frame_size,
+                      int batch_size,
+                      const lite::cuda::math::ActivationType& active_node,
+                      const lite::cuda::math::ActivationType& active_gate,
+                      bool origin_mode,
+                      lite::cuda::math::Gemm<half, half>* blas,
+                      CUDAContext* context) {
+    dim3 threads, grids;
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grids = dim3(frame_blocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grids = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+
+    if (value.prev_out_value) {
+      CHECK(blas->init(false,
+                       false,
+                       batch_size,
+                       frame_size * 2,
+                       frame_size,
+                       frame_size,
+                       frame_size * 2,
+                       frame_size * 3,
+                       context));
+      blas->run(1.0f,
+                1.0f,
+                value.prev_out_value,
+                value.gate_weight,
+                value.gate_value,
+                context);
+    }
+
+    lite::cuda::math::GruForwardResetOutput<
+        half><<<grids, threads, 0, context->exec_stream()>>>(
+        value.gate_value,
+        value.reset_output_value,
+        value.prev_out_value,
+        frame_size,
+        batch_size,
+        active_gate,
+        batch_size == 1);
+    CUDA_POST_KERNEL_CHECK;
+
+    if (value.prev_out_value) {
+      CHECK(blas->init(false,
+                       false,
+                       batch_size,
+                       frame_size,
+                       frame_size,
+                       frame_size,
+                       frame_size,
+                       frame_size * 3,
+                       context));
+      blas->run(1.0f,
+                1.0f,
+                value.reset_output_value,
+                value.state_weight,
+                value.gate_value + frame_size * 2,
+                context);
+    }
+
+    lite::cuda::math::GruForwardFinalOutput<
+        half><<<grids, threads, 0, context->exec_stream()>>>(
+        value.gate_value,
+        value.prev_out_value,
+        value.output_value,
+        frame_size,
+        batch_size,
+        active_node,
+        origin_mode,
+        batch_size == 1);
+    CUDA_POST_KERNEL_CHECK;
+  }
+};
+
+template <typename T, PrecisionType PType>
+void GRUCompute<T, PType>::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<T, T>);
+}
+
+template <typename T, PrecisionType PType>
+void GRUCompute<T, PType>::Run() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+  auto& param = this->template Param<param_t>();
+
+  auto* input = param.input;
+  lite::Tensor* h0{nullptr};
+  if (param.h0) {
+    h0 = const_cast<lite::Tensor*>(param.h0);
+  }
+  lite::Tensor* bias{nullptr};
+  if (param.bias) {
+    bias = const_cast<lite::Tensor*>(param.bias);
+  }
+  const lite::Tensor* weight = param.weight;
+  T* weight_data = const_cast<T*>(weight->template data<T>());
+  lite::Tensor* batch_gate = param.batch_gate;
+  lite::Tensor* batch_reset_hidden_prev = param.batch_reset_hidden_prev;
+  lite::Tensor* batch_hidden = param.batch_hidden;
+  lite::Tensor* hidden = param.hidden;
+  T* batch_reset_hidden_prev_data =
+      batch_reset_hidden_prev->template mutable_data<T>(TARGET(kCUDA));
+  hidden->template mutable_data<T>(TARGET(kCUDA));
+  T* batch_gate_data = batch_gate->template mutable_data<T>(TARGET(kCUDA));
+  T* batch_hidden_data = batch_hidden->template mutable_data<T>(TARGET(kCUDA));
+  bool is_reverse = param.is_reverse;
+  auto active_node = lite::cuda::math::GetActiveType(param.activation);
+  auto active_gate = lite::cuda::math::GetActiveType(param.gate_activation);
+  bool origin_mode = param.origin_mode;
+
+  auto hidden_dims = hidden->dims();
+  int frame_size = hidden_dims[1];
+
+  lite::cuda::math::LoDTensor2BatchFunctor<T> batch_func;
+  batch_func(*input, batch_gate, is_reverse, stream);
+
+  if (bias) {
+    lite::cuda::math::RowwiseAdd<T> add_bias;
+    add_bias(batch_gate_data,
+             bias->template data<T>(),
+             batch_gate_data,
+             frame_size,
+             batch_gate->numel(),
+             stream);
+  }
+  GRUMetaValue<T> gru_value;
+  gru_value.gate_weight = weight_data;
+  gru_value.state_weight = weight_data + 2 * frame_size * frame_size;
+
+  if (h0) {
+    // Since the batch computing for GRU reorders the input sequences
+    // according to their length. The initialized cell state also needs
+    // to reorder.
+    ordered_h0_.Resize(h0->dims());
+    lite::cuda::math::CopyMatrixRowsFunctor<T> row_shuffle;
+    row_shuffle(*h0, &ordered_h0_, batch_gate->lod()[2], true, stream);
+    gru_value.prev_out_value = ordered_h0_.mutable_data<T>(TARGET(kCUDA));
+  } else {
+    gru_value.prev_out_value = nullptr;
+  }
+  auto batch_starts = batch_gate->lod()[0];
+  size_t num_batch = batch_starts.size() - 1;
+  for (size_t n = 0; n < num_batch; ++n) {
+    int bstart = static_cast<int>(batch_starts[n]);
+    int bend = static_cast<int>(batch_starts[n + 1]);
+    int cur_batch_size = bend - bstart;
+
+    gru_value.output_value = batch_hidden_data + bstart * frame_size;
+    gru_value.gate_value = batch_gate_data + bstart * frame_size * 3;
+    gru_value.reset_output_value =
+        batch_reset_hidden_prev_data + bstart * frame_size;
+
+    GRUUnitFunctor<T>::compute(gru_value,
+                               frame_size,
+                               cur_batch_size,
+                               active_node,
+                               active_gate,
+                               origin_mode,
+                               gemm_impl_.get(),
+                               &context);
+    gru_value.prev_out_value = gru_value.output_value;
+  }
+
+  lite::cuda::math::Batch2LoDTensorFunctor<T> to_seq;
+  batch_hidden->set_lod(batch_gate->lod());
+  to_seq(*batch_hidden, hidden, stream);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using GRUFp32 =
+    paddle::lite::kernels::cuda::GRUCompute<float, PRECISION(kFloat)>;
+
+using GRUFp16 = paddle::lite::kernels::cuda::GRUCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(gru, kCUDA, kFloat, kNCHW, GRUFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("H0", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("BatchGate", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("BatchResetHiddenPrev", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("BatchHidden", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(gru, kCUDA, kFP16, kNCHW, GRUFp16, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("H0", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("Weight",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("BatchGate",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("BatchResetHiddenPrev",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("BatchHidden",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Hidden",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/gru_compute.h b/lite/kernels/cuda/gru_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..070deca2c54b919d1afeb856633d94fe5919eabd
--- /dev/null
+++ b/lite/kernels/cuda/gru_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/kernel.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType PType>
+class GRUCompute : public KernelLite<TARGET(kCUDA), PType> {
+ public:
+  using param_t = operators::GRUParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~GRUCompute() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::Gemm<T, T>> gemm_impl_{nullptr};
+  lite::Tensor ordered_h0_;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/gru_compute_test.cc b/lite/kernels/cuda/gru_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..adff5b6b28d6a2b4b9513148fa1219f78534dfca
--- /dev/null
+++ b/lite/kernels/cuda/gru_compute_test.cc
@@ -0,0 +1,183 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/gru_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class GRUTest : public ::testing::Test {
+ protected:
+  GRUTest()
+      : batch_(12),
+        frame_size_(128),
+        activation_("tanh"),
+        gate_activation_("sigmoid"),
+        is_reverse_(false),
+        origin_mode_(false),
+        x_shape_({batch_, frame_size_ * 3}),
+        w_shape_({frame_size_, frame_size_ * 3}),
+        out_shape_({batch_, frame_size_}),
+        lod_({{0, 4, 9, 12}}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(lite::DDim(x_shape_));
+    x_ref_.set_lod(lod_);
+
+    w_ref_.Resize(lite::DDim(w_shape_));
+    w_gpu_.Resize(lite::DDim(w_shape_));
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+    auto w_ref_data = w_ref_.mutable_data<float>();
+
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+    for (int64_t i = 0; i < w_ref_.numel(); i++) {
+      w_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_cpu_.Resize(out_ref_.dims());
+    out_gpu_.Resize(out_ref_.dims());
+    batch_gate_gpu_.Resize(lite::DDim(x_shape_));
+    batch_hidden_gpu_.Resize(lite::DDim(out_shape_));
+    batch_reset_hidden_gpu_.Resize(lite::DDim(out_shape_));
+    RunBaseLine();
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.input = &x_gpu_;
+    param_.weight = &w_gpu_;
+    param_.gate_activation = gate_activation_;
+    param_.activation = activation_;
+    param_.is_reverse = is_reverse_;
+    param_.origin_mode = origin_mode_;
+    param_.hidden = &out_gpu_;
+    param_.batch_gate = &batch_gate_gpu_;
+    param_.batch_reset_hidden_prev = &batch_reset_hidden_gpu_;
+    param_.batch_hidden = &batch_hidden_gpu_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    x_gpu_.set_lod(x_ref_.lod());
+    w_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(w_ref_.data<float>(),
+                                                    w_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(x_shape_));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+    x_gpu_.set_lod(x_ref_.lod());
+    w_half_.Resize(w_ref_.dims());
+    auto w_half_data = w_half_.mutable_data<half>();
+    for (int64_t i = 0; i < w_half_.numel(); i++) {
+      w_half_data[i] = half(lite::float16(w_ref_.data<float>()[i]));
+    }
+    w_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(w_half_data, w_gpu_.dims());
+  }
+
+  void RunBaseLine() {}
+
+  int batch_, frame_size_;
+  std::string activation_, gate_activation_;
+  bool is_reverse_, origin_mode_;
+  std::vector<int64_t> x_shape_, w_shape_, out_shape_;
+  LoD lod_;
+  lite::Tensor x_ref_, w_ref_, out_ref_;
+  lite::Tensor x_gpu_, w_gpu_;
+  lite::Tensor x_half_, w_half_;
+  lite::Tensor batch_gate_gpu_;
+  lite::Tensor batch_hidden_gpu_;
+  lite::Tensor batch_reset_hidden_gpu_;
+  lite::Tensor out_cpu_, out_gpu_;
+
+  operators::GRUParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(GRUTest, TestFP32) {
+  InitFloatInput();
+  GRUCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+}
+
+TEST_F(GRUTest, TestFP16) {
+  InitHalfInput();
+  GRUCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/lookup_table_compute_test.cc b/lite/kernels/cuda/lookup_table_compute_test.cc
index 9323de14eb168fb55a68640350b87bf7040f5729..89050ea97f160b2fddb479966f59c05aafd8c268 100644
--- a/lite/kernels/cuda/lookup_table_compute_test.cc
+++ b/lite/kernels/cuda/lookup_table_compute_test.cc
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/cuda/lookup_table_compute.h"
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/lookup_table_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -56,9 +58,7 @@ void LookupTableComputeRef(const operators::LookupTableParam& param) {
 }
 
 TEST(lookup_table_cuda, retrieve_op) {
-  auto lookup_table =
-      KernelRegistry::Global().Create<TARGET(kCUDA), PRECISION(kFloat)>(
-          "lookup_table");
+  auto lookup_table = KernelRegistry::Global().Create("lookup_table");
   ASSERT_FALSE(lookup_table.empty());
   ASSERT_TRUE(lookup_table.front());
 }
diff --git a/lite/kernels/cuda/matmul_compute.cc b/lite/kernels/cuda/matmul_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b80b673dfabdccc7c728fa3081a81a870531acf
--- /dev/null
+++ b/lite/kernels/cuda/matmul_compute.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/matmul_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType PType>
+void MatMulCompute<T, PType>::Run() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto& param = this->template Param<param_t>();
+
+  const auto* x_data = param.X->template data<T>();
+  const auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>(TARGET(kCUDA));
+  bool transpose_x = param.transpose_X;
+  bool transpose_y = param.transpose_Y;
+  float alpha = param.alpha;
+
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+
+  int m = 0;
+  int k = 0;
+  int n = 0;
+  int batch = 0;
+  int64_t stride_x = 0;
+  int64_t stride_y = 0;
+
+  if (x_dims.size() >= 2 && y_dims.size() >= 2 &&
+      (x_dims.size() != 2 || y_dims.size() != 2)) {
+    // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [B, M, K], y: [K, N], out: [B, M, N]
+    // or
+    // x: [M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [M, K], y: [B, K, N], out: [B, M, N]
+    strided_gemm_impl_->init(transpose_x, transpose_y, &context);
+    m = transpose_x ? x_dims[x_dims.size() - 1] : x_dims[x_dims.size() - 2];
+    k = transpose_x ? x_dims[x_dims.size() - 2] : x_dims[x_dims.size() - 1];
+    n = transpose_y ? y_dims[y_dims.size() - 2] : y_dims[y_dims.size() - 1];
+    int batch_x = x_dims.size() == 2 ? 0 : x_dims.count(0, x_dims.size() - 2);
+    int batch_y = y_dims.size() == 2 ? 0 : y_dims.count(0, y_dims.size() - 2);
+    CHECK(batch_x == batch_y || batch_x == 0 || batch_y == 0)
+        << "batch_size x should be equal to batch_size y, or "
+           "one of batch_size x and batch_size y should be 0. "
+           "But got batch_size x = "
+        << batch_x << ", batch_size y = " << batch_y;
+    batch = batch_x == 0 ? batch_y : batch_x;
+    stride_x = x_dims.size() == 2 ? 0 : m * k;
+    stride_y = y_dims.size() == 2 ? 0 : k * n;
+    strided_gemm_impl_->run(alpha,
+                            0.f,
+                            m,
+                            n,
+                            k,
+                            x_data,
+                            y_data,
+                            out_data,
+                            batch,
+                            stride_x,
+                            stride_y);
+  } else if (x_dims.size() == 2 && y_dims.size() == 2) {
+    // x: [M, K], y: [K, N], out: [M, N]
+    m = transpose_x ? x_dims[1] : x_dims[0];
+    k = transpose_x ? x_dims[0] : x_dims[1];
+    n = transpose_y ? y_dims[0] : y_dims[1];
+    gemm_impl_->init(transpose_x, transpose_y, m, n, k, &context);
+    gemm_impl_->run(alpha, 0.0f, x_data, y_data, out_data, &context);
+  } else if (x_dims.size() > 2 && y_dims.size() == 1) {
+    // x: [B, M, K], y: [K], out: [B, M]
+    strided_gemm_impl_->init(transpose_x, transpose_y, &context);
+    m = transpose_x ? x_dims[x_dims.size() - 1] : x_dims[x_dims.size() - 2];
+    k = transpose_x ? x_dims[x_dims.size() - 2] : x_dims[x_dims.size() - 1];
+    n = 1;
+    batch = x_dims.count(0, x_dims.size() - 2);
+    stride_x = m * k;
+    stride_y = 0;
+    strided_gemm_impl_->run(alpha,
+                            0.f,
+                            m,
+                            n,
+                            k,
+                            x_data,
+                            y_data,
+                            out_data,
+                            batch,
+                            stride_x,
+                            stride_y);
+  } else if (x_dims.size() == 1 && y_dims.size() == 1) {
+    if (!transpose_x && !transpose_y) {
+      // x: [K], y: [K], out: [1]
+      m = 1;
+      k = x_dims[0];
+      n = 1;
+      CHECK_EQ(x_dims[0], y_dims[0])
+          << "x_dims[0] should be equal to y_dims[0]";
+      gemm_impl_->init(false, false, m, n, k, &context);
+      gemm_impl_->run(alpha, 0.0f, x_data, y_data, out_data, &context);
+    } else if (transpose_x && transpose_y) {
+      // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N]
+      m = x_dims[0];
+      k = 1;
+      n = y_dims[0];
+      gemm_impl_->init(false, false, m, n, k, &context);
+      gemm_impl_->run(alpha, 0.0f, x_data, y_data, out_data, &context);
+    } else {
+      LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims("
+                 << y_dims << "), transpose_x(" << transpose_x
+                 << "), transpose_y(" << transpose_y << ")";
+    }
+  } else {
+    LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+               << ")";
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using MatMulFp32 =
+    paddle::lite::kernels::cuda::MatMulCompute<float, PRECISION(kFloat)>;
+
+using MatMulFp16 =
+    paddle::lite::kernels::cuda::MatMulCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(matmul, kCUDA, kFloat, kNCHW, MatMulFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(matmul, kCUDA, kFP16, kNCHW, MatMulFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/matmul_compute.h b/lite/kernels/cuda/matmul_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..69ad178d9184b7c3893f49a23024a14d7466115b
--- /dev/null
+++ b/lite/kernels/cuda/matmul_compute.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/backends/cuda/math/strided_gemm.h"
+#include "lite/core/kernel.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class MatMulCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::MatMulParam;
+
+  void PrepareForRun() override {
+    strided_gemm_impl_.reset(new lite::cuda::math::StridedGemm<T, T>);
+    gemm_impl_.reset(new lite::cuda::math::Gemm<T, T>);
+  }
+
+  void Run() override;
+
+  virtual ~MatMulCompute() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::StridedGemm<T, T>> strided_gemm_impl_{
+      nullptr};
+  std::unique_ptr<lite::cuda::math::Gemm<T, T>> gemm_impl_{nullptr};
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/matmul_compute_test.cc b/lite/kernels/cuda/matmul_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89f40af3920ba0d3e36781955ffbf5eaba093257
--- /dev/null
+++ b/lite/kernels/cuda/matmul_compute_test.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/matmul_compute.h"
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class MatMulTest : public ::testing::Test {
+ protected:
+  MatMulTest()
+      : x_trans_(false),
+        y_trans_(true),
+        alpha_(1.0f),
+        x_shape_({4, 1, 2}),
+        y_shape_({4, 1, 2}),
+        out_shape_({4, 1, 1}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(x_ref_.dims());
+
+    y_ref_.Resize(lite::DDim(y_shape_));
+    y_gpu_.Resize(y_ref_.dims());
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+    auto y_ref_data = y_ref_.mutable_data<float>();
+
+    // prepare input
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(1);
+    }
+    for (int64_t i = 0; i < y_ref_.numel(); i++) {
+      y_ref_data[i] = static_cast<float>(1);
+    }
+
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_cpu_.Resize(out_ref_.dims());
+    out_gpu_.Resize(out_ref_.dims());
+    RunBaseLine();
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.Y = &y_gpu_;
+    param_.transpose_X = x_trans_;
+    param_.transpose_Y = y_trans_;
+    param_.alpha = alpha_;
+    param_.Out = &out_gpu_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    y_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(y_ref_.data<float>(),
+                                                    y_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_half_.Resize(x_ref_.dims());
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); ++i) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+    y_half_.Resize(y_ref_.dims());
+    auto y_half_data = y_half_.mutable_data<half>();
+    for (int64_t i = 0; i < y_half_.numel(); i++) {
+      y_half_data[i] = half(lite::float16(y_ref_.data<float>()[i]));
+    }
+    y_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(y_half_data, y_gpu_.dims());
+  }
+
+  void RunBaseLine() {
+    auto* out_data = out_ref_.mutable_data<float>();
+    for (int64_t i = 0; i < out_ref_.numel(); ++i) {
+      out_data[i] = 2;
+    }
+  }
+
+  bool x_trans_, y_trans_;
+  float alpha_;
+  std::vector<int64_t> x_shape_, y_shape_, out_shape_;
+  lite::Tensor x_ref_, y_ref_, out_ref_;
+  lite::Tensor x_gpu_, y_gpu_;
+  lite::Tensor x_half_, y_half_;
+  lite::Tensor out_cpu_, out_gpu_;
+
+  operators::MatMulParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(MatMulTest, TestFP32) {
+  InitFloatInput();
+  MatMulCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = out_cpu_.data<float>()[i];
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5);
+  }
+}
+
+TEST_F(MatMulTest, TestFP16) {
+  InitHalfInput();
+  MatMulCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/scale_compute.cc b/lite/kernels/cuda/scale_compute.cc
index 6bf7414d8c85383a834159678cdd5a09e0b434d9..9ce5905a7de750e1eed41e56784419c737e6d2d9 100644
--- a/lite/kernels/cuda/scale_compute.cc
+++ b/lite/kernels/cuda/scale_compute.cc
@@ -23,8 +23,11 @@ namespace cuda {
 
 void ScaleCompute::Run() {
   auto& param = Param<operators::ScaleParam>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
   const float* x_data = param.x->data<float>();
-  float* output_data = param.output->mutable_data<float>();
+  float* output_data = param.output->mutable_data<float>(TARGET(kCUDA));
   DDim x_dims = param.x->dims();
   bool bias_after_scale = param.bias_after_scale;
   float scale = param.scale;
@@ -33,7 +36,7 @@ void ScaleCompute::Run() {
     bias *= scale;
   }
   lite::cuda::math::scale(
-      x_dims.production(), x_data, output_data, scale, bias);
+      x_dims.production(), x_data, output_data, scale, bias, stream);
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/sequence_mask_compute.cu b/lite/kernels/cuda/sequence_mask_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8e227a6a272127f500e10775f7ed4db53660e1f8
--- /dev/null
+++ b/lite/kernels/cuda/sequence_mask_compute.cu
@@ -0,0 +1,105 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <thrust/device_ptr.h>
+#include <thrust/functional.h>
+#include <thrust/reduce.h>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/sequence_mask_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+__global__ void SequenceMaskKernel(T* dst,
+                                   const int64_t* src,
+                                   int count,
+                                   int maxlen) {
+  CUDA_KERNEL_LOOP(index, count) {
+    int src_idx = index / maxlen;
+    int inner_idx = index % maxlen;
+    dst[index] = static_cast<T>(inner_idx < src[src_idx] ? 1 : 0);
+  }
+}
+
+template <typename T, PrecisionType Ptype>
+void SequenceMaskCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  const auto* x = param.X;
+  const int64_t* x_data = x->template data<int64_t>();
+  auto* y = param.Y;
+  int maxlen = param.maxlen;
+
+  if (param.MaxLenTensor) {
+    auto* len_tensor_data = param.MaxLenTensor->template data<int32_t>();
+    int32_t len_data{0};
+    TargetWrapperCuda::MemcpySync(
+        &len_data, len_tensor_data, sizeof(int32_t), IoDirection::DtoH);
+    maxlen = len_data;
+  }
+
+  if (maxlen < 0) {
+    maxlen = static_cast<int>(
+        thrust::reduce(thrust::device_pointer_cast(x_data),
+                       thrust::device_pointer_cast(x_data) + x->numel(),
+                       static_cast<int64_t>(0),
+                       thrust::maximum<int64_t>()));
+  }
+
+  auto y_dim = x->dims().Vectorize();
+  y_dim.push_back(maxlen);
+  y->Resize(y_dim);
+  const int count = y->numel();
+  auto* dst_data = y->template mutable_data<T>(TARGET(kCUDA));
+  if (param.out_dtype == 5) {
+    SequenceMaskKernel<
+        T><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+        dst_data, x_data, count, maxlen);
+  } else {
+    LOG(FATAL) << "not supported out_dtype: " << param.out_dtype;
+  }
+  CUDA_POST_KERNEL_CHECK;
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using SeqMaskFp32 =
+    paddle::lite::kernels::cuda::SequenceMaskCompute<float, PRECISION(kFloat)>;
+
+using SeqMaskFp16 =
+    paddle::lite::kernels::cuda::SequenceMaskCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(sequence_mask, kCUDA, kFloat, kNCHW, SeqMaskFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindInput("MaxLenTensor",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt32))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_mask, kCUDA, kFP16, kNCHW, SeqMaskFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindInput("MaxLenTensor",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt32))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_mask_compute.h b/lite/kernels/cuda/sequence_mask_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..3611587f0ce7daef1a88f5b6a916e2d30d33bcc1
--- /dev/null
+++ b/lite/kernels/cuda/sequence_mask_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class SequenceMaskCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::SequenceMaskParam;
+
+  void Run() override;
+  virtual ~SequenceMaskCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_mask_compute_test.cc b/lite/kernels/cuda/sequence_mask_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..efbdf2ae00b6d1d9353831e94a202e5e42228b62
--- /dev/null
+++ b/lite/kernels/cuda/sequence_mask_compute_test.cc
@@ -0,0 +1,170 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_mask_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequenceMaskTest : public ::testing::Test {
+ protected:
+  SequenceMaskTest()
+      : maxlen_(4),
+        out_dtype_(5),
+        x_data_({3, 2, 1, 0}),
+        out_shape_({static_cast<int64_t>(x_data_.size()), maxlen_}) {
+    x_ref_.Resize(lite::DDim({static_cast<int64_t>(x_data_.size())}));
+    x_gpu_.Resize(x_ref_.dims());
+
+    auto* x_ref_data = x_ref_.mutable_data<int64_t>();
+
+    // prepare input
+    for (size_t i = 0; i < x_data_.size(); i++) {
+      x_ref_data[i] = x_data_[i];
+    }
+
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_gpu_.Resize(out_ref_.dims());
+    out_cpu_.Resize(out_ref_.dims());
+    RunBaseLine(&x_ref_, &out_ref_);
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.Y = &out_gpu_;
+    param_.maxlen = maxlen_;
+    param_.out_dtype = out_dtype_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(x_ref_.data<int64_t>(),
+                                                      x_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_gpu_.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(x_ref_.data<int64_t>(),
+                                                      x_gpu_.dims());
+  }
+
+  void RunBaseLine(const lite::Tensor* x, lite::Tensor* out) {
+    auto* out_data = out->mutable_data<float>();
+
+    for (size_t i = 0; i < x_data_.size(); ++i) {
+      for (int j = 0; j < maxlen_; ++j) {
+        out_data[i * maxlen_ + j] = j < x_data_[i] ? 1 : 0;
+      }
+    }
+  }
+
+  int maxlen_, out_dtype_;
+  std::vector<int64_t> x_data_, out_shape_;
+
+  lite::Tensor x_ref_, out_ref_;
+  lite::Tensor x_gpu_, out_gpu_;
+  lite::Tensor out_cpu_;
+
+  operators::SequenceMaskParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(SequenceMaskTest, fp32) {
+  InitFloatInput();
+  SequenceMaskCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
+  }
+}
+
+TEST_F(SequenceMaskTest, TestFP16) {
+  InitHalfInput();
+  SequenceMaskCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_pad_compute.cu b/lite/kernels/cuda/sequence_pad_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8368eb3007e3f1d036420a5dc1c86204365e179c
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pad_compute.cu
@@ -0,0 +1,116 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/sequence_padding.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_pad_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+void SequencePadCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  const auto* x = param.X;
+  const auto* pad_value = param.PadValue;
+  auto* out = param.Out;
+  auto* len_t = param.Length;
+  int seq_num = x->lod()[0].size() - 1;
+  int padded_length;
+  if (param.padded_length == -1) {
+    int max_seq_len = 0;
+    for (int i = 0; i < seq_num; ++i) {
+      max_seq_len = std::max(
+          max_seq_len, static_cast<int>(x->lod()[0][i + 1] - x->lod()[0][i]));
+    }
+    padded_length = max_seq_len;
+  } else {
+    padded_length = param.padded_length;
+  }
+
+  int max_seq_len = 0;
+  int step_width = x->numel() / x->dims()[0];
+
+  // calc for param.Lenght
+  seq_len_.resize(seq_num);
+  seq_offsets_vec_.resize(x->lod()[0].size());
+  for (size_t i = 0; i < seq_num; ++i) {
+    max_seq_len = std::max(
+        max_seq_len, static_cast<int>(x->lod()[0][i + 1] - x->lod()[0][i]));
+    seq_len_[i] = x->lod()[0][i + 1] - x->lod()[0][i];
+    seq_offsets_vec_[i] = x->lod()[0][i];
+  }
+  seq_offsets_vec_[seq_num] = x->lod()[0][seq_num];
+  TargetWrapperCuda::MemcpyAsync(
+      len_t->template mutable_data<int64_t>(TARGET(kCUDA)),
+      seq_len_.data(),
+      sizeof(int64_t) * seq_len_.size(),
+      IoDirection::HtoD,
+      stream);
+  seq_offsets_.Resize({static_cast<int64_t>(x->lod()[0].size())});
+  TargetWrapperCuda::MemcpyAsync(
+      seq_offsets_.mutable_data<size_t>(TARGET(kCUDA)),
+      seq_offsets_vec_.data(),
+      sizeof(size_t) * seq_offsets_vec_.size(),
+      IoDirection::HtoD,
+      stream);
+
+  const T* seq_data = x->template data<T>();
+  T* pad_data = out->template mutable_data<T>(TARGET(kCUDA));
+  const T* pad_value_data = pad_value->template data<T>();
+
+  lite::cuda::math::SequencePadding(pad_data,
+                                    seq_data,
+                                    pad_value_data,
+                                    pad_value->numel() == 1,
+                                    seq_offsets_.data<size_t>(),
+                                    seq_num,
+                                    padded_length,
+                                    step_width,
+                                    &stream);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using SeqPadFp32 =
+    paddle::lite::kernels::cuda::SequencePadCompute<float, PRECISION(kFloat)>;
+
+using SeqPadFp16 =
+    paddle::lite::kernels::cuda::SequencePadCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(sequence_pad, kCUDA, kFloat, kNCHW, SeqPadFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("PadValue", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Length",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_pad, kCUDA, kFP16, kNCHW, SeqPadFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("PadValue",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Length",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_pad_compute.h b/lite/kernels/cuda/sequence_pad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..c494fe127d4eb5a7c0ba77a5c76ab1d1d0c1f2f2
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pad_compute.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class SequencePadCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::SequencePadParam;
+
+  void Run() override;
+  virtual ~SequencePadCompute() = default;
+
+ private:
+  lite::Tensor seq_offsets_;
+  std::vector<int64_t> seq_len_;
+  std::vector<size_t> seq_offsets_vec_;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_pad_compute_test.cc b/lite/kernels/cuda/sequence_pad_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91141984c98d5d105f51d0acc247aa878ff219a7
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pad_compute_test.cc
@@ -0,0 +1,233 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_pad_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequencePadTest : public ::testing::Test {
+ protected:
+  SequencePadTest()
+      : batch_(5),
+        features_(2),
+        padded_length_(3),
+        x_lod_({{0, 2, 5}}),
+        x_shape_({batch_, features_}),
+        pad_value_shape_({features_}),
+        out_shape_({static_cast<int64_t>(x_lod_[0].size() - 1),
+                    padded_length_,
+                    features_}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_ref_.set_lod(x_lod_);
+    x_gpu_.Resize(x_ref_.dims());
+
+    pad_value_ref_.Resize(lite::DDim(pad_value_shape_));
+    pad_value_gpu_.Resize(pad_value_ref_.dims());
+
+    length_ref_.Resize(
+        lite::DDim({static_cast<int64_t>(x_lod_[0].size() - 1)}));
+    length_gpu_.Resize(length_ref_.dims());
+    length_cpu_.Resize(length_ref_.dims());
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+    auto pad_value_ref_data = pad_value_ref_.mutable_data<float>();
+
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i);
+    }
+    for (int64_t i = 0; i < pad_value_ref_.numel(); i++) {
+      pad_value_ref_data[i] = static_cast<float>(i);
+    }
+
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_gpu_.Resize(out_ref_.dims());
+    out_cpu_.Resize(out_ref_.dims());
+    RunBaseLine(&x_ref_, &pad_value_ref_, &out_ref_, &length_ref_);
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.PadValue = &pad_value_gpu_;
+    param_.Length = &length_gpu_;
+    param_.Out = &out_gpu_;
+    param_.padded_length = padded_length_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    x_gpu_.set_lod(x_ref_.lod());
+    pad_value_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(
+        pad_value_ref_.data<float>(), pad_value_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(x_shape_));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+    x_gpu_.set_lod(x_ref_.lod());
+    pad_value_half_.Resize(pad_value_ref_.dims());
+    auto pad_value_half_data = pad_value_half_.mutable_data<half>();
+    for (int64_t i = 0; i < pad_value_half_.numel(); i++) {
+      pad_value_half_data[i] =
+          half(lite::float16(pad_value_ref_.data<float>()[i]));
+    }
+    pad_value_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(
+        pad_value_half_data, pad_value_gpu_.dims());
+  }
+
+  void RunBaseLine(const lite::Tensor* x,
+                   const lite::Tensor* pad_value,
+                   lite::Tensor* out,
+                   lite::Tensor* length) {
+    auto* length_data = length->mutable_data<int64_t>();
+    auto* out_data = out->mutable_data<float>();
+    length_data[0] = 2;
+    length_data[1] = 3;
+
+    for (size_t i = 0; i < 4; ++i) {
+      out_data[i] = i;
+    }
+    out_data[4] = 0;
+    out_data[5] = 1;
+    for (size_t i = 4; i < 10; ++i) {
+      out_data[2 + i] = i;
+    }
+  }
+
+  int batch_, features_, padded_length_;
+  LoD x_lod_;
+  std::vector<int64_t> x_shape_, pad_value_shape_, out_shape_;
+
+  lite::Tensor x_ref_, pad_value_ref_, out_ref_, length_ref_;
+  lite::Tensor x_gpu_, pad_value_gpu_, out_gpu_, length_gpu_;
+  lite::Tensor x_half_, pad_value_half_;
+  lite::Tensor out_cpu_, length_cpu_;
+
+  operators::SequencePadParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(SequencePadTest, fp32) {
+  InitFloatInput();
+  SequencePadCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(length_cpu_.mutable_data<int64_t>(),
+                          length_gpu_.data<int64_t>(),
+                          sizeof(int64_t) * length_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
+  }
+  for (int i = 0; i < length_gpu_.numel(); ++i) {
+    EXPECT_NEAR(
+        length_cpu_.data<int64_t>()[i], length_ref_.data<int64_t>()[i], 1e-5);
+  }
+}
+
+TEST_F(SequencePadTest, TestFP16) {
+  InitHalfInput();
+  SequencePadCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  const int64_t* length_gpu_data = length_gpu_.data<int64_t>();
+  int64_t* length_cpu_data = length_cpu_.mutable_data<int64_t>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(length_cpu_data,
+                          length_gpu_data,
+                          sizeof(int64_t) * length_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
+  }
+  for (int i = 0; i < length_gpu_.numel(); ++i) {
+    EXPECT_NEAR(
+        length_cpu_.data<int64_t>()[i], length_ref_.data<int64_t>()[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_unpad_compute.cu b/lite/kernels/cuda/sequence_unpad_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b4274e19a86d55a4e5e5099e984c537c2929bce7
--- /dev/null
+++ b/lite/kernels/cuda/sequence_unpad_compute.cu
@@ -0,0 +1,124 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+
+#include "lite/backends/cuda/math/sequence_padding.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_unpad_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+void SequenceUnpadCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  auto x_dims = param.X->dims();
+  auto len_dims = param.Length->dims();
+
+  auto* seq_len_ptr = param.Length->template data<int64_t>();
+  seq_len_cpu_.Resize(param.Length->dims());
+  TargetWrapperCuda::MemcpyAsync(seq_len_cpu_.mutable_data<int64_t>(),
+                                 seq_len_ptr,
+                                 sizeof(int64_t) * param.Length->numel(),
+                                 IoDirection::DtoH,
+                                 stream);
+  TargetWrapperCuda::StreamSync(stream);
+
+  int64_t batch_size = len_dims[0];
+  std::vector<uint64_t> out_lod0(batch_size + 1, 0);
+  for (int64_t i = 0; i < batch_size; ++i) {
+    out_lod0[i + 1] = out_lod0[i] + seq_len_cpu_.data<int64_t>()[i];
+  }
+  paddle::lite::LoD out_lod;
+  out_lod.push_back(out_lod0);
+
+  int64_t out_dim0 = out_lod0.back();
+  std::vector<int64_t> out_dims{out_dim0};
+  if (x_dims.size() == 2) {
+    out_dims.push_back(1);
+  } else {
+    for (size_t i = 2; i < x_dims.size(); ++i) {
+      out_dims.push_back(x_dims[i]);
+    }
+  }
+  param.Out->Resize(out_dims);
+  param.Out->set_lod(out_lod);
+
+  const auto* pad_tensor = param.X;
+  auto* seq_tensor = param.Out;
+
+  int padded_length = pad_tensor->dims()[1];
+  int seq_num = seq_tensor->lod()[0].size() - 1;
+  int max_seq_len = 0;
+  int step_width = seq_tensor->numel() / seq_tensor->dims()[0];
+
+  seq_offsets_vec_.resize(seq_tensor->lod()[0].size());
+  for (size_t i = 0; i < seq_num; ++i) {
+    max_seq_len = std::max(max_seq_len,
+                           static_cast<int>(seq_tensor->lod()[0][i + 1] -
+                                            seq_tensor->lod()[0][i]));
+    seq_offsets_vec_[i] = seq_tensor->lod()[0][i];
+  }
+  seq_offsets_vec_[seq_num] = seq_tensor->lod()[0][seq_num];
+  seq_offsets_.Resize({static_cast<int64_t>(seq_tensor->lod()[0].size())});
+  TargetWrapperCuda::MemcpyAsync(
+      seq_offsets_.mutable_data<size_t>(TARGET(kCUDA)),
+      seq_offsets_vec_.data(),
+      sizeof(size_t) * seq_offsets_vec_.size(),
+      IoDirection::HtoD,
+      stream);
+
+  const T* pad_data = pad_tensor->template data<T>();
+  T* seq_data = seq_tensor->template mutable_data<T>(TARGET(kCUDA));
+
+  lite::cuda::math::SequenceUnpadding(seq_data,
+                                      pad_data,
+                                      seq_offsets_.data<size_t>(),
+                                      seq_num,
+                                      padded_length,
+                                      step_width,
+                                      &stream);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using SeqUnadFp32 =
+    paddle::lite::kernels::cuda::SequenceUnpadCompute<float, PRECISION(kFloat)>;
+
+using SeqUnadFp16 =
+    paddle::lite::kernels::cuda::SequenceUnpadCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(sequence_unpad, kCUDA, kFloat, kNCHW, SeqUnadFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Length",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_unpad, kCUDA, kFP16, kNCHW, SeqUnadFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("Length",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_unpad_compute.h b/lite/kernels/cuda/sequence_unpad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b077a4dcbd91eb8f9a9e2cb1340088434f117aa
--- /dev/null
+++ b/lite/kernels/cuda/sequence_unpad_compute.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class SequenceUnpadCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::SequenceUnpadParam;
+
+  void Run() override;
+  virtual ~SequenceUnpadCompute() = default;
+
+ private:
+  lite::Tensor seq_offsets_;
+  lite::Tensor seq_len_cpu_;
+  std::vector<size_t> seq_offsets_vec_;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_unpad_compute_test.cc b/lite/kernels/cuda/sequence_unpad_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..417115a50b6d086bd628a0b93a7d45c688ea18af
--- /dev/null
+++ b/lite/kernels/cuda/sequence_unpad_compute_test.cc
@@ -0,0 +1,198 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_unpad_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequenceUnpadTest : public ::testing::Test {
+ protected:
+  SequenceUnpadTest()
+      : batch_(5),
+        features_(2),
+        padded_length_(3),
+        out_lod_({{0, 2, 5}}),
+        x_shape_({static_cast<int64_t>(out_lod_[0].size() - 1),
+                  padded_length_,
+                  features_}),
+        out_shape_({batch_, features_}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(x_ref_.dims());
+
+    length_ref_.Resize(
+        lite::DDim({static_cast<int64_t>(out_lod_[0].size() - 1)}));
+    length_gpu_.Resize(length_ref_.dims());
+
+    auto* x_ref_data = x_ref_.mutable_data<float>();
+    auto* length_ref_data = length_ref_.mutable_data<int64_t>();
+
+    // prepare input
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i);
+    }
+    for (size_t i = 0; i < out_lod_[0].size() - 1; ++i) {
+      length_ref_data[i] = out_lod_[0][i + 1] - out_lod_[0][i];
+    }
+
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_ref_.set_lod(out_lod_);
+    out_gpu_.Resize(out_ref_.dims());
+    out_gpu_.set_lod(out_ref_.lod());
+    out_cpu_.Resize(out_ref_.dims());
+    out_cpu_.set_lod(out_ref_.lod());
+
+    RunBaseLine(&x_ref_, &length_ref_, &out_ref_);
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.Length = &length_gpu_;
+    param_.Out = &out_gpu_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    length_gpu_.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(
+        length_ref_.data<int64_t>(), length_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(x_shape_));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+    length_gpu_.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(
+        length_ref_.data<int64_t>(), length_gpu_.dims());
+  }
+
+  void RunBaseLine(const lite::Tensor* X,
+                   const lite::Tensor* Length,
+                   lite::Tensor* Out) {
+    auto* out_data = Out->mutable_data<float>();
+
+    for (size_t i = 0; i < 4; ++i) {
+      out_data[i] = i;
+    }
+    for (size_t i = 6; i < 12; ++i) {
+      out_data[i - 2] = i;
+    }
+  }
+
+  int batch_, features_, padded_length_;
+  LoD out_lod_;
+  std::vector<int64_t> x_shape_, out_shape_;
+
+  lite::Tensor x_ref_, out_ref_, length_ref_;
+  lite::Tensor x_gpu_, out_gpu_, length_gpu_;
+  lite::Tensor x_half_;
+  lite::Tensor out_cpu_, length_cpu_;
+
+  operators::SequencePadParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(SequenceUnpadTest, fp32) {
+  InitFloatInput();
+  SequenceUnpadCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
+  }
+}
+
+TEST_F(SequenceUnpadTest, TestFP16) {
+  InitHalfInput();
+  SequenceUnpadCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sigmoid_compute.cu b/lite/kernels/cuda/sigmoid_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2879f50b4d8a61c80c8c73bf8b3f43e4c8dbe5b0
--- /dev/null
+++ b/lite/kernels/cuda/sigmoid_compute.cu
@@ -0,0 +1,57 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/activation.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/sigmoid_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+void SigmoidCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  int num = static_cast<int>(param.X->numel());
+  auto input = param.X->template data<T>();
+  auto output = param.Out->template mutable_data<T>(TARGET(kCUDA));
+
+  lite::cuda::math::sigmoid<T>(num, input, output, stream);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using SigmoidFp32 =
+    paddle::lite::kernels::cuda::SigmoidCompute<float, PRECISION(kFloat)>;
+
+using SigmoidFp16 =
+    paddle::lite::kernels::cuda::SigmoidCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(sigmoid, kCUDA, kFloat, kNCHW, SigmoidFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sigmoid, kCUDA, kFP16, kNCHW, SigmoidFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sigmoid_compute.h b/lite/kernels/cuda/sigmoid_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..455dc38d1f8d04fdaf5f4a70ee704c8a2fe7ddef
--- /dev/null
+++ b/lite/kernels/cuda/sigmoid_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class SigmoidCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+  virtual ~SigmoidCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sigmoid_compute_test.cc b/lite/kernels/cuda/sigmoid_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e27904333b918baf0de7042005955b8fb44d6930
--- /dev/null
+++ b/lite/kernels/cuda/sigmoid_compute_test.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sigmoid_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/target_wrapper.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SigmoidTest : public ::testing::Test {
+ protected:
+  SigmoidTest() : m_(8), n_(64), shape_({m_, n_}) {
+    x_ref_.Resize(lite::DDim(shape_));
+    x_gpu_.Resize(lite::DDim(shape_));
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+
+    out_ref_.Resize(lite::DDim(shape_));
+    out_cpu_.Resize(out_ref_.dims());
+    out_gpu_.Resize(out_ref_.dims());
+    RunBaseLine();
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.Out = &out_gpu_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(shape_));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+  }
+
+  void RunBaseLine() {
+    for (int64_t i = 0; i < x_ref_.numel(); ++i) {
+      out_ref_.mutable_data<float>()[i] =
+          1.f / (1.f + expf(-1 * x_ref_.data<float>()[i]));
+    }
+  }
+
+  int m_, n_;
+  std::vector<int64_t> shape_;
+  lite::Tensor x_ref_, out_ref_;
+  lite::Tensor x_gpu_;
+  lite::Tensor x_half_;
+  lite::Tensor out_cpu_, out_gpu_;
+
+  operators::ActivationParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(SigmoidTest, TestFP32) {
+  InitFloatInput();
+  SigmoidCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = out_cpu_.data<float>()[i];
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5);
+  }
+}
+
+TEST_F(SigmoidTest, TestFP16) {
+  InitHalfInput();
+  SigmoidCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 2e-2);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/topk_pooling_compute.cu b/lite/kernels/cuda/topk_pooling_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bb4499b637a1435dec2dc913bf8141edd60130fc
--- /dev/null
+++ b/lite/kernels/cuda/topk_pooling_compute.cu
@@ -0,0 +1,200 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/topk_pooling_compute.h"
+
+#include <limits>
+#include <vector>
+
+#include "lite/backends/cuda/target_wrapper.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename Dtype>
+__global__ void top_k_pooling_batch_kernel_reduction(Dtype *output_data,
+                                                     const Dtype *input,
+                                                     const int *height_offset,
+                                                     const int *width_offset,
+                                                     const int batch_size,
+                                                     const int channel_num,
+                                                     const int height_stride,
+                                                     const int width_stride,
+                                                     const int k) {
+  const Dtype *input_start =
+      input +
+      (blockIdx.x * channel_num + blockIdx.y) * height_stride * width_stride;
+  Dtype *output_start =
+      output_data + (blockIdx.x * channel_num + blockIdx.y) * k;
+
+  int width = width_offset[blockIdx.x + 1] - width_offset[blockIdx.x];
+  int height = height_offset[blockIdx.x + 1] - height_offset[blockIdx.x];
+  int real_k = k < height * width ? k : height * width;
+
+  extern __shared__ Dtype smem[];
+
+  Dtype min_val = -100000.0f;
+  for (int j = threadIdx.x; j < height * width; j += blockDim.x) {
+    int index_tmp = (j / width) * width_stride + j % width;
+    smem[j] = input_start[index_tmp];
+  }
+  __syncthreads();
+
+  // get max val
+  int t = 0;
+  for (; t < real_k; ++t) {
+    // reduction
+    for (int gap = height * width; gap > 1;) {
+      if (threadIdx.x == 0) {  // edge cond
+        if (gap % 2 != 0) {
+          Dtype value_first = smem[0];
+          Dtype value_gap = smem[gap - 1];
+          if (value_first < value_gap) {
+            smem[0] = value_gap;
+            smem[gap - 1] = value_first;
+          }
+        }
+      }
+      gap >>= 1;
+      for (int j = threadIdx.x; j < gap; j += blockDim.x) {
+        Dtype value_first = smem[j];
+        Dtype value_gap = smem[j + gap];
+        if (value_first < value_gap) {
+          smem[j] = value_gap;
+          smem[j + gap] = value_first;
+        }
+      }
+      __syncthreads();
+    }
+    if (threadIdx.x == 0) {
+      output_start[t] = smem[0];
+      smem[0] = min_val;
+    }
+    __syncthreads();
+  }
+  for (int i = threadIdx.x; i < (k - t); i += blockDim.x) {
+    // output_start[t + i] = 0.0f;
+  }
+}
+
+template <typename T>
+void TopkPoolingCompute<T>::PrepareForRun() {
+  int device_id = lite::TargetWrapperCuda::GetCurDevice();
+  cudaDeviceProp deviceProp;
+  CUDA_CALL(cudaGetDeviceProperties(&deviceProp, device_id));
+  _shared_mem_size = deviceProp.sharedMemPerBlock;
+}
+
+template <typename T>
+void TopkPoolingCompute<T>::Run() {
+  auto &param = this->Param<param_t>();
+  auto &ctx = this->ctx_->template As<CUDAContext>();
+  auto cuda_stream = ctx.exec_stream();
+
+  CHECK(param.X->lod().size() > 0 && param.X->lod()[0].size() > 0)
+      << "X sequence offset is not valid";
+  CHECK(param.Y->lod().size() > 0 && param.Y->lod()[0].size() > 0)
+      << "Y sequence offset is not valid";
+
+  int width_offset_len = param.X->lod()[0].size();
+  lite::DDim width_offset_shape(std::vector<int64_t>{width_offset_len});
+  _width_offset.Resize(width_offset_shape);
+  std::vector<int> width_lod_0(width_offset_len, 0);
+  for (size_t i = 0; i < param.X->lod()[0].size(); ++i) {
+    width_lod_0[i] = static_cast<int>(param.X->lod()[0][i]);
+  }
+  lite::TargetWrapperCuda::MemcpyAsync(
+      _width_offset.mutable_data<int>(TARGET(kCUDA)),
+      width_lod_0.data(),
+      sizeof(int) * width_offset_len,
+      lite::IoDirection::HtoD,
+      cuda_stream);
+
+  int height_offset_len = param.Y->lod()[0].size();
+  lite::DDim height_offset_shape(std::vector<int64_t>{height_offset_len});
+  _height_offset.Resize(height_offset_shape);
+  std::vector<int> height_lod_0(height_offset_len, 0);
+  for (size_t i = 0; i < param.Y->lod()[0].size(); ++i) {
+    height_lod_0[i] = static_cast<int>(param.Y->lod()[0][i]);
+  }
+  lite::TargetWrapperCuda::MemcpyAsync(
+      _height_offset.mutable_data<int>(TARGET(kCUDA)),
+      height_lod_0.data(),
+      sizeof(int) * height_offset_len,
+      lite::IoDirection::HtoD,
+      cuda_stream);
+
+  const Tensor *x_tensor = param.X;
+  Tensor *out_tensor = param.Out;
+  const T *in_data = x_tensor->data<T>();
+  T *out_data = out_tensor->mutable_data<T>(TARGET(kCUDA));
+
+  int num = x_tensor->dims()[0];
+  int channel = x_tensor->dims()[1];
+  int height = x_tensor->dims()[2];
+  int width = x_tensor->dims()[3];
+
+  const int *height_offset = _height_offset.data<int>();
+  const int *width_offset = _width_offset.data<int>();
+
+  int feat_map_size = height * width;
+
+  if (feat_map_size * sizeof(T) <= _shared_mem_size) {
+    dim3 blocks(num, channel);
+    dim3 threads(32, 1);
+
+    top_k_pooling_batch_kernel_reduction<
+        T><<<blocks, threads, feat_map_size * sizeof(T), cuda_stream>>>(
+        out_data,
+        in_data,
+        height_offset,
+        width_offset,
+        num,
+        channel,
+        height,
+        width,
+        param.top_k);
+  } else {
+    LOG(FATAL) << "Not implemented. Exceeded the shared memory limit.";
+  }
+  CUDA_POST_KERNEL_CHECK;
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(topk_pooling,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::TopkPoolingCompute<float>,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/topk_pooling_compute.h b/lite/kernels/cuda/topk_pooling_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..abf16163812a74de8ebb8cce0dd7d80469e0a7d8
--- /dev/null
+++ b/lite/kernels/cuda/topk_pooling_compute.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+class TopkPoolingCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::TopkPoolingParam;
+
+  void Run() override;
+
+  void PrepareForRun() override;
+
+  virtual ~TopkPoolingCompute() = default;
+
+ protected:
+  lite::Tensor _height_offset;
+  lite::Tensor _width_offset;
+  int _shared_mem_size;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/topk_pooling_compute_test.cc b/lite/kernels/cuda/topk_pooling_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0fb5c29f25bba0b4cc00f3eb58fc1c0726e6b23b
--- /dev/null
+++ b/lite/kernels/cuda/topk_pooling_compute_test.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/topk_pooling_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class TopkPooingTest : public ::testing::Test {
+ protected:
+  TopkPooingTest()
+      : num(2),
+        channels(4),
+        height(4),
+        width(4),
+        top_k(2),
+        feat_map_num(height * width),
+        x_lod({{0, 4, 7}}),
+        y_lod({{0, 4, 7}}),
+        x_shape({num, channels, height, width}),
+        out_shape({num, channels * top_k}) {
+    CHECK_EQ(x_lod[0].size(), num + 1) << "invalid input.";
+    for (size_t i = 1; i < x_lod[0].size(); ++i) {
+      CHECK_LE(x_lod[0][i] - x_lod[0][i - 1], height) << "invalid input.";
+    }
+
+    X_gpu.Resize(lite::DDim(x_shape));
+    X_ref.Resize(lite::DDim(x_shape));
+    X_ref.set_lod(x_lod);
+    Y_gpu.Resize(lite::DDim(x_shape));
+    Y_ref.Resize(lite::DDim(x_shape));
+    Y_ref.set_lod(y_lod);
+    auto x_ref_data = X_ref.mutable_data<float>();
+    auto y_ref_data = Y_ref.mutable_data<float>();
+
+    // prepare input
+    for (int64_t i = 0; i < X_ref.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i % 16);
+    }
+    for (int64_t i = 0; i < Y_ref.numel(); i++) {
+      y_ref_data[i] = static_cast<float>(i % 16);
+    }
+
+    Out_ref.Resize(lite::DDim(out_shape));
+    Out_gpu.Resize(lite::DDim(out_shape));
+    Out_cpu.Resize(lite::DDim(out_shape));
+
+    device_init();
+  }
+
+  void device_init() {
+    ctx.reset(new KernelContext);
+    cudaStreamCreate(&stream);
+    param.X = &X_gpu;
+    param.Y = &Y_gpu;
+    param.Out = &Out_gpu;
+    param.top_k = top_k;
+    param.feat_map_num = feat_map_num;
+  }
+
+  void float_data_init() {
+    X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
+                                                   X_gpu.dims());
+    X_gpu.set_lod(X_ref.lod());
+    Y_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(Y_ref.data<float>(),
+                                                   Y_gpu.dims());
+    Y_gpu.set_lod(Y_ref.lod());
+  }
+
+  void half_data_init() {}
+
+  void cpu_base(const lite::Tensor* X,
+                const lite::Tensor* Y,
+                lite::Tensor* Out) {}
+
+  int num, channels, height, width;
+  int top_k, feat_map_num;
+  std::vector<std::vector<uint64_t>> x_lod, y_lod;
+  std::vector<int64_t> x_shape, out_shape;
+  lite::Tensor X_ref, Y_ref, Out_ref;
+  lite::Tensor X_gpu, Y_gpu;
+  lite::Tensor Out_cpu, Out_gpu;
+
+  operators::TopkPoolingParam param;
+  std::unique_ptr<KernelContext> ctx;
+  cudaStream_t stream;
+};
+
+TEST_F(TopkPooingTest, fp32) {
+  float_data_init();
+  auto& context = ctx->As<CUDAContext>();
+  context.SetExecStream(stream);
+  TopkPoolingCompute<float> kernel;
+  kernel.SetParam(param);
+  kernel.SetContext(std::move(ctx));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
+                          Out_gpu.data<float>(),
+                          sizeof(float) * Out_gpu.numel(),
+                          IoDirection::DtoH);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/transpose_compute.cu b/lite/kernels/cuda/transpose_compute.cu
index c5693c674c573d7c9f59034dd3c0985c9d94a22f..ec7ecd16e0daa9f9cb696224ae498825fe75c5b4 100644
--- a/lite/kernels/cuda/transpose_compute.cu
+++ b/lite/kernels/cuda/transpose_compute.cu
@@ -13,17 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "lite/kernels/cuda/transpose_compute.h"
+
 #include <vector>
+
 #include "lite/core/op_registry.h"
-#include "lite/kernels/cuda/transpose_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace cuda {
 
-void TransposeCompute::Run() {
-  auto& param = this->Param<param_t>();
+template <typename T, PrecisionType Ptype>
+void TransposeCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
   auto stream = ctx.exec_stream();
 
@@ -31,8 +34,8 @@ void TransposeCompute::Run() {
   lite::Tensor* Out = param.output;
   std::vector<int> axes = param.axis;
 
-  const float* in = X->data<float>();
-  float* out = Out->mutable_data<float>(TARGET(kCUDA));
+  const T* in = X->template data<T>();
+  T* out = Out->mutable_data<T>(TARGET(kCUDA));
 
   int ndim = X->dims().size();
   std::vector<int64_t> dims = X->dims().data();
@@ -40,7 +43,7 @@ void TransposeCompute::Run() {
   // NCHW -> NHWC
   if (axes.size() == 4 && axes[0] == 0 && axes[1] == 2 && axes[2] == 3 &&
       axes[3] == 1) {
-    trans.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream);
+    trans_.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream);
     cudaError_t error = cudaGetLastError();
     if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
     return;
@@ -49,13 +52,13 @@ void TransposeCompute::Run() {
   // NHWC -> NCHW
   if (axes.size() == 4 && axes[0] == 0 && axes[1] == 3 && axes[2] == 1 &&
       axes[3] == 2) {
-    trans.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream);
+    trans_.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream);
     cudaError_t error = cudaGetLastError();
     if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
     return;
   }
 
-  trans.transpose(out, in, dims, axes, &stream);
+  trans_.transpose(out, in, dims, axes, &stream);
   cudaError_t error = cudaGetLastError();
   if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }
@@ -65,34 +68,31 @@ void TransposeCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(transpose,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::TransposeCompute,
-                     def)
+using TransFp32 =
+    paddle::lite::kernels::cuda::TransposeCompute<float, PRECISION(kFloat)>;
+
+using TransFp16 =
+    paddle::lite::kernels::cuda::TransposeCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(transpose, kCUDA, kFloat, kNCHW, TransFp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(transpose2,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::TransposeCompute,
-                     def)
+REGISTER_LITE_KERNEL(transpose2, kCUDA, kFloat, kNCHW, TransFp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .Finalize();
 
-// REGISTER_LITE_KERNEL(transpose2,
-//                      kCUDA,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::cuda::TransposeCompute,
-//                      def)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
-//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
-//     .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kCUDA))})
-//     .Finalize();
+REGISTER_LITE_KERNEL(transpose, kCUDA, kFP16, kNCHW, TransFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(transpose2, kCUDA, kFP16, kNCHW, TransFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("XShape",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/transpose_compute.h b/lite/kernels/cuda/transpose_compute.h
index 273d072231fb0608deb9ed729bdf153395ee983f..7e373c3b26c1701cd467148a06466a86f04e0c95 100644
--- a/lite/kernels/cuda/transpose_compute.h
+++ b/lite/kernels/cuda/transpose_compute.h
@@ -21,7 +21,8 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-class TransposeCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+template <typename Dtype, PrecisionType Ptype>
+class TransposeCompute : public KernelLite<TARGET(kCUDA), Ptype> {
  public:
   using param_t = operators::TransposeParam;
 
@@ -29,7 +30,7 @@ class TransposeCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
   virtual ~TransposeCompute() = default;
 
  private:
-  lite::cuda::math::Transpose<float> trans;
+  lite::cuda::math::Transpose<Dtype> trans_;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/transpose_compute_test.cc b/lite/kernels/cuda/transpose_compute_test.cc
index bf0d803a14a5f0e47c96128b953ae72a18798205..89654dd9c8a200f5672f23bd08c32b40b9b6f99e 100644
--- a/lite/kernels/cuda/transpose_compute_test.cc
+++ b/lite/kernels/cuda/transpose_compute_test.cc
@@ -13,11 +13,16 @@
 // limitations under the License.
 
 #include "lite/kernels/cuda/transpose_compute.h"
+
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
 #include <vector>
 
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/utils/float16.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -31,9 +36,9 @@ namespace {
 #define OUT(n, c, h, w)                                    \
   output_data[w + h * output_w + c * output_h * output_w + \
               n * output_c * output_h * output_w]
-void nchw2nhwc_ref(lite::Tensor* input,
-                   lite::Tensor* output,
-                   const std::vector<int> axies) {
+void Nchw2nhwcBaseLine(lite::Tensor* input,
+                       lite::Tensor* output,
+                       const std::vector<int> axies) {
   auto* input_data = input->data<float>();
   auto* output_data = output->mutable_data<float>();
 
@@ -64,9 +69,9 @@ void nchw2nhwc_ref(lite::Tensor* input,
 #define OUT(n, h, w, c)                                    \
   output_data[c + w * output_c + h * output_w * output_c + \
               n * output_h * output_w * output_c]
-void nhwc2nchw_ref(lite::Tensor* input,
-                   lite::Tensor* output,
-                   const std::vector<int> axies) {
+void Nhwc2nchwBaseLine(lite::Tensor* input,
+                       lite::Tensor* output,
+                       const std::vector<int>& axies) {
   auto* input_data = input->data<float>();
   auto* output_data = output->mutable_data<float>();
 
@@ -89,7 +94,7 @@ void nhwc2nchw_ref(lite::Tensor* input,
   }
 }
 
-void transpose_ref(lite::Tensor* input,
+void TransBaseLine(const lite::Tensor* input,
                    lite::Tensor* output,
                    const std::vector<int> axes) {
   auto* input_data = input->data<float>();
@@ -123,7 +128,7 @@ void transpose_ref(lite::Tensor* input,
 }  // namespace
 
 TEST(transpose_nchw, normal) {
-  TransposeCompute transpose_kernel;
+  TransposeCompute<float, PRECISION(kFloat)> transpose_kernel;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
   auto& context = ctx->As<CUDAContext>();
 
@@ -168,16 +173,15 @@ TEST(transpose_nchw, normal) {
   auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
   CopySync<TARGET(kCUDA)>(
       out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  nchw2nhwc_ref(&x_ref, &out_ref, axes);
+  Nchw2nhwcBaseLine(&x_ref, &out_ref, axes);
   auto* out_ref_data = out_ref.mutable_data<float>();
-  // transpose_ref(&x_ref, &out_ref, axes);
   for (int i = 0; i < out.numel(); i++) {
     EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
   }
 }
 
 TEST(transpose_nhwc, normal) {
-  TransposeCompute transpose_kernel;
+  TransposeCompute<float, PRECISION(kFloat)> transpose_kernel;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
   auto& context = ctx->As<CUDAContext>();
 
@@ -220,62 +224,146 @@ TEST(transpose_nhwc, normal) {
   auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
   CopySync<TARGET(kCUDA)>(
       out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  nhwc2nchw_ref(&x_ref, &out_ref, axes);
-  // transpose_ref(&x_ref, &out_ref, axes);
+  Nhwc2nchwBaseLine(&x_ref, &out_ref, axes);
   auto* out_ref_data = out_ref.mutable_data<float>();
   for (int i = 0; i < out.numel(); i++) {
     EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
   }
 }
 
-TEST(transpose, normal) {
-  TransposeCompute transpose_kernel;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
+class TransposeTest : public ::testing::Test {
+ protected:
+  TransposeTest()
+      : C_(3),
+        H_(128),
+        W_(64),
+        axes_({1, 2, 0}),
+        x_shape_({C_, H_, W_}),
+        out_shape_({H_, W_, C_}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(x_ref_.dims());
+
+    auto X_ref__data = x_ref_.mutable_data<float>();
+
+    // prepare input
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      X_ref__data[i] = static_cast<float>(i);
+    }
 
-  operators::TransposeParam param;
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_gpu_.Resize(out_ref_.dims());
+    out_cpu_.Resize(out_ref_.dims());
+    RunBaseLine(&x_ref_, &out_ref_);
 
-  lite::Tensor x, x_cpu, x_ref;
-  lite::Tensor out, out_cpu, out_ref;
+    InitParamAndContext();
+  }
 
-  int C = 3, H = 128, W = 128;
-  std::vector<int> axes({2, 0, 1});
-  x.Resize({C, H, W});
-  out.Resize({W, C, H});
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.x = &x_gpu_;
+    param_.output = &out_gpu_;
+    param_.axis = axes_;
+  }
 
-  x_cpu.Resize({C, H, W});
-  out_cpu.Resize({W, C, H});
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+  }
 
-  x_ref.Resize({C, H, W});
-  out_ref.Resize({W, C, H});
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(x_ref_.dims()));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+  }
 
-  auto* x_cpu_data = x_cpu.mutable_data<float>();
-  auto* out_cpu_data = out_cpu.mutable_data<float>();
-  auto* x_ref_data = x_ref.mutable_data<float>();
+  void RunBaseLine(const lite::Tensor* x, lite::Tensor* out) {
+    TransBaseLine(x, out, axes_);
+  }
 
-  for (int i = 0; i < x_cpu.numel(); ++i) {
-    x_cpu_data[i] = i + 1;
-    x_ref_data[i] = i + 1;
+  int C_, H_, W_;
+  std::vector<int> axes_;
+  std::vector<int64_t> x_shape_, out_shape_;
+
+  lite::Tensor x_ref_, out_ref_;
+  lite::Tensor x_gpu_, out_gpu_;
+  lite::Tensor x_half_;
+  lite::Tensor out_cpu_;
+
+  operators::TransposeParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(TransposeTest, fp32) {
+  InitFloatInput();
+  TransposeCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
   }
 
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  param.x = &x;
-  param.output = &out;
-  param.axis = axes;
-  transpose_kernel.SetParam(param);
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-  transpose_kernel.SetContext(std::move(ctx));
-  transpose_kernel.Launch();
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
   cudaDeviceSynchronize();
-  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
-  CopySync<TARGET(kCUDA)>(
-      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  transpose_ref(&x_ref, &out_ref, axes);
-  auto* out_ref_data = out_ref.mutable_data<float>();
-  for (int i = 0; i < out.numel(); i++) {
-    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
+  }
+}
+
+TEST_F(TransposeTest, TestFP16) {
+  InitHalfInput();
+  TransposeCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_cpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
   }
 }
 
diff --git a/lite/kernels/cuda/var_conv_2d_compute.cu b/lite/kernels/cuda/var_conv_2d_compute.cu
index b847069879357ea600fd62b8f70d6c50e3c8c35f..b14073e5e1bfe074d355265726562579895dde86 100644
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
@@ -184,6 +184,8 @@ using VarConvFp16 =
 REGISTER_LITE_KERNEL(var_conv_2d, kCUDA, kFloat, kNCHW, VarConvFp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .Finalize();
@@ -191,6 +193,9 @@ REGISTER_LITE_KERNEL(var_conv_2d, kCUDA, kFloat, kNCHW, VarConvFp32, def)
 REGISTER_LITE_KERNEL(var_conv_2d, kCUDA, kFP16, kNCHW, VarConvFp16, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
     .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("COLUMN",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
     .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
     .Finalize();
diff --git a/lite/kernels/cuda/yolo_box_compute.cu b/lite/kernels/cuda/yolo_box_compute.cu
index 6b4b2875f39c479f3ddd387230dbdf8e3d24ce3c..23f5639a9ddbafa38cc575ac5ca068916956a075 100644
--- a/lite/kernels/cuda/yolo_box_compute.cu
+++ b/lite/kernels/cuda/yolo_box_compute.cu
@@ -185,15 +185,11 @@ void YoloBoxCompute::Run() {
 
   anchors_.Resize({static_cast<int64_t>(anchors.size())});
   int* d_anchors = anchors_.mutable_data<int>(TARGET(kCUDA));
-  // TargetWrapperCuda::MemcpyAsync(d_anchors,
-  //                               anchors.data(),
-  //                               sizeof(int) * anchors.size(),
-  //                               IoDirection::HtoD,
-  //                               stream);
-  CopySync<TARGET(kCUDA)>(d_anchors,
-                          anchors.data(),
-                          sizeof(int) * anchors.size(),
-                          IoDirection::HtoD);
+  TargetWrapperCuda::MemcpyAsync(d_anchors,
+                                 anchors.data(),
+                                 sizeof(int) * anchors.size(),
+                                 IoDirection::HtoD,
+                                 stream);
 
   int threads = 512;
   int blocks = (n * box_num + threads - 1) / threads;
diff --git a/lite/kernels/fpga/activation_compute_test.cc b/lite/kernels/fpga/activation_compute_test.cc
index cef87afffca65ee82ca63e58191d3877f62824f2..99f702b84b3439814433e7c416151b43772dfb0e 100644
--- a/lite/kernels/fpga/activation_compute_test.cc
+++ b/lite/kernels/fpga/activation_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/fpga/activation_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/fpga/activation_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -37,8 +39,7 @@ void activation_compute_ref(const operators::ActivationParam& param) {
 }
 
 TEST(activation_fpga, retrive_op) {
-  auto activation =
-      KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>("relu");
+  auto activation = KernelRegistry::Global().Create("relu");
   ASSERT_FALSE(activation.empty());
   ASSERT_TRUE(activation.front());
 }
diff --git a/lite/kernels/fpga/fc_compute_test.cc b/lite/kernels/fpga/fc_compute_test.cc
index 6ef8c02ed06dd89876dcab8c14fe389039bda614..08daecda314c771d0597951162d043f34d6316c9 100644
--- a/lite/kernels/fpga/fc_compute_test.cc
+++ b/lite/kernels/fpga/fc_compute_test.cc
@@ -12,15 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/fpga/fc_compute.h"
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <iostream>
 #include <memory>
 #include <random>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/fpga/fc_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -76,8 +78,7 @@ void FillData(T* a,
 }
 
 TEST(fc_fpga, retrive_op) {
-  auto fc =
-      KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>("fc");
+  auto fc = KernelRegistry::Global().Create("fc");
   ASSERT_FALSE(fc.empty());
   ASSERT_TRUE(fc.front());
 }
diff --git a/lite/kernels/fpga/pooling_compute_test.cc b/lite/kernels/fpga/pooling_compute_test.cc
old mode 100755
new mode 100644
index 9248289fe9353705e7a2d84831b9f3de5d8ee7d7..ff93f1a6e1c30d006065deb04576255c24baed25
--- a/lite/kernels/fpga/pooling_compute_test.cc
+++ b/lite/kernels/fpga/pooling_compute_test.cc
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/fpga/pooling_compute.h"
 #include <gtest/gtest.h>
+
 #include <limits>
 #include <string>
 #include <vector>
-#include "lite/core/op_registry.h"
 
 #include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/fpga/pooling_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -277,8 +278,7 @@ TEST(pool_fpga, compute) {
 }
 
 TEST(pool_fpga, retrive_op) {
-  auto pool = KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>(
-      "pool2d");
+  auto pool = KernelRegistry::Global().Create("pool2d");
   ASSERT_FALSE(pool.empty());
   ASSERT_TRUE(pool.front());
 }
diff --git a/lite/kernels/fpga/softmax_compute_test.cc b/lite/kernels/fpga/softmax_compute_test.cc
index f92139d0f49b3d149531f11cb422e44ded6e7e64..a6f456ba1f140d07ccfcea0d7746c1061586611e 100644
--- a/lite/kernels/fpga/softmax_compute_test.cc
+++ b/lite/kernels/fpga/softmax_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/fpga/softmax_compute.h"
 #include <gtest/gtest.h>
+
 #include <limits>
 #include <vector>
+
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/fpga/softmax_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -121,9 +123,7 @@ TEST(softmax_arm, compute) {
 }
 
 TEST(softmax, retrive_op) {
-  auto softmax =
-      KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>(
-          "softmax");
+  auto softmax = KernelRegistry::Global().Create("softmax");
   ASSERT_FALSE(softmax.empty());
   ASSERT_TRUE(softmax.front());
 }
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index a70345708cce678b52e288a1f3eaf4ee1a23f541..381b9304142537da028b35c688128d34465965aa 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -16,3 +16,13 @@ add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${li
 add_kernel(write_to_array_compute_host Host extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(read_from_array_compute_host Host extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(assign_compute_host Host extra SRCS assign_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(retinanet_detection_output_compute_host Host extra SRCS retinanet_detection_output_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(where_index_compute_host Host extra SRCS where_index_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(print_compute_host Host extra SRCS print_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(while_compute_host Host extra SRCS while_compute.cc DEPS ${lite_kernel_deps} program)
+add_kernel(conditional_block_compute_host Host extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} program)
+add_kernel(activation_grad_compute_host Host train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps})
+
+if(LITE_BUILD_EXTRA)
+  lite_cc_test(test_where_index_compute_host SRCS where_index_compute.cc DEPS where_index_compute_host)
+endif()
diff --git a/lite/kernels/host/activation_grad_compute.cc b/lite/kernels/host/activation_grad_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b837cfda4572fa106a1ba1d015ffd5163b08340
--- /dev/null
+++ b/lite/kernels/host/activation_grad_compute.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/activation_grad_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+void SquareGradCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(param.X);
+  auto out_grad_dims = param.Out_grad->dims();
+  auto out_grad_data = param.Out_grad->data<float>();
+
+  auto x_data = param.X->data<float>();
+  auto x_grad_data = param.X_grad->mutable_data<float>();
+  for (int i = 0; i < out_grad_dims.production(); i++) {
+    x_grad_data[i] = out_grad_data[i] * 2.0 * x_data[i];
+  }
+}
+
+void ReluGradCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(param.X);
+  auto out_grad_dims = param.Out_grad->dims();
+  auto out_grad_data = param.Out_grad->data<float>();
+
+  auto x_data = param.X->data<float>();
+  auto x_grad_data = param.X_grad->mutable_data<float>();
+  for (int i = 0; i < out_grad_dims.production(); i++) {
+    x_grad_data[i] = x_data[i] > 0 ? out_grad_data[i] : 0.0;
+  }
+}
+
+void TanhGradCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(param.Out);
+  auto out_grad_dims = param.Out_grad->dims();
+  auto out_grad_data = param.Out_grad->data<float>();
+
+  auto out_data = param.Out->data<float>();
+  auto x_grad_data = param.X_grad->mutable_data<float>();
+  for (int i = 0; i < out_grad_dims.production(); i++) {
+    x_grad_data[i] = out_grad_data[i] *
+                     (static_cast<float>(1.0) - out_data[i] * out_data[i]);
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(square_grad,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::SquareGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(relu_grad,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::SquareGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(tanh_grad,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::SquareGradCompute,
+                     def)
+    .BindInput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/host/activation_grad_compute.h b/lite/kernels/host/activation_grad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..d942b901c448ee87410a2030ea0f9f10aca0e493
--- /dev/null
+++ b/lite/kernels/host/activation_grad_compute.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class SquareGradCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationGradParam;
+
+  void Run() override;
+
+  virtual ~SquareGradCompute() = default;
+};
+
+class ReluGradCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationGradParam;
+
+  void Run() override;
+
+  virtual ~ReluGradCompute() = default;
+};
+
+class TanhGradCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationGradParam;
+
+  void Run() override;
+
+  virtual ~TanhGradCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/assign_compute.cc b/lite/kernels/host/assign_compute.cc
index e496ffbd1d9a6362d730117be949cbdab83ec62a..bfbbc32e5f3b3b4dd5936e0e296306641312cabf 100644
--- a/lite/kernels/host/assign_compute.cc
+++ b/lite/kernels/host/assign_compute.cc
@@ -51,3 +51,19 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kAny),
                                        DATALAYOUT(kAny))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(assign,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::AssignCompute,
+                     def_tensor_array)
+    .BindInput("X",
+               {LiteType::GetTensorListTy(TARGET(kHost),
+                                          PRECISION(kAny),
+                                          DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorListTy(TARGET(kHost),
+                                           PRECISION(kAny),
+                                           DATALAYOUT(kAny))})
+    .Finalize();
diff --git a/lite/kernels/arm/conditional_block_compute.cc b/lite/kernels/host/conditional_block_compute.cc
similarity index 51%
rename from lite/kernels/arm/conditional_block_compute.cc
rename to lite/kernels/host/conditional_block_compute.cc
index f0bd43e1300d4034241c03d3e4ce27dcaa59c1e5..5bdca012dd4e838f3371bae7cf17634513d59db5 100644
--- a/lite/kernels/arm/conditional_block_compute.cc
+++ b/lite/kernels/host/conditional_block_compute.cc
@@ -12,28 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/conditional_block_compute.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
+#include "lite/kernels/host/conditional_block_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 void ConditionalBlockCompute::PrepareForRun() {
-  auto& param = Param<operators::ConditionalBlockParam>();
-  auto cur_scope = param.scope;
-
-  executor_ =
-      std::make_shared<CondExecutor>(param.sub_block, cur_scope, place());
+  auto& param = this->Param<param_t>();
+  program_.reset(new RuntimeProgram(
+      param.program_desc, param.exec_scope, param.block_idx));
 }
+
 void ConditionalBlockCompute::Run() {
-  auto& param = Param<operators::ConditionalBlockParam>();
+  auto& param = this->Param<param_t>();
   for (auto& out : param.outs) {
     out->clear();
   }
@@ -43,32 +36,40 @@ void ConditionalBlockCompute::Run() {
     auto* cond_data = cond->data<bool>();
     need_run = cond_data[0];
   } else {
-    auto x = param.x;
-    for (auto pt : x) {
-      if (pt == nullptr || !pt->IsInitialized() || pt->dims().empty()) {
+    for (auto input : param.inputs) {
+      if (input == nullptr || !input->IsInitialized() ||
+          input->dims().empty()) {
         need_run = false;
         break;
       }
     }
   }
   if (need_run) {
-    executor_->Run();
+    program_->Run();
   }
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(conditional_block,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ConditionalBlockCompute,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::ConditionalBlockCompute,
                      def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Cond", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Scope", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Input",
+               {LiteType::GetTensorListTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindInput("Cond",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorListTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("Scope",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
     .Finalize();
diff --git a/lite/kernels/host/conditional_block_compute.h b/lite/kernels/host/conditional_block_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d3381ce3c4d6da076e6bb477df423bc640c56c9
--- /dev/null
+++ b/lite/kernels/host/conditional_block_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/program.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class ConditionalBlockCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  using param_t = operators::ConditionalBlockParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+
+ private:
+  std::unique_ptr<RuntimeProgram> program_;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/print_compute.cc b/lite/kernels/host/print_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..00c8ab7b13597ad33b9fafc878cd553572462a99
--- /dev/null
+++ b/lite/kernels/host/print_compute.cc
@@ -0,0 +1,188 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/print_compute.h"
+
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+const char kForward[] = "FORWARD";
+const char kBackward[] = "BACKWARD";
+const char kBoth[] = "BOTH";
+
+class TensorFormatter {
+ public:
+  TensorFormatter() {}
+
+  std::string Format(const Tensor& print_tensor,
+                     const std::string& tensor_name = "",
+                     const std::string& message = "") {
+    std::stringstream log_stream;
+    if (!tensor_name.empty()) {
+      log_stream << "Variable: " << tensor_name << std::endl;
+    }
+
+    if (!message.empty()) {
+      log_stream << "  - message: " << message << std::endl;
+    }
+
+    if (print_tensor_lod_) {
+      log_stream << "  - lod: {";
+      const LoD& lod = print_tensor.lod();
+      for (auto level : lod) {
+        log_stream << "{";
+        bool is_first = true;
+        for (auto i : level) {
+          if (is_first) {
+            log_stream << i;
+            is_first = false;
+          } else {
+            log_stream << ", " << i;
+          }
+        }
+        log_stream << "}";
+      }
+      log_stream << "}" << std::endl;
+    }
+
+    log_stream << "  - place: " << TargetToStr(print_tensor.target())
+               << std::endl;  // TODO(hong19860320) always kHost
+
+    if (print_tensor_shape_) {
+      log_stream << "  - shape: " << print_tensor.dims().repr() << std::endl;
+    }
+
+    if (print_tensor_layout_) {
+      log_stream << "  - layout: "
+                 << DataLayoutToStr(
+                        DATALAYOUT(kNCHW))  // TODO(hong19860320) Query the data
+                                            // layout from target tensor
+                 << std::endl;
+    }
+
+    auto dtype = print_tensor.precision();
+    if (print_tensor_type_) {
+      log_stream << "  - dtype: " << PrecisionToStr(dtype) << std::endl;
+    }
+
+    if (dtype == PRECISION(kBool)) {
+      FormatData<bool>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kInt8)) {
+      FormatData<int8_t>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kInt16)) {
+      FormatData<int16_t>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kInt32)) {
+      FormatData<int32_t>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kInt64)) {
+      FormatData<int64_t>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kFloat)) {
+      FormatData<float>(print_tensor, log_stream);
+    } else {
+      log_stream << "\tdata: unprintable type: " << PrecisionToStr(dtype)
+                 << std::endl;
+    }
+    return log_stream.str();
+  }
+
+  void Print(const Tensor& print_tensor,
+             const std::string& tensor_name = "",
+             const std::string& message = "") {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    std::cout << Format(print_tensor, tensor_name, message);
+  }
+
+  void SetPrintTensorType(bool print_tensor_type) {
+    print_tensor_type_ = print_tensor_type;
+  }
+  void SetPrintTensorShape(bool print_tensor_shape) {
+    print_tensor_shape_ = print_tensor_shape;
+  }
+  void SetPrintTensorLod(bool print_tensor_lod) {
+    print_tensor_lod_ = print_tensor_lod;
+  }
+  void SetPrintTensorLayout(bool print_tensor_layout) {
+    print_tensor_layout_ = print_tensor_layout;
+  }
+  void SetSummarize(int64_t summarize) { summarize_ = summarize; }
+
+ private:
+  template <typename T>
+  void FormatData(const Tensor& print_tensor, std::stringstream& log_stream) {
+    int64_t print_size = summarize_ == -1
+                             ? print_tensor.numel()
+                             : std::min(summarize_, print_tensor.numel());
+    const T* data = print_tensor.data<T>();  // Always kHost, so unnessary to
+                                             // copy the data from device
+    log_stream << "  - data: [";
+    if (print_size > 0) {
+      log_stream << data[0];
+      for (int64_t i = 1; i < print_size; ++i) {
+        log_stream << " " << data[i];
+      }
+    }
+    log_stream << "]" << std::endl;
+  }
+
+  int64_t summarize_ = -1;
+  bool print_tensor_type_ = true;
+  bool print_tensor_shape_ = true;
+  bool print_tensor_lod_ = true;
+  bool print_tensor_layout_ = true;
+};
+
+void PrintCompute::Run() {
+  auto& param = Param<param_t>();
+  param.out->CopyDataFrom(*param.in);
+
+  if ((param.is_forward && param.print_phase == kBackward) ||
+      (!param.is_forward && param.print_phase == kForward)) {
+    return;
+  }
+
+  int first_n = param.first_n;
+  if (first_n > 0 && ++times_ > first_n) return;
+
+  TensorFormatter formatter;
+  const std::string& name = param.print_tensor_name ? param.name : "";
+  formatter.SetPrintTensorType(param.print_tensor_type);
+  formatter.SetPrintTensorShape(param.print_tensor_shape);
+  formatter.SetPrintTensorLod(param.print_tensor_lod);
+  formatter.SetPrintTensorLayout(param.print_tensor_layout);
+  formatter.SetSummarize(static_cast<int64_t>(param.summarize));
+  formatter.Print(*param.in, name, param.message);
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    print, kHost, kAny, kAny, paddle::lite::kernels::host::PrintCompute, def)
+    .BindInput("In",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
diff --git a/lite/kernels/host/print_compute.h b/lite/kernels/host/print_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..91a54182d2d2e00250da01fcd5d62556da930198
--- /dev/null
+++ b/lite/kernels/host/print_compute.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class PrintCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  using param_t = operators::PrintParam;
+
+  void Run() override;
+
+  virtual ~PrintCompute() = default;
+
+ private:
+  mutable int times_{0};
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/retinanet_detection_output_compute.cc b/lite/kernels/host/retinanet_detection_output_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95a4bf708e7f03aee9d9ac99323b173287260b13
--- /dev/null
+++ b/lite/kernels/host/retinanet_detection_output_compute.cc
@@ -0,0 +1,435 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/retinanet_detection_output_compute.h"
+#include <cmath>
+#include <map>
+#include <utility>
+#include <vector>
+#include "lite/operators/retinanet_detection_output_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+bool SortScoreTwoPairDescend(const std::pair<float, std::pair<T, T>>& pair1,
+                             const std::pair<float, std::pair<T, T>>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+    const std::vector<T>& scores,
+    const T threshold,
+    int top_k,
+    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(),
+                   sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static inline T BBoxArea(const std::vector<T>& box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const std::vector<T>& box1,
+                               const std::vector<T>& box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
+    T inter_w = inter_xmax - inter_xmin + norm;
+    T inter_h = inter_ymax - inter_ymin + norm;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <class T>
+void NMSFast(const std::vector<std::vector<T>>& cls_dets,
+             const T nms_threshold,
+             const T eta,
+             std::vector<int>* selected_indices) {
+  int64_t num_boxes = cls_dets.size();
+  std::vector<std::pair<T, int>> sorted_indices;
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    sorted_indices.push_back(std::make_pair(cls_dets[i][4], i));
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(
+      sorted_indices.begin(), sorted_indices.end(), SortScorePairDescend<int>);
+  selected_indices->clear();
+  T adaptive_threshold = nms_threshold;
+
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < selected_indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*selected_indices)[k];
+        T overlap = T(0.);
+
+        overlap = JaccardOverlap<T>(cls_dets[idx], cls_dets[kept_idx], false);
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      selected_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <class T>
+void DeltaScoreToPrediction(
+    const std::vector<T>& bboxes_data,
+    const std::vector<T>& anchors_data,
+    T im_height,
+    T im_width,
+    T im_scale,
+    int class_num,
+    const std::vector<std::pair<T, int>>& sorted_indices,
+    std::map<int, std::vector<std::vector<T>>>* preds) {
+  im_height = static_cast<T>(std::round(im_height / im_scale));
+  im_width = static_cast<T>(std::round(im_width / im_scale));
+  T zero(0);
+  int i = 0;
+  for (const auto& it : sorted_indices) {
+    T score = it.first;
+    int idx = it.second;
+    int a = idx / class_num;
+    int c = idx % class_num;
+
+    int box_offset = a * 4;
+    T anchor_box_width =
+        anchors_data[box_offset + 2] - anchors_data[box_offset] + 1;
+    T anchor_box_height =
+        anchors_data[box_offset + 3] - anchors_data[box_offset + 1] + 1;
+    T anchor_box_center_x = anchors_data[box_offset] + anchor_box_width / 2;
+    T anchor_box_center_y =
+        anchors_data[box_offset + 1] + anchor_box_height / 2;
+    T target_box_center_x = 0, target_box_center_y = 0;
+    T target_box_width = 0, target_box_height = 0;
+    target_box_center_x =
+        bboxes_data[box_offset] * anchor_box_width + anchor_box_center_x;
+    target_box_center_y =
+        bboxes_data[box_offset + 1] * anchor_box_height + anchor_box_center_y;
+    target_box_width = std::exp(bboxes_data[box_offset + 2]) * anchor_box_width;
+    target_box_height =
+        std::exp(bboxes_data[box_offset + 3]) * anchor_box_height;
+    T pred_box_xmin = target_box_center_x - target_box_width / 2;
+    T pred_box_ymin = target_box_center_y - target_box_height / 2;
+    T pred_box_xmax = target_box_center_x + target_box_width / 2 - 1;
+    T pred_box_ymax = target_box_center_y + target_box_height / 2 - 1;
+    pred_box_xmin = pred_box_xmin / im_scale;
+    pred_box_ymin = pred_box_ymin / im_scale;
+    pred_box_xmax = pred_box_xmax / im_scale;
+    pred_box_ymax = pred_box_ymax / im_scale;
+
+    pred_box_xmin = std::max(std::min(pred_box_xmin, im_width - 1), zero);
+    pred_box_ymin = std::max(std::min(pred_box_ymin, im_height - 1), zero);
+    pred_box_xmax = std::max(std::min(pred_box_xmax, im_width - 1), zero);
+    pred_box_ymax = std::max(std::min(pred_box_ymax, im_height - 1), zero);
+
+    std::vector<T> one_pred;
+    one_pred.push_back(pred_box_xmin);
+    one_pred.push_back(pred_box_ymin);
+    one_pred.push_back(pred_box_xmax);
+    one_pred.push_back(pred_box_ymax);
+    one_pred.push_back(score);
+    (*preds)[c].push_back(one_pred);
+    i++;
+  }
+}
+
+template <class T>
+void MultiClassNMS(const std::map<int, std::vector<std::vector<T>>>& preds,
+                   int class_num,
+                   const int keep_top_k,
+                   const T nms_threshold,
+                   const T nms_eta,
+                   std::vector<std::vector<T>>* nmsed_out,
+                   int* num_nmsed_out) {
+  std::map<int, std::vector<int>> indices;
+  int num_det = 0;
+  for (int c = 0; c < class_num; ++c) {
+    if (static_cast<bool>(preds.count(c))) {
+      const std::vector<std::vector<T>> cls_dets = preds.at(c);
+      NMSFast(cls_dets, nms_threshold, nms_eta, &(indices[c]));
+      num_det += indices[c].size();
+    }
+  }
+
+  std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+  for (const auto& it : indices) {
+    int label = it.first;
+    const std::vector<int>& label_indices = it.second;
+    for (size_t j = 0; j < label_indices.size(); ++j) {
+      int idx = label_indices[j];
+      score_index_pairs.push_back(
+          std::make_pair(preds.at(label)[idx][4], std::make_pair(label, idx)));
+    }
+  }
+  // Keep top k results per image.
+  std::stable_sort(score_index_pairs.begin(),
+                   score_index_pairs.end(),
+                   SortScoreTwoPairDescend<int>);
+  if (num_det > keep_top_k) {
+    score_index_pairs.resize(keep_top_k);
+  }
+
+  // Store the new indices.
+  std::map<int, std::vector<int>> new_indices;
+  for (const auto& it : score_index_pairs) {
+    int label = it.second.first;
+    int idx = it.second.second;
+    std::vector<T> one_pred;
+    one_pred.push_back(label);
+    one_pred.push_back(preds.at(label)[idx][4]);
+    one_pred.push_back(preds.at(label)[idx][0]);
+    one_pred.push_back(preds.at(label)[idx][1]);
+    one_pred.push_back(preds.at(label)[idx][2]);
+    one_pred.push_back(preds.at(label)[idx][3]);
+    nmsed_out->push_back(one_pred);
+  }
+
+  *num_nmsed_out = (num_det > keep_top_k ? keep_top_k : num_det);
+}
+
+template <class T>
+void RetinanetDetectionOutput(
+    const operators::RetinanetDetectionOutputParam& param,
+    const std::vector<Tensor>& scores,
+    const std::vector<Tensor>& bboxes,
+    const std::vector<Tensor>& anchors,
+    const Tensor& im_info,
+    std::vector<std::vector<T>>* nmsed_out,
+    int* num_nmsed_out) {
+  int64_t nms_top_k = param.nms_top_k;
+  int64_t keep_top_k = param.keep_top_k;
+  T nms_threshold = static_cast<T>(param.nms_threshold);
+  T nms_eta = static_cast<T>(param.nms_eta);
+  T score_threshold = static_cast<T>(param.score_threshold);
+
+  int64_t class_num = scores[0].dims()[1];
+  std::map<int, std::vector<std::vector<T>>> preds;
+  for (size_t l = 0; l < scores.size(); ++l) {
+    // Fetch per level score
+    Tensor scores_per_level = scores[l];
+    // Fetch per level bbox
+    Tensor bboxes_per_level = bboxes[l];
+    // Fetch per level anchor
+    Tensor anchors_per_level = anchors[l];
+
+    int64_t scores_num = scores_per_level.numel();
+    int64_t bboxes_num = bboxes_per_level.numel();
+    std::vector<T> scores_data(scores_num);
+    std::vector<T> bboxes_data(bboxes_num);
+    std::vector<T> anchors_data(bboxes_num);
+    std::copy_n(scores_per_level.data<T>(), scores_num, scores_data.begin());
+    std::copy_n(bboxes_per_level.data<T>(), bboxes_num, bboxes_data.begin());
+    std::copy_n(anchors_per_level.data<T>(), bboxes_num, anchors_data.begin());
+    std::vector<std::pair<T, int>> sorted_indices;
+
+    // For the highest level, we take the threshold 0.0
+    T threshold = (l < (scores.size() - 1) ? score_threshold : 0.0);
+    GetMaxScoreIndex(scores_data, threshold, nms_top_k, &sorted_indices);
+    auto* im_info_data = im_info.data<T>();
+    auto im_height = im_info_data[0];
+    auto im_width = im_info_data[1];
+    auto im_scale = im_info_data[2];
+    DeltaScoreToPrediction(bboxes_data,
+                           anchors_data,
+                           im_height,
+                           im_width,
+                           im_scale,
+                           class_num,
+                           sorted_indices,
+                           &preds);
+  }
+
+  MultiClassNMS(preds,
+                class_num,
+                keep_top_k,
+                nms_threshold,
+                nms_eta,
+                nmsed_out,
+                num_nmsed_out);
+}
+
+template <class T>
+void MultiClassOutput(const std::vector<std::vector<T>>& nmsed_out,
+                      Tensor* outs) {
+  auto* odata = outs->mutable_data<T>();
+  int count = 0;
+  int64_t out_dim = 6;
+  for (size_t i = 0; i < nmsed_out.size(); ++i) {
+    odata[count * out_dim] = nmsed_out[i][0] + 1;  // label
+    odata[count * out_dim + 1] = nmsed_out[i][1];  // score
+    odata[count * out_dim + 2] = nmsed_out[i][2];  // xmin
+    odata[count * out_dim + 3] = nmsed_out[i][3];  // xmin
+    odata[count * out_dim + 4] = nmsed_out[i][4];  // xmin
+    odata[count * out_dim + 5] = nmsed_out[i][5];  // xmin
+    count++;
+  }
+}
+
+void RetinanetDetectionOutputCompute::Run() {
+  auto& param = Param<operators::RetinanetDetectionOutputParam>();
+  auto& boxes = param.bboxes;
+  auto& scores = param.scores;
+  auto& anchors = param.anchors;
+  auto* im_info = param.im_info;
+  auto* outs = param.out;
+
+  std::vector<Tensor> boxes_list(boxes.size());
+  std::vector<Tensor> scores_list(scores.size());
+  std::vector<Tensor> anchors_list(anchors.size());
+  for (size_t j = 0; j < boxes_list.size(); ++j) {
+    boxes_list[j] = *boxes[j];
+    scores_list[j] = *scores[j];
+    anchors_list[j] = *anchors[j];
+  }
+  auto score_dims = scores_list[0].dims();
+  int64_t batch_size = score_dims[0];
+  auto box_dims = boxes_list[0].dims();
+  int64_t box_dim = box_dims[2];
+  int64_t out_dim = box_dim + 2;
+
+  std::vector<std::vector<std::vector<float>>> all_nmsed_out;
+  std::vector<uint64_t> batch_starts = {0};
+  for (int i = 0; i < batch_size; ++i) {
+    int num_nmsed_out = 0;
+    std::vector<Tensor> box_per_batch_list(boxes_list.size());
+    std::vector<Tensor> score_per_batch_list(scores_list.size());
+    for (size_t j = 0; j < boxes_list.size(); ++j) {
+      auto score_dims = scores_list[j].dims();
+      score_per_batch_list[j] = scores_list[j].Slice<float>(i, i + 1);
+      score_per_batch_list[j].Resize({score_dims[1], score_dims[2]});
+      box_per_batch_list[j] = boxes_list[j].Slice<float>(i, i + 1);
+      box_per_batch_list[j].Resize({score_dims[1], box_dim});
+    }
+    Tensor im_info_slice = im_info->Slice<float>(i, i + 1);
+
+    std::vector<std::vector<float>> nmsed_out;
+    RetinanetDetectionOutput(param,
+                             score_per_batch_list,
+                             box_per_batch_list,
+                             anchors_list,
+                             im_info_slice,
+                             &nmsed_out,
+                             &num_nmsed_out);
+    all_nmsed_out.push_back(nmsed_out);
+    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+  }
+
+  uint64_t num_kept = batch_starts.back();
+  if (num_kept == 0) {
+    outs->Resize({0, out_dim});
+  } else {
+    outs->Resize({static_cast<int64_t>(num_kept), out_dim});
+    for (int i = 0; i < batch_size; ++i) {
+      int64_t s = static_cast<int64_t>(batch_starts[i]);
+      int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
+      if (e > s) {
+        Tensor out = outs->Slice<float>(s, e);
+        MultiClassOutput(all_nmsed_out[i], &out);
+      }
+    }
+  }
+
+  LoD lod;
+  lod.emplace_back(batch_starts);
+  outs->set_lod(lod);
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    retinanet_detection_output,
+    kHost,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::host::RetinanetDetectionOutputCompute,
+    def)
+    .BindInput("BBoxes",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Scores",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Anchors",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("ImInfo",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/host/retinanet_detection_output_compute.h b/lite/kernels/host/retinanet_detection_output_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..612ea7105e2728b856f02d71e9fcfaea2a1ef680
--- /dev/null
+++ b/lite/kernels/host/retinanet_detection_output_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class RetinanetDetectionOutputCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  void Run() override;
+
+  virtual ~RetinanetDetectionOutputCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/where_index_compute.cc b/lite/kernels/host/where_index_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d06be8d332734f3e41b0414e891c8810a117d8a6
--- /dev/null
+++ b/lite/kernels/host/where_index_compute.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/where_index_compute.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+static void where_index_rank4(const int64_t* true_index,
+                              int true_num,
+                              const int64_t* stride,
+                              int64_t* out) {
+  int cnt = true_num >> 1;
+  register int64_t stride0 = stride[0];
+  register int64_t stride1 = stride[1];
+  register int64_t stride2 = stride[2];
+  register int64_t stride3 = stride[3];
+  for (int i = 0; i < cnt; ++i) {
+    int64_t index0 = true_index[i * 2];
+    int64_t index1 = true_index[i * 2 + 1];
+    int out_index = i * 8;
+    // rank0
+    register int64_t oindex0 = index0 / stride0;
+    register int64_t oindex1 = index1 / stride0;
+    out[out_index] = oindex0;
+    index0 -= oindex0 * stride0;
+    index1 -= oindex1 * stride0;
+    out[out_index + 4] = oindex1;
+    out_index++;
+    // rank1
+    oindex0 = index0 / stride1;
+    oindex1 = index1 / stride1;
+    out[out_index] = oindex0;
+    index0 -= oindex0 * stride1;
+    index1 -= oindex1 * stride1;
+    out[out_index + 4] = oindex1;
+    out_index++;
+    // rank2
+    oindex0 = index0 / stride2;
+    oindex1 = index1 / stride2;
+    out[out_index] = oindex0;
+    index0 -= oindex0 * stride2;
+    index1 -= oindex1 * stride2;
+    out[out_index + 4] = oindex1;
+    out_index++;
+    // rank3
+    oindex0 = index0 / stride3;
+    oindex1 = index1 / stride3;
+    out[out_index] = oindex0;
+    out[out_index + 4] = oindex1;
+  }
+  // remain
+  for (int r = cnt * 2; r < true_num; ++r) {
+    int out_index = r * 4;
+    int64_t index = true_index[r];
+    for (int i = 0; i < 4; ++i) {
+      out[out_index + i] = index / stride[i];
+      index -= out[out_index + i] * stride[i];
+    }
+  }
+}
+
+inline void where_index_rank1(const int64_t* true_index,
+                              int true_num,
+                              int64_t* out) {
+  memcpy(out, true_index, true_num * sizeof(int64_t));
+}
+
+static void where_index_rankn(const int64_t* true_index,
+                              int true_num,
+                              const int64_t* stride,
+                              int rank,
+                              int64_t* out) {
+  int out_index = 0;
+  for (int i = 0; i < true_num; ++i) {
+    int64_t index = true_index[i];
+    for (int r = 0; r < rank; ++r) {
+      out[out_index] = index / stride[r];
+      index -= out[out_index++] * stride[r];
+    }
+  }
+}
+
+template <typename T>
+void WhereIndexKernel(const operators::WhereIndexParam& param) {
+  auto* input = param.input;
+  auto* output = param.output;
+  auto dims = input->dims();
+  auto numel = dims.production();
+  int64_t rank = static_cast<int64_t>(dims.size());
+  const T* cond_data = input->template data<T>();
+  int64_t true_num = 0;
+  std::vector<int64_t> true_index(numel);
+  for (auto i = 0; i < numel; i++) {
+    if (static_cast<bool>(cond_data[i])) {
+      true_index[true_num] = i;
+      true_num++;
+    }
+  }
+  output->Resize({true_num, rank});
+  if (true_num == 0) {
+    return;
+  }
+  auto* out_ptr = output->template mutable_data<int64_t>();
+  std::vector<int64_t> stride(rank);
+  stride[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    stride[i] = stride[i + 1] * dims[i + 1];
+  }
+  if (rank == 1) {
+    where_index_rank1(true_index.data(), true_num, out_ptr);
+  } else if (rank == 4) {
+    where_index_rank4(true_index.data(), true_num, stride.data(), out_ptr);
+  } else {
+    where_index_rankn(
+        true_index.data(), true_num, stride.data(), rank, out_ptr);
+  }
+}
+
+void WhereIndexCompute::Run() {
+  auto& param = this->Param<operators::WhereIndexParam>();
+  switch (param.input->precision()) {
+    case PRECISION(kFloat):
+      WhereIndexKernel<float>(param);
+      break;
+    case PRECISION(kInt32):
+      WhereIndexKernel<int32_t>(param);
+      break;
+    case PRECISION(kInt64):
+      WhereIndexKernel<int64_t>(param);
+      break;
+    case PRECISION(kInt8):
+      WhereIndexKernel<int8_t>(param);
+      break;
+    case PRECISION(kBool):
+      WhereIndexKernel<bool>(param);
+      break;
+    default:
+      LOG(FATAL) << "WhereIndex does not implement for the "
+                 << "input type:" << static_cast<int>(param.input->precision());
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using whereindex = paddle::lite::kernels::host::WhereIndexCompute;
+
+REGISTER_LITE_KERNEL(where_index, kHost, kAny, kAny, whereindex, def)
+    .BindInput("Condition",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/host/where_index_compute.h b/lite/kernels/host/where_index_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6936e3ed8f0ee16bf0e41095bbcbd0c18169d62f
--- /dev/null
+++ b/lite/kernels/host/where_index_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/where_index_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class WhereIndexCompute : public KernelLite<TARGET(kHost), PRECISION(kAny)> {
+ public:
+  using param_t = operators::WhereIndexParam;
+
+  void Run() override;
+
+  virtual ~WhereIndexCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/where_index_compute_test.cc b/lite/kernels/host/where_index_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7097bdcae2bb319331af72c390a9d5de4fc23a9f
--- /dev/null
+++ b/lite/kernels/host/where_index_compute_test.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/where_index_compute.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+#include "lite/core/context.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+template <typename T>
+void where_index_compute_ref(lite::Tensor* condition, lite::Tensor* out) {
+  auto dims = condition->dims();
+  auto numel = condition->numel();
+  const int64_t rank = static_cast<int64_t>(dims.size());
+  const T* cond_data = condition->data<T>();
+  std::vector<int64_t> true_index;
+  for (auto i = 0; i < numel; i++) {
+    if (static_cast<bool>(cond_data[i])) {
+      true_index.push_back(i);
+    }
+  }
+  int64_t true_num = static_cast<int64_t>(true_index.size());
+  out->Resize({true_num, rank});
+  int64_t* out_ptr = out->mutable_data<int64_t>();
+  if (true_num == 0) {
+    return;
+  }
+
+  std::vector<int64_t> stride(rank);
+  stride[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    stride[i] = stride[i + 1] * dims[i + 1];
+  }
+  for (int i = 0; i < true_num; ++i) {
+    int64_t index = true_index[i];
+    for (int j = 0; j < rank; ++j) {
+      out_ptr[i * rank + j] = index / stride[j];
+      index -= out_ptr[i * rank + j] * stride[j];
+    }
+  }
+}
+
+TEST(where_index, init) {
+  WhereIndexCompute where_index;
+  ASSERT_EQ(where_index.precision(), PRECISION(kAny));
+  ASSERT_EQ(where_index.target(), TARGET(kHost));
+}
+
+TEST(where_index, retrive_op) {
+  auto where_index =
+      KernelRegistry::Global().Create<TARGET(kHost), PRECISION(kAny)>(
+          "where_index");
+  ASSERT_FALSE(where_index.empty());
+  ASSERT_TRUE(where_index.front());
+}
+
+TEST(where_index, compute) {
+  paddle::lite::DeviceInfo::Init();
+  WhereIndexCompute where_index;
+  operators::WhereIndexParam param;
+
+  lite::Tensor input;
+  lite::Tensor output;
+  lite::Tensor output_ref;
+  param.input = &input;
+  param.output = &output;
+  where_index.SetParam(param);
+  for (auto& n : {1, 2, 4}) {
+    for (auto& c : {1, 3, 21, 32}) {
+      for (auto& h : {1, 5, 63}) {
+        for (auto& w : {1, 5, 64}) {
+          for (auto& dim_size : {1, 2, 3, 4}) {
+            for (int i = 0; i < 5; ++i) {
+              std::vector<int64_t> in_shape;
+              in_shape.push_back(n);
+              in_shape.push_back(c);
+              in_shape.push_back(h);
+              in_shape.push_back(w);
+              int outer = 1;
+              for (int i = dim_size - 1; i < in_shape.size(); ++i) {
+                outer *= in_shape[i];
+              }
+              in_shape.resize(dim_size);
+              in_shape[dim_size - 1] = outer;
+
+              DDim indim(in_shape);
+              LOG(INFO) << "in dims: ";
+              for (int i = 0; i < dim_size; ++i) {
+                LOG(INFO) << in_shape[i];
+              }
+              input.Resize(indim);
+              std::default_random_engine engine;
+              std::uniform_real_distribution<float> dist(-1, 1);
+              if (i == 0) {
+                int* indata = input.mutable_data<int32_t>();
+                for (int i = 0; i < indim.production(); ++i) {
+                  indata[i] = static_cast<int>(dist(engine) > 0);
+                }
+                where_index_compute_ref<int32_t>(&input, &output_ref);
+              } else if (i == 1) {
+                int64_t* indata = input.mutable_data<int64_t>();
+                for (int i = 0; i < indim.production(); ++i) {
+                  indata[i] = static_cast<int64_t>(dist(engine) > 0);
+                }
+                where_index_compute_ref<int64_t>(&input, &output_ref);
+              } else if (i == 2) {
+                int8_t* indata = input.mutable_data<int8_t>();
+                for (int i = 0; i < indim.production(); ++i) {
+                  indata[i] = static_cast<int8_t>(dist(engine) > 0);
+                }
+                where_index_compute_ref<int8_t>(&input, &output_ref);
+              } else if (i == 3) {
+                bool* indata = input.mutable_data<bool>();
+                for (int i = 0; i < indim.production(); ++i) {
+                  indata[i] = dist(engine) > 0;
+                }
+                where_index_compute_ref<bool>(&input, &output_ref);
+              } else {
+                float* indata = input.mutable_data<float>();
+                for (int i = 0; i < indim.production(); ++i) {
+                  indata[i] = dist(engine) > 0;
+                }
+                where_index_compute_ref<float>(&input, &output_ref);
+              }
+              where_index.Run();
+              const int64_t* outdata = output.data<int64_t>();
+              const int64_t* outdata_ref = output_ref.data<int64_t>();
+              CHECK_EQ(output.dims(), output_ref.dims())
+                  << "where_index out shape error! out_dim is not equal "
+                     "to out_ref dim";
+              for (int i = 0; i < output.numel(); i++) {
+                if (std::abs(outdata[i] - outdata_ref[i]) > 0) {
+                  LOG(FATAL) << "where_index cmp error, i: " << i
+                             << ", output_data: " << outdata[i]
+                             << ", output_ref_data: " << outdata_ref[i]
+                             << "input precision: "
+                             << static_cast<int>(input.precision());
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(where_index, kHost, kAny, kAny, def);
diff --git a/lite/kernels/arm/while_compute.cc b/lite/kernels/host/while_compute.cc
similarity index 50%
rename from lite/kernels/arm/while_compute.cc
rename to lite/kernels/host/while_compute.cc
index 9241fd410a542cef797b57b9341f59895b0f734d..4886b5ffe0f48b231bcef59b5494fc126b8b69e2 100644
--- a/lite/kernels/arm/while_compute.cc
+++ b/lite/kernels/host/while_compute.cc
@@ -12,44 +12,44 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/while_compute.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
+#include "lite/kernels/host/while_compute.h"
+#include <unordered_map>
+#include <utility>
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 void WhileCompute::PrepareForRun() {
-  auto &param = Param<operators::WhileParam>();
-  auto cur_scope = param.scope;
-
-  executor_ =
-      std::make_shared<StepExecutor>(param.sub_block, cur_scope, place());
+  auto &param = this->Param<param_t>();
+  program_.reset(new RuntimeProgram(
+      param.program_desc, param.exec_scope, param.block_idx));
 }
 void WhileCompute::Run() {
-  auto &param = Param<operators::WhileParam>();
+  auto &param = this->Param<param_t>();
   while (param.cond->data<bool>()[0]) {
-    executor_->Run();
+    program_->Run();
   }
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(
-    while, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::WhileCompute, def)
-    .BindInput("X", {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))})
+    while, kHost, kAny, kAny, paddle::lite::kernels::host::WhileCompute, def)
+    .BindInput("X",
+               {LiteType::GetTensorListTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
     .BindInput("Condition",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
     .BindOutput("Out",
-                {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))})
-    .BindOutput("StepScopes", {LiteType::GetTensorTy(TARGET(kARM))})
+                {LiteType::GetTensorListTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("StepScopes",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
     .Finalize();
diff --git a/lite/kernels/host/while_compute.h b/lite/kernels/host/while_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..42065865e45c18376034dea0e105bc6d4f1f053f
--- /dev/null
+++ b/lite/kernels/host/while_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/program.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class WhileCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  using param_t = operators::WhileParam;
+
+  void Run() override;
+  void PrepareForRun() override;
+
+  virtual ~WhileCompute() = default;
+
+ private:
+  std::unique_ptr<RuntimeProgram> program_;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/huawei_ascend_npu/CMakeLists.txt b/lite/kernels/huawei_ascend_npu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..be0a8d05081e3dda5f474689dc4eed23bc5f56c4
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(bridges)
+
+add_kernel(subgraph_compute_huawei_ascend_npu HUAWEI_ASCEND_NPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_huawei_ascend_npu subgraph_bridge_engine ${huawei_ascend_npu_subgraph_bridges})
diff --git a/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt b/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f6fac2b0b560dcc467132abe9a21c2c75d266a77
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
@@ -0,0 +1,19 @@
+if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+  return()
+endif()
+
+lite_cc_library(subgraph_bridge_utility_huawei_ascend_npu SRCS utility.cc DEPS)
+lite_cc_library(subgraph_bridge_graph_huawei_ascend_npu SRCS graph.cc DEPS subgraph_bridge_utility_huawei_ascend_npu)
+
+set(huawei_ascend_npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_huawei_ascend_npu subgraph_bridge_graph_huawei_ascend_npu)
+
+lite_cc_library(subgraph_bridge_act_op_huawei_ascend_npu SRCS act_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_op_huawei_ascend_npu SRCS conv_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
+
+set(huawei_ascend_npu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_engine
+        subgraph_bridge_graph_huawei_ascend_npu
+        subgraph_bridge_act_op_huawei_ascend_npu
+        subgraph_bridge_conv_op_huawei_ascend_npu
+        CACHE INTERNAL "huawei_ascend_npu_subgraph_bridges")
diff --git a/lite/kernels/huawei_ascend_npu/bridges/act_op.cc b/lite/kernels/huawei_ascend_npu/bridges/act_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0293515356a13035fcdc4725c5de132ea06ceb67
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/act_op.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+
+template <typename ActType>
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Act node
+  auto act_node = graph->template Add<ActType>(out_name);
+  auto act_op = act_node->template data<ActType>();
+  act_op->set_input_x(*x_node->data());
+
+  return SUCCESS;
+}
+
+template <>
+int ActConverter<ge::op::LeakyRelu>(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Act node
+  auto act_node = graph->template Add<ge::op::LeakyRelu>(out_name);
+  auto act_op = act_node->template data<ge::op::LeakyRelu>();
+  act_op->set_input_x(*x_node->data());
+  // only for leaky_relu
+  auto alpha = op_info->GetAttr<float>("alpha");
+  act_op->set_attr_negative_slope(alpha);
+
+  return SUCCESS;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(
+    sigmoid,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Sigmoid>);
+REGISTER_SUBGRAPH_BRIDGE(
+    relu,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Relu>);
+REGISTER_SUBGRAPH_BRIDGE(
+    tanh,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Tanh>);
+REGISTER_SUBGRAPH_BRIDGE(
+    relu6,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Relu6>);
+REGISTER_SUBGRAPH_BRIDGE(
+    leaky_relu,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::LeakyRelu>);
+REGISTER_SUBGRAPH_BRIDGE(
+    softsign,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Softsign>);
+REGISTER_SUBGRAPH_BRIDGE(
+    softplus,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Softplus>);
diff --git a/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc b/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..075bbca8bd63a3c12d74b3624c6a1d51d7edfb76
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
@@ -0,0 +1,252 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " << op_type << "... ";
+
+  // Get input and output vars and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  ge::DataType ge_data_type = CvtPrecisionType(input->precision());
+
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+
+  auto output_name = op_info->Output("Output").front();
+  auto output = scope->FindMutableTensor(output_name);
+  auto output_dims = output->dims();
+
+  auto bs = input_dims[0];
+  auto ic = input_dims[1];
+  auto oc = filter_dims[0];
+  CHECK_EQ(input_dims.size(), 4L);
+  CHECK_EQ(output_dims.size(), 4L);
+  CHECK_EQ(filter_dims.size(), 4L);
+  CHECK_EQ(output_dims[0], bs);
+  CHECK_EQ(output_dims[1], oc);
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  // Conv2D: groups must set to 1; DepthwiseConv2D: groups not supported.
+  CHECK_LE(groups, 1)
+      << "[HUAWEI_ASCEND_NPU] groups > 1 NOT supported, groups: " << groups;
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  bool with_act =
+      op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
+  std::string act_type =
+      with_act ? op_info->GetAttr<std::string>("act_type") : "";
+  float leaky_relu_alpha = act_type == "leaky_relu"
+                               ? op_info->GetAttr<float>("leaky_relu_alpha")
+                               : 0.f;
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+
+  // Input node
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    input_node = graph->Add(input_name, *input);
+  }
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[HUAWEI_ASCEND_NPU] Paddings size should be "
+         "the same or twice as the input size.";
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
+  // Check depthwise mode, and decide whether use DepthwiseConv2D Op
+  bool use_depthwise_conv = false;
+  bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1);
+  if (is_depthwise_mode && dilations[0] == 1 && dilations[1] == 1) {
+    use_depthwise_conv = true;
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] DepthwiseConv2D op is used.";
+  }
+
+  // Filter node
+  auto filter_node = graph->Add(filter_name, *filter);
+
+  // Add bias node if exists bias
+  // Supports the bias nodes with the following dimensions
+  // 0: {oc} => 1D tensor of foramt ND
+  // 1: {1, oc, oh, ow}
+  // 2: {n, oc, oh, ow}
+  std::vector<int64_t> bias_shape;
+  std::shared_ptr<Node> bias_node = nullptr;
+  bool is_channel_bias = false;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      auto bias_data_size = bias_dims.production();
+      auto output_data_size = output_dims.production();
+      if (bias_data_size == oc) {
+        // 0: {oc}
+        bias_shape = {oc};
+        is_channel_bias = true;
+      } else if (bias_data_size == output_data_size / bs) {
+        // 1: {1, oc, oh, ow}
+        bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
+      } else if (bias_data_size == output_data_size) {
+        // 2: {n, oc, oh, ow}
+        bias_shape = output_dims.Vectorize();
+      } else {
+        LOG(WARNING)
+            << "[HUAWEI_ASCEND_NPU] Bias dimension " << bias_dims
+            << " isn't supported in conv2d Op when output dimension is "
+            << output_dims;
+        return FAILED;
+      }
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
+    }
+  }
+
+  // Ascend must update convop desc, or IR model build will fail
+  ge::TensorDesc conv2d_input_desc_x(
+      ge::Shape(CvtShape(input_dims)), ge::FORMAT_NCHW, ge_data_type);
+  ge::TensorDesc conv2d_input_desc_filter(
+      ge::Shape(CvtShape(filter_dims)), ge::FORMAT_NCHW, ge_data_type);
+  ge::TensorDesc conv2d_input_desc_bias(
+      ge::Shape(bias_shape), ge::FORMAT_ND, ge_data_type);
+  ge::TensorDesc conv2d_output_desc_y(
+      ge::Shape(CvtShape(output_dims)), ge::FORMAT_NCHW, ge_data_type);
+  // Setting desc name
+  conv2d_input_desc_x.SetName("conv2d_input_desc_x");
+  conv2d_input_desc_filter.SetName("conv2d_input_desc_filter");
+  conv2d_input_desc_bias.SetName("conv2d_input_desc_bias");
+  conv2d_output_desc_y.SetName("conv2d_output_desc_y");
+  // Conv node
+  std::shared_ptr<Node> conv_node = nullptr;
+  if (use_depthwise_conv && is_depthwise_mode) {
+    conv_node = graph->Add<ge::op::DepthwiseConv2D>(output_name);
+    auto conv_op = conv_node->data<ge::op::DepthwiseConv2D>();
+    conv_op->set_input_x(*input_node->data());
+    conv_op->set_input_filter(*filter_node->data());
+    conv_op->set_attr_strides(
+        ge::Operator::OpListInt({1, 1, strides[0], strides[1]}));
+    conv_op->set_attr_dilations({1, 1, dilations[0], dilations[1]});
+    conv_op->set_attr_pads(
+        {paddings[0], paddings[1], paddings[2], paddings[3]});
+    conv_op->set_attr_data_format("NCHW");
+    if (bias_node != nullptr && is_channel_bias) {
+      conv_op->set_input_bias(*bias_node->data());
+      conv_op->update_input_desc_bias(conv2d_input_desc_bias);
+    }
+    // update tensor desc to conv2d
+    conv_op->update_input_desc_x(conv2d_input_desc_x);
+    conv_op->update_input_desc_filter(conv2d_input_desc_filter);
+    conv_op->update_output_desc_y(conv2d_output_desc_y);
+  } else {
+    conv_node = graph->Add<ge::op::Conv2D>(output_name);
+    auto conv_op = conv_node->data<ge::op::Conv2D>();
+    conv_op->set_input_x(*input_node->data());
+    conv_op->set_input_filter(*filter_node->data());
+    conv_op->set_attr_strides(
+        ge::Operator::OpListInt({bs, ic, strides[0], strides[1]}));
+    conv_op->set_attr_pads(ge::Operator::OpListInt(
+        {paddings[0], paddings[1], paddings[2], paddings[3]}));
+    conv_op->set_attr_dilations(
+        ge::Operator::OpListInt({bs, ic, dilations[0], dilations[1]}));
+    conv_op->set_attr_groups(groups);
+    conv_op->set_attr_data_format("NCHW");
+    if (bias_node != nullptr && is_channel_bias) {
+      conv_op->set_input_bias(*bias_node->data());
+      conv_op->update_input_desc_bias(conv2d_input_desc_bias);
+    }
+    // update tensor desc to conv2d
+    conv_op->update_input_desc_x(conv2d_input_desc_x);
+    conv_op->update_input_desc_filter(conv2d_input_desc_filter);
+    conv_op->update_output_desc_y(conv2d_output_desc_y);
+  }
+  // append Add node to support bias
+  if (bias_node != nullptr && !is_channel_bias) {
+    auto add_node = graph->Add<ge::op::Add>(output_name);
+    auto add_op = add_node->data<ge::op::Add>();
+    add_op->set_input_x1(*conv_node->data());
+    add_op->set_input_x2(*bias_node->data());
+    conv_node = add_node;
+  }
+  CHECK(conv_node);
+
+  // ONLY support relu/leaky_relu now
+  // to do (@qili93): add more act types
+  if (!act_type.empty()) {
+    if (act_type == "relu") {
+      auto act_node = graph->Add<ge::op::Relu>(output_name);
+      auto act_op = act_node->data<ge::op::Relu>();
+      act_op->set_input_x(*conv_node->data());
+    } else if (act_type == "leaky_relu") {
+      auto act_node = graph->Add<ge::op::LeakyRelu>(output_name);
+      auto act_op = act_node->data<ge::op::LeakyRelu>();
+      act_op->set_input_x(*conv_node->data());
+      act_op->set_attr_negative_slope(leaky_relu_alpha);
+    } else {
+      LOG(WARNING) << "[HUAWEI_ASCEND_NPU] act type not supported: "
+                   << act_type;
+      return FAILED;
+    }
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(
+    conv2d,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(
+    depthwise_conv2d,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ConvConverter);
diff --git a/lite/kernels/huawei_ascend_npu/bridges/graph.cc b/lite/kernels/huawei_ascend_npu/bridges/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e1eaf1228fd3df7583ddc194b3d58862ddc0e12
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/graph.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include <utility>
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
+  auto it = nodes_.find(name);
+  if (it != nodes_.end()) {
+    // Only variable node can be shared with the same name
+    if (!node->is_var() || !it->second.back()->is_var()) {
+      LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Const or data node " << name
+                 << " is redefined.";
+      return -1;
+    }
+  } else {
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
+  }
+  it->second.push_back(node);
+  return it->second.size();
+}
+
+// Const or data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const Tensor& tensor,
+                                 std::vector<int64_t> shape,
+                                 DataLayoutType layout) {
+  std::shared_ptr<Node> node = nullptr;
+  PrecisionType precision = tensor.precision();
+  if (tensor.persistable()) {
+    // Const node
+    node = Add<ge::op::Const>(name, precision, layout);
+    ge::TensorDesc desc(ge::Shape(shape),
+                        CvtDataLayoutType(layout),
+                        CvtPrecisionType(precision));
+    desc.SetName("const_node_desc");
+    node->data<ge::op::Const>()->set_attr_value(
+        CvtTensor(tensor, shape, layout));
+    node->data<ge::op::Const>()->update_output_desc_y(desc);
+  } else {
+    // Data node
+    node = Add(name, shape, precision, layout);
+  }
+  return node;
+}
+
+// Data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 std::vector<int64_t> shape,
+                                 PrecisionType precision,
+                                 DataLayoutType layout) {
+  auto node = Add<ge::op::Data>(name, precision, layout);
+  ge::TensorDesc desc(
+      ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision));
+  desc.SetName("data_node_desc");
+  node->data<ge::op::Data>()->update_input_desc_x(desc);
+  node->data<ge::op::Data>()->update_output_desc_y(desc);
+  return node;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/huawei_ascend_npu/bridges/graph.h b/lite/kernels/huawei_ascend_npu/bridges/graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb763004939a4ccfffdd526e92bc029509aab45e
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/graph.h
@@ -0,0 +1,196 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "op_proto/built-in/inc/all_ops.h"  // opp/op_proto/built-in/inc
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+
+// Graph and node is defined to collect all of converted HiAI IR nodes
+class Node {
+ public:
+  enum class Role {
+    kVar = 0,
+    kConst,
+    kData,
+  };
+
+  Node(std::shared_ptr<ge::Operator> data,
+       PrecisionType precision,
+       DataLayoutType layout,
+       Role role)
+      : data_(data), precision_(precision), layout_(layout), role_(role) {}
+  Node(PrecisionType precision, DataLayoutType layout, Role role)
+      : precision_(precision), layout_(layout), role_(role) {}
+
+  void set_data(std::shared_ptr<ge::Operator> data) { data_ = data; }
+  void set_precision(PrecisionType precision) { precision_ = precision; }
+  void set_layout(DataLayoutType layout) { layout_ = layout; }
+  void set_role(Role role) { role_ = role; }
+
+  template <typename T>
+  std::shared_ptr<T> data() {
+    return std::static_pointer_cast<T>(data_);
+  }
+  std::shared_ptr<ge::Operator> data() { return data_; }
+  PrecisionType precision() const { return precision_; }
+  DataLayoutType layout() const { return layout_; }
+  bool is_var() const { return role_ == Role::kVar; }
+  bool is_const() const { return role_ == Role::kConst; }
+  bool is_data() const { return role_ == Role::kData; }
+
+ private:
+  std::shared_ptr<ge::Operator> data_{nullptr};
+  PrecisionType precision_{PRECISION(kFloat)};
+  DataLayoutType layout_{DATALAYOUT(kNCHW)};
+  Role role_{Role::kVar};
+};
+
+class Graph {
+ public:
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+
+  // Variable, const or data node
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    Node::Role role = Node::Role::kVar;
+    if (typeid(T) == typeid(ge::op::Const)) {
+      role = Node::Role::kConst;
+    } else if (typeid(T) == typeid(ge::op::Data)) {
+      role = Node::Role::kData;
+    }
+    auto node = std::make_shared<Node>(precision, layout, role);
+    auto idx = Add(name, node);
+    CHECK_GE(idx, 1);
+    // Generate a unique name for the created HiAI IR
+    node->set_data(
+        std::make_shared<T>(name + "__" + paddle::lite::to_string(idx)));
+    return node;
+  }
+
+  // Const or data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            std::vector<int64_t> shape,
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, tensor, tensor.dims().Vectorize(), layout);
+  }
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, tensor, dims.Vectorize(), layout);
+  }
+
+  // Const node
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const std::vector<T>& data,
+                            std::vector<int64_t> shape = {},
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    if (shape.empty()) {
+      shape = {static_cast<int64_t>(data.size())};
+    } else {
+      int size = 1;
+      for (auto i : shape) {
+        size *= i;
+      }
+      CHECK_EQ(data.size(), size);
+    }
+    Tensor tensor;
+    tensor.Resize(shape);
+    tensor.set_persistable(true);
+    std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
+                reinterpret_cast<const uint8_t*>(data.data()),
+                data.size() * sizeof(T));
+    return Add(name, tensor, layout);
+  }
+
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const std::vector<T>& data,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, data, dims.Vectorize(), layout);
+  }
+
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            T value,
+                            std::vector<int64_t> shape = {1},
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    int64_t size = 1;
+    for (auto i : shape) {
+      size *= i;
+    }
+    std::vector<T> data(size, value);
+    return Add(name, data, shape, layout);
+  }
+
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            T value,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, value, dims.Vectorize(), layout);
+  }
+
+  // Data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            std::vector<int64_t> shape,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            DDim dims,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, dims.Vectorize(), precision, layout);
+  }
+
+  std::shared_ptr<Node> Get(std::string name) {
+    CHECK(Has(name)) << "[HUAWEI_ASCEND_NPU] Node " << name << " not found.";
+    return nodes_.at(name).back();
+  }
+
+  bool Has(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+ private:
+  std::map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
+};
+
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h b/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d38a4b0e68df0ddd66e0642e34323c40a6f1056
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// activation
+USE_SUBGRAPH_BRIDGE(sigmoid, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(relu, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(tanh, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(relu6, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(leaky_relu, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(softsign, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(softplus, kHuaweiAscendNPU);
+// conv
+USE_SUBGRAPH_BRIDGE(conv2d, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kHuaweiAscendNPU);
diff --git a/lite/kernels/huawei_ascend_npu/bridges/utility.cc b/lite/kernels/huawei_ascend_npu/bridges/utility.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2fdaa49b94f48ad12b58036cd89d2f545566cad6
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.cc
@@ -0,0 +1,217 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+
+ge::DataType CvtPrecisionType(PrecisionType itype) {
+  ge::DataType otype = ge::DT_FLOAT;
+  switch (itype) {
+    case PRECISION(kFloat):
+      otype = ge::DT_FLOAT;
+      break;
+    case PRECISION(kFP16):
+      otype = ge::DT_FLOAT16;
+      break;
+    case PRECISION(kInt8):
+      otype = ge::DT_INT8;
+      break;
+    case PRECISION(kInt16):
+      otype = ge::DT_INT16;
+      break;
+    case PRECISION(kInt32):
+      otype = ge::DT_INT32;
+      break;
+    case PRECISION(kInt64):
+      otype = ge::DT_INT64;
+      break;
+    // TODO(liq27) support more precision type
+    default:
+      LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Can not convert precision type("
+                 << PrecisionToStr(itype) << ") from Lite to NPU";
+      break;
+  }
+  return otype;
+}
+
+ge::Format CvtDataLayoutType(DataLayoutType itype) {
+  ge::Format otype = ge::FORMAT_NCHW;
+  switch (itype) {
+    case DATALAYOUT(kNCHW):
+      otype = ge::FORMAT_NCHW;
+      break;
+    case DATALAYOUT(kNHWC):
+      otype = ge::FORMAT_NHWC;
+      break;
+    // TODO(liq27) support more data layout type
+    default:
+      LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Can not convert data layout type("
+                 << DataLayoutToStr(itype)
+                 << ") from Lite to HUAWEI_ASCEND_NPU";
+      break;
+  }
+  return otype;
+}
+
+std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape) {
+  std::vector<int64_t> out_shape;
+  // Padding the shape to 4-dimensions(NCHW)
+  for (size_t i = 0; i < 4 - in_shape.size(); i++) {
+    out_shape.push_back(1);
+  }
+  for (size_t i = 0; i < in_shape.size(); i++) {
+    out_shape.push_back(in_shape[i]);
+  }
+  return out_shape;
+}
+
+std::vector<int64_t> CvtShape(const DDim& in_dims) {
+  return CvtShape(in_dims.Vectorize());
+}
+
+ge::Tensor CvtTensor(const Tensor& in_tensor,
+                     std::vector<int64_t> out_shape,
+                     DataLayoutType in_layout) {
+  PrecisionType in_precision = in_tensor.precision();
+  auto in_size = in_tensor.dims().production();
+  auto in_shape = in_tensor.dims().Vectorize();
+  if (out_shape.empty()) {
+    out_shape = in_shape;
+  }
+  ge::TensorDesc out_desc(ge::Shape(out_shape),
+                          CvtDataLayoutType(in_layout),
+                          CvtPrecisionType(in_precision));
+  auto out_size = out_desc.GetShape().GetShapeSize();
+  CHECK_EQ(out_size, in_size);
+  ge::Tensor out_tensor;
+  out_tensor.SetTensorDesc(out_desc);
+  out_tensor.SetData(reinterpret_cast<const uint8_t*>(in_tensor.raw_data()),
+                     in_tensor.memory_size());
+  return out_tensor;
+}
+
+int CvtActMode(std::string act_type) {
+  int act_mode = 1;
+  if (act_type == "sigmoid") {
+    act_mode = 0;
+  } else if (act_type == "relu") {
+    act_mode = 1;
+  } else if (act_type == "tanh") {
+    act_mode = 2;
+  } else if (act_type == "relu_clipped" || act_type == "relu6") {
+    act_mode = 3;
+  } else if (act_type == "elu") {
+    act_mode = 4;
+  } else if (act_type == "leaky_relu") {
+    act_mode = 5;
+  } else if (act_type == "abs") {
+    act_mode = 6;
+  } else if (act_type == "softsign") {
+    act_mode = 8;
+  } else if (act_type == "softplus") {
+    act_mode = 9;
+  } else if (act_type == "hard_sigmoid") {
+    act_mode = 10;
+  } else if (act_type == "thresholded_relu") {
+    act_mode = 11;
+  } else {
+    // TODO(liqi27) support more activation mode
+    LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Unsupported activation type "
+               << act_type;
+  }
+  return act_mode;
+}
+
+const std::string& CvtFormat(ge::Format format) {
+  static const int MAX_FORMAT_LENGTH = 25;
+  static const std::string format2string[] = {
+      "FORMAT_NCHW = 0",
+      "FORMAT_NHWC = 1",
+      "FORMAT_ND = 2",
+      "FORMAT_NC1HWC0 = 3",
+      "FORMAT_FRACTAL_Z = 4",
+      "FORMAT_NC1C0HWPAD = 5",
+      "FORMAT_NHWC1C0 = 6",
+      "FORMAT_FSR_NCHW = 7",
+      "FORMAT_FRACTAL_DECONV = 8",
+      "FORMAT_C1HWNC0 = 9",
+      "FORMAT_FRACTAL_DECONV_TRANSPOSE = 10",
+      "FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS = 11",
+      "FORMAT_NC1HWC0_C04 = 12",
+      "FORMAT_FRACTAL_Z_C04 = 13",
+      "FORMAT_CHWN = 14",
+      "FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS = 15",
+      "FORMAT_HWCN = 16",
+      "FORMAT_NC1KHKWHWC0 = 17",
+      "FORMAT_BN_WEIGHT = 18",
+      "FORMAT_FILTER_HWCK = 19",
+      "FORMAT_HASHTABLE_LOOKUP_LOOKUPS = 20",
+      "FORMAT_HASHTABLE_LOOKUP_KEYS = 21",
+      "FORMAT_HASHTABLE_LOOKUP_VALUE = 22",
+      "FORMAT_HASHTABLE_LOOKUP_OUTPUT = 23",
+      "FORMAT_HASHTABLE_LOOKUP_HITS = 24"};
+  auto x = static_cast<int>(format);
+  CHECK_LT(x, MAX_FORMAT_LENGTH);
+  return format2string[x];
+}
+
+const std::string& CvtDataType(ge::DataType data_type) {
+  static const int MAX_DATATYPE_LENGTH = 14;
+  static const std::string datatype2string[] = {"DT_FLOAT=0",
+                                                "DT_FLOAT16=1",
+                                                "DT_INT8=2",
+                                                "DT_INT32=3",
+                                                "DT_UINT8=4",
+                                                "Unknown=5",
+                                                "DT_INT16=6",
+                                                "DT_UINT16=7",
+                                                "DT_UINT32=8",
+                                                "DT_INT64=9",
+                                                "DT_UINT64=10",
+                                                "DT_DOUBLE=11",
+                                                "DT_BOOL=12",
+                                                "DT_STRING=13"};
+
+  auto x = static_cast<int>(data_type);
+  CHECK_LT(x, MAX_DATATYPE_LENGTH);
+  return datatype2string[x];
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/huawei_ascend_npu/bridges/utility.h b/lite/kernels/huawei_ascend_npu/bridges/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..da9a8999ad09e545745f30e02ca62c60e6f9bf82
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+// #include "graph/buffer.h"
+#include "graph/tensor.h"
+#include "graph/types.h"
+#include "lite/core/op_lite.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+
+// Type/tensor converters for converting Paddle type/tensor to HiAI type/tensor
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+
+ge::DataType CvtPrecisionType(PrecisionType itype);
+
+ge::Format CvtDataLayoutType(DataLayoutType itype);
+
+// Padding the shape to 4-dimensions(NCHW) for HiAI
+std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape);
+
+std::vector<int64_t> CvtShape(const DDim& in_dims);
+
+ge::Tensor CvtTensor(const Tensor& in_tensor,
+                     std::vector<int64_t> out_shape = {},
+                     DataLayoutType in_layout = DATALAYOUT(kNCHW));
+
+int CvtActMode(std::string act_type);
+
+const std::string& CvtFormat(ge::Format format);
+const std::string& CvtDataType(ge::DataType data_type);
+
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/huawei_ascend_npu/subgraph_compute.cc b/lite/kernels/huawei_ascend_npu/subgraph_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e71c71ca28b163f27a9783572d585466335ef87
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.cc
@@ -0,0 +1,483 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/huawei_ascend_npu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include "lite/utils/io.h"
+#include "lite/utils/md5.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace huawei_ascend_npu {
+
+// Generate the model name by using md5 hashes based on:
+// 1. the sorted variable input names
+// 2. the shapes of the origin input tensors
+// 3. the sorted variable output names
+std::string DeviceProgram::GenerateModelName(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims) {
+  std::ostringstream os;
+  CHECK_EQ(input_names.size(), origin_idims.size());
+  for (size_t i = 0; i < input_names.size(); i++) {
+    os << input_names[i];
+    for (auto dim : origin_idims[i]) {
+      os << dim;
+    }
+  }
+  for (auto output_name : output_names) {
+    os << output_name;
+  }
+  return MD5(os.str());
+}
+
+// Serialize the generated model, the precisions and dimensions of the origin
+// output tensors of the subgraph op into files
+bool DeviceProgram::LoadFromCacheFile(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims,
+    const std::string& model_cache_dir,
+    const int device_id) {
+  // Generate the model name if not initialized
+  if (model_name_.empty()) {
+    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
+  }
+  // Load from the cached model file, return a HiAI model manager client for
+  // inference
+  auto model_path = model_cache_dir + "/" + model_name_ + ".om";
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model from cached file from:"
+          << model_path;
+  model_client_ = lite::huawei_ascend_npu::Device::Global().LoadFromFile(
+      model_path, device_id);
+  if (!model_client_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from cached file failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
+  // Deserialize the precisions and shapes of the origin output tensors from the
+  // cached configuration file
+  auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Load configuration from " << config_path;
+  std::vector<char> config_buffer;
+  if (!ReadFile(config_path, &config_buffer)) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] read from " << config_path
+                 << " failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading configuration success:"
+          << config_path;
+  std::string config_str(config_buffer.begin(), config_buffer.end());
+  // Parse the precision and shapes of the output tensors
+  auto output_options = Split<std::string>(config_str, ";");
+  CHECK_EQ(output_options.size(), output_names.size());
+  origin_otypes_.resize(output_names.size());
+  origin_odims_.resize(output_names.size());
+  for (size_t i = 0; i < output_names.size(); i++) {
+    auto items = Split<std::string>(output_options[i], ":");
+    CHECK_EQ(items.size(), 2);  // precision and shapes
+    origin_otypes_[i] = static_cast<PrecisionType>(std::stoi(items[0]));
+    origin_odims_[i] = Split<int64_t>(items[1], ",");
+  }
+  return true;
+}
+
+bool DeviceProgram::BuildGraphAndCacheToFile(
+    RuntimeProgram* origin_program,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims,
+    const std::vector<Tensor*>& origin_otensors,
+    const std::string& model_cache_dir,
+    const int device_id) {
+  // Generate the model name if not initialized
+  if (model_name_.empty()) {
+    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
+  }
+  // Convert all of ops and their input vars and weights to HiAI IR nodes,
+  // then added them into the IR graph
+  int status = 0;
+  subgraph::huawei_ascend_npu::Graph graph;
+  const auto& bridges = subgraph::Registry::Instance();
+  CHECK(origin_program)
+      << "[HUAWEI_ASCEND_NPU] The origin program is not initialized!";
+  CHECK_GT(origin_program->instructions(kRootBlockIdx).size(), 0)
+      << "[HUAWEI_ASCEND_NPU] No instructions found in the origin program!";
+  const auto& insts = origin_program->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
+    auto op = const_cast<OpLite*>(inst.op());
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kHuaweiAscendNPU))) {
+      return false;
+    }
+    auto kernel = inst.kernel();
+    status |= bridges.Select(op_type, TARGET(kHuaweiAscendNPU))(
+        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      return false;
+    }
+  }
+  // Collect the input and output nodes of the IR graph
+  std::vector<ge::Operator> device_inodes;
+  for (size_t i = 0; i < input_names.size(); i++) {
+    CHECK(graph.Has(input_names[i]));
+    CHECK(graph.Get(input_names[i])->is_data());
+    device_inodes.push_back(*graph.Get(input_names[i])->data());
+  }
+  std::vector<ge::Operator> device_onodes;
+  for (size_t i = 0; i < output_names.size(); i++) {
+    CHECK(graph.Has(output_names[i]));
+    device_onodes.push_back(*graph.Get(output_names[i])->data());
+  }
+  // Build the IR graph to the om model
+  std::vector<char> model_buffer;
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Building model from model buffer...";
+  if (!lite::huawei_ascend_npu::Device::Global().Build(
+          device_inodes, device_onodes, &model_buffer)) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Build model failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Build model success.";
+  // Load the om model and create a model manager client
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model from memory ...";
+  model_client_ = lite::huawei_ascend_npu::Device::Global().LoadFromMem(
+      model_buffer, device_id);
+  if (!model_client_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from memory failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Load model from memory success.";
+  // Update the precison and dimensions of the origin output tensors
+  CHECK_EQ(origin_otensors.size(), output_names.size());
+  origin_otypes_.resize(output_names.size());
+  origin_odims_.resize(output_names.size());
+  for (size_t i = 0; i < output_names.size(); i++) {
+    origin_otypes_[i] = graph.Get(output_names[i])->precision();
+    origin_odims_[i] = origin_otensors[i]->dims().Vectorize();
+  }
+  if (!model_cache_dir.empty()) {
+    auto model_path = model_cache_dir + "/" + model_name_ + ".om";
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Saving model to " << model_path;
+    if (!WriteFile(model_path, model_buffer)) {
+      LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Open " << model_path
+                   << " for writting failed!";
+    }
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Saved OM model success:";
+    // Serialize the precisions and shapes of the origin output tensors into the
+    // configuration file
+    std::ostringstream os;
+    for (size_t i = 0; i < output_names.size(); i++) {
+      os << static_cast<int32_t>(origin_otypes_[i]) << ":";
+      for (auto dim : origin_odims_[i]) {
+        os << dim << ",";
+      }
+      os << ";";
+    }
+    auto str = os.str();
+    std::vector<char> config_buffer(str.begin(), str.end());
+    auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Saving configuration to " << config_path;
+    if (!WriteFile(config_path, config_buffer)) {
+      LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Open " << config_path
+                   << " for writting failed!";
+    }
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Saved configuration file success.";
+  }
+  return true;
+}
+
+bool DeviceProgram::ShareBufferWithOriginTensors(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    std::vector<Tensor*>* origin_itensors,
+    std::vector<Tensor*>* origin_otensors,
+    std::vector<std::shared_ptr<ge::Tensor>>* device_itensors,
+    std::vector<std::shared_ptr<ge::Tensor>>* device_otensors) {
+  CHECK(!model_name_.empty() && model_client_);
+  // Query the dimensions of the device input and output tensors if not
+  // initialized
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Sharing buffer with origin tnsors...";
+  if (device_idims_.empty() || device_odims_.empty()) {
+    if (!(model_client_->GetModelIOTensorDim(&device_idims_, &device_odims_))) {
+      LOG(WARNING)
+          << "[HUAWEI_ASCEND_NPU] Get the dimensions of input and output "
+             "tensors failed!";
+      return false;
+    }
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] GetModelIOTensorDim success.";
+  // Check the dimensions of the device tensors and the origin tensors
+  CHECK_EQ(device_itensors->size(), input_names.size());
+  CHECK_EQ(device_otensors->size(), output_names.size());
+  CHECK_EQ(origin_otypes_.size(), output_names.size());
+  CHECK_EQ(origin_odims_.size(), output_names.size());
+  CHECK_EQ(device_idims_.size(), input_names.size());
+  CHECK_EQ(device_odims_.size(), output_names.size());
+  for (size_t i = 0; i < input_names.size(); i++) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Inputs[" << i
+            << "] name: " << input_names[i]
+            << " origin dims:" << (*origin_itensors)[i]->dims().repr()
+            << " device dims: {" << device_idims_[i].GetNumber() << ","
+            << device_idims_[i].GetChannel() << ","
+            << device_idims_[i].GetHeight() << ","
+            << device_idims_[i].GetWidth() << "}";
+    CHECK_EQ((*origin_itensors)[i]->dims().production(),
+             device_idims_[i].GetNumber() * device_idims_[i].GetChannel() *
+                 device_idims_[i].GetHeight() * device_idims_[i].GetWidth());
+
+    // reset tensor desc
+    if ((*device_itensors)[i]->SetTensorDesc(
+            device_idims_[i].GetGeTensorDesc()) != ge::GRAPH_SUCCESS) {
+      LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Tensor input tensor "
+                      "SetTensorDesc failed!";
+    } else {
+      VLOG(3) << "[HUAWEI_ASCEND_NPU] ge::Tensor input tensor SetTensorDesc "
+                 "success.";
+    }
+    // copy data from origin to device
+    if ((*device_itensors)[i]->SetData(
+            reinterpret_cast<uint8_t*>((*origin_itensors)[i]->raw_data()),
+            (*origin_itensors)[i]->memory_size()) != ge::GRAPH_SUCCESS) {
+      LOG(WARNING)
+          << "[HUAWEI_ASCEND_NPU] ge::Tensor input tensor SetData failed!";
+    } else {
+      VLOG(3) << "[HUAWEI_ASCEND_NPU] ge::Tensor input tensor SetData success.";
+    }
+    VLOG(3)
+        << "[HUAWEI_ASCEND_NPU] Init the input tensors for the device program "
+           "and share their buffers with the origin input tensors";
+
+    // Share data buf between device_itensor and origin_itensor
+    std::shared_ptr<Buffer> buffer = std::make_shared<Buffer>(
+        reinterpret_cast<void*>((*device_itensors)[i]->GetData()),
+        lite_api::TargetType::kHost,
+        (*device_itensors)[i]->GetSize());
+    (*origin_itensors)[i]->ResetBuffer(buffer,
+                                       (*device_itensors)[i]->GetSize());
+  }
+  for (size_t i = 0; i < output_names.size(); i++) {
+    (*origin_otensors)[i]->set_precision(origin_otypes_[i]);
+    (*origin_otensors)[i]->Resize(origin_odims_[i]);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Outputs[" << i
+            << "] name: " << output_names[i]
+            << " origin dims:" << (*origin_otensors)[i]->dims().repr()
+            << " device dims: {" << device_odims_[i].GetNumber() << ","
+            << device_odims_[i].GetChannel() << ","
+            << device_odims_[i].GetHeight() << ","
+            << device_odims_[i].GetWidth() << "}";
+    CHECK_EQ((*origin_otensors)[i]->dims().production(),
+             device_odims_[i].GetNumber() * device_odims_[i].GetChannel() *
+                 device_odims_[i].GetHeight() * device_odims_[i].GetWidth());
+
+    // reset tensor desc
+    if ((*device_otensors)[i]->SetTensorDesc(
+            device_odims_[i].GetGeTensorDesc()) != ge::GRAPH_SUCCESS) {
+      LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Tensor output tensor "
+                      "SetTensorDesc failed!";
+    } else {
+      VLOG(3) << "[HUAWEI_ASCEND_NPU] ge::Tensor output tensor SetTensorDesc "
+                 "success.";
+    }
+    VLOG(3)
+        << "[HUAWEI_ASCEND_NPU] Init the output tensors for the device program "
+           "and share their buffers with the origin output tensors";
+  }
+  return true;
+}
+
+bool DeviceProgram::SharedBufferWithOutputTensors(
+    const std::vector<std::string>& output_names,
+    std::vector<Tensor*>* origin_otensors,
+    std::vector<std::shared_ptr<ge::Tensor>>* device_otensors) {
+  CHECK(!model_name_.empty() && model_client_);
+  // Check the dimensions of the device tensors and the origin tensors
+  CHECK_EQ(device_otensors->size(), output_names.size());
+  CHECK_EQ(origin_otypes_.size(), output_names.size());
+  CHECK_EQ(origin_odims_.size(), output_names.size());
+
+  for (size_t i = 0; i < output_names.size(); i++) {
+    CHECK_EQ((*origin_otensors)[i]->dims().production(),
+             device_odims_[i].GetNumber() * device_odims_[i].GetChannel() *
+                 device_odims_[i].GetHeight() * device_odims_[i].GetWidth());
+
+    // Share data buf between device_itensor and origin_itensor
+    std::shared_ptr<Buffer> buffer = std::make_shared<Buffer>(
+        reinterpret_cast<void*>((*device_otensors)[i]->GetData()),
+        lite_api::TargetType::kHost,
+        (*device_otensors)[i]->GetSize());
+    (*origin_otensors)[i]->ResetBuffer(buffer,
+                                       (*device_otensors)[i]->GetSize());
+  }
+  // unload model after model execution
+  CHECK_EQ(model_client_->UnloadModel(), true);
+  return true;
+}
+
+bool DeviceProgram::ZeroCopyRun(
+    std::vector<std::shared_ptr<ge::Tensor>>* device_itensors,
+    std::vector<std::shared_ptr<ge::Tensor>>* device_otensors) {
+  CHECK(!model_name_.empty() && model_client_);
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  // int istamp;
+  auto start_time = GetCurrentUS();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Starting ZeroCopyRun to ModelExecute ...";
+  CHECK_EQ(model_client_->ModelExecute(device_itensors, device_otensors), true);
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Process cost " << GetCurrentUS() - start_time
+          << " us";
+  return true;
+}
+
+bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
+  // Obtain the origin input tensors, and create the origin output
+  // tensors(Don't try to access them before launch the device program or the
+  // origin program)
+  PrepareWorkspaceForOriginProgram();
+  // Create the device input and output tensors, but don't initialize them
+  // with the dimensions
+  device_itensors_.resize(input_names_.size());
+  for (size_t i = 0; i < input_names_.size(); i++) {
+    device_itensors_[i].reset(new ge::Tensor);
+    CHECK(device_itensors_[i]);
+  }
+  device_otensors_.resize(output_names_.size());
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    device_otensors_[i].reset(new ge::Tensor);
+    CHECK(device_otensors_[i]);
+  }
+  return true;
+}
+
+bool SubgraphEngine::BuildDeviceProgram() {
+  // Check if the cache device program exists
+  if (!device_programs_.count(origin_idims_)) {
+    auto device_program = std::make_shared<DeviceProgram>();
+    // Obtain the model cache dir from the NPU Context of the subgraph op
+    auto model_cache_dir =
+        ctx_->As<HuaweiAscendNPUContext>().SubgraphModelCacheDir();
+    auto device_id = ctx_->As<HuaweiAscendNPUContext>().HuaweiAscendDeviceID();
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Get model cached dir: " << model_cache_dir;
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Get huawei ascend npu device id: "
+            << device_id;
+    // Check and load if the cached model and configuration file exists
+    if (model_cache_dir.empty() ||
+        !device_program->LoadFromCacheFile(input_names_,
+                                           output_names_,
+                                           origin_idims_,
+                                           model_cache_dir,
+                                           device_id)) {
+      // Build the model online, including converting the paddle ops to the HiAI
+      // IR nodes, building the HiAI IR graph to the om model, then load it as a
+      // new HiAI model manager client for inference.
+      if (!origin_program_) {
+        BuildOriginProgram();
+      }
+      CHECK(origin_program_)
+          << "[HUAWEI_ASCEND_NPU] The origin program is not initialized!";
+      CHECK_GT(origin_program_->instructions().size(), 0)
+          << "[HUAWEI_ASCEND_NPU] No instructions found in the origin program!";
+      if (!device_program->BuildGraphAndCacheToFile(origin_program_.get(),
+                                                    input_names_,
+                                                    output_names_,
+                                                    origin_idims_,
+                                                    origin_otensors_,
+                                                    model_cache_dir,
+                                                    device_id)) {
+        return false;
+      }
+    }
+    if (device_program->model_client_ == nullptr) {
+      return false;
+    }
+    device_programs_[origin_idims_] = device_program;
+  }
+  auto device_program = device_programs_[origin_idims_];
+  CHECK(device_program && device_program->model_client_);
+  return device_program->ShareBufferWithOriginTensors(input_names_,
+                                                      output_names_,
+                                                      &origin_itensors_,
+                                                      &origin_otensors_,
+                                                      &device_itensors_,
+                                                      &device_otensors_);
+}
+
+bool SubgraphEngine::LaunchDeviceProgram() {
+  // Roll back to launch the origin program if the device program can't be
+  // found or the model client isn't initialized.
+  if (device_programs_.count(origin_idims_) == 0 ||
+      device_programs_[origin_idims_]->model_client_ == nullptr) {
+    return LaunchOriginProgram();
+  }
+  auto device_program = device_programs_[origin_idims_];
+  if (!device_program->model_client_) {
+    return LaunchOriginProgram();
+  }
+  if (!device_program->ZeroCopyRun(&device_itensors_, &device_otensors_)) {
+    return false;
+  }
+  if (!device_program->SharedBufferWithOutputTensors(
+          output_names_, &origin_otensors_, &device_otensors_)) {
+    return false;
+  }
+  return true;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
+                                   param.input_data_names,
+                                   param.output_data_names));
+  CHECK(engine_);
+}
+
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Run();
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kHuaweiAscendNPU,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::huawei_ascend_npu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .Finalize();
diff --git a/lite/kernels/huawei_ascend_npu/subgraph_compute.h b/lite/kernels/huawei_ascend_npu/subgraph_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb7d2efe0c29912a07f11a544c91432d69c51fa0
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.h
@@ -0,0 +1,121 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "graph/tensor.h"
+#include "lite/backends/huawei_ascend_npu/device.h"
+#include "lite/core/kernel.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace huawei_ascend_npu {
+
+using TensorDesc = paddle::lite::huawei_ascend_npu::TensorDesc;
+using AclModelClient = paddle::lite::huawei_ascend_npu::AclModelClient;
+
+class DeviceProgram {
+ public:
+  DeviceProgram() {}
+  ~DeviceProgram() {}
+  std::string GenerateModelName(
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::vector<int64_t>>& origin_idims);
+  bool LoadFromCacheFile(const std::vector<std::string>& input_names,
+                         const std::vector<std::string>& output_names,
+                         const std::vector<std::vector<int64_t>>& origin_idims,
+                         const std::string& model_cache_dir,
+                         const int device_id);
+  bool BuildGraphAndCacheToFile(
+      RuntimeProgram* origin_program,
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::vector<int64_t>>& origin_idims,
+      const std::vector<Tensor*>& origin_otensors,
+      const std::string& model_cache_dir,
+      const int device_id);
+  bool ShareBufferWithOriginTensors(
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      std::vector<Tensor*>* origin_itensors,
+      std::vector<Tensor*>* origin_otensors,
+      std::vector<std::shared_ptr<ge::Tensor>>* device_itensors,
+      std::vector<std::shared_ptr<ge::Tensor>>* device_otensors);
+  bool SharedBufferWithOutputTensors(
+      const std::vector<std::string>& output_names,
+      std::vector<Tensor*>* origin_otensors,
+      std::vector<std::shared_ptr<ge::Tensor>>* device_otensors);
+  bool ZeroCopyRun(std::vector<std::shared_ptr<ge::Tensor>>* device_itensors,
+                   std::vector<std::shared_ptr<ge::Tensor>>* device_otensors);
+
+ public:
+  std::string model_name_{""};
+  std::shared_ptr<AclModelClient> model_client_{nullptr};
+  std::vector<std::vector<int64_t>> origin_odims_;
+  std::vector<PrecisionType> origin_otypes_;
+  std::vector<TensorDesc> device_idims_{};
+  std::vector<TensorDesc> device_odims_{};
+};
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext* ctx,
+                 int block_idx,
+                 const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+                 Scope* exec_scope,
+                 const std::vector<std::string>& input_names,
+                 const std::vector<std::string>& output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}
+
+ protected:
+  bool PrepareWorkspaceForDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
+
+ private:
+  std::vector<std::shared_ptr<ge::Tensor>> device_itensors_{};
+  std::vector<std::shared_ptr<ge::Tensor>> device_otensors_{};
+  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<DeviceProgram>>
+      device_programs_;
+};
+
+class SubgraphCompute
+    : public KernelLite<TARGET(kHuaweiAscendNPU), PRECISION(kAny)> {
+ public:
+  using param_t = operators::SubgraphParam;
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace huawei_ascend_npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/CMakeLists.txt b/lite/kernels/mlu/CMakeLists.txt
index f9395d45ccecccaf3f873797d0c2d71eda266319..634a0afc551d83be58487d7393e092196e0f6cc5 100644
--- a/lite/kernels/mlu/CMakeLists.txt
+++ b/lite/kernels/mlu/CMakeLists.txt
@@ -4,6 +4,7 @@ endif()
 
 add_subdirectory(bridges)
 add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges})
-add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
-add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
-add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
+add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu})
+add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps})
+# depend on transpose function in backend/x86/math/math_function
+add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_function})
diff --git a/lite/kernels/mlu/bridges/CMakeLists.txt b/lite/kernels/mlu/bridges/CMakeLists.txt
index 82510ab9b6a794f5c6b1ffb43d2d3c55db3a5514..91323925e1ef49462c180fd96392d638e273fd69 100644
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
@@ -3,7 +3,7 @@ if(NOT LITE_WITH_MLU)
 endif()
 
 lite_cc_library(subgraph_bridge_utility_mlu SRCS utility.cc DEPS ${mlu_builder_libs} tensor)
-lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs})
+lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs} subgraph_bridge_utility_mlu)
 lite_cc_library(subgraph_bridge_graph_mlu SRCS graph.cc DEPS subgraph_bridge_utility_mlu subgraph_bridge_tensor_mlu)
 
 set(mlu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_mlu subgraph_bridge_graph_mlu)
@@ -18,6 +18,16 @@ lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_d
 lite_cc_library(subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS ${subgraph_bridge_deps_mlu})
 lite_cc_library(subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS ${subgraph_bridge_deps_mlu})
 lite_cc_library(subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_transpose_op_mlu SRCS transpose_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_dropout_op_mlu SRCS dropout_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_slice_op_mlu SRCS slice_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_split_op_mlu SRCS split_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_cast_op_mlu SRCS cast_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_layout_op_mlu SRCS layout_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_argmax_op_mlu SRCS argmax_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_squeeze_op_mlu SRCS squeeze_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_reshape_op_mlu SRCS reshape_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_flatten_op_mlu SRCS flatten_op.cc DEPS ${subgraph_bridge_deps_mlu})
 set(mlu_subgraph_bridges
         subgraph_bridge_registry
         subgraph_bridge_utility_mlu
@@ -28,12 +38,35 @@ set(mlu_subgraph_bridges
         subgraph_bridge_pool_op_mlu
         subgraph_bridge_softmax_op_mlu
         subgraph_bridge_fc_op_mlu
+        subgraph_bridge_transpose_op_mlu
         subgraph_bridge_batch_norm_op_mlu
         subgraph_bridge_scale_op_mlu
         subgraph_bridge_interp_op_mlu
         subgraph_bridge_concat_op_mlu
+        subgraph_bridge_dropout_op_mlu
+        subgraph_bridge_slice_op_mlu
+        subgraph_bridge_split_op_mlu
+        subgraph_bridge_cast_op_mlu
+        subgraph_bridge_layout_op_mlu
+        subgraph_bridge_argmax_op_mlu
+        subgraph_bridge_squeeze_op_mlu
+        subgraph_bridge_reshape_op_mlu
+        subgraph_bridge_flatten_op_mlu
         CACHE INTERNAL "mlu_subgraph_bridges")
 
+
+if (LITE_BUILD_EXTRA)
+  lite_cc_library(subgraph_bridge_lrn_op_mlu SRCS lrn_op.cc DEPS ${subgraph_bridge_deps_mlu})
+  lite_cc_library(subgraph_bridge_gather_op_mlu SRCS gather_op.cc DEPS ${subgraph_bridge_deps_mlu})
+  lite_cc_library(subgraph_bridge_norm_op_mlu SRCS norm_op.cc DEPS ${subgraph_bridge_deps_mlu})
+  set(mlu_subgraph_bridges
+      "${mlu_subgraph_bridges}"
+      subgraph_bridge_lrn_op_mlu
+      subgraph_bridge_gather_op_mlu
+      subgraph_bridge_norm_op_mlu
+      CACHE INTERNAL "mlu_subgraph_bridges")
+endif()
+
 lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
 lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
@@ -45,4 +78,21 @@ lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer targe
 lite_cc_test(test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 lite_cc_test(test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 lite_cc_test(test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_transpose_converter_mlu SRCS transpose_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_dropout_converter_mlu SRCS dropout_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_slice_converter_mlu SRCS slice_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_split_converter_mlu SRCS split_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_layout_converter_mlu SRCS layout_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_cast_converter_mlu SRCS cast_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_argmax_converter_mlu SRCS argmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_squeeze_converter_mlu SRCS squeeze_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_reshape_converter_mlu SRCS reshape_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_flatten_converter_mlu SRCS flatten_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+
+if (LITE_BUILD_EXTRA)
+  lite_cc_test(test_norm_converter_mlu SRCS norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+  lite_cc_test(test_lrn_converter_mlu SRCS lrn_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+  lite_cc_test(test_gather_converter_mlu SRCS gather_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+endif()
+
 message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
diff --git a/lite/kernels/mlu/bridges/act_op.cc b/lite/kernels/mlu/bridges/act_op.cc
index 286195d9d5f961288dd0156db31ff8aacae58227..d24c7fac216ed0ba213a4fd95365132a693281c3 100644
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
@@ -60,6 +60,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  output_tensor->mlu_tensor()));
   }
   graph->FuseOp(activation_op);
+  CNML_CALL(cnmlDestroyBaseOp(&activation_op));
   return SUCCESS;
 }
 
@@ -72,6 +73,9 @@ REGISTER_SUBGRAPH_BRIDGE(sigmoid,
                          kMLU,
                          paddle::lite::subgraph::mlu::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(relu6,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
                          kMLU,
diff --git a/lite/kernels/mlu/bridges/act_op_test.cc b/lite/kernels/mlu/bridges/act_op_test.cc
index 2b7747f4d8b647b8cb621876907f6178ebf9fe88..11c0c3f732c4c29fff3aedc6cfdcf55760128b5d 100644
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <random>
+
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/mlu/bridges/test_helper.h"
@@ -116,7 +118,7 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
     opdesc.SetAttr("offset", 0.5f);
   }
 
-  // create and convert op to NPU model, then run it on NPU
+  // create and convert op to MLU model, then run it on MLU
   auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
   // execute reference implementation and save to output tensor
   act_ref(op);
@@ -134,7 +136,8 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
 
 TEST(MLUBridges, activation) {
   std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
-  std::vector<std::string> types{"sigmoid", "relu", "tanh", "leaky_relu"};
+  std::vector<std::string> types{
+      "sigmoid", "relu", "relu6", "tanh", "leaky_relu"};
   for (auto x_shape : shapes) {
     for (auto op_type : types) {
       test_act(x_shape, op_type);
@@ -149,5 +152,6 @@ TEST(MLUBridges, activation) {
 
 USE_SUBGRAPH_BRIDGE(sigmoid, kMLU)
 USE_SUBGRAPH_BRIDGE(relu, kMLU)
+USE_SUBGRAPH_BRIDGE(relu6, kMLU)
 USE_SUBGRAPH_BRIDGE(tanh, kMLU)
 USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU)
diff --git a/lite/kernels/mlu/bridges/argmax_op.cc b/lite/kernels/mlu/bridges/argmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b004639f07c79e5cc414e2d60bc1f32ec522f0f5
--- /dev/null
+++ b/lite/kernels/mlu/bridges/argmax_op.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get input vars and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims().Vectorize();
+
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+
+  int axis = op_info->GetAttr<int64_t>("axis");
+  if (axis < 0) {
+    axis = axis + x_dims.size();
+  }
+  cnmlDimension_t argmax_mode = static_cast<cnmlDimension_t>(axis);
+  auto mlu_output_dim = x->dims().Vectorize();
+  // shape is NCHW, layout is NHWC
+  mlu_output_dim[axis] = 1;
+  auto input_tensor = graph->GetNode(x_var_name);
+  // if use_fp16 and axis is not c, cast input datatype from fp16 to fp32, so
+  // output datatype is int32
+  bool cast_to_fp32 =
+      graph->FPType() == CNML_DATA_FLOAT16 && argmax_mode != CNML_DIM_C;
+  cnmlBaseOp_t cast_op{nullptr};
+  std::shared_ptr<MLUTensor> fp32_input_tensor;
+  if (cast_to_fp32) {
+    fp32_input_tensor = graph->AddNode(x_var_name + ".fp32",
+                                       x_dims,
+                                       CNML_TENSOR,
+                                       CNML_NCHW,
+                                       CNML_DATA_FLOAT32);
+    cnmlCreateCastOp(&cast_op,
+                     CNML_CAST_FLOAT16_TO_FLOAT32,
+                     input_tensor->mlu_tensor(),
+                     fp32_input_tensor->mlu_tensor());
+  }
+  auto output_tensor = graph->AddNode(
+      out_var_name, mlu_output_dim, CNML_TENSOR, CNML_NCHW, CNML_DATA_INT32);
+
+  CHECK(graph->HasNode(x_var_name));
+  cnmlBaseOp_t argmax_op{nullptr};
+  // ======================= DEBUG INFO =====================
+  VLOG(6) << "x_var_name: " << x_var_name;
+  VLOG(6) << "out_var_name: " << out_var_name;
+  VLOG(6) << "x dims: " << x->dims();
+  VLOG(6) << "output dims: " << output->dims();
+  VLOG(6) << "axis: " << axis;
+  VLOG(6) << "cast_to_fp32: " << cast_to_fp32;
+  cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR);
+  cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR);
+  // ======================= DEBUG END =====================
+
+  CNML_CALL(cnmlCreateArgmaxOp(&argmax_op,
+                               argmax_mode,
+                               cast_to_fp32 ? fp32_input_tensor->mlu_tensor()
+                                            : input_tensor->mlu_tensor(),
+                               output_tensor->mlu_tensor()));
+  if (cast_to_fp32) {
+    graph->FuseOp(cast_op);
+  }
+  graph->FuseOp(argmax_op);
+  CNML_CALL(cnmlDestroyBaseOp(&argmax_op));
+  if (cast_op) {
+    CNML_CALL(cnmlDestroyBaseOp(&cast_op));
+  }
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(arg_max,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ArgmaxConverter);
diff --git a/lite/kernels/mlu/bridges/argmax_op_test.cc b/lite/kernels/mlu/bridges/argmax_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9eeb172812b8deecd6a8f1f2eb321ade4289fa9b
--- /dev/null
+++ b/lite/kernels/mlu/bridges/argmax_op_test.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/argmax_op.h"
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <iostream>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+template <typename dtype, typename out_dtype>
+void argmax_ref(const std::shared_ptr<operators::ArgmaxOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  int axis = op_info->GetAttr<int64_t>("axis");
+  auto x_dims = x->dims();
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+  auto y_shape = x_dims.Vectorize();
+  y_shape.erase(y_shape.begin() + axis);
+  out->Resize(y_shape);
+  auto out_dims = out->dims();
+
+  auto* x_data = x->mutable_data<dtype>();
+  auto* out_data = out->mutable_data<out_dtype>();
+
+  const int size = x_dims[axis];
+  const int in_channel = x_dims.count(axis, x_dims.size());
+  const int out_channel = out_dims.count(axis, out_dims.size());
+  const int in_stride = x_dims.count(axis + 1, x_dims.size());
+  const int out_stride = x_dims.count(0, axis);
+  // int index = 0;
+  for (int n = 0; n < out_stride; n++) {
+    for (int k = 0; k < in_stride; k++) {
+      const float* in_ptr = x_data + n * in_channel + k;
+      std::vector<std::pair<float, int>> vec;
+      vec.resize(size);
+      for (int i = 0; i < size; i++) {
+        vec[i] = std::make_pair(in_ptr[i * in_stride], i);
+      }
+      // sort
+      std::partial_sort(vec.begin(),
+                        vec.begin() + 1,
+                        vec.end(),
+                        std::greater<std::pair<float, int>>());
+
+      out_dtype* out_ptr = out_data + n * out_channel + k;
+      *out_ptr = vec[0].second;
+    }
+  }
+}
+
+void test_argmax(const std::vector<int64_t>& input_shape, int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+  // initialize input&output data
+  FillTensor<float, float>(x, -9, 9);
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("arg_max");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", static_cast<int64_t>(axis));
+
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::ArgmaxOpLite>(opdesc, &scope);
+  argmax_ref<float, int>(op);
+  out_ref->CopyDataFrom(*out);
+  Tensor input_x;
+  input_x.Resize(DDim(input_shape));
+  // change input layout from NCHW to NHWC
+  transpose<float>(x->mutable_data<float>(),
+                   input_x.mutable_data<float>(),
+                   {static_cast<int>(input_shape[0]),
+                    static_cast<int>(input_shape[1]),
+                    static_cast<int>(input_shape[2]),
+                    static_cast<int>(input_shape[3])},
+                   {0, 2, 3, 1});
+  x->CopyDataFrom(input_x);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  auto* out_data = out->mutable_data<int>();
+  auto* out_ref_data = out_ref->mutable_data<int>();
+  std::vector<int64_t> out_shape = input_shape;
+  out_shape[axis] = 1;
+  Tensor output_trans;
+  output_trans.Resize(out_shape);
+  // Change output layout from NHWC to NCHW
+  transpose<int>(out_data,
+                 output_trans.mutable_data<int>(),
+                 {static_cast<int>(out_shape[0]),
+                  static_cast<int>(out_shape[2]),
+                  static_cast<int>(out_shape[3]),
+                  static_cast<int>(out_shape[1])},
+                 {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<int>();
+
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, arg_max) {
+  test_argmax({1, 2, 3, 4}, 1);
+  test_argmax({1, 2, 3, 4}, 2);
+  test_argmax({1, 2, 3, 4}, 3);
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(arg_max, kMLU);
diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc
index 7353a685dd5fd3a5bcc8c88def8ffb8b96fdde55..ceac1ac696d788869e77a1b173cc0bb4d10a4e21 100644
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
@@ -48,25 +48,32 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   auto mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
   auto mean_dims = mean->dims().Vectorize();
+  if (mean_dims.size() < 4) {
+    mean_dims.insert(mean_dims.begin(), 4 - mean_dims.size(), 1);
+  }
   auto mean_tensor = graph->AddNode(
-      mean_var_name, mean_dims, CNML_CONST, CNML_CNHW, graph->FPType());
+      mean_var_name, mean_dims, CNML_CONST, CNML_NHWC, graph->FPType());
 
   auto variance = scope->FindVar(variance_var_name)->GetMutable<Tensor>();
   auto variance_dims = variance->dims().Vectorize();
+  if (variance_dims.size() < 4) {
+    variance_dims.insert(variance_dims.begin(), 4 - variance_dims.size(), 1);
+  }
   auto variance_tensor = graph->AddNode(
-      variance_var_name, variance_dims, CNML_CONST, CNML_CNHW, graph->FPType());
+      variance_var_name, variance_dims, CNML_CONST, CNML_NHWC, graph->FPType());
 
   auto scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
   auto bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
 
-  int co = static_cast<int>(mean_dims[0]);
+  int co = static_cast<int>(mean_dims[3]);
 
+  std::vector<float> variance_trans(co);
+  std::vector<float> mean_trans(co);
   for (int i = 0; i < co; ++i) {
-    variance->mutable_data<float>()[i] =
+    variance_trans[i] =
         scale->data<float>()[i] / sqrtf(variance->data<float>()[i] + epsilon);
-    mean->mutable_data<float>()[i] =
-        mean->data<float>()[i] -
-        bias->data<float>()[i] / variance->data<float>()[i];
+    mean_trans[i] =
+        mean->data<float>()[i] - bias->data<float>()[i] / variance_trans[i];
   }
 
   auto input_tensor = graph->GetNode(x_var_name);
@@ -77,10 +84,14 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                          mean_tensor->mlu_tensor(),
                                          variance_tensor->mlu_tensor()));
 
-  graph->BindConstData(variance_var_name, variance);
-  graph->BindConstData(mean_var_name, mean);
+  graph->BindConstRawData(
+      variance_var_name, variance_trans.data(), variance_trans.size(), true);
+  graph->BindConstRawData(
+      mean_var_name, mean_trans.data(), mean_trans.size(), true);
   graph->FuseOp(bn_op);
 
+  CNML_CALL(cnmlDestroyBaseOp(&bn_op));
+
   return SUCCESS;
 }
 
diff --git a/lite/kernels/mlu/bridges/cast_op.cc b/lite/kernels/mlu/bridges/cast_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25d988ce5aee519dfb00574343956022b30a89e7
--- /dev/null
+++ b/lite/kernels/mlu/bridges/cast_op.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int CastConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto in_dtype = op_info->GetAttr<int>("in_dtype");
+  auto out_dtype = op_info->GetAttr<int>("out_dtype");
+
+  CHECK(graph->HasNode(x_var_name));
+  auto x_tensor = graph->GetNode(x_var_name);
+
+  cnmlDataType_t out_type;
+  cnmlCastType_t cast_type;
+  if (in_dtype == 4 && out_dtype == 5) {
+    cast_type = CNML_CAST_FLOAT16_TO_FLOAT32;
+    out_type = CNML_DATA_FLOAT32;
+  } else if (in_dtype == 5 && out_dtype == 4) {
+    cast_type = CNML_CAST_FLOAT32_TO_FLOAT16;
+    out_type = CNML_DATA_FLOAT16;
+  } else {
+    CHECK(0) << "Unsupported cast type";
+  }
+
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, out_type);
+
+  cnmlBaseOp_t cast_op;
+  CNML_CALL(cnmlCreateCastOp(&cast_op,
+                             cast_type,
+                             x_tensor->mlu_tensor(),
+                             output_tensor->mlu_tensor()));
+  graph->FuseOp(cast_op);
+  CNML_CALL(cnmlDestroyBaseOp(&cast_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(cast,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::CastConverter);
diff --git a/lite/kernels/mlu/bridges/cast_op_test.cc b/lite/kernels/mlu/bridges/cast_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2389ad5560cd2ede710626cfd40f8db8bff56351
--- /dev/null
+++ b/lite/kernels/mlu/bridges/cast_op_test.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/cast_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void test_cast_FP16_to_FP32(std::vector<int64_t> shape) {
+  // prepare input&output variables
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(shape));
+  auto* x_data = x->mutable_data<paddle::lite::fluid::float16>();
+
+  // initialize input&output data
+  for (int i = 0; i < x->dims().production(); i++) {
+    x_data[i] = static_cast<paddle::lite::fluid::float16>(i);
+  }
+  // initialize op desc
+  int in_dtype = 4, out_dtype = 5;
+  cpp::OpDesc opdesc;
+  opdesc.SetType("cast");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("in_dtype", in_dtype);
+  opdesc.SetAttr("out_dtype", out_dtype);
+
+  auto op = CreateOp<operators::CastOp>(opdesc, &scope);
+
+  Tensor data;
+  data.Resize(DDim(shape));
+  auto* copy_data = data.mutable_data<paddle::lite::fluid::float16>();
+  data.CopyDataFrom(*x);
+  x->set_precision(paddle::lite_api::PrecisionType::kFP16);
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], static_cast<double>(copy_data[i]), 5e-4);
+  }
+}
+
+void test_cast_FP32_to_FP16(std::vector<int64_t> shape) {
+  // prepare input&output variables
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(shape));
+  auto* x_data = x->mutable_data<float>();
+
+  // initialize input&output data
+  for (int i = 0; i < x->dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  // initialize op desc
+  int in_dtype = 5, out_dtype = 4;
+  cpp::OpDesc opdesc;
+  opdesc.SetType("cast");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("in_dtype", in_dtype);
+  opdesc.SetAttr("out_dtype", out_dtype);
+
+  auto op = CreateOp<operators::CastOp>(opdesc, &scope);
+
+  Tensor data;
+  data.Resize(DDim(shape));
+  auto* copy_data = data.mutable_data<float>();
+  data.CopyDataFrom(*x);
+  x->set_precision(paddle::lite_api::PrecisionType::kFloat);
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<paddle::lite::fluid::float16>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(static_cast<double>(out_data[i]), copy_data[i], 5e-4);
+  }
+}
+
+TEST(MLUBridges, cast) {
+  test_cast_FP16_to_FP32({2, 3, 4, 5});
+  test_cast_FP16_to_FP32({6, 3, 2, 5});
+  test_cast_FP32_to_FP16({2, 3, 4, 5});
+  test_cast_FP32_to_FP16({6, 3, 2, 5});
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(cast, kMLU);
diff --git a/lite/kernels/mlu/bridges/concat_op.cc b/lite/kernels/mlu/bridges/concat_op.cc
index 14f0da746a00c1ea10ffae824217dbb2df84df55..1d566639937d79cf1c98c70bfc1294d874fb89c4 100644
--- a/lite/kernels/mlu/bridges/concat_op.cc
+++ b/lite/kernels/mlu/bridges/concat_op.cc
@@ -44,9 +44,10 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   auto dims = output_dims.size();
   int axis = (param_axis < 0) ? (param_axis + dims) : param_axis;
-  CHECK_LE(axis, 4) << "Unsupport dims in mlu concat";
-  int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2};
-  int nhwc_axis = nchw_to_nhwc_axis_map[axis];
+  CHECK_LT(axis, dims) << "Unsupport dims in mlu concat";
+  // value of nhwc2nchw_axis is index of nhwc
+  // order of nhwc2nchw_axis is nchw
+  int nhwc_axis = GetAxisNHWC2NCHW<int>(dims)[axis];
 
   auto output_tensor = graph->AddNode(
       out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
@@ -60,6 +61,7 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  &outputs,
                                  1));
   graph->FuseOp(concat_op);
+  CNML_CALL(cnmlDestroyBaseOp(&concat_op));
   return SUCCESS;
 }
 
diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc
index e7e21f7ad2f64275746e015289c9372368e46f5c..6d10605e2c4060cbd8b30d358ac15f2e78f13ca5 100644
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "lite/operators/conv_op.h"
+
 #include <algorithm>
+
 #include "lite/kernels/mlu/bridges/graph.h"
 #include "lite/kernels/mlu/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
@@ -30,6 +32,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   const auto* op_info = op->op_info();
   const auto* scope = op->scope();
   VLOG(3) << "[MLU] Converting " << op_info->Type() << "... ";
+  CHECK(!op_info->HasAttr("act_type"));
 
   // get input, filter and op attributes
   const auto input_var_name = op_info->Input("Input").front();
@@ -43,8 +46,13 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   const auto output_shape = output->dims().Vectorize();
   const auto bs = input_dims[0];
   const auto oc = filter_dims[0];
+  const auto groups = op_info->GetAttr<int>("groups");
+
   CHECK_EQ(input_dims.size(), 4u);
   CHECK_EQ(filter_dims.size(), 4u);
+  CHECK(!(op_info->HasAttr("fuse_relu") &&
+          (op_info->GetAttr<bool>("fuse_relu") == true)))
+      << "UnSupported param fuse_relu is true!";
   const auto strides = op_info->GetAttr<std::vector<int>>("strides");
   auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
   auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
@@ -70,18 +78,36 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                       padding_algorithm,
                                       input_dims,
                                       filter_dims);
+  bool is_group_mode = groups > 1;
 
+  bool is_depthwise_mode = false;
+  if (filter_dims[0] == groups && filter_dims[1] == 1 && dilations[0] == 1 &&
+      dilations[1] == 1) {  // depthwise filter shape = {1, ic ,kh ,kw}
+    is_depthwise_mode = true;
+    is_group_mode = false;
+  }
+
+  auto input_tensor = graph->GetNode(input_var_name);
   const auto output_tensor = graph->AddNode(
       output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  std::vector<int64_t> cnml_filter_shape = {
+      filter_dims[0], filter_dims[1], filter_dims[2], filter_dims[3]};
+  if (is_depthwise_mode) {
+    /*paddle filter shape is {oc , ic / groups == 1, kh, kw} while
+     cnml depthwise conv filter expect shape {oc / groups == 1 , ic , kh, kw}
+     so we should shape filter shape
+     */
+    cnml_filter_shape = {
+        filter_dims[1], filter_dims[0], filter_dims[2], filter_dims[3]};
+  }
 
   // Create filter node
   const auto filter_tensor = graph->AddNode(filter_var_name,
-                                            filter_dims.Vectorize(),
+                                            cnml_filter_shape,
                                             CNML_FILTER,
                                             CNML_NCHW,
                                             graph->FPType());
-  const auto weight_scale =
-      op_info->GetAttr<std::vector<float>>("weight_scale");
+  const auto weight_scale = op_info->GetInputScale(filter_var_name);
 
   if (filter->precision() == PrecisionType::kUnk ||
       filter->precision() == PrecisionType::kInt8) {
@@ -89,15 +115,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     dequant(filter_dequant.data(),
             filter->mutable_data<int8_t>(),
             1,
-            filter_dims[0],
-            filter_dims[1] * filter_dims[2] * filter_dims[3],
+            cnml_filter_shape[0],
+            cnml_filter_shape[1] * cnml_filter_shape[2] * cnml_filter_shape[3],
             weight_scale);
     transpose(filter_dequant.data(),
               filter->mutable_data<float>(),
-              {static_cast<int>(filter_dims[0]),
-               static_cast<int>(filter_dims[1]),
-               static_cast<int>(filter_dims[2]),
-               static_cast<int>(filter_dims[3])},
+              {static_cast<int>(cnml_filter_shape[0]),
+               static_cast<int>(cnml_filter_shape[1]),
+               static_cast<int>(cnml_filter_shape[2]),
+               static_cast<int>(cnml_filter_shape[3])},
               {0, 2, 3, 1});
     filter->set_precision(PrecisionType::kFloat);
   } else if (filter->precision() != PrecisionType::kFloat) {
@@ -116,7 +142,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     std::vector<int64_t> bias_shape;
     if (bias_data_size == oc) {
       // 0: {oc}
-      bias_shape = {oc};
+      bias_shape = {1, 1, 1, oc};
     } else if (bias_data_size == output_data_size / bs) {
       LOG(FATAL) << "Unsupported ... ...";
       // 1: {1, oc, oh, ow}
@@ -130,18 +156,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                  << " isn't supported in conv2d Op when output dimension is "
                  << output_dims;
     }
-    bias_tensor = graph->AddNode(bias_var_name,
-                                 bias_dims.Vectorize(),
-                                 CNML_CONST,
-                                 CNML_CNHW,
-                                 graph->FPType());
+    bias_tensor = graph->AddNode(
+        bias_var_name, bias_shape, CNML_CONST, CNML_NHWC, graph->FPType());
     graph->BindConstData(bias_var_name, bias);
   }
 
-  const auto input_scale = op_info->GetAttr<float>("input_scale");
+  const auto input_scale = op_info->GetInputScale(input_var_name)[0];
 
   bool use_first_conv = false;
-  if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) {
+  if (lite::TargetWrapperMlu::UseFirstConv() && input_dims[1] == 3) {
     use_first_conv = true;
   }
 
@@ -158,38 +181,75 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                             paddings[0],
                                             paddings[0]));
     const auto mean_tensor = graph->AddNode("first_conv_mean_tensor",
-                                            std::vector<int64_t>{3},
+                                            std::vector<int64_t>{1, 1, 1, 3},
                                             CNML_CONST,
-                                            CNML_CNHW,
+                                            CNML_NHWC,
                                             graph->FPType());
     const auto std_tensor = graph->AddNode("first_conv_std_tensor",
-                                           std::vector<int64_t>{3},
+                                           std::vector<int64_t>{1, 1, 1, 3},
                                            CNML_CONST,
-                                           CNML_CNHW,
+                                           CNML_NHWC,
                                            graph->FPType());
 
     graph->BindConstRawData("first_conv_mean_tensor",
-                            lite::DeviceInfo::Global().MeanVec().data(),
+                            lite::TargetWrapperMlu::MeanVec().data(),
                             3,
                             false);
     graph->BindConstRawData("first_conv_std_tensor",
-                            lite::DeviceInfo::Global().StdVec().data(),
+                            lite::TargetWrapperMlu::StdVec().data(),
                             3,
                             false);
 
-    graph->GetNode(input_var_name)->set_mlu_dtype(CNML_DATA_UINT8);
+    input_tensor->set_mlu_dtype(CNML_DATA_UINT8);
     CNML_CALL(cnmlCreateConvFirstOpForward(
         &conv_op,
         conv_param,
-        graph->GetNode(input_var_name)->mlu_tensor(),
+        input_tensor->mlu_tensor(),
         mean_tensor->mlu_tensor(),
         output_tensor->mlu_tensor(),
         filter_tensor->mlu_tensor(),
         bias_tensor ? bias_tensor->mlu_tensor() : nullptr,
         std_tensor->mlu_tensor()));
     CNML_CALL(cnmlDestroyConvFirstOpParam(&conv_param));
+  } else if (is_depthwise_mode) {
+    cnmlConvDepthwiseOpParam_t conv_depthwise_param;
+    cnmlCreateConvDepthwiseOpParam_V2(&conv_depthwise_param,
+                                      strides[0],
+                                      strides[1],
+                                      paddings[0] * 2,
+                                      paddings[2] * 2);
+    CNML_CALL(cnmlCreateConvDepthwiseOpForward(
+        &conv_op,
+        conv_depthwise_param,
+        input_tensor->mlu_tensor(),
+        output_tensor->mlu_tensor(),
+        filter_tensor->mlu_tensor(),
+        bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
+    CNML_CALL(cnmlDestroyConvDepthwiseOpParam(&conv_depthwise_param));
+  } else if (is_group_mode) {
+    cnmlConvOpParam_t conv_param;
+    CNML_CALL(cnmlCreateConvOpParam(&conv_param,
+                                    strides[0],
+                                    strides[1],
+                                    dilations[0],
+                                    dilations[1],
+                                    paddings[0] * 2,
+                                    paddings[2] * 2));
+    CNML_CALL(cnmlCreateConvGroupOpForward(
+        &conv_op,
+        conv_param,
+        input_tensor->mlu_tensor(),
+        output_tensor->mlu_tensor(),
+        filter_tensor->mlu_tensor(),
+        bias_tensor ? bias_tensor->mlu_tensor() : nullptr,
+        groups));
+    CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
   } else {
     cnmlConvOpParam_t conv_param;
+    VLOG(5) << "conv param (" << input_var_name << ")"
+            << "stride: " << strides[0] << ',' << strides[1] << '\t'
+            << "dilations: " << dilations[0] << ',' << dilations[1] << '\t'
+            << "paddings: " << paddings[0] << ',' << paddings[2] << std::endl;
     CNML_CALL(cnmlCreateConvOpParam(&conv_param,
                                     strides[0],
                                     strides[1],
@@ -200,19 +260,21 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     CNML_CALL(cnmlCreateConvOpForward(
         &conv_op,
         conv_param,
-        graph->GetNode(input_var_name)->mlu_tensor(),
+        input_tensor->mlu_tensor(),
         output_tensor->mlu_tensor(),
         filter_tensor->mlu_tensor(),
         bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
     CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
   }
 
-  graph->SetComputingDataType(
-      conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
-  graph->SetComputingDataType(
-      conv_op,
-      filter_tensor->mlu_tensor(),
-      1 / *min_element(weight_scale.begin(), weight_scale.end()));
+  if (!is_depthwise_mode) {
+    graph->SetComputingDataType(
+        conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
+    graph->SetComputingDataType(
+        conv_op,
+        filter_tensor->mlu_tensor(),
+        1 / *max_element(weight_scale.begin(), weight_scale.end()));
+  }
   CNML_CALL(cnmlSetOperationComputingLayout(conv_op, CNML_NHWC));
   if (HasInputArg(op_info, scope, "Bias")) {
     auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
@@ -220,6 +282,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
   graph->BindConstData(filter_var_name, filter);
   graph->FuseOp(conv_op);
+  CNML_CALL(cnmlDestroyBaseOp(&conv_op));
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
diff --git a/lite/kernels/mlu/bridges/conv_op_test.cc b/lite/kernels/mlu/bridges/conv_op_test.cc
index 1b04814d7d88d227d0bb3e0b58aef26d62f06966..e23f7c68ab0048b8cc04ffdae33ea94fcabbcf65 100644
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
@@ -13,8 +13,11 @@
 // limitations under the License.
 
 #include "lite/operators/conv_op.h"
+
 #include <gtest/gtest.h>
+
 #include <random>
+
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/mlu/bridges/test_helper.h"
@@ -221,8 +224,10 @@ void test_conv(int bs,
   opdesc_mlu.SetAttr("groups", groups);
   opdesc_mlu.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
 
-  opdesc_mlu.SetAttr("weight_scale", std::vector<float>(oc, filter_scale));
-  opdesc_mlu.SetAttr("input_scale", input_scale);
+  OpInfo op_info(opdesc_mlu);
+  op_info.SetInputScale(filter_int_var_name,
+                        std::vector<float>(oc, filter_scale));
+  op_info.SetInputScale(input_var_name, {input_scale});
 
   if (has_bias) {
     if (is_channel_bias) {
@@ -231,7 +236,7 @@ void test_conv(int bs,
       bias->Resize({output_shape});
     }
     FillTensor<float>(bias);
-    opdesc_mlu.SetInput("Bias", {bias_var_name});
+    op_info.SetInput("Bias", {bias_var_name});
   }
 
   for (int i = 0; i < bs; i++) {
@@ -245,7 +250,7 @@ void test_conv(int bs,
   }
 
   // create and convert op to MLU model, then run it on MLU
-  auto op = CreateOp<operators::ConvOpLite>(opdesc_mlu, &scope);
+  auto op = CreateOp<operators::ConvOpLite>(op_info, &scope);
   LaunchOp(op, {input_var_name}, {output_var_name});
   // compare results
   auto* output_data = output->mutable_data<float>();
@@ -331,6 +336,10 @@ TEST(MLUBridges, conv) {
 #endif
 }
 
+TEST(MLUBridges, depthwise_conv2d) {
+  test_conv(1, 8, 8, 14, 14, false, false, false, true, 1, 1, 2, 3);
+}
+
 }  // namespace mlu
 }  // namespace subgraph
 }  // namespace lite
diff --git a/lite/kernels/mlu/bridges/dropout_op.cc b/lite/kernels/mlu/bridges/dropout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9aa296236e05a0c80ed9b7001f940cce99b019f7
--- /dev/null
+++ b/lite/kernels/mlu/bridges/dropout_op.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Create act node and set params from op
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  /* auto mask_var_name = op_info->Output("Mask").front(); */
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  /* auto mask = scope->FindVar(mask_var_name)->GetMutable<Tensor>(); */
+  /* auto mask_dims = mask->dims().Vectorize(); */
+  /* auto mask_tensor = graph->AddNode( */
+  /*     mask_var_name, mask_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); */
+
+  // is_test is true by default
+  // if(op_info->HasAttr("is_test")){
+  //   auto is_test = op_info->GetAttr<bool>("is_test");
+  //   CHECK(is_test != true);
+  // }
+
+  // Param fix_seed and seed is useless in MLU
+
+  auto dropout_implementation =
+      op_info->GetAttr<std::string>("dropout_implementation");
+  auto dropout_prob = op_info->GetAttr<float>("dropout_prob");
+  float alpha = 1.0f - dropout_prob;
+  if (dropout_implementation == "upscale_in_train") {
+    alpha = 1.;
+  }
+  float beta = 0.;
+
+  std::vector<int64_t> shape = {1, 1, 1, 1};
+  std::string alpha_var_name = string_format("dropout_alpha_%p", op);
+  std::string beta_var_name = string_format("dropout_beta_%p", op);
+  auto alpha_tensor = graph->AddNode(
+      alpha_var_name, shape, CNML_CONST, CNML_NHWC, graph->FPType());
+  auto beta_tensor = graph->AddNode(
+      beta_var_name, shape, CNML_CONST, CNML_NHWC, graph->FPType());
+
+  graph->BindConstRawData(alpha_var_name, &alpha, 1);
+  graph->BindConstRawData(beta_var_name, &beta, 1);
+
+  auto input_tensor = graph->GetNode(x_var_name);
+  cnmlBaseOp_t scale_op;
+  CNML_CALL(cnmlCreateScaleOp(&scale_op,
+                              input_tensor->mlu_tensor(),
+                              output_tensor->mlu_tensor(),
+                              alpha_tensor->mlu_tensor(),
+                              beta_tensor->mlu_tensor()));
+  graph->FuseOp(scale_op);
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(dropout,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::DropoutConverter);
diff --git a/lite/kernels/mlu/bridges/dropout_op_test.cc b/lite/kernels/mlu/bridges/dropout_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44f03e3051a6c568d541b98b64808e27470d8916
--- /dev/null
+++ b/lite/kernels/mlu/bridges/dropout_op_test.cc
@@ -0,0 +1,158 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/dropout_op.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void dropout_ref(const std::shared_ptr<operators::DropoutOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto dropout_implementation =
+      op_info->GetAttr<std::string>("dropout_implementation");
+  auto dropout_prob = op_info->GetAttr<float>("dropout_prob");
+  float alpha = 1.0f - dropout_prob;
+  if (dropout_implementation == "upscale_in_train") {
+    alpha = 1.;
+  }
+  float beta = 0.;
+
+  auto x_data = x->data<float>();
+  auto out_data = out->mutable_data<float>();
+  DDim x_dims = x->dims();
+  DDim out_dims = out->dims();
+  CHECK_EQ(x_dims.production(), out_dims.production());
+  for (int i = 0; i < out_dims.production(); i++) {
+    out_data[i] = x_data[i] * alpha + beta;
+  }
+}
+
+void test_dropout(int bs,
+                  int ic,
+                  int ih,
+                  int iw,
+                  std::string dropout_implementation,
+                  float dropout_prob,
+                  float bias) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string mask_var_name("mask");
+  std::string out_ref_var_name("out_ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* mask = scope.Var(mask_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize({bs, ic, ih, iw});
+
+  // initialize input&output data
+  FillTensor<float, int>(x);
+
+  // initialize op desc
+  bool is_test = true;
+  bool fix_seed = false;
+  int seed = 0;
+  cpp::OpDesc opdesc;
+  opdesc.SetType("dropout");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetOutput("Mask", {mask_var_name});
+  opdesc.SetAttr("is_test", is_test);
+  opdesc.SetAttr("fix_seed", fix_seed);
+  opdesc.SetAttr("seed", seed);
+  opdesc.SetAttr("dropout_implementation", dropout_implementation);
+  opdesc.SetAttr("dropout_prob", dropout_prob);
+  VLOG(6) << "mask: " << mask->dims()[0] << std::endl;
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::DropoutOp>(opdesc, &scope);
+  dropout_ref(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input_trans;
+  input_trans.Resize({bs, ic, ih, iw});
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {bs, ic, ih, iw},
+            {0, 2, 3, 1});
+  auto os = out->dims();
+  out->Resize({static_cast<int>(os[0]),
+               static_cast<int>(os[2]),
+               static_cast<int>(os[3]),
+               static_cast<int>(os[1])});
+  x->CopyDataFrom(input_trans);
+  x->Resize({bs, ih, iw, ic});
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor('out')
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize(os);
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(os[0]),
+             static_cast<int>(os[2]),
+             static_cast<int>(os[3]),
+             static_cast<int>(os[1])},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(MLUBridges, dropout) {
+  for (auto bs : {1, 3}) {
+    for (auto ic : {1, 3}) {
+      for (auto ih : {3, 4}) {
+        for (auto iw : {4, 3}) {
+          for (auto dropout_implementation :
+               {"downgrade_in_infer", "upscale_in_train"}) {
+            for (auto dropout_prob : {0.f, 1.0f}) {
+              VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
+                      << " iw: " << iw
+                      << " dropout_implementation: " << dropout_implementation
+                      << " dropout_prob: " << dropout_prob;
+              test_dropout(
+                  bs, ic, ih, iw, dropout_implementation, dropout_prob, 0.);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(dropout, kMLU);
diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc
index 41526a0100ba71be9eda25983cb96aa888d6cf4d..5f7192a0628a7887dbca15d63f1ba22799d7ee4b 100644
--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
@@ -23,7 +23,7 @@ namespace mlu {
 
 std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
   auto x_dims = x.dims();
-  CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
+  // CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
   auto y_dims = y->dims();
   CHECK_GE(x_dims.size(), y_dims.size());
 
@@ -117,6 +117,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
 
   graph->FuseOp(elementwise_op);
+  CNML_CALL(cnmlDestroyBaseOp(&elementwise_op));
   cnmlBaseOp_t act_op;
   if (op_type == "fusion_elementwise_add_activation") {
     auto mid_tensor = graph->GetNode(out_var_name + "_mid");
@@ -127,6 +128,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  mid_tensor->mlu_tensor(),
                                  output_tensor->mlu_tensor()));
     graph->FuseOp(act_op);
+    CNML_CALL(cnmlDestroyBaseOp(&act_op));
   }
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
diff --git a/lite/kernels/mlu/bridges/elementwise_ops_test.cc b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
index e5087dd708eee3ba255fbfa0383d31b12a6b6870..7844e5b1b57567f72750b21ba288547cb165eb54 100644
--- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
@@ -153,7 +153,7 @@ void test_elementwise_add(const std::vector<int64_t>& x_shape,
   opdesc.SetOutput("Out", {out_var_name});
   opdesc.SetAttr("axis", axis);
 
-  // create and convert op to NPU model, then run it on NPU
+  // create and convert op to MLU model, then run it on MLU
   auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
 
   // execute reference implementation and save to output tensor
diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc
index 286feec8d4d44eaa025f333d559c32ca72f042ff..e820fc7abca89a573cfbd7efd7ecca1640905e6a 100644
--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
@@ -34,7 +34,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto w_var_name = op_info->Input("W").front();
   auto output_var_name = op_info->Output("Out").front();
 
-  // int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  CHECK(!op_info->HasAttr("activation_type"));
   auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
   auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
   auto output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
@@ -45,11 +45,30 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(w_dims.size(), 2UL);
 
   // Create w node
-  std::vector<int64_t> w_shape{w_dims[1], w_dims[0]};
+  std::vector<int64_t> cnml_w_shape;
+  if (x_dims.size() == 4) {
+    if (x_dims[1] * x_dims[2] * x_dims[3] == w_dims[0]) {
+      cnml_w_shape = {
+          static_cast<int>(w_dims[1]),
+          static_cast<int>(x_dims[1]),  // input_c
+          static_cast<int>(x_dims[2]),  //  input_h
+          static_cast<int>(x_dims[3]),  //  input_w
+      };
+    } else {
+      LOG(FATAL)
+          << "in fc op, we expect input_h * input_w * input_c == filter_c"
+          << " but we got input_c = " << x_dims[1] << " input_h = " << x_dims[2]
+          << " input_w = " << x_dims[3] << " filter_c = " << w_dims[0]
+          << std::endl;
+    }
+  } else {
+    cnml_w_shape = {w_dims[1], w_dims[0]};
+  }
+
   auto w_tensor = graph->AddNode(
-      w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
+      w_var_name, cnml_w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
 
-  auto input_scale = op_info->GetAttr<float>("input_scale");
+  auto input_scale = op_info->GetInputScale(x_var_name)[0];
 
   auto output_tensor = graph->AddNode(output_var_name,
                                       output->dims().Vectorize(),
@@ -63,15 +82,15 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   if (HasInputArg(op_info, scope, "Bias")) {
     bias_var_name = op_info->Input("Bias").front();
     auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-    auto bias_dims = bias->dims();
+    auto bias_dims = bias->dims().Vectorize();
     CHECK(!graph->HasNode(bias_var_name));
+    if (bias_dims.size() < 4u) {
+      bias_dims.insert(bias_dims.begin(), 4 - bias_dims.size(), 1);
+    }
     // CHECK_EQ(bias_dims.production(), n);
 
-    bias_tensor = graph->AddNode(bias_var_name,
-                                 bias_dims.Vectorize(),
-                                 CNML_CONST,
-                                 CNML_CNHW,
-                                 graph->FPType());
+    bias_tensor = graph->AddNode(
+        bias_var_name, bias_dims, CNML_CONST, CNML_NHWC, graph->FPType());
     graph->BindConstData(bias_var_name, bias);
   }
   cnmlBaseOp_t fc_op;
@@ -82,24 +101,52 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                             bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
   graph->SetComputingDataType(
       fc_op, graph->GetNode(x_var_name)->mlu_tensor(), 1 / input_scale);
-  auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+  auto weight_scale = op_info->GetInputScale(w_var_name);
 
   // LOG(INFO) << "W precision " << int(w->precision());
   if (w->precision() == PrecisionType::kUnk ||
       w->precision() == PrecisionType::kInt8) {
     std::vector<float> w_dequant(w->data_size());
-    dequant(w_dequant.data(),
-            w->mutable_data<int8_t>(),
-            1,
-            w_dims[1],
-            w_dims[0],
-            weight_scale);
-    for (int i = 0; i < w_dims[1]; i++) {
-      for (int j = 0; j < w_dims[0]; j++) {
-        w->mutable_data<float>()[i * w_dims[0] + j] =
-            w_dequant[i + j * w_dims[1]];
-      }
+    if (cnml_w_shape.size() == 2) {
+      dequant(w_dequant.data(),
+              w->mutable_data<int8_t>(),
+              1,
+              cnml_w_shape[0],
+              cnml_w_shape[1],
+              weight_scale);
+      transpose2d(w_dequant.data(),
+                  w->mutable_data<float>(),
+                  {static_cast<int>(cnml_w_shape[0]),
+                   static_cast<int>(cnml_w_shape[1])});
+    } else if (cnml_w_shape.size() == 4) {
+      dequant(w_dequant.data(),
+              w->mutable_data<int8_t>(),
+              1,
+              cnml_w_shape[0],
+              cnml_w_shape[1] * cnml_w_shape[2] * cnml_w_shape[3],
+              weight_scale);
+
+      int c_o_num = cnml_w_shape[0];
+      int c_i_num = cnml_w_shape[1];
+      int h_i_num = cnml_w_shape[2];
+      int w_i_num = cnml_w_shape[3];
+
+      // chw == ci * hi * wi == w_dim[0]
+      // first trans [chw, co] -> [co,chw]
+      std::vector<float> first_trans_output(w_dequant.size());
+      int chw = c_i_num * h_i_num * w_i_num;
+      transpose2d(w_dequant.data(), first_trans_output.data(), {chw, c_o_num});
+
+      // second trans [co,ci,hi,wi] -> [co,hi,wi,ci]
+      transpose(first_trans_output.data(),
+                w->mutable_data<float>(),
+                {c_o_num, c_i_num, h_i_num, w_i_num},
+                {0, 2, 3, 1});
+    } else {
+      LOG(FATAL) << "expect w_shape.size == 2 or 4, but got "
+                 << cnml_w_shape.size() << std::endl;
     }
+
     w->set_precision(PrecisionType::kFloat);
   } else if (w->precision() != PrecisionType::kFloat) {
     LOG(FATAL) << "UnSupported weight precision!";
@@ -110,9 +157,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   graph->SetComputingDataType(
       fc_op,
       w_tensor->mlu_tensor(),
-      1 / *min_element(weight_scale.begin(), weight_scale.end()));
+      1 / *max_element(weight_scale.begin(), weight_scale.end()));
 
   graph->FuseOp(fc_op);
+  CNML_CALL(cnmlDestroyBaseOp(&fc_op));
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
diff --git a/lite/kernels/mlu/bridges/fc_op_test.cc b/lite/kernels/mlu/bridges/fc_op_test.cc
index fe1c889f431350b4175ac400aefe77e6392405c5..b7c576581b7bab4b5dd3f2538350a65f94d62c62 100644
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
@@ -131,14 +131,15 @@ void test_fc(const std::vector<int64_t>& input_shape,
   fc_op_desc_mlu.SetOutput("Out", {out_var_name});
   fc_op_desc_mlu.SetAttr("in_num_col_dims", static_cast<int>(in_num_col_dims));
 
-  fc_op_desc_mlu.SetAttr("weight_scale",
-                         std::vector<float>(w_shape[1], w_scale));
-  fc_op_desc_mlu.SetAttr("input_scale", input_scale);
+  OpInfo op_info(fc_op_desc_mlu);
+  op_info.SetInputScale(w_int_var_name,
+                        std::vector<float>(w_shape[1], w_scale));
+  op_info.SetInputScale(input_var_name, {input_scale});
   if (has_bias) {
-    fc_op_desc_mlu.SetInput("Bias", {bias_var_name});
+    op_info.SetInput("Bias", {bias_var_name});
   }
 
-  auto fc_op_mlu = CreateOp<operators::FcOpLite>(fc_op_desc_mlu, &scope);
+  auto fc_op_mlu = CreateOp<operators::FcOpLite>(op_info, &scope);
 
   Tensor input_tmp, out_tmp;
   input_tmp.Resize(input_shape);
@@ -175,9 +176,9 @@ void test_fc(const std::vector<int64_t>& input_shape,
 
 TEST(MLUBridges, fc) {
   for (bool use_bias : {true, false}) {
-    // test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias);
-    // test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias);
-    // test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
+    test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias);
+    test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias);
+    test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
     test_fc({1, 1024, 1, 1}, {1024, 32}, 1, use_bias);
   }
 }
diff --git a/lite/kernels/mlu/bridges/flatten_op.cc b/lite/kernels/mlu/bridges/flatten_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..faf7e6fd2801cdcaad4bce0a20921843f1d1b516
--- /dev/null
+++ b/lite/kernels/mlu/bridges/flatten_op.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int FlattenConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+
+  // ================== Trans1: NHWC => NCHW ===========================
+  auto input_tensor = graph->GetNode(x_var_name);
+  auto trans_1_axis = std::move(GetAxisNHWC2NCHW<int>(x->dims().size()));
+  auto trans1_out = graph->AddNode(x_var_name + ".trans.i",
+                                   x->dims().Vectorize(),
+                                   CNML_TENSOR,
+                                   CNML_NCHW,
+                                   graph->FPType(),
+                                   CNML_NCHW);
+  cnmlBaseOp_t trans1_op{nullptr};
+  cnmlNdTransposeOpParam_t trans1_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &trans1_param, trans_1_axis.data(), trans_1_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&trans1_op,
+                                       input_tensor->mlu_tensor(),
+                                       trans1_out->mlu_tensor(),
+                                       trans1_param));
+  // ======================== Trans1 End ==================================
+
+  // ======================= Flatten op ===================================
+  cnmlBaseOp_t flatten_op;
+  auto trans2_input = graph->AddNode(out_var_name + ".trans.o",
+                                     output_dims,
+                                     CNML_TENSOR,
+                                     CNML_NCHW,
+                                     graph->FPType(),
+                                     CNML_NCHW);
+  int cnml_trans2_input_shape[4];
+  CNML_CALL(
+      cnmlGetTensorShape(trans2_input->mlu_tensor(), cnml_trans2_input_shape));
+  cnmlReshapeOpParam_t reshape_param{nullptr};
+  CNML_CALL(cnmlCreateNdReshapeOpParam(
+      &reshape_param, cnml_trans2_input_shape, output->dims().size()));
+
+  // Use cnmlCreatexxxOpForward to create op.
+  CNML_CALL(cnmlCreateReshapeOp(&flatten_op,
+                                reshape_param,
+                                trans1_out->mlu_tensor(),
+                                trans2_input->mlu_tensor()));
+  // ======================= Flatten End ===================================
+
+  // ================== Trans2: NCHW => NHWC ===============================
+  auto trans_2_axis = std::move(GetAxisNCHW2NHWC<int>(output->dims().size()));
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  cnmlBaseOp_t trans2_op{nullptr};
+  cnmlNdTransposeOpParam_t trans2_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &trans2_param, trans_2_axis.data(), trans_2_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&trans2_op,
+                                       trans2_input->mlu_tensor(),
+                                       output_tensor->mlu_tensor(),
+                                       trans2_param));
+  // ======================== Trans2 End ==================================
+
+  // ============== DEBUG LOG ===============
+
+  VLOG(6) << "x_var_name: " << x_var_name;
+  VLOG(6) << "out_var_name: " << out_var_name;
+  VLOG(6) << "input dim: " << x->dims();
+  VLOG(6) << "output dim: " << output->dims();
+  //   cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(trans1_out->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(trans2_input->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR);
+  // ============== DEBUG END ===============
+  graph->FuseOp(trans1_op);
+  graph->FuseOp(flatten_op);
+  graph->FuseOp(trans2_op);
+  CNML_CALL(cnmlDestroyBaseOp(&trans1_op));
+  CNML_CALL(cnmlDestroyBaseOp(&flatten_op));
+  CNML_CALL(cnmlDestroyBaseOp(&trans2_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(flatten,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::FlattenConverter);
+REGISTER_SUBGRAPH_BRIDGE(flatten2,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::FlattenConverter);
diff --git a/lite/kernels/mlu/bridges/flatten_op_test.cc b/lite/kernels/mlu/bridges/flatten_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..190b837ffeecfd494ffbd748220207cd63da5c06
--- /dev/null
+++ b/lite/kernels/mlu/bridges/flatten_op_test.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/flatten_op.h"
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void test_flatten(std::vector<int64_t> input_shape, int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+  Tensor x_cpu;
+
+  // initialize input&output data
+  FillTensor<float, int>(x);
+  x_cpu.CopyDataFrom(*x);
+
+  Tensor input_trans;
+  input_trans.Resize(input_shape);
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {static_cast<int>(input_shape[0]),
+             static_cast<int>(input_shape[1]),
+             static_cast<int>(input_shape[2]),
+             static_cast<int>(input_shape[3])},
+            {0, 2, 3, 1});
+  x->CopyDataFrom(input_trans);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("flatten2");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr<int>("axis", axis);
+  auto op = CreateOp<operators::FlattenOp>(opdesc, &scope);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], x_cpu.mutable_data<float>()[i], 1e-5);
+  }
+}
+
+TEST(MLUBridges, flatten) { test_flatten({1, 2, 4, 4}, 2); }
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(flatten, kMLU);
+USE_SUBGRAPH_BRIDGE(flatten2, kMLU);
diff --git a/lite/kernels/mlu/bridges/gather_op.cc b/lite/kernels/mlu/bridges/gather_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b68f1af76456eede14ec550c623d6a8355f5d5e8
--- /dev/null
+++ b/lite/kernels/mlu/bridges/gather_op.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto index_var_name = op_info->Input("Index").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+
+  CHECK(graph->HasNode(x_var_name));
+  auto x_tensor = graph->GetNode(x_var_name);
+  auto index_tensor = graph->GetNode(index_var_name);
+
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+
+  cnmlBaseOp_t gather_op;
+  CNML_CALL(cnmlCreateGatherV2Op(&gather_op,
+                                 x_tensor->mlu_tensor(),
+                                 index_tensor->mlu_tensor(),
+                                 output_tensor->mlu_tensor(),
+                                 CNML_DIM_N));
+  graph->FuseOp(gather_op);
+  CNML_CALL(cnmlDestroyBaseOp(&gather_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(gather,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::GatherConverter);
diff --git a/lite/kernels/mlu/bridges/gather_op_test.cc b/lite/kernels/mlu/bridges/gather_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..413de7c9d7fda750b387c2daa21ef1e40e7982c7
--- /dev/null
+++ b/lite/kernels/mlu/bridges/gather_op_test.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/gather_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+template <typename dtype>
+void gather_ref(const std::shared_ptr<operators::GatherOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto index =
+      scope->FindVar(op_info->Input("Index").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+
+  auto x_dims = x->dims();
+  auto index_dims = index->dims();
+  CHECK(index_dims.size() == 1 ||
+        (index_dims.size() == 2 && index_dims[1] == 1));
+
+  int batch_size = index_dims[0];
+  DDim out_dims = x_dims;
+  out_dims[0] = batch_size;
+  out->Resize(out_dims);
+
+  auto x_data = x->data<float>();
+  auto index_data = index->data<int>();
+  auto out_data = out->mutable_data<float>();
+
+  auto slice_num = x_dims[0];
+  auto slice_size = x_dims.Slice(1, x_dims.size()).production();
+  for (int i = 0; i < batch_size; i++) {
+    auto index = index_data[i];
+    CHECK_LT(index, slice_num) << "index <= slice_num";
+    CHECK_GE(index, 0) << "index > 0";
+    memcpy(out_data + i * slice_size,
+           x_data + index * slice_size,
+           slice_size * sizeof(float));
+  }
+}
+
+void test_gather() {
+  // prepare input&output variables
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  std::string index_var_name = "index";
+
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  auto* index = scope.Var(index_var_name)->GetMutable<Tensor>();
+
+  x->Resize({5, 4, 3, 2});
+  index->Resize({2});
+  // initialize input&output data
+  FillTensor<float>(x);
+  FillTensor<int>(index, 1, 3);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("gather");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetInput("Index", {index_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  auto op = CreateOp<operators::GatherOp>(opdesc, &scope);
+  gather_ref<float>(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input;
+  input.Resize({5, 4, 3, 2});
+  transpose<float>(x->mutable_data<float>(),
+                   input.mutable_data<float>(),
+                   {static_cast<int>(5),
+                    static_cast<int>(4),
+                    static_cast<int>(3),
+                    static_cast<int>(2)},
+                   {0, 2, 3, 1});
+  x->CopyDataFrom(input);
+  LaunchOp(op, {x_var_name, index_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+
+  Tensor output;
+  output.Resize(out->dims());
+  transpose<float>(out_data,
+                   output.mutable_data<float>(),
+                   {static_cast<int>(out->dims()[0]),
+                    static_cast<int>(out->dims()[2]),
+                    static_cast<int>(out->dims()[3]),
+                    static_cast<int>(out->dims()[1])},
+                   {0, 3, 1, 2});
+  out_data = output.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4);
+  }
+}
+
+TEST(MLUBridges, gather) { test_gather(); }
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(gather, kMLU);
diff --git a/lite/kernels/mlu/bridges/graph.cc b/lite/kernels/mlu/bridges/graph.cc
index 65c2f8214c13ee8d004dbe4b2e706523d007469c..bbe88547c8d60e1468653a28dad97af09b24f952 100644
--- a/lite/kernels/mlu/bridges/graph.cc
+++ b/lite/kernels/mlu/bridges/graph.cc
@@ -27,10 +27,14 @@ std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
                                           cnmlTensorType_t tensor_type,
                                           cnmlDataOrder_t shape_order,
                                           cnmlDataType_t mlu_dtype,
+                                          cnmlDataOrder_t data_order,
                                           void* raw_ptr) {
   CHECK(!HasNode(name));
+  VLOG(5) << "add mlu node: " << name << "\t data type "
+          << static_cast<int>(mlu_dtype) << "\t data order "
+          << static_cast<int>(data_order);
   auto node = std::shared_ptr<MLUTensor>(
-      new MLUTensor(shape, tensor_type, shape_order, mlu_dtype));
+      new MLUTensor(shape, tensor_type, shape_order, mlu_dtype, data_order));
   node->set_mlu_ptr(raw_ptr);
   nodes_.insert(std::make_pair(name, node));
   return node;
diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h
index 2c6bd63a87e53332a329d0c5c66fcf372a2584ca..07c6b20efb9a72106cf6ae288c411e490345b089 100644
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -15,13 +15,15 @@
 #pragma once
 
 #include <cmath>
-#include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
+
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 #include "lite/kernels/mlu/bridges/tensor.h"
+#include "lite/utils/env.h"
 
 #define PRINT_HW_TIME false
 
@@ -45,32 +47,30 @@ class Graph {
     CNRT_CALL(cnrtCreateNotifier(&notifier_end_));
 #endif
   }
-
   ~Graph() {
     FreeConstData();
     CNML_CALL(cnmlDestroyFusionOp(&fusion_op_));
-    for (auto op : ops_) {
-      CNML_CALL(cnmlDestroyBaseOp(&op));
-    }
 #if PRINT_HW_TIME
     CNRT_CALL(cnrtDestroyNotifier(&notifier_start_));
     CNRT_CALL(cnrtDestroyNotifier(&notifier_end_));
     double total_time = 0;
-    for (auto& f : time_log_) {
-      total_time += f;
+    if (!time_log_.empty()) {
+      for (auto& f : time_log_) {
+        total_time += f;
+      }
+      std::cout << "cnml hardware time for " << time_log_.size()
+                << " process:" << total_time / time_log_.size() << std::endl;
     }
-    std::cout << "cnml hardware time for " << time_log_.size()
-              << " process:" << total_time / time_log_.size() << std::endl;
 #endif
   }
-
   // Data node
   std::shared_ptr<MLUTensor> AddNode(
       const std::string& name,
       std::vector<int64_t> shape,
       cnmlTensorType_t tensor_type = CNML_TENSOR,
-      cnmlDataOrder_t data_order = CNML_NCHW,
+      cnmlDataOrder_t shape_order = CNML_NCHW,
       cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32,
+      cnmlDataOrder_t data_order = CNML_NHWC,
       void* raw_ptr = nullptr);
 
   std::shared_ptr<MLUTensor> GetNode(const std::string& name) {
@@ -82,9 +82,16 @@ class Graph {
     return nodes_.find(name) != nodes_.end();
   }
 
-  void AddInput(std::shared_ptr<MLUTensor> tensor) {
+  void AddInput(std::shared_ptr<MLUTensor> tensor,
+                bool disable_batch_size_changeable = true) {
     inputs_.push_back(tensor->mlu_tensor());
     input_tensors_.push_back(tensor);
+    if (!disable_batch_size_changeable) {
+      constexpr int input_dimNb = 4;
+      bool input_dim_mutable[4] = {true, false, false, false};
+      CNML_CALL(cnmlSetTensorDimMutable(
+          tensor->mlu_tensor(), input_dim_mutable, input_dimNb));
+    }
   }
 
   void AddOutput(std::shared_ptr<MLUTensor> tensor) {
@@ -92,6 +99,22 @@ class Graph {
     output_tensors_.push_back(tensor);
   }
 
+  std::vector<std::shared_ptr<MLUTensor>>* MutableInputs() {
+    return &input_tensors_;
+  }
+
+  std::vector<std::shared_ptr<MLUTensor>>* MutableOutputs() {
+    return &output_tensors_;
+  }
+  void GenOfflineModel(const std::string& name) {
+    cnmlModel_t model;
+    const std::string& symbol = "subnet0";
+    const auto& filename = name + ".offline.cambricon";
+    CNML_CALL(cnmlCreateModel(&model, filename.c_str()));
+    CNML_CALL(cnmlAddFusionOpToModel(model, fusion_op_, symbol.c_str()));
+    CNML_CALL(cnmlSaveModel(model, filename.c_str()));
+    CNML_CALL(cnmlDestroyModel(model));
+  }
   void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); }
 
   void Compile(cnmlCoreVersion_t core_version, int core_number) {
@@ -103,18 +126,37 @@ class Graph {
     CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number));
     CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version));
     CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_));
-    for (auto in : input_tensors_) {
-      input_addrs_.push_back(in->mlu_data());
-    }
-    for (auto out : output_tensors_) {
-      output_addrs_.push_back(out->mlu_data());
-    }
   }
 
+#define MEASURE_HWTIME_START(que)                       \
+  do {                                                  \
+    CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); \
+  } while (0)
+
+#define MEASURE_HWTIME_END(que)                                                \
+  do {                                                                         \
+    thread_local float hw_time;                                                \
+    CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));                          \
+    CNRT_CALL(cnrtSyncQueue(que));                                             \
+    CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); \
+    hw_time /= 1000.0f;                                                        \
+    DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;       \
+    std::lock_guard<std::mutex> lk(time_mut_);                                 \
+    time_log_.push_back(hw_time);                                              \
+  } while (0)
+
   void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
+    input_addrs_.resize(input_tensors_.size());
+    output_addrs_.resize(output_tensors_.size());
+    for (size_t i = 0; i < input_addrs_.size(); ++i) {
+      input_addrs_[i] = input_tensors_[i]->mlu_data();
+    }
+    for (size_t i = 0; i < output_addrs_.size(); ++i) {
+      output_addrs_[i] = output_tensors_[i]->mlu_data();
+    }
+
 #if PRINT_HW_TIME
-    thread_local float hw_time;
-    CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
+    MEASURE_HWTIME_START(que);
 #endif
     CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
                                             input_addrs_.data(),
@@ -124,18 +166,46 @@ class Graph {
                                             &forward_param,
                                             que));
 #if PRINT_HW_TIME
-    CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
+    MEASURE_HWTIME_END(que);
 #endif
+  }
 
-    CNRT_CALL(cnrtSyncQueue(que));
+  void Compute(cnrtQueue_t que,
+               const std::vector<std::shared_ptr<MLUTensor>>& in,
+               const std::vector<std::shared_ptr<MLUTensor>>& out) {
+    std::vector<cnmlTensor_t> in_tensor;
+    std::vector<cnmlTensor_t> out_tensor;
+    input_addrs_.resize(in.size());
+    output_addrs_.resize(out.size());
+    for (size_t i = 0; i < input_addrs_.size(); ++i) {
+      input_addrs_[i] = in[i]->mlu_data();
+      in_tensor.push_back(in[i]->mlu_tensor());
+    }
+    for (size_t i = 0; i < output_addrs_.size(); ++i) {
+      output_addrs_[i] = out[i]->mlu_data();
+      out_tensor.push_back(out[i]->mlu_tensor());
+    }
+
+#if PRINT_HW_TIME
+    MEASURE_HWTIME_START(que);
+#endif
+    /* Because of using cnmlSetTensorDimMutable, cnmlComputeFusionOpForward_V3
+     * -> cnmlComputeFusionOpForward_V4 */
+    CNML_CALL(cnmlComputeFusionOpForward_V4(fusion_op_,
+                                            &in_tensor[0],
+                                            input_addrs_.data(),
+                                            input_addrs_.size(),
+                                            &out_tensor[0],
+                                            output_addrs_.data(),
+                                            output_addrs_.size(),
+                                            que,
+                                            NULL));
 #if PRINT_HW_TIME
-    CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
-    hw_time /= 1000.0f;
-    DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;
-    std::lock_guard<std::mutex> lk(time_mut_);
-    time_log_.push_back(hw_time);
+    MEASURE_HWTIME_END(que);
 #endif
   }
+#undef MEASURE_HWTIME_START
+#undef MEASURE_HWTIME_END
 
   template <typename T>
   void* RegisterConstData(size_t len) {
@@ -165,7 +235,7 @@ class Graph {
       CNML_CALL(cnmlBindConstData_V2(
           nodes_[tensor_name]->mlu_tensor(), alloc_data, false));
     } else if (fp_type_ == CNML_DATA_FLOAT16) {
-      void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len);
+      void* data_fp16 = RegisterConstData<paddle::lite::fluid::float16>(len);
       CNRT_CALL(
           cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
                            CNRT_FLOAT32,
@@ -180,7 +250,7 @@ class Graph {
     }
   }
 
-  void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
+  void BindConstData(std::string tensor_name, paddle::lite::Tensor* tensor) {
     const float* data = tensor->data<float>();
     size_t len = tensor->data_size();
     if (fp_type_ == CNML_DATA_FLOAT32) {
@@ -189,10 +259,14 @@ class Graph {
           const_cast<void*>(static_cast<const void*>(data)),
           false));
     } else if (fp_type_ == CNML_DATA_FLOAT16) {
-      auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>();
-      for (size_t i = 0; i < len; ++i) {
-        data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]);
-      }
+      void* data_fp16 = RegisterConstData<paddle::lite::fluid::float16>(len);
+      CNRT_CALL(
+          cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
+                           CNRT_FLOAT32,
+                           data_fp16,
+                           CNRT_FLOAT16,
+                           len,
+                           nullptr));
       CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(),
                                      static_cast<void*>(data_fp16),
                                      false));
@@ -206,19 +280,23 @@ class Graph {
                             float scale,
                             cnmlDataType_t data_type = CNML_DATA_INT8) {
     cnmlQuantizedParam_t quant_param;
-    CNML_CALL(
-        cnmlCreateQuantizedParam(&quant_param, scale2position(scale), 1, 0.0));
+    int pos = scale2position(scale);
+    auto cnml_scale = pow(2, pos) * scale;
+    VLOG(5) << "[cnml quantized param] pos: " << pos
+            << "\tscale: " << cnml_scale << std::endl;
+    CNML_CALL(cnmlCreateQuantizedParam(&quant_param, pos, cnml_scale, 0.0));
     CNML_CALL(
         cnmlSetOperationComputingDataType(op, tensor, data_type, quant_param));
     CNML_CALL(cnmlDestroyQuantizedParam(&quant_param));
   }
 
-  void SetFPType(::paddle::lite_api::PrecisionType type) {
+  void SetFPType(paddle::lite_api::PrecisionType type) {
+    origin_fp_type_ = type;
     switch (type) {
-      case ::paddle::lite_api::PrecisionType::kFP16:
+      case paddle::lite_api::PrecisionType::kFP16:
         fp_type_ = CNML_DATA_FLOAT16;
         break;
-      case ::paddle::lite_api::PrecisionType::kFloat:
+      case paddle::lite_api::PrecisionType::kFloat:
         fp_type_ = CNML_DATA_FLOAT32;
         break;
       default:
@@ -230,14 +308,14 @@ class Graph {
 
  private:
   cnmlDataType_t fp_type_{CNML_DATA_FLOAT32};
-  std::map<std::string, std::shared_ptr<MLUTensor>> nodes_;
+  paddle::lite_api::PrecisionType origin_fp_type_{PRECISION(kFloat)};
+  std::unordered_map<std::string, std::shared_ptr<MLUTensor>> nodes_;
   std::vector<cnmlTensor_t> inputs_;
   std::vector<cnmlTensor_t> outputs_;
   std::vector<void*> input_addrs_;
   std::vector<void*> output_addrs_;
   std::vector<std::shared_ptr<MLUTensor>> input_tensors_;
   std::vector<std::shared_ptr<MLUTensor>> output_tensors_;
-  std::vector<cnmlBaseOp_t> ops_;
   cnmlFusionOp_t fusion_op_;
   std::vector<void*> const_data_storage_;
 #if PRINT_HW_TIME
diff --git a/lite/kernels/mlu/bridges/interpolate_op.cc b/lite/kernels/mlu/bridges/interpolate_op.cc
index 2c1a2aeeff799d31d4328169fce058259543fb1f..32840736b8d9a9712d59a8175cd7d70311a34aad 100644
--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
@@ -85,6 +85,7 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                         nn_param));
   CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param));
   graph->FuseOp(interp_op);
+  CNML_CALL(cnmlDestroyBaseOp(&interp_op));
 
   return SUCCESS;
 }
diff --git a/lite/kernels/mlu/bridges/layout_op.cc b/lite/kernels/mlu/bridges/layout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d14695c4357e06832e06a68646628bfa8d211c43
--- /dev/null
+++ b/lite/kernels/mlu/bridges/layout_op.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int LayoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("Input").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  std::shared_ptr<MLUTensor> output_tensor;
+
+  CHECK(graph->HasNode(x_var_name));
+  std::vector<int> axis;
+  auto x_tensor = graph->GetNode(x_var_name);
+  auto x_data_order = x_tensor->dorder();
+  auto x_dims = x->dims().Vectorize();
+  if (x_data_order == CNML_NCHW) {
+    switch (x_dims.size()) {
+      case 2:
+        axis = {0, 1};
+        break;
+      case 3:
+        axis = {0, 2, 1};
+        break;
+      case 4:
+        axis = {0, 2, 3, 1};
+        break;
+      case 5:
+        axis = {0, 2, 3, 4, 1};
+        break;
+      default:
+        CHECK(0) << "Unsupport shape";
+    }
+    output_tensor = graph->AddNode(
+        out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, x_tensor->dtype());
+    VLOG(3) << "layout transpose nchw to nhwc" << std::endl;
+  } else {
+    switch (x_dims.size()) {
+      case 2:
+        axis = {0, 1};
+        break;
+      case 3:
+        axis = {0, 2, 1};
+        break;
+      case 4:
+        axis = {0, 3, 1, 2};
+        break;
+      case 5:
+        axis = {0, 4, 1, 2, 3};
+        break;
+      default:
+        CHECK(0) << "Unsupport shpae";
+    }
+    VLOG(3) << "layout transpose nhwc to nchw" << std::endl;
+    output_tensor = graph->AddNode(out_var_name,
+                                   output_dims,
+                                   CNML_TENSOR,
+                                   CNML_NCHW,
+                                   x_tensor->dtype(),
+                                   CNML_NCHW);
+  }
+  cnmlBaseOp_t layout_op;
+  cnmlNdTransposeOpParam_t transpose_param;
+  CNML_CALL(
+      cnmlCreateNdTransposeOpParam(&transpose_param, axis.data(), axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&layout_op,
+                                       x_tensor->mlu_tensor(),
+                                       output_tensor->mlu_tensor(),
+                                       transpose_param));
+  graph->FuseOp(layout_op);
+  CNML_CALL(cnmlDestroyBaseOp(&layout_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(layout,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::LayoutConverter);
diff --git a/lite/kernels/mlu/bridges/layout_op_test.cc b/lite/kernels/mlu/bridges/layout_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..69b905b0750fe99e29c6aaa9bffdc9f20229a239
--- /dev/null
+++ b/lite/kernels/mlu/bridges/layout_op_test.cc
@@ -0,0 +1,190 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/layout_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void test_layout_NHWC2NCHW(std::vector<int64_t> input_shape) {
+  // prepare input&output variables
+  std::string x_var_name = "input";
+  std::string out_var_name = "out";
+
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(input_shape));
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("layout");
+  opdesc.SetInput("Input", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  auto op = CreateOp<operators::LayoutOp>(opdesc, &scope);
+
+  // execute reference implementation and save to output tensor
+  Tensor input;
+  input.Resize(DDim(input_shape));
+  switch (input_shape.size()) {
+    case 2:
+      transpose<float>(
+          x->mutable_data<float>(),
+          input.mutable_data<float>(),
+          {static_cast<int>(input_shape[0]), static_cast<int>(input_shape[1])},
+          {0, 1});
+      break;
+    case 3:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[1])},
+                       {0, 2, 1});
+      break;
+    case 4:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[3]),
+                        static_cast<int>(input_shape[1])},
+                       {0, 3, 1, 2});
+      break;
+    case 5:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[3]),
+                        static_cast<int>(input_shape[4]),
+                        static_cast<int>(input_shape[1])},
+                       {0, 4, 1, 2, 3});
+      break;
+    default:
+      CHECK(0) << "Unsupport";
+  }
+  auto* x_data = input.mutable_data<float>();
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], x_data[i], 5e-4);
+  }
+}
+
+void test_layout_NCHW2NHWC(std::vector<int64_t> input_shape) {
+  // prepare input&output variables
+  std::string x_var_name = "input";
+  std::string out_var_name = "out";
+
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(input_shape));
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("layout");
+  opdesc.SetInput("Input", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  auto op = CreateOp<operators::LayoutOp>(opdesc, &scope);
+
+  // execute reference implementation and save to output tensor
+  Tensor input;
+  input.Resize(DDim(input_shape));
+  switch (input_shape.size()) {
+    case 2:
+      transpose<float>(
+          x->mutable_data<float>(),
+          input.mutable_data<float>(),
+          {static_cast<int>(input_shape[0]), static_cast<int>(input_shape[1])},
+          {0, 1});
+      break;
+    case 3:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[1]),
+                        static_cast<int>(input_shape[2])},
+                       {0, 2, 1});
+      break;
+    case 4:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[1]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[3])},
+                       {0, 2, 3, 1});
+      break;
+    case 5:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[1]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[3]),
+                        static_cast<int>(input_shape[4])},
+                       {0, 2, 3, 4, 1});
+      break;
+    default:
+      CHECK(0) << "Unsupport";
+  }
+  auto* x_data = input.mutable_data<float>();
+  LaunchOp(op, {x_var_name}, {out_var_name}, CNML_NCHW);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], x_data[i], 5e-4);
+  }
+}
+
+TEST(MLUBridges, layout) {
+  test_layout_NHWC2NCHW({12, 32, 4});
+  test_layout_NHWC2NCHW({12, 32, 44, 3});
+  test_layout_NHWC2NCHW({12, 32, 44, 3, 6});
+  test_layout_NCHW2NHWC({12, 32, 55});
+  test_layout_NCHW2NHWC({12, 32, 44, 3});
+  test_layout_NCHW2NHWC({12, 32, 44, 3, 8});
+  test_layout_NHWC2NCHW({12, 32});
+  test_layout_NCHW2NHWC({12, 32});
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(layout, kMLU);
diff --git a/lite/kernels/mlu/bridges/lrn_op.cc b/lite/kernels/mlu/bridges/lrn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff428ab10cef170983de788b9af517558e1fd7f5
--- /dev/null
+++ b/lite/kernels/mlu/bridges/lrn_op.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int LrnConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Create lrn node and get params from op
+  auto fp_type = graph->FPType();
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type);
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+
+  auto alpha = op_info->GetAttr<float>("alpha");
+  auto beta = op_info->GetAttr<float>("beta");
+  auto k = op_info->GetAttr<float>("k");
+  if (op_info->HasAttr("norm_region")) {
+    CHECK(op_info->GetAttr<std::string>("norm_region") == "AcrossChannels")
+        << "Unsuport WithinChannel";
+  }
+  auto local_size = op_info->GetAttr<int>("n");
+  auto input_scale = op_info->GetInputScale(x_var_name)[0];
+  VLOG(5) << "lrn input scale: " << input_scale;
+
+  cnmlLrnOpParam_t param;
+  cnmlBaseOp_t lrn_op;
+  CNML_CALL(
+      cnmlCreateLrnOpParam(&param, CNML_LRN_V3, local_size, alpha, beta, k));
+  CNML_CALL(cnmlCreateLrnOp(
+      &lrn_op, param, input_tensor->mlu_tensor(), output_tensor->mlu_tensor()));
+  CNML_CALL(cnmlDestroyLrnOpParam(&param));
+
+  graph->SetComputingDataType(
+      lrn_op, input_tensor->mlu_tensor(), 1 / input_scale);
+  CNML_CALL(cnmlSetOperationComputingDataType(
+      lrn_op, output_tensor->mlu_tensor(), fp_type, nullptr));
+
+  graph->FuseOp(lrn_op);
+  CNML_CALL(cnmlDestroyBaseOp(&lrn_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(lrn, kMLU, paddle::lite::subgraph::mlu::LrnConverter);
diff --git a/lite/kernels/mlu/bridges/lrn_op_test.cc b/lite/kernels/mlu/bridges/lrn_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..266446d6d3353bffa4398385703cd4cb64b4f53b
--- /dev/null
+++ b/lite/kernels/mlu/bridges/lrn_op_test.cc
@@ -0,0 +1,243 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/lrn_op.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <vector>
+
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+/**
+ * @brief get sum of x^2 between channels [size elements]
+ *
+ * @tparam float
+ * @param input
+ * @param channel_id: the c-th channel within n-th graph.
+ * @param offset_within_channel: the pixel's offset within a channel.
+ * @param offset_num: the first address of n-th graph.
+ * @param c
+ * @param h
+ * @param w
+ * @param size
+ * @return float
+ */
+float lrn_square(const float* input,
+                 int channel_id,
+                 int offset_within_channel,
+                 int offset_num,
+                 int c,
+                 int h,
+                 int w,
+                 int size) {
+  int pre_pad = (size - 1) / 2;
+  float res = 0;
+  const float* src = input + offset_num;
+
+  // handle left channels with padding situation.
+  if (channel_id - pre_pad < 0) {
+    for (int i = 0; i <= channel_id; ++i) {
+      res += src[i * h * w + offset_within_channel] *
+             src[i * h * w + offset_within_channel];
+    }
+  }
+
+  // handle left channels.
+  if (channel_id - pre_pad >= 0) {
+    for (int i = channel_id - pre_pad; i <= channel_id; ++i) {
+      res += src[i * h * w + offset_within_channel] *
+             src[i * h * w + offset_within_channel];
+    }
+  }
+
+  // handle right channels.
+  if (channel_id + pre_pad < c) {
+    for (int i = channel_id + 1; i <= channel_id + pre_pad; ++i) {
+      res += src[i * h * w + offset_within_channel] *
+             src[i * h * w + offset_within_channel];
+    }
+  }
+
+  // handle right channels with padding situation.
+  if (channel_id + pre_pad >= c && channel_id + 1 < c) {
+    for (int i = channel_id + 1; i < c; ++i) {
+      res += src[i * h * w + offset_within_channel] *
+             src[i * h * w + offset_within_channel];
+    }
+  }
+
+  return res;
+}
+
+void lrn_compute_ref(std::shared_ptr<operators::LrnOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x =
+      scope->FindVar(op_info->Input("X").front())->GetMutable<lite::Tensor>();
+  auto out = scope->FindVar(op_info->Output("Out").front())
+                 ->GetMutable<lite::Tensor>();
+
+  const float* x_data = x->data<const float>();
+  float* out_data = out->mutable_data<float>();
+  auto x_dims = x->dims();
+
+  auto alpha = op_info->GetAttr<float>("alpha");
+  auto beta = op_info->GetAttr<float>("beta");
+  auto k = op_info->GetAttr<float>("k");
+  auto norm_region = op_info->GetAttr<std::string>("norm_region");
+  auto local_size = op_info->GetAttr<int>("n");
+
+  int N = x_dims[0];
+  int C = x_dims[1];
+  int H = x_dims[2];
+  int W = x_dims[3];
+
+  int offset_num = 0;
+  int offset_within_channel = 0;
+  int dst_id;
+
+  float square;
+
+  for (int n = 0; n < N; ++n) {
+    offset_num = n * C * H * W;
+
+    for (int c = 0; c < C; ++c) {
+      for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+          offset_within_channel = h * W + w;
+          dst_id = offset_num + c * H * W + offset_within_channel;
+          square = lrn_square(x_data,
+                              c,
+                              offset_within_channel,
+                              offset_num,
+                              C,
+                              H,
+                              W,
+                              local_size);
+          out_data[dst_id] = x_data[dst_id] * pow(k + alpha * square, -beta);
+        }
+      }
+    }
+  }
+}
+
+void test_lrn(float alpha,
+              float beta,
+              float k,
+              int local_size,
+              int n,
+              int c,
+              int h,
+              int w,
+              const std::string& norm_region) {
+  Scope scope;
+  std::string x_var_name("X_test");
+  std::string out_var_name("Out_test");
+  std::string out_ref_var_name("Out_ref");
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+
+  std::vector<int64_t> x_dim{n, c, h, w};
+  x->Resize(x_dim);
+  out->Resize(x_dim);
+  out_ref->Resize(x_dim);
+  auto* x_data = x->mutable_data<float>();
+  FillTensor<float, float>(x, 0.f, 1.f);
+  float *dmax, *dmin;
+  std::tie(dmin, dmax) =
+      std::minmax_element(x_data, x_data + x->data_size() - 1);
+
+  cpp::OpDesc opdesc;
+  opdesc.SetType("lrn");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("alpha", alpha);
+  opdesc.SetAttr("beta", beta);
+  opdesc.SetAttr("k", k);
+  opdesc.SetAttr("n", local_size);
+  opdesc.SetAttr("norm_region", norm_region);
+  OpInfo op_info(opdesc);
+  op_info.SetInputScale(x_var_name, {(*dmax - *dmin) / 255.f});
+
+  auto op = CreateOp<operators::LrnOpLite>(op_info, &scope);
+
+  // baseline
+  lrn_compute_ref(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input_x;
+  input_x.Resize(x->dims());
+  transpose(x->mutable_data<float>(),
+            input_x.mutable_data<float>(),
+            {static_cast<int>(x_dim[0]),
+             static_cast<int>(x_dim[1]),
+             static_cast<int>(x_dim[2]),
+             static_cast<int>(x_dim[3])},
+            {0, 2, 3, 1});
+  x->CopyDataFrom(input_x);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  Tensor output_trans;
+  auto os = out->dims();
+  output_trans.Resize(os);
+  transpose(out->mutable_data<float>(),
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(os[0]),
+             static_cast<int>(os[2]),
+             static_cast<int>(os[3]),
+             static_cast<int>(os[1])},
+            {0, 3, 1, 2});
+
+  auto output_data = output_trans.mutable_data<float>();
+  auto* output_ref_data = out_ref->mutable_data<float>();
+  for (size_t i = 0; i < out->data_size(); i++) {
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 5e-4);
+  }
+}
+
+TEST(MLUBridges, lrn) {
+  int local_size = 5;
+  float alpha = 0.0001f;
+  float beta = 0.75;
+  float k = 2.0f;
+  std::string norm_region = "AcrossChannels";
+  for (int w : {2, 4, 8}) {
+    for (int h : {2, 4, 8}) {
+      for (int c : {1, 2, 3, 4}) {
+        for (int n : {1, 2, 3, 4}) {
+          test_lrn(alpha, beta, k, local_size, n, c, h, w, norm_region);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(lrn, kMLU)
diff --git a/lite/kernels/mlu/bridges/norm_op.cc b/lite/kernels/mlu/bridges/norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..492c3932a8c8a68f7eba687dde30d888d6e0f297
--- /dev/null
+++ b/lite/kernels/mlu/bridges/norm_op.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int NormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get input vars and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims().Vectorize();
+
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  int axis = op_info->GetAttr<int>("axis");
+  int epsilon = op_info->GetAttr<float>("epsilon");
+  if (axis < 0) {
+    axis = axis + x_dims.size();
+  }
+  std::vector<int> nchw2nhwc = {0, 3, 1, 2};
+  int nhwc_axis = nchw2nhwc[axis];
+
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+
+  // ======== DEBUG ===============
+  VLOG(6) << "x name=" << x_var_name;
+  VLOG(6) << "out name=" << out_var_name;
+  VLOG(6) << "x dims=" << x->dims();
+  VLOG(6) << "out dims=" << output->dims();
+  VLOG(6) << "axis =" << axis;
+  VLOG(6) << "nwhc axis=" << nhwc_axis;
+  VLOG(6) << "epsilon =" << epsilon;
+  // cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR);
+  // cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR);
+  // ======== DEBUG END ============
+  cnmlBaseOp_t norm_op{nullptr};
+
+  cnmlNormalizeOpParam_t param;
+  int mode = -1;
+  switch (axis) {
+    case 0:
+      mode = 3;  // N
+      break;
+    case 1:
+      mode = 0;  // C
+      break;
+    case 2:
+      mode = 4;  // H
+      break;
+    case 3:
+      mode = 5;  // W
+      break;
+    default:
+      CHECK(0);
+      break;
+  }
+  cnmlCreateNormalizeOpParamV2(&param,
+                               0,  // p
+                               0,  // use_scale
+                               mode,
+                               1,  // weight
+                               epsilon);
+
+  CNML_CALL(cnmlCreateNormalizeOp(&norm_op,
+                                  param,
+                                  input_tensor->mlu_tensor(),
+                                  output_tensor->mlu_tensor(),
+                                  nullptr,
+                                  false /*is_fix8_mode*/));
+  graph->FuseOp(norm_op);
+  CNML_CALL(cnmlDestroyBaseOp(&norm_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(norm,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::NormConverter);
diff --git a/lite/kernels/mlu/bridges/norm_op_test.cc b/lite/kernels/mlu/bridges/norm_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..35b5eabbb9ffacd96c3ca6500dd9181f4d5bec5b
--- /dev/null
+++ b/lite/kernels/mlu/bridges/norm_op_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/norm_op.h"
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <iostream>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+// void ToFile(std::string file_name, Tensor* tensor) {
+//   int count = tensor->dims().production();
+//   auto data = tensor->mutable_data<float>();
+//   std::ostringstream outs;
+//   for (size_t i = 0; i < count; i++) {
+//     outs << data[i] << std::endl;
+//   }
+//   std::ofstream of;
+//   of.open(file_name, std::ios::out);
+//   of << outs.str();
+//   of.close();
+// }
+
+void norm_ref(const std::shared_ptr<operators::NormOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  int axis = op_info->GetAttr<int>("axis");
+  int epsilon = op_info->GetAttr<float>("epsilon");
+  auto x_dims = x->dims();
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+  out->Resize(x_dims.Vectorize());
+  auto* out_data = out->mutable_data<float>();
+
+  const auto* x_data = x->data<float>();
+  int pre_n = x_dims.count(0, axis);
+  int n = x_dims[axis];
+  int post_n = x_dims.count(axis + 1, x_dims.size());
+  for (int i = 0; i < pre_n; i++) {
+    for (int k = 0; k < post_n; k++) {
+      float sum = epsilon;
+      const float* in_tmp = x_data + i * n * post_n + k;
+      for (int j = 0; j < n; j++) {
+        sum += in_tmp[j * post_n] * in_tmp[j * post_n];
+      }
+      sum = std::sqrt(sum);
+      float* out_tmp = out_data + i * n * post_n + k;
+      for (int j = 0; j < n; j++) {
+        out_tmp[j * post_n] = in_tmp[j * post_n] / sum;
+      }
+    }
+  }
+}
+
+void test_norm(const std::vector<int64_t>& input_shape, int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+  // initialize input&output data
+  FillTensor<float, float>(x, -9, 9);
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  float epsilon = 1e-9f;
+  opdesc.SetType("norm");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", static_cast<int>(axis));
+  opdesc.SetAttr("epsilon", static_cast<float>(epsilon));
+
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::NormOp>(opdesc, &scope);
+  norm_ref(op);
+  out_ref->CopyDataFrom(*out);
+  Tensor input_x;
+  input_x.Resize(DDim(input_shape));
+  // change input layout from NCHW to NHWC
+  transpose<float>(x->mutable_data<float>(),
+                   input_x.mutable_data<float>(),
+                   {static_cast<int>(input_shape[0]),
+                    static_cast<int>(input_shape[1]),
+                    static_cast<int>(input_shape[2]),
+                    static_cast<int>(input_shape[3])},
+                   {0, 2, 3, 1});
+  x->CopyDataFrom(input_x);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  std::vector<int64_t> out_shape = input_shape;
+  Tensor output_trans;
+  output_trans.Resize(out_shape);
+  // Change output layout from NHWC to NCHW
+  transpose<float>(out_data,
+                   output_trans.mutable_data<float>(),
+                   {static_cast<int>(out_shape[0]),
+                    static_cast<int>(out_shape[2]),
+                    static_cast<int>(out_shape[3]),
+                    static_cast<int>(out_shape[1])},
+                   {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, norm) {
+  test_norm({1, 2, 3, 4}, 1);
+  test_norm({1, 2, 3, 4}, 2);
+  test_norm({1, 2, 3, 4}, 3);
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(norm, kMLU);
diff --git a/lite/kernels/mlu/bridges/paddle_use_bridges.h b/lite/kernels/mlu/bridges/paddle_use_bridges.h
index d31ba0dd41111860a3b26d8ac3afb3273bef4557..be5c64b3b7056d0b8de1589d198db541b5a3777b 100644
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
@@ -15,6 +15,7 @@
 #pragma once
 
 USE_SUBGRAPH_BRIDGE(relu, kMLU);
+USE_SUBGRAPH_BRIDGE(relu6, kMLU)
 USE_SUBGRAPH_BRIDGE(conv2d, kMLU);
 USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU);
 USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU);
@@ -24,5 +25,26 @@ USE_SUBGRAPH_BRIDGE(batch_norm, kMLU);
 USE_SUBGRAPH_BRIDGE(fc, kMLU);
 USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
 USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU);
+USE_SUBGRAPH_BRIDGE(transpose, kMLU);
+USE_SUBGRAPH_BRIDGE(transpose2, kMLU);
 USE_SUBGRAPH_BRIDGE(concat, kMLU);
 USE_SUBGRAPH_BRIDGE(scale, kMLU);
+USE_SUBGRAPH_BRIDGE(sigmoid, kMLU);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU);
+USE_SUBGRAPH_BRIDGE(dropout, kMLU);
+USE_SUBGRAPH_BRIDGE(arg_max, kMLU);
+USE_SUBGRAPH_BRIDGE(split, kMLU);
+USE_SUBGRAPH_BRIDGE(cast, kMLU);
+USE_SUBGRAPH_BRIDGE(layout, kMLU);
+USE_SUBGRAPH_BRIDGE(slice, kMLU);
+USE_SUBGRAPH_BRIDGE(squeeze, kMLU);
+USE_SUBGRAPH_BRIDGE(squeeze2, kMLU);
+USE_SUBGRAPH_BRIDGE(flatten, kMLU);
+USE_SUBGRAPH_BRIDGE(flatten2, kMLU);
+USE_SUBGRAPH_BRIDGE(reshape, kMLU);
+USE_SUBGRAPH_BRIDGE(reshape2, kMLU);
+#ifdef LITE_BUILD_EXTRA
+USE_SUBGRAPH_BRIDGE(gather, kMLU);
+USE_SUBGRAPH_BRIDGE(lrn, kMLU)
+USE_SUBGRAPH_BRIDGE(norm, kMLU)
+#endif
diff --git a/lite/kernels/mlu/bridges/pool_op.cc b/lite/kernels/mlu/bridges/pool_op.cc
index f77c8084c76fc52c39938e723f02bde9b3cac41b..c734de1eec75d253a9b6b8d7a7f21d710df3d949 100644
--- a/lite/kernels/mlu/bridges/pool_op.cc
+++ b/lite/kernels/mlu/bridges/pool_op.cc
@@ -55,6 +55,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto global_pooling = op_info->GetAttr<bool>("global_pooling");
   auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  CHECK(!(op_info->HasAttr("exclusive") &&
+          op_info->GetAttr<bool>("exclusive") == false))
+      << "Unsupport param exclusive is false!";
 
   if (paddings.size() == 2L) {
     for (size_t i = 0; i < 2L; ++i) {
@@ -62,8 +65,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
     }
   }
-  int pad_height = paddings[0];
-  int pad_width = paddings[2];
   std::string padding_algorithm("");
   if (op_info->HasAttr("padding_algorithm")) {
     padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
@@ -72,6 +73,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   if (op_info->HasAttr("adaptive")) {
     adaptive = op_info->GetAttr<bool>("adaptive");
   }
+  auto input_dims = x->dims();
+
   lite::operators::UpdatePadding(&paddings,
                                  global_pooling,
                                  adaptive,
@@ -80,31 +83,31 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  strides,
                                  ksize);
 
-  //  std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
-  //  for (size_t i = 0; i < 2; i++) {
-  //    output_shape.push_back(
-  //        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] -
-  //        ksize[0]) /
-  //            strides[i] +
-  //        1);
-  //  }
+  if (global_pooling) {
+    ksize.resize(static_cast<size_t>(input_dims.size()) - 2);
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      ksize[i] = static_cast<int>(input_dims[i + 2]);
+    }
+  }
 
   auto output_tensor = graph->AddNode(
       output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
 
   cnmlPoolOpParam_t pool_param;
   CNML_CALL(
-      cnmlCreatePoolOpParam_V2(&pool_param,
+      cnmlCreatePoolOpParam_V3(&pool_param,
                                ksize[0],
                                ksize[1],
                                strides[0],
                                strides[1],
-                               pad_height,
-                               pad_width,
-                               1,  // dilation
-                               1,
+                               paddings[0],
+                               paddings[1],
+                               paddings[2],
+                               paddings[3],
+                               1,  // dilation h
+                               1,  // dilation w
                                ToCnmlPoolMode(pooling_type),
-                               ceil_mode ? CNML_POOL_KVALID : CNML_POOL_KFULL,
+                               ceil_mode ? CNML_POOL_KFULL : CNML_POOL_KVALID,
                                true, /* real */
                                1 /* blend factor */));
   cnmlBaseOp_t pool_op;
@@ -114,6 +117,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                              output_tensor->mlu_tensor()));
   CNML_CALL(cnmlDestroyPoolOpParam(&pool_param));
   graph->FuseOp(pool_op);
+  CNML_CALL(cnmlDestroyBaseOp(&pool_op));
   return SUCCESS;
 }
 
diff --git a/lite/kernels/mlu/bridges/pool_op_test.cc b/lite/kernels/mlu/bridges/pool_op_test.cc
index 8cee8dbe86109b14cff49f329d71074a9b3bfb61..2ae888744fde3e94e857f04d50ceb1eb878f3c1c 100644
--- a/lite/kernels/mlu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
@@ -43,6 +43,12 @@ void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
   std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
   bool global_pooling = op_info->GetAttr<bool>("global_pooling");
 
+  if (pooling_type == "max") {
+    for (int i = 0; i < out_dims.production(); ++i) {
+      dst_ptr[i] = -65504.f;
+    }
+  }
+
   int in_n = in_dims[0];
   int in_c = in_dims[1];
   int in_h = in_dims[2];
@@ -203,62 +209,46 @@ void test_pool(int bs,
 }
 
 TEST(MLUBridges, pool) {
-  // for (auto pooling_type : {"max", "avg"}) {
-  //   for (auto ceil_mode : {true, false}) {
-  //     for (auto global_pooling : {/*true, */ false}) {
-  //       for (auto exclusive : {true /*, false*/}) {
-  //         for (auto ksize : {2, 3}) {
-  //           for (auto stride : {1, 2}) {
-  //             for (auto padding : {0, 1}) {
-  //               for (auto bs : {1, 3}) {
-  //                 for (auto ic : {1, 3}) {
-  //                   for (auto ih : {3, 7}) {
-  //                     for (auto iw : {3, 7}) {
-  //                       test_pool(bs,
-  //                                 ic,
-  //                                 ih,
-  //                                 iw,
-  //                                 pooling_type,
-  //                                 ceil_mode,
-  //                                 global_pooling,
-  //                                 exclusive,
-  //                                 ksize,
-  //                                 stride,
-  //                                 padding);
-  //                     }
-  //                   }
-  //                 }
-  //               }
-  //             }
-  //           }
-  //         }
-  //       }
-  //     }
-  //   }
-  // }
-
   for (auto pooling_type : {"max", "avg"}) {
     for (auto ceil_mode : {true, false}) {
-      bool global_pooling = false;
-      bool exclusive = true;
-      int ksize = 2;
-      int stride = 1;
-      int padding = 0;
-      int bs = 6;
-      int ic = 6;
-      int ih = 6;
-      int iw = 6;
-      test_pool(bs,
-                ic,
-                ih,
-                iw,
-                pooling_type,
-                ceil_mode,
-                global_pooling,
-                exclusive,
-                ksize,
-                stride,
-                padding);
+      for (auto global_pooling : {true, false}) {
+        for (auto exclusive : {true /*, false*/}) {
+          for (auto ksize : {2, 3}) {
+            for (auto stride : {1, 2}) {
+              for (auto padding : {0, 1}) {
+                for (auto bs : {1, 3}) {
+                  for (auto ic : {1, 3}) {
+                    for (auto ih : {3, 7}) {
+                      for (auto iw : {3, 7}) {
+                        LOG(INFO)
+                            << "shape: " << bs << ',' << ic << ',' << ih << ','
+                            << iw << '\t' << "pooling type: " << pooling_type
+                            << '\t' << "ceil model: " << ceil_mode << '\t'
+                            << "global_pooling: " << global_pooling << '\t'
+                            << "exclusive: " << exclusive << '\t'
+                            << "ksize: " << ksize << '\t'
+                            << "stride: " << stride << '\t'
+                            << "padding: " << padding;
+                        test_pool(bs,
+                                  ic,
+                                  ih,
+                                  iw,
+                                  pooling_type,
+                                  ceil_mode,
+                                  global_pooling,
+                                  exclusive,
+                                  ksize,
+                                  stride,
+                                  padding);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
     }
   }
 }
diff --git a/lite/kernels/mlu/bridges/reshape_op.cc b/lite/kernels/mlu/bridges/reshape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b47322b3462525be64e42b608d052719d7c5f0b
--- /dev/null
+++ b/lite/kernels/mlu/bridges/reshape_op.cc
@@ -0,0 +1,130 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+
+  // ================== Trans1: NHWC => NCHW ===========================
+  auto input_tensor = graph->GetNode(x_var_name);
+  auto trans_1_axis = std::move(GetAxisNHWC2NCHW<int>(x->dims().size()));
+  auto trans1_out = graph->AddNode(x_var_name + ".trans.i",
+                                   x->dims().Vectorize(),
+                                   CNML_TENSOR,
+                                   CNML_NCHW,
+                                   graph->FPType(),
+                                   CNML_NCHW);
+  cnmlBaseOp_t trans1_op{nullptr};
+  cnmlNdTransposeOpParam_t trans1_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &trans1_param, trans_1_axis.data(), trans_1_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&trans1_op,
+                                       input_tensor->mlu_tensor(),
+                                       trans1_out->mlu_tensor(),
+                                       trans1_param));
+  // ======================== Trans1 End ==================================
+
+  // ======================= Reshape op ===================================
+  cnmlBaseOp_t reshape_op;
+  auto trans2_input = graph->AddNode(out_var_name + ".trans.o",
+                                     output_dims,
+                                     CNML_TENSOR,
+                                     CNML_NCHW,
+                                     graph->FPType(),
+                                     CNML_NCHW);
+  cnmlReshapeOpParam_t reshape_param{nullptr};
+  int cnml_trans2_input_shape[4];
+  CNML_CALL(
+      cnmlGetTensorShape(trans2_input->mlu_tensor(), cnml_trans2_input_shape));
+  CNML_CALL(
+      cnmlCreateNdReshapeOpParam(&reshape_param, cnml_trans2_input_shape, 4));
+
+  // Use cnmlCreatexxxOpForward to create op.
+  CNML_CALL(cnmlCreateReshapeOp(&reshape_op,
+                                reshape_param,
+                                trans1_out->mlu_tensor(),
+                                trans2_input->mlu_tensor()));
+  // ======================= Reshape op End ===================================
+
+  // ================== Trans2: NCHW => NHWC ===============================
+  auto trans_2_axis = std::move(GetAxisNCHW2NHWC<int>(output->dims().size()));
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  cnmlBaseOp_t trans2_op{nullptr};
+  cnmlNdTransposeOpParam_t trans2_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &trans2_param, trans_2_axis.data(), trans_2_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&trans2_op,
+                                       trans2_input->mlu_tensor(),
+                                       output_tensor->mlu_tensor(),
+                                       trans2_param));
+  // ======================== Trans2 End ==================================
+
+  // =============== DEBUG ====================
+  VLOG(6) << "x_var_name: " << x_var_name;
+  VLOG(6) << "out_var_name: " << out_var_name;
+  VLOG(6) << "input dim: " << x->dims();
+  VLOG(6) << "output dim: " << output->dims();
+  int cnml_input_shape[4];
+  CNML_CALL(cnmlGetTensorShape(input_tensor->mlu_tensor(), cnml_input_shape));
+  VLOG(6) << "cnml input dim: ";
+  for (size_t i = 0; i < 4; i++) {
+    VLOG(6) << cnml_input_shape[i];
+  }
+  //   cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(trans1_out->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(trans2_input->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR);
+  // =============== DEBUG END =================
+
+  graph->FuseOp(trans1_op);
+  graph->FuseOp(reshape_op);
+  graph->FuseOp(trans2_op);
+  CNML_CALL(cnmlDestroyBaseOp(&trans1_op));
+  CNML_CALL(cnmlDestroyBaseOp(&reshape_op));
+  CNML_CALL(cnmlDestroyBaseOp(&trans2_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(reshape,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(reshape2,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ReshapeConverter);
diff --git a/lite/kernels/mlu/bridges/reshape_op_test.cc b/lite/kernels/mlu/bridges/reshape_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0cd2c6cc26f8f40ee83c99755d8842b072693b1a
--- /dev/null
+++ b/lite/kernels/mlu/bridges/reshape_op_test.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/reshape_op.h"
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void test_reshape(std::vector<int64_t> input_shape,
+                  std::vector<int64_t> out_shape) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+  Tensor x_cpu;
+
+  // initialize input&output data
+  FillTensor<float, int>(x);
+  x_cpu.CopyDataFrom(*x);
+
+  Tensor input_trans;
+  input_trans.Resize(input_shape);
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {static_cast<int>(input_shape[0]),
+             static_cast<int>(input_shape[1]),
+             static_cast<int>(input_shape[2]),
+             static_cast<int>(input_shape[3])},
+            {0, 2, 3, 1});
+  x->CopyDataFrom(input_trans);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("reshape2");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  std::vector<int> shape_attr;
+  shape_attr.resize(out_shape.size());
+  for (size_t i = 0; i < out_shape.size(); i++) {
+    shape_attr[i] = static_cast<int>(out_shape[i]);
+  }
+
+  opdesc.SetAttr<std::vector<int>>("shape", shape_attr);
+  auto op = CreateOp<operators::ReshapeOp>(opdesc, &scope);
+
+  auto os = out->dims();
+  out->Resize(out_shape);
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  Tensor out_trans;
+  out_trans.Resize(out_shape);
+  transpose(out->mutable_data<float>(),
+            out_trans.mutable_data<float>(),
+            {static_cast<int>(out_shape[0]),
+             static_cast<int>(out_shape[1]),
+             static_cast<int>(out_shape[2]),
+             static_cast<int>(out_shape[3])},
+            {0, 3, 1, 2});
+  out->CopyDataFrom(out_trans);
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], x_cpu.mutable_data<float>()[i], 1e-5);
+  }
+}
+
+TEST(MLUBridges, reshape) { test_reshape({1, 2, 4, 4}, {1, 4, 2, 4}); }
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(reshape, kMLU);
+USE_SUBGRAPH_BRIDGE(reshape2, kMLU);
diff --git a/lite/kernels/mlu/bridges/scale_op.cc b/lite/kernels/mlu/bridges/scale_op.cc
index 5557602bd7576ccd71c51f52a538a45fe27f7ada..5b6b3dff7969562b19344f9eccbf219d26c3e02d 100644
--- a/lite/kernels/mlu/bridges/scale_op.cc
+++ b/lite/kernels/mlu/bridges/scale_op.cc
@@ -61,6 +61,7 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                               alpha_tensor->mlu_tensor(),
                               beta_tensor->mlu_tensor()));
   graph->FuseOp(scale_op);
+  CNML_CALL(cnmlDestroyBaseOp(&scale_op));
   return SUCCESS;
 }
 
diff --git a/lite/kernels/mlu/bridges/slice_op.cc b/lite/kernels/mlu/bridges/slice_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..067d110bf4160c5bcf2bbd3009d82bbb5804c998
--- /dev/null
+++ b/lite/kernels/mlu/bridges/slice_op.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // input
+  auto input_var_name = op_info->Input("Input").front();
+  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input_shape = input->dims().Vectorize();
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  // attr
+  auto axes = op_info->GetAttr<std::vector<int32_t>>("axes");
+  auto starts = op_info->GetAttr<std::vector<int32_t>>("starts");
+  auto ends = op_info->GetAttr<std::vector<int32_t>>("ends");
+
+  CHECK(graph->HasNode(input_var_name));
+  auto input_tensor = graph->GetNode(input_var_name);
+  auto output_tensor = graph->AddNode(output_var_name,
+                                      output->dims().Vectorize(),
+                                      CNML_TENSOR,
+                                      CNML_NCHW,
+                                      graph->FPType());
+
+  std::vector<int32_t> begin_index(input_shape.size(), 0);
+  std::vector<int32_t> end_index(input_shape.size());
+  std::vector<int32_t> strides(input_shape.size(), 1);
+  auto nhwc2nchw_axis = std::move(GetAxisNHWC2NCHW<int>(input_shape.size()));
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    end_index[nhwc2nchw_axis[i]] = input_shape[i];
+  }
+  for (size_t i = 0; i < axes.size(); i++) {
+    int dim_value = input_shape[axes[i]];
+    int end = ends[i] < 0 ? std::max(ends[i] + dim_value, 0) : ends[i];
+    begin_index[nhwc2nchw_axis[axes[i]]] =
+        starts[i] < 0 ? std::max(starts[i] + dim_value, 0) : starts[i];
+    end_index[nhwc2nchw_axis[axes[i]]] = std::min(end, dim_value);
+  }
+
+  cnmlNdStridedSliceOpParam_t param;
+  cnmlBaseOp_t slice_op;
+  CNML_CALL(cnmlCreateNdStridedSliceOpParam(&param,
+                                            input_shape.size(),
+                                            begin_index.data(),
+                                            end_index.data(),
+                                            strides.data()));
+  CNML_CALL(cnmlCreateNdStridedSliceOp(&slice_op,
+                                       param,
+                                       input_tensor->mlu_tensor(),
+                                       output_tensor->mlu_tensor()));
+  CNML_CALL(cnmlDestroyNdStridedSliceOpParam(&param));
+
+  graph->FuseOp(slice_op);
+  CNML_CALL(cnmlDestroyBaseOp(&slice_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(slice,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::SliceConverter);
diff --git a/lite/kernels/mlu/bridges/slice_op_test.cc b/lite/kernels/mlu/bridges/slice_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5e2a9f5a4c99b6f46fff24686cdbe546cae727d
--- /dev/null
+++ b/lite/kernels/mlu/bridges/slice_op_test.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/slice_op.h"
+#include <gtest/gtest.h>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+static void slice_ref(const float* input,
+                      std::vector<int64_t> in_dims,
+                      std::vector<int> axes,
+                      std::vector<int> starts,
+                      std::vector<int> ends,
+                      float* out) {
+  auto out_dims = in_dims;
+  std::vector<int> real_starts(in_dims.size(), 0);
+  std::vector<int> real_ends(in_dims.size(), 0);
+  std::vector<int> real_step(in_dims.size(), 0);
+  for (size_t i = 0; i < in_dims.size(); i++) {
+    real_ends[i] = in_dims[i];
+  }
+  for (size_t i = 0; i < axes.size(); i++) {
+    int dim_value = in_dims[axes[i]];
+    if (dim_value > 0) {
+      int start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
+      int end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
+      start = std::max(start, 0);
+      end = std::max(end, 0);
+      end = std::min(end, dim_value);
+      out_dims[axes[i]] = end - start;
+      real_starts[axes[i]] = start;
+      real_ends[axes[i]] = end;
+    }
+  }
+  const int LEN = in_dims.size();
+  int dst_step[LEN];
+  for (size_t i = 0; i < in_dims.size(); ++i) {
+    dst_step[i] = 1;
+  }
+  int src_step[LEN];
+  for (size_t i = 0; i < in_dims.size(); ++i) {
+    src_step[i] = 1;
+  }
+  int out_num = out_dims[in_dims.size() - 1];
+  for (int i = in_dims.size() - 2; i >= 0; i--) {
+    dst_step[i] = out_dims[i + 1] * dst_step[i + 1];
+    src_step[i] = in_dims[i + 1] * src_step[i + 1];
+    out_num *= out_dims[i];
+  }
+
+  for (int dst_id = 0; dst_id < out_num; dst_id++) {
+    int src_id = 0;
+    int index_id = dst_id;
+    for (size_t j = 0; j < out_dims.size(); j++) {
+      int cur_id = index_id / dst_step[j];
+      index_id = index_id % dst_step[j];
+      src_id += (cur_id + real_starts[j]) * src_step[j];
+    }
+    out[dst_id] = input[src_id];
+  }
+}
+
+static void test_case(std::vector<int64_t> x_shape,
+                      std::vector<int64_t> out_shape,
+                      std::vector<int> starts,
+                      std::vector<int> ends,
+                      std::vector<int> axes) {
+  Scope scope;
+
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  x->Resize(lite::DDim(x_shape));
+  out->Resize(lite::DDim(out_shape));
+
+  auto x_data = x->mutable_data<float>();
+  FillTensor<float, float>(x, 0.f, 2.f);
+
+  cpp::OpDesc opdesc;
+  opdesc.SetType("slice");
+  opdesc.SetInput("Input", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axes", axes);
+  opdesc.SetAttr("starts", starts);
+  opdesc.SetAttr("ends", ends);
+
+  std::vector<float> out_ref(out->data_size(), 0);
+  slice_ref(x_data, x_shape, axes, starts, ends, out_ref.data());
+
+  auto type_cast = [](int64_t in) { return static_cast<int>(in); };
+  std::vector<int> i_dims;
+  std::transform(
+      x_shape.cbegin(), x_shape.cend(), std::back_inserter(i_dims), type_cast);
+
+  auto nchw2nhwc_axis = std::move(GetAxisNCHW2NHWC<int>(x_shape.size()));
+
+  Tensor input_x;
+  input_x.Resize(x->dims());
+  transpose<float>(x->mutable_data<float>(),
+                   input_x.mutable_data<float>(),
+                   i_dims,
+                   nchw2nhwc_axis);
+  x->CopyDataFrom(input_x);
+
+  auto op = CreateOp<operators::SliceOp>(opdesc, &scope);
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  Tensor output_trans;
+  auto os = out->dims().Vectorize();
+  output_trans.Resize(os);
+  std::vector<int> o_dims(os.size());
+  for (size_t i = 0; i < os.size(); ++i) {
+    o_dims[i] = os[nchw2nhwc_axis[i]];
+  }
+  transpose<float>(out->mutable_data<float>(),
+                   output_trans.mutable_data<float>(),
+                   o_dims,
+                   GetAxisNHWC2NCHW<int>(x_shape.size()));
+
+  auto out_data = output_trans.mutable_data<float>();
+  for (DDim::value_type i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_ref[i], out_data[i], 1e-4);
+  }
+}
+
+TEST(MLUBridges, slice) {
+  /* test_case({3}, {3}, {-3}, {3}, {0}); */
+  test_case({3, 4}, {3, 4}, {-3, 0}, {3, 100}, {0, 1});
+  test_case({3, 4, 5}, {3, 4, 2}, {-3, 0, 2}, {3, 100, -1}, {0, 1, 2});
+  test_case({3, 4, 5, 6}, {3, 4, 2, 6}, {-3, 0, 2}, {3, 100, -1}, {0, 1, 2});
+  /* test_case({3, 4, 5, 6, 3}, {3, 4, 2, 6, 3}, {-3, 0, 2}, {3, 100, -1}, {0,
+   * 1, 2}); */
+  /* test_case({3, 4, 5, 6, 5, 2}, {3, 4, 2, 6, 5, 2}, {-3, 0, 2}, {3, 100, 1},
+   * {0, 1, 2}); */
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(slice, kMLU);
diff --git a/lite/kernels/mlu/bridges/softmax_op.cc b/lite/kernels/mlu/bridges/softmax_op.cc
index 17c911675718a15c7ede4888b268ffcd62b4d8ed..b1b621c1efc6cbc54092a8082e4d624355e07652 100644
--- a/lite/kernels/mlu/bridges/softmax_op.cc
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
@@ -35,9 +35,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto out_var_name = op_info->Output("Out").front();
   auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
   auto output_dims = output->dims().Vectorize();
+  auto x_shape =
+      scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims().Vectorize();
 
-  // nchw axis to nhwc aixs
-  int nchw_to_nhwc_aixs_map[4] = {0, 3, 1, 2};
+  // nchw axis to nhwc axis
   int axis = 1;
   if (op_info->HasAttr("axis")) {
     axis = op_info->GetAttr<int>("axis");
@@ -45,7 +46,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       axis = output_dims.size() + axis;
     }
   }
-  int nhwc_axis = nchw_to_nhwc_aixs_map[axis];
+  // value of nhwc2nchw_axis is index of nhwc
+  // order of nhwc2nchw_axis is nchw
+  int nhwc_axis = GetAxisNHWC2NCHW<int>(x_shape.size())[axis];
 
   auto output_tensor = graph->AddNode(
       out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
@@ -55,6 +58,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                   graph->GetNode(x_var_name)->mlu_tensor(),
                                   output_tensor->mlu_tensor()));
   graph->FuseOp(softmax_op);
+  CNML_CALL(cnmlDestroyBaseOp(&softmax_op));
   return SUCCESS;
 }
 
diff --git a/lite/kernels/mlu/bridges/softmax_op_test.cc b/lite/kernels/mlu/bridges/softmax_op_test.cc
index a5251ed43c9187fc2874f9b01853b45b8abf7f1c..d5d7251205a0f60b9e5c8568a58ba48661c9df3e 100644
--- a/lite/kernels/mlu/bridges/softmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
@@ -93,7 +93,7 @@ void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
   opdesc.SetOutput("Out", {out_var_name});
   opdesc.SetAttr("axis", axis);
 
-  // create and convert op to NPU model, then run it on NPU
+  // create and convert op to MLU model, then run it on MLU
   auto op = CreateOp<operators::SoftmaxOp>(opdesc, &scope);
   // execute reference implementation and save to output tensor
   softmax_ref<float>(op);
diff --git a/lite/kernels/mlu/bridges/split_op.cc b/lite/kernels/mlu/bridges/split_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4188ba3ec08161552bc688c212408fa81ae815a3
--- /dev/null
+++ b/lite/kernels/mlu/bridges/split_op.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims().Vectorize();
+
+  auto out_var_name = op_info->Output("Out");
+
+  auto param_axis = op_info->GetAttr<int>("axis");
+
+  auto num = op_info->GetAttr<int>("num");
+  auto sections = op_info->GetAttr<std::vector<int>>("sections");
+  int64_t sections_num = static_cast<int64_t>(sections.size());
+  auto output_num = num > 0 ? num : sections_num;
+
+  std::vector<cnmlTensor_t> output_tensor;
+  for (auto out_name : out_var_name) {
+    auto out = scope->FindVar(out_name)->GetMutable<Tensor>();
+    auto out_dims = out->dims().Vectorize();
+    auto out_tensor = graph->AddNode(
+        out_name, out_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+    output_tensor.push_back(out_tensor->mlu_tensor());
+  }
+
+  auto dims = x_dims.size();
+  int axis = (param_axis < 0) ? (param_axis + dims) : param_axis;
+  CHECK_LE(axis, 4) << "Unsupport dims in mlu concat";
+  int nhwc_axis = GetAxisNHWC2NCHW<int>(dims)[axis];
+
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+
+  cnmlBaseOp_t split_op;
+  cnmlTensor_t inputs = input_tensor->mlu_tensor();
+  CNML_CALL(cnmlCreateNdSplitOp(
+      &split_op, nhwc_axis, &inputs, 1, output_tensor.data(), output_num));
+  graph->FuseOp(split_op);
+  CNML_CALL(cnmlDestroyBaseOp(&split_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(split,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::SplitConverter);
diff --git a/lite/kernels/mlu/bridges/split_op_test.cc b/lite/kernels/mlu/bridges/split_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a44a45504036e9ef6199e9d2b534aa3dde63bb01
--- /dev/null
+++ b/lite/kernels/mlu/bridges/split_op_test.cc
@@ -0,0 +1,199 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/split_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+template <typename dtype>
+void split_ref(const std::shared_ptr<operators::SplitOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  int num = op_info->GetAttr<int>("num");
+  int axis = op_info->GetAttr<int>("axis");
+  std::vector<int> sections = op_info->GetAttr<std::vector<int>>("sections");
+  std::vector<lite::Tensor*> output_vec;
+  auto output = op_info->Output("Out");
+  for (auto out_var : output) {
+    output_vec.push_back(scope->Var(out_var)->GetMutable<Tensor>());
+  }
+  auto in_dims = x->dims();
+  auto rank = in_dims.size();
+  int outs_number = output_vec.size();
+  std::vector<lite::DDimLite> outs_dims;
+  outs_dims.reserve(outs_number);
+  if (axis < 0) {
+    axis += rank;
+  }
+  if (num > 0) {
+    int out_axis_dim = in_dims[axis] / num;
+    for (int i = 0; i < outs_number; ++i) {
+      auto dim = in_dims;
+      dim[axis] = out_axis_dim;
+      outs_dims.push_back(dim);
+    }
+  } else if (sections.size() > 0) {
+    for (size_t i = 0; i < outs_number; ++i) {
+      auto dim = in_dims;
+      dim[axis] = sections[i];
+      outs_dims.push_back(dim);
+    }
+  }
+  for (int j = 0; j < outs_dims.size(); ++j) {
+    output_vec[j]->Resize(outs_dims[j]);
+  }
+
+  const dtype* din = x->mutable_data<const dtype>();
+  std::vector<int> in_strides(in_dims.size());
+  in_strides[in_dims.size() - 1] = in_dims[in_dims.size() - 1];
+  for (int i = in_dims.size() - 2; i >= 0; --i) {
+    in_strides[i] = in_strides[i + 1] * in_dims[i];
+  }
+
+  int input_offset = 0;
+  for (auto out : output_vec) {
+    auto out_dim = out->dims();
+    std::vector<int> out_strides(out_dim.size());
+    out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1];
+    for (int i = out_dim.size() - 2; i >= 0; --i) {
+      out_strides[i] = out_strides[i + 1] * out_dim[i];
+    }
+
+    dtype* out_data = out->mutable_data<dtype>();
+    int before = out_strides[0] / out_strides[axis];
+    int in_after = in_strides[axis];
+    int out_after = out_strides[axis];
+
+    for (int i = 0; i < before; ++i) {
+      std::memcpy(out_data + i * out_after,
+                  din + input_offset + i * in_after,
+                  sizeof(dtype) * out_after);
+    }
+    input_offset += out_strides[axis];
+  }
+}
+
+void test_split(int bs,
+                int ic,
+                int ih,
+                int iw,
+                int axis,
+                int num,
+                std::vector<int> sections) {
+  // prepare input&output variables
+  std::string x_var_name = "x";
+  std::string out_var_name_1 = "out_1";
+  std::string out_var_name_2 = "out_2";
+  std::string out_ref_var_name_1 = "out_ref_1";
+  std::string out_ref_var_name_2 = "out_ref_2";
+
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out_1 = scope.Var(out_var_name_1)->GetMutable<Tensor>();
+  auto* out_2 = scope.Var(out_var_name_2)->GetMutable<Tensor>();
+  auto* out_ref_1 = scope.Var(out_ref_var_name_1)->GetMutable<Tensor>();
+  auto* out_ref_2 = scope.Var(out_ref_var_name_2)->GetMutable<Tensor>();
+  x->Resize({bs, ic, ih, iw});
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("split");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name_1, out_var_name_2});
+  opdesc.SetAttr("axis", axis);
+  opdesc.SetAttr("sections", sections);
+  opdesc.SetAttr("num", num);
+
+  auto op = CreateOp<operators::SplitOp>(opdesc, &scope);
+  split_ref<float>(op);
+  out_ref_1->CopyDataFrom(*out_1);
+  out_ref_2->CopyDataFrom(*out_2);
+  // execute reference implementation and save to output tensor
+
+  Tensor input;
+  input.Resize({bs, ic, ih, iw});
+  transpose<float>(x->mutable_data<float>(),
+                   input.mutable_data<float>(),
+                   {static_cast<int>(bs),
+                    static_cast<int>(ic),
+                    static_cast<int>(ih),
+                    static_cast<int>(iw)},
+                   {0, 2, 3, 1});
+  x->CopyDataFrom(input);
+  LaunchOp(op, {x_var_name}, {out_var_name_1, out_var_name_2});
+
+  // compare results
+  auto* out_data_1 = out_1->mutable_data<float>();
+  auto* out_data_2 = out_2->mutable_data<float>();
+  auto* out_ref_data_1 = out_ref_1->mutable_data<float>();
+  auto* out_ref_data_2 = out_ref_2->mutable_data<float>();
+
+  Tensor output1, output2;
+  output1.Resize(out_1->dims());
+  output2.Resize(out_2->dims());
+  transpose<float>(out_data_1,
+                   output1.mutable_data<float>(),
+                   {static_cast<int>(out_1->dims()[0]),
+                    static_cast<int>(out_1->dims()[2]),
+                    static_cast<int>(out_1->dims()[3]),
+                    static_cast<int>(out_1->dims()[1])},
+                   {0, 3, 1, 2});
+  transpose<float>(out_data_2,
+                   output2.mutable_data<float>(),
+                   {static_cast<int>(out_2->dims()[0]),
+                    static_cast<int>(out_2->dims()[2]),
+                    static_cast<int>(out_2->dims()[3]),
+                    static_cast<int>(out_2->dims()[1])},
+                   {0, 3, 1, 2});
+  out_data_1 = output1.mutable_data<float>();
+  out_data_2 = output2.mutable_data<float>();
+  for (int i = 0; i < out_1->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data_1[i], out_ref_data_1[i], 5e-4);
+  }
+  for (int i = 0; i < out_2->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data_2[i], out_ref_data_2[i], 5e-4);
+  }
+}
+
+TEST(MLUBridges, split) {
+  test_split(4, 2, 3, 1, 0, 2, {});
+  test_split(4, 2, 3, 1, 0, 0, {3, 1});
+  test_split(4, 6, 3, 1, 1, 2, {});
+  test_split(4, 6, 3, 1, 1, 0, {2, 4});
+  test_split(4, 2, 2, 1, 2, 2, {});
+  test_split(4, 2, 6, 1, 2, 0, {3, 3});
+  test_split(4, 2, 3, 4, 3, 2, {});
+  test_split(4, 2, 3, 6, 3, 0, {5, 1});
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(split, kMLU);
diff --git a/lite/kernels/mlu/bridges/squeeze_op.cc b/lite/kernels/mlu/bridges/squeeze_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f8af5b014bdba29bb50036473f671ec359f26d4
--- /dev/null
+++ b/lite/kernels/mlu/bridges/squeeze_op.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int SqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Create act node and set params from op
+  auto fp_type = graph->FPType();
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type);
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+
+  auto output_dims_nhwc = DimNCHW2NHWC(output_dims);
+  std::vector<int> o_dims(output_dims.size());
+  std::transform(output_dims_nhwc.cbegin(),
+                 output_dims_nhwc.cend(),
+                 o_dims.begin(),
+                 [](DDim::value_type d) { return static_cast<int>(d); });
+
+  cnmlReshapeOpParam_t param;
+  cnmlBaseOp_t squeeze_op;
+  CNML_CALL(cnmlCreateNdReshapeOpParam(&param, o_dims.data(), o_dims.size()));
+  CNML_CALL(cnmlCreateReshapeOp(&squeeze_op,
+                                param,
+                                input_tensor->mlu_tensor(),
+                                output_tensor->mlu_tensor()));
+  CNML_CALL(cnmlDestroyReshapeOpParam(&param));
+  graph->FuseOp(squeeze_op);
+  CNML_CALL(cnmlDestroyBaseOp(&squeeze_op));
+
+  if (op_type == "squeeze2") {
+    auto xshape_var_name = op_info->Output("XShape").front();
+    auto xshape = scope->FindVar(xshape_var_name)->GetMutable<Tensor>();
+    auto dims_64 = xshape->dims().Vectorize();
+    auto dims_64_nhwc = DimNCHW2NHWC(dims_64);
+    auto xshape_tensor = graph->AddNode(
+        xshape_var_name, dims_64, CNML_TENSOR, CNML_NCHW, fp_type);
+
+    std::vector<int> xshape_dims(dims_64.size());
+    std::transform(dims_64_nhwc.cbegin(),
+                   dims_64_nhwc.cend(),
+                   xshape_dims.begin(),
+                   [](DDim::value_type d) { return static_cast<int>(d); });
+
+    cnmlBaseOp_t squeeze2_op;
+    CNML_CALL(cnmlCreateNdReshapeOpParam(
+        &param, xshape_dims.data(), xshape_dims.size()));
+    CNML_CALL(cnmlCreateReshapeOp(&squeeze2_op,
+                                  param,
+                                  input_tensor->mlu_tensor(),
+                                  xshape_tensor->mlu_tensor()));
+    CNML_CALL(cnmlDestroyReshapeOpParam(&param));
+    graph->FuseOp(squeeze2_op);
+    CNML_CALL(cnmlDestroyBaseOp(&squeeze2_op));
+  }
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(squeeze,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::SqueezeConverter);
+REGISTER_SUBGRAPH_BRIDGE(squeeze2,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::SqueezeConverter);
diff --git a/lite/kernels/mlu/bridges/squeeze_op_test.cc b/lite/kernels/mlu/bridges/squeeze_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad16dac2e978fa977acacf62ed6adca16365ed6d
--- /dev/null
+++ b/lite/kernels/mlu/bridges/squeeze_op_test.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/squeeze_op.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+// squeeze
+TEST(MLUBridges, squeeze) {
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string ref_var_name("ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(ref_var_name)->GetMutable<Tensor>();
+  std::vector<int64_t> x_shape({1, 3, 1, 5});
+  x->Resize(x_shape);
+  out_ref->Resize(x_shape);
+  std::vector<int64_t> out_shape({3, 5});
+  out->Resize(out_shape);
+
+  FillTensor<float>(x, 0, 10);
+  out_ref->CopyDataFrom(*x);
+
+  // SqueezeCompute squeeze;
+  cpp::OpDesc opdesc;
+  opdesc.SetType("squeeze");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  std::vector<int> axes{0, -2};
+  opdesc.SetAttr("axes", axes);
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::SqueezeOp>(opdesc, &scope);
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  auto x_data = out_ref->data<float>();
+  auto out_data = out->data<float>();
+  for (int j = 0; j < out->numel(); ++j) {
+    EXPECT_NEAR(out_data[j], x_data[j], 1e-5);
+  }
+}
+
+// squeeze2
+TEST(MLUBridges, squeeze2) {
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string xshape_var_name("xshape");
+  std::string ref_var_name("ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* xshape = scope.Var(xshape_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(ref_var_name)->GetMutable<Tensor>();
+  std::vector<int64_t> x_shape({1, 3, 1, 5});
+  x->Resize(x_shape);
+  out_ref->Resize(x_shape);
+  std::vector<int64_t> out_shape({3, 5});
+  out->Resize(out_shape);
+  std::vector<int64_t> xshape_shape({1, 3, 1, 5});
+  xshape->Resize(xshape_shape);
+
+  FillTensor<float>(x, 0, 10);
+  out_ref->CopyDataFrom(*x);
+
+  // Squeeze2Compute squeeze2;
+  cpp::OpDesc opdesc;
+  opdesc.SetType("squeeze2");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetOutput("XShape", {xshape_var_name});
+
+  std::vector<int> axes({0, -2});
+  opdesc.SetAttr("axes", axes);
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::SqueezeOp>(opdesc, &scope);
+  LaunchOp(op, {x_var_name}, {out_var_name, xshape_var_name});
+
+  auto x_data = out_ref->mutable_data<float>();
+  auto out_data = out->mutable_data<float>();
+  auto xshape_data = xshape->mutable_data<float>();
+  for (int j = 0; j < out->numel(); ++j) {
+    EXPECT_NEAR(out_data[j], x_data[j], 1e-5);
+    EXPECT_NEAR(xshape_data[j], x_data[j], 1e-5);
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(squeeze, kMLU);
+USE_SUBGRAPH_BRIDGE(squeeze2, kMLU);
diff --git a/lite/kernels/mlu/bridges/tensor.cc b/lite/kernels/mlu/bridges/tensor.cc
index be7e1f09beaee61dace598b958ab4f95f14b38f8..f1bf48d66e8693e72a96f0f52c285a717f464128 100644
--- a/lite/kernels/mlu/bridges/tensor.cc
+++ b/lite/kernels/mlu/bridges/tensor.cc
@@ -16,6 +16,9 @@
 #include <glog/logging.h>
 #include <algorithm>
 #include <climits>
+#include <fstream>
+#include <sstream>
+#include <string>
 #include <vector>
 
 namespace paddle {
@@ -25,8 +28,9 @@ namespace mlu {
 
 MLUTensor::MLUTensor(const std::vector<int64_t>& shape,
                      cnmlTensorType_t tensor_type,
-                     cnmlDataOrder_t data_order,
-                     cnmlDataType_t mlu_dtype)
+                     cnmlDataOrder_t shape_order,
+                     cnmlDataType_t mlu_dtype,
+                     cnmlDataOrder_t data_order)
     : mlu_tensor_(nullptr), tensor_type_(tensor_type), mlu_ptr_(nullptr) {
   std::vector<int> int_shape;
   for (auto i : shape) {
@@ -36,15 +40,18 @@ MLUTensor::MLUTensor(const std::vector<int64_t>& shape,
       LOG(FATAL) << "Shape size is beyond the limitation of MLUTensor!";
     }
   }
-  remember(int_shape, tensor_type, mlu_dtype, data_order);
+  remember(int_shape, tensor_type, mlu_dtype, shape_order, data_order);
 }
 
 void MLUTensor::remember(const std::vector<int>& shape,
                          cnmlTensorType_t tensor_type,
                          cnmlDataType_t mlu_dtype,
-                         cnmlDataOrder_t shape_order) {
+                         cnmlDataOrder_t shape_order,
+                         cnmlDataOrder_t data_order) {
   tensor_type_ = tensor_type;
   mlu_dtype_ = mlu_dtype;
+  data_order_ = data_order;
+  origin_shape_.assign(shape.begin(), shape.end());
 
   int size = 4;
   if (shape.size() > 4 || shape_order == CNML_ARRAY) {
@@ -239,13 +246,22 @@ void MLUTensor::remember(const std::vector<int>& shape,
         break;
     }
   }
-  dim_ = shape_.size();
+  auto shape_NCHW = DimNHWC2NCHW(shape_);
+  shape_NCHW.erase(shape_NCHW.begin() + shape.size(), shape_NCHW.end());
+  dim_ = shape_NCHW.size();
+  shape_ = DimNCHW2NHWC(shape_NCHW);
 }
 
 void MLUTensor::Create() {
   if (mlu_tensor_ == nullptr) {
     CNML_CALL(cnmlCreateTensor_V2(&mlu_tensor_, tensor_type_));
     std::vector<int> dim_shape(shape_);
+    if (data_order_ == CNML_NCHW) {
+      std::transform(origin_shape_.cbegin(),
+                     origin_shape_.cend(),
+                     dim_shape.begin(),
+                     [](DDim::value_type in) { return static_cast<int>(in); });
+    }
     int* dim_strides = nullptr;
     CNML_CALL(cnmlSetTensorShape_V2(
         mlu_tensor_, dim_, dim_shape.data(), dim_strides));
@@ -258,6 +274,84 @@ cnmlTensor_t MLUTensor::mlu_tensor() {
   return mlu_tensor_;
 }
 
+void MLUTensor::ToFile(std::string file_name) {
+  if (mlu_ptr_) {
+    VLOG(5) << "to dump mlu ptr: " << mlu_ptr_ << " to: " << file_name;
+    int count = 1;
+    for (size_t i = 0; i < shape_.size(); i++) {
+      count *= shape_[i];
+    }
+    VLOG(6) << " dump count: " << count;
+    VLOG(6) << " dump shape: ";
+    for (size_t i = 0; i < shape_.size(); i++) {
+      VLOG(6) << shape_[i] << " ";
+    }
+
+    std::vector<float> cpu_data_fp32(count);
+    // fp16 to fp32
+    if (mlu_dtype_ == CNML_DATA_FLOAT16) {
+      VLOG(6) << " convert fp16 to fp32 ";
+      std::vector<uint16_t> cpu_data_fp16(count);
+      cnrtMemcpy(cpu_data_fp16.data(),
+                 mlu_ptr_,
+                 count * sizeof(uint16_t),
+                 CNRT_MEM_TRANS_DIR_DEV2HOST);
+      for (int i = 0; i < count; i++) {
+        cnrtConvertHalfToFloat(&(cpu_data_fp32[i]), cpu_data_fp16[i]);
+      }
+    } else {
+      cnrtMemcpy(cpu_data_fp32.data(),
+                 mlu_ptr_,
+                 count * sizeof(float),
+                 CNRT_MEM_TRANS_DIR_DEV2HOST);
+    }
+
+    // trans to nchw
+    std::vector<float> cpu_data_trans(count);
+    if (data_order_ != CNML_NCHW) {
+      switch (shape_.size()) {
+        case 4:
+          transpose(cpu_data_fp32.data(),
+                    cpu_data_trans.data(),
+                    shape_,
+                    {0, 3, 1, 2});
+          break;
+        case 3:
+          transpose(
+              cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 2, 1});
+          break;
+        case 2:
+          transpose(
+              cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 1});
+          break;
+        case 1:
+          transpose(cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0});
+          break;
+        default:
+          CHECK(0) << "ToFile only support dim <=4";
+          break;
+      }
+    }
+
+    // to file
+    std::ostringstream outs;
+    for (int i = 0; i < count; i++) {
+      if (data_order_ == CNML_NCHW) {
+        outs << cpu_data_fp32[i] << std::endl;
+      } else {
+        outs << cpu_data_trans[i] << std::endl;
+      }
+    }
+    std::ofstream of;
+    of.open(file_name, std::ios::out);
+    of << outs.str();
+    of.close();
+  } else {
+    LOG(FATAL) << "mlu ptr is null ,can not dump mlu content to : "
+               << file_name;
+  }
+}
+
 MLUTensor::~MLUTensor() {
   if (mlu_tensor_ != nullptr) {
     CNML_CALL(cnmlDestroyTensor(&mlu_tensor_));
diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h
index 12dc97a772dabc529bf183f783a22a9f2dfa936d..22268f69ba39926dbbfb1bbb18e3a86331097f90 100644
--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <fstream>
+#include <string>
 #include <vector>
 #include "lite/kernels/mlu/bridges/utility.h"
 
@@ -33,13 +35,15 @@ class MLUTensor {
 
   MLUTensor(const std::vector<int64_t>& shape,
             cnmlTensorType_t tensor_type = CNML_TENSOR,
-            cnmlDataOrder_t data_order = CNML_NCHW,
-            cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32);
+            cnmlDataOrder_t shape_order = CNML_NCHW,
+            cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32,
+            cnmlDataOrder_t data_order = CNML_NHWC);
 
   void remember(const std::vector<int>& shape,
                 cnmlTensorType_t tensor_type,
                 cnmlDataType_t mlu_dtype,
-                cnmlDataOrder_t shape_order);
+                cnmlDataOrder_t shape_order,
+                cnmlDataOrder_t data_order);
   void Create();
   cnmlTensor_t mlu_tensor();
   void* mlu_data() {
@@ -47,14 +51,21 @@ class MLUTensor {
     return mlu_ptr_;
   }
 
+  cnmlDataType_t dtype() { return mlu_dtype_; }
   void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; }
 
+  const std::vector<int64_t>& get_origin_shape() const { return origin_shape_; }
+
   ~MLUTensor();
 
+  void ToFile(std::string file_name);
+  cnmlDataOrder_t dorder() { return data_order_; }
+
  private:
   cnmlTensor_t mlu_tensor_;
 
   std::vector<int> shape_;
+  std::vector<int64_t> origin_shape_;
   cnmlTensorType_t tensor_type_;
   cnmlDataType_t mlu_dtype_;
   int dim_{0};
diff --git a/lite/kernels/mlu/bridges/test_helper.cc b/lite/kernels/mlu/bridges/test_helper.cc
index 377a00689ef3a27f78ae008072578ab3701cd337..36eeb473f6a37aa28a9447280f808f5fb08978d0 100644
--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
@@ -24,18 +24,38 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
+template <lite_api::PrecisionType Dtype>
+void PrepareInput(Graph* graph,
+                  const std::string& input_name,
+                  Tensor* input_tensor,
+                  cnmlDataOrder_t order) {
+  thread_local Tensor temp_input;
+  temp_input.Resize(input_tensor->dims().Vectorize());
+  temp_input.CopyDataFrom(*input_tensor);
+  using data_type = typename MLUTypeTraits<Dtype>::type;
+  auto input_node = graph->AddNode(
+      input_name,
+      input_tensor->dims().Vectorize(),
+      CNML_TENSOR,
+      CNML_NCHW,
+      MLUTypeTraits<Dtype>::cnml_type,
+      order,
+      reinterpret_cast<void*>(
+          input_tensor->template mutable_data<data_type>(TARGET(kMLU))));
+  CHECK(input_node);
+  CNRT_CHECK(cnrtMemcpy(input_tensor->template mutable_data<data_type>(),
+                        temp_input.mutable_data<data_type>(),
+                        sizeof(data_type) * input_tensor->dims().production(),
+                        CNRT_MEM_TRANS_DIR_HOST2DEV));
+}
+
 void LaunchOp(const std::shared_ptr<lite::OpLite> op,
               const std::vector<std::string>& input_var_names,
-              const std::vector<std::string>& output_var_names) {
+              const std::vector<std::string>& output_var_names,
+              cnmlDataOrder_t order) {
   CNRT_CALL(cnrtInit(0));
-  ::paddle::lite::SetMluDevice(0);
+  lite::SetMluDevice(0);
   cnrtQueue_t queue_;
-  cnrtInvokeFuncParam_t forward_param;
-  u32_t affinity = 1;
-  int data_param = 1;
-  forward_param.data_parallelism = &data_param;
-  forward_param.affinity = &affinity;
-  forward_param.end = CNRT_PARAM_END;
   CNRT_CALL(cnrtCreateQueue(&queue_));
   cnrtDev_t dev_handle;
   CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0));
@@ -50,23 +70,21 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
   // Convert input data var and add it into the MLU IR graph
   for (auto& input_name : input_var_names) {
     auto input_tensor = scope->FindMutableTensor(input_name);
-    CHECK(input_tensor);
-    Tensor temp_input;
-    temp_input.Resize(input_tensor->dims().Vectorize());
-    temp_input.CopyDataFrom(*input_tensor);
-    auto input_node =
-        graph.AddNode(input_name,
-                      input_tensor->dims().Vectorize(),
-                      CNML_TENSOR,
-                      CNML_NCHW,
-                      graph.FPType(),
-                      reinterpret_cast<void*>(
-                          input_tensor->mutable_data<float>(TARGET(kMLU))));
-    CHECK(input_node);
-    CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data<float>(),
-                          temp_input.mutable_data<float>(),
-                          sizeof(float) * input_tensor->dims().production(),
-                          CNRT_MEM_TRANS_DIR_HOST2DEV));
+    auto data_type = input_tensor->precision();
+
+    switch (data_type) {
+#define PREPARE_INPUT(type__)                                                 \
+  case PRECISION(type__):                                                     \
+    PrepareInput<PRECISION(type__)>(&graph, input_name, input_tensor, order); \
+    break;
+      PREPARE_INPUT(kFP16)
+      PREPARE_INPUT(kFloat)
+      PREPARE_INPUT(kInt8)
+      PREPARE_INPUT(kInt32)
+#undef PREPARE_INPUT
+      default:
+        CHECK(0);
+    }
   }
   op->CheckShape();
   op->InferShape();
@@ -89,8 +107,9 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
   }
 
   graph.Compile(CNML_MLU270, 1);
+  graph.Compute(queue_, *(graph.MutableInputs()), *(graph.MutableOutputs()));
+  CNRT_CALL(cnrtSyncQueue(queue_));
 
-  graph.Compute(forward_param, queue_);
   for (auto& output_name : output_var_names) {
     auto output_tensor = scope->FindMutableTensor(output_name);
     Tensor temp_out;
diff --git a/lite/kernels/mlu/bridges/test_helper.h b/lite/kernels/mlu/bridges/test_helper.h
index 4da9e72dfcc5a81a68467f7622e2c16aedb2ded5..36fe6f1efaed76deccdc6e9542bb52a2aefc2571 100644
--- a/lite/kernels/mlu/bridges/test_helper.h
+++ b/lite/kernels/mlu/bridges/test_helper.h
@@ -58,7 +58,8 @@ void FillTensor(Tensor* x,
 
 void LaunchOp(const std::shared_ptr<lite::OpLite> op,
               const std::vector<std::string>& input_var_names,
-              const std::vector<std::string>& output_var_names);
+              const std::vector<std::string>& output_var_names,
+              cnmlDataOrder_t order = CNML_NHWC);
 
 }  // namespace mlu
 }  // namespace subgraph
diff --git a/lite/kernels/mlu/bridges/transpose_op.cc b/lite/kernels/mlu/bridges/transpose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6caeb3613fea8f348e3990ec2c9660321590116
--- /dev/null
+++ b/lite/kernels/mlu/bridges/transpose_op.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+std::vector<int> axis_to_nhwc(const std::vector<int>& axis) {
+  std::vector<int> new_axis(axis.size());
+
+  auto nhwc2nchw_axis = std::move(GetAxisNHWC2NCHW<int>(axis.size()));
+  auto nchw2nhwc_axis = std::move(GetAxisNCHW2NHWC<int>(axis.size()));
+
+  for (size_t i = 0; i < new_axis.size(); ++i) {
+    new_axis[i] = nhwc2nchw_axis[axis[nchw2nhwc_axis[i]]];
+  }
+  return new_axis;
+}
+
+int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get input vars and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims().Vectorize();
+
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+
+  auto axis = op_info->GetAttr<std::vector<int>>("axis");
+  std::vector<int> axis_nhwc = axis_to_nhwc(axis);
+
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+  cnmlBaseOp_t transpose_op{nullptr};
+
+  cnmlNdTransposeOpParam_t transpose_param{nullptr};
+
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &transpose_param, axis_nhwc.data(), axis_nhwc.size()));
+
+  // Use cnmlCreatexxxOpForward to create op.
+  CNML_CALL(cnmlCreateNdTransposeProOp(&transpose_op,
+                                       input_tensor->mlu_tensor(),
+                                       output_tensor->mlu_tensor(),
+                                       transpose_param));
+
+  graph->FuseOp(transpose_op);
+  CNML_CALL(cnmlDestroyBaseOp(&transpose_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(transpose,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(transpose2,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::TransposeConverter);
diff --git a/lite/kernels/mlu/bridges/transpose_op_test.cc b/lite/kernels/mlu/bridges/transpose_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e8f7890581279f0ab4d51006c194967fd9c61e7
--- /dev/null
+++ b/lite/kernels/mlu/bridges/transpose_op_test.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/transpose_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int data_index(std::vector<int> pos, DDimLite dims) {
+  int d1 = dims[1];
+  int d2 = dims[2];
+  int d3 = dims[3];
+  return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1;
+}
+
+std::vector<int> pos_trans(std::vector<int> in_pos, std::vector<int> axis) {
+  std::vector<int> out_pos(in_pos.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    out_pos[axis[i]] = in_pos[i];
+  }
+  return out_pos;
+}
+
+template <typename dtype>
+void transpose_ref(const std::shared_ptr<operators::TransposeOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto input =
+      scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto output =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto x_dims = input->dims();
+  auto y_dims = output->dims();
+  auto axis = op_info->GetAttr<std::vector<int>>("axis");
+
+  // auto input_data = input->data<dtype>();
+  auto* input_data = input->mutable_data<dtype>();
+  auto* output_data = output->mutable_data<dtype>();
+
+  int input_n = x_dims[0];
+  int input_c = x_dims[1];
+  int input_h = x_dims[2];
+  int input_w = x_dims[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          std::vector<int> in_pos{n, c, h, w};
+          std::vector<int> out_pos = pos_trans(in_pos, axis);
+          int in_index = data_index(in_pos, x_dims);
+          int out_index = data_index(out_pos, y_dims);
+          output_data[out_index] = input_data[in_index];
+        }
+      }
+    }
+  }
+}
+
+void test_transpose(const std::vector<int64_t>& input_shape,
+                    std::vector<int> axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("transpose");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", axis);
+
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::TransposeOp>(opdesc, &scope);
+
+  // transpose_ref  must run befor LaunchOp
+  // otherwise get Cannot access memory
+  // execute reference implementation and save to output tensor
+  transpose_ref<float>(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input_x;
+  input_x.Resize(DDim(input_shape));
+  transpose(x->mutable_data<float>(),
+            input_x.mutable_data<float>(),
+            {static_cast<int>(input_shape[0]),
+             static_cast<int>(input_shape[1]),
+             static_cast<int>(input_shape[2]),
+             static_cast<int>(input_shape[3])},
+            {0, 2, 3, 1});
+  x->CopyDataFrom(input_x);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+
+  Tensor output_trans;
+  output_trans.Resize(out->dims());
+  auto os = out->dims();
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(os[0]),
+             static_cast<int>(os[2]),
+             static_cast<int>(os[3]),
+             static_cast<int>(os[1])},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+// TODO(pmshst): fix the transpose test
+TEST(MLUBridges, transpose) {
+  std::vector<int64_t> input_shape = {2, 3, 4, 5};
+  test_transpose(input_shape, std::vector<int>{0, 1, 3, 2});
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(transpose, kMLU);
+USE_SUBGRAPH_BRIDGE(transpose2, kMLU);
diff --git a/lite/kernels/mlu/bridges/utility.cc b/lite/kernels/mlu/bridges/utility.cc
index cd78553a652433fc41334a6bff5575031f5125e0..b53debd643ae2b1080644d2844d702797addabec 100644
--- a/lite/kernels/mlu/bridges/utility.cc
+++ b/lite/kernels/mlu/bridges/utility.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/mlu/bridges/utility.h"
+
 #include <utility>
 
 namespace paddle {
@@ -20,33 +21,21 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
-void transpose(float* input_data,
-               float* output_data,
-               std::vector<int> input_shape,
-               std::vector<int> axis) {
+void transpose2d(float* input_data,
+                 float* output_data,
+                 std::vector<int> input_shape) {
+  CHECK_EQ(input_shape.size(), 2);
   int old_index = -1;
   int new_index = -1;
-  int dim[4] = {0};
-  std::vector<int> shape = input_shape;
-  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
-    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
-      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
-        for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) {
-          old_index = dim[0] * shape[1] * shape[2] * shape[3] +
-                      dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3];
-          new_index =
-              dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
-              dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
-              dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
-          output_data[new_index] = input_data[old_index];
-        }
-      }
+  for (int i = 0; i < input_shape[0]; i++) {
+    for (int j = 0; j < input_shape[1]; j++) {
+      old_index = i * input_shape[1] + j;
+      new_index = j * input_shape[0] + i;
+      output_data[new_index] = input_data[old_index];
     }
   }
 }
 
-int scale2position(float scale) { return static_cast<int>(-std::log2(scale)); }
-
 void dequant(float* dst, int8_t* src, size_t size, float scale) {
   for (size_t i = 0; i < size; ++i) {
     dst[i] = static_cast<float>(src[i]) * scale;
diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h
index fa8fb1597c0fb068a855928dd20057d48ecd5eaf..fd1e5eb265936f11f258d86e2b6a91af1d55c6ed 100644
--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
@@ -16,24 +16,76 @@
 
 #include <cnml.h>
 #include <cnrt.h>
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "lite/backends/mlu/mlu_utils.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
-#include "lite/fluid/data_type.h"
+#include "lite/fluid/float16.h"
 
 namespace paddle {
 namespace lite {
 namespace subgraph {
 namespace mlu {
 
-void transpose(float* input_data,
-               float* output_data,
+void transpose2d(float* input_data,
+                 float* output_data,
+                 std::vector<int> input_shape);
+
+template <typename dtype>
+void transpose(dtype* input_data,
+               dtype* output_data,
                std::vector<int> input_shape,
-               std::vector<int> axis);
-int scale2position(float scale);
+               std::vector<int> axis) {
+  int old_index = -1;
+  int new_index = -1;
+  std::vector<int> shape;
+  std::vector<int> expand_axis;
+  if (input_shape.size() < 5u) {
+    for (size_t i = 0; i < 5 - input_shape.size(); i++) {
+      shape.push_back(1);
+      expand_axis.push_back(i);
+    }
+    for (size_t i = 0; i < input_shape.size(); i++) {
+      shape.push_back(input_shape[i]);
+      expand_axis.push_back(axis[i] + 5 - input_shape.size());
+    }
+  } else {
+    shape = input_shape;
+    expand_axis = axis;
+  }
+  int dim[5] = {0};
+  for (dim[0] = 0; dim[0] < shape[0]; dim[0]++) {
+    for (dim[1] = 0; dim[1] < shape[1]; dim[1]++) {
+      for (dim[2] = 0; dim[2] < shape[2]; dim[2]++) {
+        for (dim[3] = 0; dim[3] < shape[3]; dim[3]++) {
+          for (dim[4] = 0; dim[4] < shape[4]; dim[4]++) {
+            old_index = dim[0] * shape[1] * shape[2] * shape[3] * shape[4] +
+                        dim[1] * shape[2] * shape[3] * shape[4] +
+                        dim[2] * shape[3] * shape[4] + dim[3] * shape[4] +
+                        dim[4];
+            new_index = dim[expand_axis[0]] * shape[expand_axis[1]] *
+                            shape[expand_axis[2]] * shape[expand_axis[3]] *
+                            shape[expand_axis[4]] +
+                        dim[expand_axis[1]] * shape[expand_axis[2]] *
+                            shape[expand_axis[3]] * shape[expand_axis[4]] +
+                        dim[expand_axis[2]] * shape[expand_axis[3]] *
+                            shape[expand_axis[4]] +
+                        dim[expand_axis[3]] * shape[expand_axis[4]] +
+                        dim[expand_axis[4]];
+            output_data[new_index] = input_data[old_index];
+          }
+        }
+      }
+    }
+  }
+}
+
+inline int scale2position(float scale) { return std::floor(-std::log2(scale)); }
+
 void dequant(float* dst, int8_t* src, size_t size, float scale);
 
 void dequant(float* dst,
@@ -64,27 +116,94 @@ inline const ::paddle::lite::DDimLite DimNCHW2NHWC(
       std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]}));
 }
 
-inline const std::vector<int64_t> DimNHWC2NCHW(
-    const std::vector<int64_t>& dim) {
-  return std::vector<int64_t>({dim[0], dim[3], dim[1], dim[2]});
+template <typename data_type>
+inline const std::vector<data_type> DimNHWC2NCHW(
+    const std::vector<data_type>& dim) {
+  switch (dim.size()) {
+    case 1:
+      return dim;
+    case 2:
+      return dim;
+    case 3:
+      return std::vector<data_type>({dim[0], dim[2], dim[1]});
+    case 4:
+      return std::vector<data_type>({dim[0], dim[3], dim[1], dim[2]});
+    case 5:
+      return std::vector<data_type>({dim[0], dim[4], dim[1], dim[2], dim[3]});
+    default:
+      CHECK(0) << "unsupport dimension";
+  }
+}
+
+template <typename data_type>
+inline const std::vector<data_type> DimNCHW2NHWC(
+    const std::vector<data_type>& dim) {
+  switch (dim.size()) {
+    case 1:
+      return dim;
+    case 2:
+      return dim;
+    case 3:
+      return std::vector<data_type>({dim[0], dim[2], dim[1]});
+    case 4:
+      return std::vector<data_type>({dim[0], dim[2], dim[3], dim[1]});
+    case 5:
+      return std::vector<data_type>({dim[0], dim[2], dim[3], dim[4], dim[1]});
+    default:
+      CHECK(0) << "unsupport dimension";
+  }
 }
 
-inline const std::vector<int64_t> DimNCHW2NHWC(
-    const std::vector<int64_t>& dim) {
-  return std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]});
+template <typename data_type>
+inline std::vector<data_type> GetAxisNHWC2NCHW(size_t n_dims) {
+  std::vector<data_type> nhwc2nchw_axis(n_dims);
+  nhwc2nchw_axis[0] = 0;
+  if (n_dims > 1) nhwc2nchw_axis[1] = n_dims - 1;
+  for (size_t i = 2; i < n_dims; ++i) {
+    nhwc2nchw_axis[i] = i - 1;
+  }
+  return nhwc2nchw_axis;
+}
+
+template <typename data_type>
+inline std::vector<data_type> GetAxisNCHW2NHWC(size_t n_dims) {
+  std::vector<data_type> nchw2nhwc_axis(n_dims);
+  nchw2nhwc_axis[0] = 0;
+  for (size_t i = 1; i < n_dims - 1; ++i) {
+    nchw2nhwc_axis[i] = i + 1;
+  }
+  if (n_dims > 1) nchw2nhwc_axis[n_dims - 1] = 1;
+  return nchw2nhwc_axis;
 }
 
 template <paddle::lite_api::PrecisionType>
-struct FPTypeTraits {};
+struct MLUTypeTraits {
+  /* using type = void; */
+  /* static constexpr cnmlDataType_t cnml_type = CNML_DATA_INVALID; */
+};
+
+template <>
+struct MLUTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
+  using type = float;
+  static constexpr cnmlDataType_t cnml_type = CNML_DATA_FLOAT32;
+};
+
+template <>
+struct MLUTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
+  using type = paddle::lite::fluid::float16;
+  static constexpr cnmlDataType_t cnml_type = CNML_DATA_FLOAT16;
+};
 
 template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
-  typedef float T;
+struct MLUTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
+  using type = int8_t;
+  static constexpr cnmlDataType_t cnml_type = CNML_DATA_INT8;
 };
 
 template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
-  typedef paddle::lite::fluid::float16 T;
+struct MLUTypeTraits<paddle::lite_api::PrecisionType::kInt32> {
+  using type = int32_t;
+  static constexpr cnmlDataType_t cnml_type = CNML_DATA_INT32;
 };
 
 }  // namespace mlu
diff --git a/lite/kernels/mlu/io_copy_compute.cc b/lite/kernels/mlu/io_copy_compute.cc
index 02e4d8b28e81e88201b895a4b8fbe9e93d3f17f9..ff8a7ddf6e4c465f288ba42b5b2537294a9d7ffd 100644
--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
@@ -41,6 +41,9 @@ class IoCopyHostToMluCompute
     auto mem_size = param.x->memory_size();
     // LOG(INFO) << "copy size " << mem_size;
     auto* data = param.y->mutable_data(TARGET(kMLU), mem_size);
+    VLOG(6) << "io_copy host to mlu] memory size: " << mem_size
+            << " precision type: " << PrecisionToStr(Precision);
+    param.y->set_precision(param.x->precision());
     CopyFromHostSync(data, param.x->raw_data(), mem_size);
   }
 
@@ -79,6 +82,13 @@ class IoCopyMluToHostCompute
     CHECK(param.x->target() == TARGET(kMLU));
     auto mem_size = param.x->memory_size();
     auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
+    VLOG(6) << "io_copy mlu to host] memory size: " << mem_size
+            << " precision type: " << PrecisionToStr(Precision);
+
+    // sync queue to ensure process done
+    auto& mlu_context = this->ctx_->template As<MLUContext>();
+    CNRT_CALL(cnrtSyncQueue(mlu_context.exec_queue()));
+
     CopyToHostSync(data, param.x->raw_data(), mem_size);
   }
 
@@ -97,8 +107,14 @@ REGISTER_LITE_KERNEL(
     kNHWC,
     paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFloat)>,
     host_to_device_kFloat)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kAny))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(
@@ -108,8 +124,31 @@ REGISTER_LITE_KERNEL(
     kNHWC,
     paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFP16)>,
     host_to_device_kFP16)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kInt32,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kInt32)>,
+    host_to_device_kInt32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kAny))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(
@@ -119,8 +158,14 @@ REGISTER_LITE_KERNEL(
     kNHWC,
     paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFloat)>,
     device_to_host_kFloat)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kMLU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kAny))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(
@@ -130,6 +175,29 @@ REGISTER_LITE_KERNEL(
     kNHWC,
     paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFP16)>,
     device_to_host_kFP16)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kMLU),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kInt8,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kInt8)>,
+    host_to_device_to_kInt8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kAny))})
     .Finalize();
diff --git a/lite/kernels/mlu/layout_compute.cc b/lite/kernels/mlu/layout_compute.cc
index d4e16734d6d2dae6f5c119194008bce114a2e918..42b12740ff0edb88ea2944e25ca03ade36caa956 100644
--- a/lite/kernels/mlu/layout_compute.cc
+++ b/lite/kernels/mlu/layout_compute.cc
@@ -24,9 +24,9 @@ namespace mlu {}  // namespace mlu
 
 REGISTER_LITE_KERNEL(
     layout,
-    kMLU,
+    kX86,
     kFloat,
-    kNHWC,
+    kNCHW,
     paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFloat)>,
     def_layout_nhwc2nchw_fp32)
     .BindInput("Input",
@@ -41,9 +41,9 @@ REGISTER_LITE_KERNEL(
 
 REGISTER_LITE_KERNEL(
     layout,
-    kMLU,
+    kX86,
     kFP16,
-    kNHWC,
+    kNCHW,
     paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFP16)>,
     def_layout_nhwc2nchw_fp16)
     .BindInput("Input",
@@ -58,9 +58,9 @@ REGISTER_LITE_KERNEL(
 
 REGISTER_LITE_KERNEL(
     layout,
-    kMLU,
+    kX86,
     kFloat,
-    kNHWC,
+    kNCHW,
     paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFloat)>,
     def_layout_nchw2nhwc_fp32)
     .BindInput("Input",
@@ -75,9 +75,9 @@ REGISTER_LITE_KERNEL(
 
 REGISTER_LITE_KERNEL(
     layout,
-    kMLU,
+    kX86,
     kFP16,
-    kNHWC,
+    kNCHW,
     paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFP16)>,
     def_layout_nchw2nhwc_fp16)
     .BindInput("Input",
@@ -92,11 +92,11 @@ REGISTER_LITE_KERNEL(
 
 REGISTER_LITE_KERNEL(
     layout,
-    kMLU,
+    kX86,
     kInt8,
-    kNHWC,
+    kNCHW,
     paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kInt8)>,
-    def_layout_nchw2nhwc_fp32_int8)
+    def_layout_nchw2nhwc_int8)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kInt8),
diff --git a/lite/kernels/mlu/layout_compute.h b/lite/kernels/mlu/layout_compute.h
index edacdf8a98a2ffde6e538f61d4dd8259e3211b22..df254865994fe8548df0e021ecb471f5a1020080 100644
--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
@@ -22,6 +22,7 @@
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
+#include "lite/kernels/mlu/bridges/utility.h"
 #include "lite/operators/layout_op.h"
 
 namespace paddle {
@@ -29,24 +30,6 @@ namespace lite {
 namespace kernels {
 namespace mlu {
 
-template <paddle::lite_api::PrecisionType>
-struct FPTypeTraits {};
-
-template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
-  typedef float T;
-};
-
-template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
-  typedef paddle::lite::fluid::float16 T;
-};
-
-template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
-  typedef int8_t T;
-};
-
 template <lite::TargetType Target, typename T>
 inline void LayoutTransCompute(const int dim,
                                const lite::Context<Target>& context,
@@ -73,7 +56,7 @@ inline void LayoutTransCompute(const int dim,
 
 template <PrecisionType Precision>
 class LayoutNchwToNhwcCompute
-    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+    : public KernelLite<TARGET(kX86), Precision, DATALAYOUT(kNCHW)> {
  public:
   using param_t = operators::LayoutParam;
 
@@ -81,36 +64,37 @@ class LayoutNchwToNhwcCompute
     auto& param = this->template Param<param_t>();
     auto* x = param.x;
     auto* out = param.y;
-    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
-    auto x_dims = param.x->dims().size();
+    out->template mutable_data<
+        typename subgraph::mlu::MLUTypeTraits<Precision>::type>();
+    auto x_ndims = param.x->dims().size();
     auto& context = this->ctx_->template As<X86Context>();
 
     const auto origin_dims = out->dims().Vectorize();
 
     std::vector<int> axis;
-    switch (x_dims) {
+    switch (x_ndims) {
       case 2:
         axis = {0, 1};
         break;
       case 3:
         axis = {0, 2, 1};
         out->Resize(std::vector<int64_t>{
-            out->dims()[0], out->dims()[2], out->dims()[1]});
+            origin_dims[0], origin_dims[2], origin_dims[1]});
         break;
       case 4:
         axis = {0, 2, 3, 1};
         out->Resize(std::vector<int64_t>{
-            out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
+            origin_dims[0], origin_dims[2], origin_dims[3], origin_dims[1]});
         break;
       default:
         CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc";
     }
 
     LayoutTransCompute<lite::TargetType::kX86,
-                       typename FPTypeTraits<Precision>::T>(
-        x_dims, context, *x, out, axis);
+                       typename subgraph::mlu::MLUTypeTraits<Precision>::type>(
+        x_ndims, context, *x, out, axis);
 
-    if (x_dims > 2) {
+    if (x_ndims > 2) {
       out->Resize(origin_dims);
     }
   }
@@ -122,7 +106,7 @@ class LayoutNchwToNhwcCompute
 
 template <PrecisionType Precision>
 class LayoutNhwcToNchwCompute
-    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+    : public KernelLite<TARGET(kX86), Precision, DATALAYOUT(kNCHW)> {
  public:
   using param_t = operators::LayoutParam;
 
@@ -130,25 +114,27 @@ class LayoutNhwcToNchwCompute
     auto& param = this->template Param<param_t>();
     auto* x = param.x;
     auto* out = param.y;
-    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
-    auto x_dims = param.x->dims().size();
+    out->template mutable_data<
+        typename subgraph::mlu::MLUTypeTraits<Precision>::type>();
     auto& context = this->ctx_->template As<X86Context>();
 
-    const auto origin_dims = out->dims().Vectorize();
+    TensorLite tmp_t;
+    tmp_t.ShareDataWith(*x);
 
+    const auto x_dims = x->dims().Vectorize();
+    auto x_ndims = param.x->dims().size();
     std::vector<int> axis;
-    switch (x_dims) {
+    switch (x_ndims) {
       case 2:
         axis = {0, 1};
         break;
       case 3:
-        out->Resize(std::vector<int64_t>{
-            out->dims()[0], out->dims()[2], out->dims()[1]});
+        tmp_t.Resize(std::vector<int64_t>{x_dims[0], x_dims[2], x_dims[1]});
         axis = {0, 2, 1};
         break;
       case 4:
-        out->Resize(std::vector<int64_t>{
-            out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]});
+        tmp_t.Resize(
+            std::vector<int64_t>{x_dims[0], x_dims[2], x_dims[3], x_dims[1]});
         axis = {0, 3, 1, 2};
         break;
       default:
@@ -156,12 +142,8 @@ class LayoutNhwcToNchwCompute
     }
 
     LayoutTransCompute<lite::TargetType::kX86,
-                       typename FPTypeTraits<Precision>::T>(
-        x_dims, context, *x, out, axis);
-
-    if (x_dims > 2) {
-      out->Resize(origin_dims);
-    }
+                       typename subgraph::mlu::MLUTypeTraits<Precision>::type>(
+        x_ndims, context, tmp_t, out, axis);
   }
 
   std::string doc() const override {
diff --git a/lite/kernels/mlu/subgraph_compute.cc b/lite/kernels/mlu/subgraph_compute.cc
index 73ca9dcc20a6311d33e5cff6c6ed6be08f3c7a1f..450031021d3ad70c6abb348a6e498d8876f5ec56 100644
--- a/lite/kernels/mlu/subgraph_compute.cc
+++ b/lite/kernels/mlu/subgraph_compute.cc
@@ -36,8 +36,14 @@ REGISTER_LITE_KERNEL(
     kNHWC,
     paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFloat)>,
     def_kFloat)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kMLU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(
@@ -47,6 +53,12 @@ REGISTER_LITE_KERNEL(
     kNHWC,
     paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFP16)>,
     def_FP16)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kMLU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
     .Finalize();
diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h
index 3bfba33f4d7e8fd86f7aaf276da2ca4a8b0bd7cf..75570a6249ecaa36a94b73dafb27f655495cab87 100644
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -14,17 +14,24 @@
 
 #pragma once
 
+#include <algorithm>
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "lite/api/paddle_place.h"
 #include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
 #include "lite/core/type_system.h"
 #include "lite/core/types.h"
 #include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/tensor.h"
 #include "lite/kernels/npu/bridges/engine.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/utils/env.h"
 
 namespace paddle {
 namespace lite {
@@ -36,125 +43,434 @@ class SubgraphEngine : public subgraph::Engine {
  public:
   SubgraphEngine(KernelContext* ctx,
                  int block_idx,
-                 cpp::BlockDesc* block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+                 Scope* exec_scope,
                  const std::vector<std::string>& input_names,
                  const std::vector<std::string>& output_names,
-                 Scope* scope,
-                 ::paddle::lite_api::PrecisionType type)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {
-    graph_.SetFPType(type);
+                 paddle::lite_api::PrecisionType type)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names),
+        fp_type_(type) {
+    VLOG(4) << "[MLU] PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL is "
+            << GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL");
+    VLOG(4) << "[MLU] PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE is "
+            << GetBoolFromEnv("PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE",
+                              true);
+    VLOG(4) << "[MLU] LITE_DISABLE_MLU_CAST is "
+            << GetBoolFromEnv("LITE_DISABLE_MLU_CAST");
+    if (GetBoolFromEnv("PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE", true)) {
+      disable_batch_size_changeable_ = true;
+    }
   }
 
-  int Build() {
-    // In order to attach all of the ops of the block desc, we need to build
-    // the original program firstly.
-    BuildOriginProgram();
-    // Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph
-    build_device_program_status_ = BuildDeviceProgram();
-    return build_device_program_status_;
+  bool InputShapeChanged() {
+    std::vector<std::vector<int64_t>> new_shape;
+    // used in batch changable situation
+    std::vector<std::vector<int64_t>> all_shape;
+    for (auto origin_itensor : origin_itensors_) {
+      if (!disable_batch_size_changeable_) {
+        auto iv = origin_itensor->dims().Vectorize();
+        all_shape.push_back(iv);
+        iv.erase(iv.begin());
+        new_shape.push_back(iv);
+      } else {
+        new_shape.push_back(origin_itensor->dims().Vectorize());
+      }
+    }
+    inputs_shape_ = new_shape;
+    all_inputs_shape_ = all_shape;
+    if (shape_graph_map_.count(inputs_shape_) > 0) {
+      return false;
+    }
+    VLOG(3) << "MLU graph input shape changed" << std::endl;
+    return true;
   }
 
-  int Launch() {
-    // Rebuild device program when the shapes of input tensors have been
-    // changed.
-    if (subgraph::CHECK_SUCCESS(build_device_program_status_) &&
-        subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED(
-            build_device_program_status_) &&
-        InputShapeChanged()) {
-      Build();
-    }
-    if (subgraph::CHECK_FAILED(build_device_program_status_)) {
-      LaunchOriginProgram();
-    } else {
-      LaunchDeviceProgram();
+  inline cnmlDataType_t PrecisionToDatatype(PrecisionType data_type) {
+    switch (data_type) {
+      case paddle::lite_api::PrecisionType::kFP16:
+        return CNML_DATA_FLOAT16;
+      case paddle::lite_api::PrecisionType::kFloat:
+        return CNML_DATA_FLOAT32;
+      case paddle::lite_api::PrecisionType::kInt32:
+        return CNML_DATA_INT32;
+      case paddle::lite_api::PrecisionType::kInt8:
+        return CNML_DATA_UINT8;
+      default:
+        return PrecisionToDatatype(fp_type_);
     }
-    return 0;
   }
 
  protected:
-  int BuildDeviceProgram() override {
+  bool BuildDeviceProgram() override {
+    if (!origin_program_) {
+      BuildOriginProgram();
+    }
+    if (!error_compile_batch_size_changeable_ &&
+        !disable_batch_size_changeable_) {
+      int status = BuildDeviceProgramImpl();
+      if (subgraph::CHECK_SUCCESS(status)) {
+        return status;
+      }
+      LOG(INFO) << "[MLU] build batch_size changeable subgraph op failed, "
+                   "changed to input_shape changeable";
+    }
+    error_compile_batch_size_changeable_ = true;
+    disable_batch_size_changeable_ = true;
+    return BuildDeviceProgramImpl();
+  }
+
+  bool BuildDeviceProgramImpl() {
     int status = 0;
+    auto graph = std::make_shared<paddle::lite::subgraph::mlu::Graph>();
+    graph->SetFPType(fp_type_);
+    std::vector<std::vector<int64_t>> new_shape;
+    origin_itensors_.clear();
+    origin_otensors_.clear();
+
+    auto* sub_block_desc =
+        program_desc_->GetBlock()<cpp::BlockDesc>(block_idx_);
+    auto data_order = sub_block_desc->GetOp<cpp::OpDesc>(0)->Type() == "layout"
+                          ? CNML_NCHW
+                          : CNML_NHWC;
     // Convert all of input data vars and added into the MLU IR graph
+    status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
     for (auto& input_name : input_names_) {
-      auto input_tensor = scope_->FindMutableTensor(input_name);
+      auto input_tensor = exec_scope_->FindMutableTensor(input_name);
+      auto data_type = input_tensor->precision();
+      cnmlDataType_t fp_type = PrecisionToDatatype(data_type);
+      origin_itensors_.push_back(input_tensor);
+      if (!disable_batch_size_changeable_) {
+        auto iv = input_tensor->dims().Vectorize();
+        iv.erase(iv.begin());
+        new_shape.push_back(iv);
+      } else {
+        new_shape.push_back(input_tensor->dims().Vectorize());
+      }
+
       CHECK(input_tensor);
-      auto input_node =
-          graph_.AddNode(input_name,
-                         input_tensor->dims().Vectorize(),
-                         CNML_TENSOR,
-                         CNML_NCHW,
-                         graph_.FPType(),
-                         const_cast<void*>(input_tensor->raw_data()));
+      VLOG(4) << "subgraph input tensor " << input_name << std::endl;
+      auto input_node = graph->AddNode(input_name,
+                                       input_tensor->dims().Vectorize(),
+                                       CNML_TENSOR,
+                                       CNML_NCHW,
+                                       fp_type,
+                                       data_order);
       CHECK(input_node);
       // MLU doesn't support dynamic dimensions/shapes, so need to rebuild
       // the program when the shape of any input tensor is changed.
-      status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
     }
     LOG(INFO) << "START TO CONVERT ";
     // Convert all of ops and its weights and added into the MLU IR graph
     const auto& bridges = subgraph::Registry::Instance();
-    for (auto& inst : origin_program_) {
+    const auto& insts = origin_program_->instructions(kRootBlockIdx);
+    for (auto& inst : insts) {
       auto op = inst.op();
       CHECK(op);
       std::string op_type = op->op_info()->Type();
+      // since cnml's compile api will not return error now, we simply check
+      // op's type
+      if (!disable_batch_size_changeable_ &&
+          std::find(unsupport_batch_size_changeable_op_type_.begin(),
+                    unsupport_batch_size_changeable_op_type_.end(),
+                    op_type) !=
+              unsupport_batch_size_changeable_op_type_.end()) {
+        status |= subgraph::FAILED;
+        VLOG(4) << "[MLU] found unsupported batch_size changeable op type: "
+                << op_type;
+        if (subgraph::CHECK_FAILED(status)) {
+          return false;
+        }
+        return true;
+      }
       op->CheckShape();
       const_cast<OpLite*>(op)->InferShape();
       if (!bridges.Exists(op_type, TARGET(kMLU))) {
         LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
-        return subgraph::FAILED;
+        return false;
       }
       auto kernel = inst.kernel();
       status |= bridges.Select(op_type, TARGET(kMLU))(
-          reinterpret_cast<void*>(&graph_),
+          reinterpret_cast<void*>(graph.get()),
           const_cast<OpLite*>(op),
           const_cast<KernelBase*>(kernel));
       if (subgraph::CHECK_FAILED(status)) {
-        return subgraph::FAILED;
+        return false;
       }
     }
     // Obtain the output nodes of the MLU IR graph and build the graph to MLU
     // runtime
-    std::vector<std::string> valid_output_names;
     for (auto& output_name : output_names_) {
-      if (graph_.HasNode(output_name)) {
-        graph_.AddOutput(graph_.GetNode(output_name));
-        auto output_tensor = scope_->FindMutableTensor(output_name);
-        void* p_data = static_cast<void*>(
-            output_tensor->mutable_data<typename ::paddle::lite::subgraph::mlu::
-                                            FPTypeTraits<Precision>::T>(
-                TARGET(kMLU)));
-        auto node = graph_.GetNode(output_name);
-        CHECK(p_data);
-        node->set_mlu_ptr(p_data);
-        valid_output_names.push_back(output_name);
+      if (graph->HasNode(output_name)) {
+        graph->AddOutput(graph->GetNode(output_name));
+        auto output_tensor = exec_scope_->FindMutableTensor(output_name);
+        origin_otensors_.push_back(output_tensor);
+        VLOG(4) << "subgraph output tensor " << output_name << std::endl;
+
+        // auto node = graph->GetNode(output_name);
+        // CHECK(p_data);
+        // node->set_mlu_ptr(p_data);
       }
     }
     for (auto& input_name : input_names_) {
-      graph_.AddInput(graph_.GetNode(input_name));
+      graph->AddInput(graph->GetNode(input_name),
+                      disable_batch_size_changeable_);
     }
-    CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
+
+    CHECK(!origin_otensors_.empty()) << "[MLU] no valid output names";
     auto& mlu_context = this->ctx_->template As<MLUContext>();
     auto core_version = mlu_context.MLUCoreVersion();
     auto core_number = mlu_context.MLUCoreNumber();
-    graph_.Compile(core_version, core_number);
-    return status;
+    graph->Compile(core_version, core_number);
+    shape_graph_map_[new_shape] = graph;
+    if (GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL")) {
+      graph->GenOfflineModel(GetOfflineModName());
+    }
+    return true;
+  }
+
+  std::string TrimStrings(const std::string& origin_str) {
+    std::string str = origin_str;
+    std::size_t found = str.find("0x");
+    std::size_t found_end = 0;
+    const std::vector<std::string> del_strs = {
+        "/trans_io_copy", "/trans_cast", "/trans_layout"};
+    for (const auto& iterm : del_strs) {
+      found_end = str.find(iterm);
+      // trim point address and one of the del_strs
+      if (found != std::string::npos && found_end != std::string::npos) {
+        str.replace(found, found_end - found, "");
+        found_end = str.find(iterm);
+        str.replace(found_end, iterm.size(), "");
+        break;
+      }
+    }
+    return str;
+  }
+
+  std::string GetOfflineModName() {
+    sort(input_names_.begin(), input_names_.end());
+    sort(output_names_.begin(), output_names_.end());
+    const auto& delimiter = "__";
+    const auto& delimiter_num = "_";
+    const auto& input_shape_str = "input_shape_";
+    const auto& output_shape_str = "output_shape_";
+    std::string name = "";
+    std::string tmp = "";
+    for (const auto& input_name : input_names_) {
+      tmp = input_name;
+      name += TrimStrings(tmp) + delimiter + input_shape_str;
+      auto input_tensor = exec_scope_->FindMutableTensor(input_name);
+      for (const auto& iterm : input_tensor->dims().Vectorize()) {
+        name += std::to_string(iterm) + delimiter_num;
+      }
+      name += delimiter;
+    }
+    for (const auto& output_name : output_names_) {
+      tmp = output_name;
+      name += TrimStrings(tmp) + delimiter + output_shape_str;
+      auto output_tensor = exec_scope_->FindMutableTensor(output_name);
+      for (const auto& iterm : output_tensor->dims().Vectorize()) {
+        name += std::to_string(iterm) + delimiter_num;
+      }
+      name += delimiter;
+    }
+    std::replace(name.begin(), name.end(), '/', '-');
+    return name;
   }
 
-  int LaunchDeviceProgram() override {
+  void InferOutputsShapeOnly() {
+    // infer outputs shape when enable BATCH_SIZE_CHANGEABLE
+    const auto iter = in_out_shape_map_.find(all_inputs_shape_);
+    if (iter != in_out_shape_map_.end()) {
+      for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+        origin_otensors_[i]->Resize(iter->second[i]);
+      }
+    } else {
+      const auto& insts = origin_program_->instructions(kRootBlockIdx);
+      for (auto& inst : insts) {
+        auto op = inst.op();
+        CHECK(op);
+        op->CheckShape();
+        const_cast<OpLite*>(op)->InferShape();
+      }
+      std::vector<std::vector<int64_t>> outs_shape;
+      for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+        outs_shape.push_back(origin_otensors_[i]->dims().Vectorize());
+      }
+      in_out_shape_map_[all_inputs_shape_] = outs_shape;
+    }
+  }
+
+  inline void* GetOutputDataPtr(Tensor* tensor, bool use_mlu_cast) {
+    if (use_mlu_cast) {
+      // output is float, since cast fused in subgraph
+      return static_cast<void*>(tensor->mutable_data<float>(TARGET(kMLU)));
+    } else {
+      return static_cast<void*>(
+          tensor->template mutable_data<
+              typename subgraph::mlu::MLUTypeTraits<Precision>::type>(
+              TARGET(kMLU)));
+    }
+  }
+
+  bool LaunchDeviceProgram() override {
+    // prepare input and output memory
     auto& mlu_context = this->ctx_->template As<MLUContext>();
     auto exec_queue = mlu_context.exec_queue();
-    u32_t affinity = mlu_context.affinity();
-    cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
-    int data_param = 1;
-    forward_param.data_parallelism = &data_param;
-    forward_param.affinity = &affinity;
-    forward_param.end = CNRT_PARAM_END;
-    graph_.Compute(forward_param, exec_queue);
-    return 0;
+
+    auto graph = shape_graph_map_[inputs_shape_];
+    auto* graph_input = graph->MutableInputs();
+    auto* graph_output = graph->MutableOutputs();
+    CHECK_EQ(graph_input->size(), origin_itensors_.size());
+    CHECK_EQ(graph_output->size(), origin_otensors_.size());
+
+    bool disable_mlu_cast = GetBoolFromEnv("LITE_DISABLE_MLU_CAST");
+
+    if (!disable_batch_size_changeable_) {
+      std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
+          graph_in;
+      if (shape_tensor_map_in_.find(all_inputs_shape_) !=
+          shape_tensor_map_in_.end()) {
+        graph_in = shape_tensor_map_in_[all_inputs_shape_];
+        for (size_t i = 0; i < origin_itensors_.size(); ++i) {
+          graph_in[i]->set_mlu_ptr(
+              const_cast<void*>(origin_itensors_[i]->raw_data()));
+        }
+      } else {
+        graph_in.reserve(origin_itensors_.size());
+        for (size_t i = 0; i < origin_itensors_.size(); ++i) {
+          paddle::lite::subgraph::mlu::MLUTensor tmp(
+              origin_itensors_[i]->dims().Vectorize());
+          tmp.set_mlu_dtype(graph_input->at(i)->dtype());
+          tmp.set_mlu_ptr(const_cast<void*>(origin_itensors_[i]->raw_data()));
+          graph_in.push_back(
+              std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
+        }
+        shape_tensor_map_in_[all_inputs_shape_] = graph_in;
+      }
+
+      // TODO(zhangmingwei): we just call every op's infer_shape to get outputs'
+      // shape, may be it's better to use cnml's api to get output shape. This
+      // can be done when cnml's tensor dimension is totally equal to lite's
+      // tensor
+      // shape.
+      InferOutputsShapeOnly();
+      // const std::vector<std::vector<int64_t>> new_output_size =
+      //    graph->InferOutputsShape(graph_in);
+
+      std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
+          graph_out;
+
+      if (shape_tensor_map_out_.find(all_inputs_shape_) !=
+          shape_tensor_map_out_.end()) {
+        graph_out = shape_tensor_map_out_[all_inputs_shape_];
+        for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+          // origin_otensors_[i]->Resize(new_output_size.at(i));
+          graph_out[i]->set_mlu_ptr(
+              GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast));
+        }
+      } else {
+        graph_out.reserve(origin_otensors_.size());
+        for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+          // origin_otensors_[i]->Resize(new_output_size.at(i));
+          paddle::lite::subgraph::mlu::MLUTensor tmp(
+              origin_otensors_[i]->dims().Vectorize());
+          tmp.set_mlu_dtype(graph_output->at(i)->dtype());
+          tmp.set_mlu_ptr(
+              GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast));
+          graph_out.push_back(
+              std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
+        }
+        shape_tensor_map_out_[all_inputs_shape_] = graph_out;
+      }
+      graph->Compute(exec_queue, graph_in, graph_out);
+    } else {
+      for (size_t i = 0; i < origin_itensors_.size(); ++i) {
+        graph_input->at(i)->set_mlu_ptr(
+            const_cast<void*>(origin_itensors_[i]->raw_data()));
+      }
+      for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+        origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape());
+        graph_output->at(i)->set_mlu_ptr(
+            GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast));
+      }
+      // only cnmlComputeFusionOpForward_V3 need cnrtInvokeFuncParam_t
+      cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
+      int data_param = 1;
+      forward_param.data_parallelism = &data_param;
+      u32_t affinity = mlu_context.affinity();
+      forward_param.affinity = &affinity;
+      forward_param.end = CNRT_PARAM_END;
+      graph->Compute(forward_param, exec_queue);
+
+#ifdef MLU_DUMP_SUBGRAPH_IO
+      // Graph node store compile-time tensor while batchsize mutable is set.
+      // Only batchsize mutable is disabled, data exists in graph node at
+      // runtime
+      // =========== DUMP ===================
+      for (auto input_name : input_names_) {
+        auto input_tensor =
+            shape_graph_map_[inputs_shape_]->GetNode(input_name);
+        auto dump_name = input_name;
+        while (dump_name.find("/") != std::string::npos) {
+          dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
+        }
+        VLOG(6) << "dump_name: " << dump_name;
+        input_tensor->ToFile(dump_name);
+      }
+      for (auto output_name : output_names_) {
+        if (shape_graph_map_[inputs_shape_]->HasNode(output_name)) {
+          auto output_tensor =
+              shape_graph_map_[inputs_shape_]->GetNode(output_name);
+          auto dump_name = output_name;
+          while (dump_name.find("/") != std::string::npos) {
+            dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
+          }
+          VLOG(6) << "dump_name: " << dump_name;
+          output_tensor->ToFile(dump_name);
+        } else {
+          VLOG(6) << "graph does not have " << output_name << " as output"
+                  << std::endl;
+        }
+      }
+#endif
+      // =========== DUMP END ================
+    }
+
+    return true;
   }
 
-  paddle::lite::subgraph::mlu::Graph graph_;
+  paddle::lite_api::PrecisionType fp_type_;
+  std::vector<std::vector<int64_t>> inputs_shape_{};
+  std::vector<std::vector<int64_t>> all_inputs_shape_{};
+  std::map<std::vector<std::vector<int64_t>>,
+           std::shared_ptr<paddle::lite::subgraph::mlu::Graph>>
+      shape_graph_map_{};
+  // enable batch size changeable by default, this cound be changed by
+  // environment variable PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE and
+  // whether the op can be compiled with batch size changeable way
+  bool disable_batch_size_changeable_{false};
+  bool error_compile_batch_size_changeable_{false};
+  std::vector<std::string> unsupport_batch_size_changeable_op_type_{"concat"};
+  // search output runtime MLUTensor for certain output shape when enable
+  // BATCH_SIZE_CHANGEABLE
+  std::map<std::vector<std::vector<int64_t>>,
+           std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>>
+      shape_tensor_map_out_{};
+  // search input runtime MLUTensor for certain input shape when enable
+  // BATCH_SIZE_CHANGEABLE
+  std::map<std::vector<std::vector<int64_t>>,
+           std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>>
+      shape_tensor_map_in_{};
+  // search output shape for certain input shape when enable
+  // BATCH_SIZE_CHANGEABLE
+  std::map<std::vector<std::vector<int64_t>>, std::vector<std::vector<int64_t>>>
+      in_out_shape_map_{};
 };
 
 template <PrecisionType Precision>
@@ -167,19 +483,18 @@ class SubgraphCompute
     auto& param = this->template Param<param_t>();
     // LOG(INFO) << "SUBGRAP Prepare RUN index " << param.sub_block_idx;
     engine_.reset(new SubgraphEngine<Precision>(this->ctx_.get(),
-                                                param.sub_block_idx,
-                                                param.sub_block_desc,
+                                                param.block_idx,
+                                                param.program_desc,
+                                                param.exec_scope,
                                                 param.input_data_names,
                                                 param.output_data_names,
-                                                param.scope,
                                                 this->precision()));
     CHECK(engine_);
-    engine_->Build();
   }
 
   void Run() override {
     CHECK(engine_);
-    engine_->Launch();
+    engine_->Run();
   }
 
   virtual ~SubgraphCompute() = default;
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index 5157f47867160cf4f705306ca37cfad962373386..be30d1c03988cb8b88761c0719c2785446c0b0ea 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU AND NOT LITE_WITH_APU)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU AND NOT LITE_WITH_APU AND NOT LITE_WITH_HUAWEI_ASCEND_NPU)
   return()
 endif()
 
diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc
index db9a652b6c1b4055e09a70e1f407b1027fd1b1e8..afe689729d3efde0a611c6da2086e0f8cf58a307 100644
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
@@ -100,6 +100,9 @@ int ActConverter<ge::op::Activation>(void* ctx,
     auto offset = op_info->GetAttr<float>("offset");
     act_op->set_attr_negative_slope(slope);
     act_op->set_attr_coef(offset);
+  } else if (op_type == "thresholded_relu") {
+    auto threshold = op_info->GetAttr<float>("threshold");
+    act_op->set_attr_coef(threshold);
   }
   return SUCCESS;
 }
@@ -141,6 +144,10 @@ REGISTER_SUBGRAPH_BRIDGE(
     hard_sigmoid,
     kNPU,
     paddle::lite::subgraph::npu::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    thresholded_relu,
+    kNPU,
+    paddle::lite::subgraph::npu::ActConverter<ge::op::Activation>);
 
 REGISTER_SUBGRAPH_BRIDGE(
     log, kNPU, paddle::lite::subgraph::npu::ActConverter<ge::op::Log>);
diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc
index 8ca8357710e1f36a7c3f21417d7633e47f18c59a..b9f81a74ad997966ecb79c66bceed1e84b4a91f7 100644
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/npu/bridges/engine.h"
 #include <sys/time.h>
 #include <time.h>
+#include <algorithm>
 #include <utility>
 #include "lite/kernels/npu/bridges/registry.h"
 
@@ -22,104 +23,90 @@ namespace paddle {
 namespace lite {
 namespace subgraph {
 
-int Engine::BuildDeviceProgram() { return FAILED; }
+Engine::Engine(KernelContext *ctx,
+               int block_idx,
+               const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+               Scope *exec_scope,
+               const std::vector<std::string> &input_names,
+               const std::vector<std::string> &output_names)
+    : ctx_(ctx),
+      block_idx_(block_idx),
+      program_desc_(program_desc),
+      exec_scope_(exec_scope) {
+  input_names_ = input_names;
+  output_names_ = output_names;
+  // Sort the name of input and output tensors, it's convenient for us to get
+  // the info of input and output tensors in the same order from the device
+  // program, because the result of subgraph division may be different but right
+  // at each call of the subgraph pass.
+  std::stable_sort(input_names_.begin(), input_names_.end());
+  std::stable_sort(output_names_.begin(), output_names_.end());
+}
+
+bool Engine::Run() {
+  if (is_first_epoch_) {
+    PrepareWorkspaceForDeviceProgram();
+    is_first_epoch_ = false;
+  }
+  if (InputShapeChanged()) {
+    BuildDeviceProgram();
+  }
+  return LaunchDeviceProgram();
+}
 
-int Engine::LaunchDeviceProgram() { return 0; }
+bool Engine::PrepareWorkspaceForOriginProgram() {
+  origin_idims_.resize(input_names_.size());
+  origin_itensors_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = exec_scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+  }
+  origin_otensors_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i] = exec_scope_->FindMutableTensor(output_names_[i]);
+    CHECK(origin_otensors_[i]);
+  }
+  return true;
+}
 
-int Engine::BuildOriginProgram() {
+bool Engine::BuildOriginProgram() {
   // TODO(hong19860320) The block_desc need to be divided into subgraphs during
   // the exection time. But only see them as a subgraph now.
-  origin_program_.clear();
-  for (size_t op_idx = 0; op_idx < block_desc_->OpsSize(); op_idx++) {
-    auto op_desc = block_desc_->GetOp<cpp::OpDesc>(op_idx);
-    CHECK(op_desc);
-    std::string op_type = op_desc->Type();
-    auto op = LiteOpRegistry::Global().Create(op_desc->Type());
-    op->Attach(*op_desc, scope_);
-    std::unique_ptr<KernelBase> picked_kernel;
-    if (op_desc->HasAttr(kKernelTypeAttr)) {
-      // Create op and pick up kernel according to the kKernelTypeAttr attribute
-      auto kernel_type = op_desc->GetAttr<std::string>(kKernelTypeAttr);
-      std::string alias;
-      Place place;
-      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-      VLOG(3) << "Found the attr '" << kKernelTypeAttr << "': " << kernel_type
-              << " for " << op_type;
-      auto kernels = op->CreateKernels({place});
-      CHECK_GT(kernels.size(), 0u) << "No kernels found for " << op_type;
-      auto it = std::find_if(
-          kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
-            return it->alias() == alias;
-          });
-      CHECK(it != kernels.end());
-      picked_kernel = std::move(*it);
-    } else {
-      VLOG(3) << "The attr '" << kKernelTypeAttr
-              << "' not found, pick the first kernel for " << op_type;
-      std::vector<std::unique_ptr<KernelBase>> kernels;
-#if defined(LITE_WITH_ARM)
-      kernels = op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}});
-#elif defined(LITE_WITH_X86)
-      kernels = op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}});
-#endif
-      if (kernels.size() > 0) {
-        picked_kernel = std::move(kernels.front());
-      } else {
-        LOG(WARNING) << "No kernels found for " << op_type;
-      }
-    }
-    if (picked_kernel != nullptr) {
-      picked_kernel->SetContext(
-          ContextScheduler::Global().NewContext(picked_kernel->target()));
-    }
-    origin_program_.emplace_back(std::move(op), std::move(picked_kernel));
+  if (!origin_program_) {
+    origin_program_.reset(
+        new RuntimeProgram(program_desc_, exec_scope_, block_idx_));
   }
-  return 0;
+  return true;
 }
 
-int Engine::LaunchOriginProgram() {
-  for (auto& inst : origin_program_) {
-    auto op_type = inst.op()->op_info()->Type();
-    if (op_type == "feed" || op_type == "fetch") continue;
-    inst.Run();
+bool Engine::LaunchOriginProgram() {
+  if (!origin_program_) {
+    BuildOriginProgram();
   }
-  return 0;
+  if (origin_program_) {
+    VLOG(3) << "Roll back to run the origin program.";
+    origin_program_->Run();
+    return true;
+  }
+  return false;
 }
 
-int Engine::Build() {
-  // In order to attach all of the ops of the block desc, we need to build the
-  // original program firstly.
-  BuildOriginProgram();
-  // Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph
-  build_device_program_status_ = BuildDeviceProgram();
-  return build_device_program_status_;
+bool Engine::PrepareWorkspaceForDeviceProgram() {
+  return PrepareWorkspaceForOriginProgram();
 }
 
-void Engine::InitDeviceTensor() { return; }
+bool Engine::BuildDeviceProgram() { return BuildOriginProgram(); }
+
+bool Engine::LaunchDeviceProgram() { return LaunchOriginProgram(); }
 
 bool Engine::InputShapeChanged() {
+  bool changed = false;
   for (size_t i = 0; i < origin_itensors_.size(); i++) {
-    if (origin_itensors_[i]->dims() != origin_idims_[i]) {
-      return true;
-    }
-  }
-  return false;
-}
-
-int Engine::Launch() {
-  // Rebuild device program when the shapes of input tensors have been changed.
-  if (CHECK_SUCCESS(build_device_program_status_) &&
-      CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) &&
-      InputShapeChanged()) {
-    Build();
-    InitDeviceTensor();
-  }
-  if (CHECK_FAILED(build_device_program_status_)) {
-    LaunchOriginProgram();
-  } else {
-    LaunchDeviceProgram();
+    auto origin_idim = origin_itensors_[i]->dims().Vectorize();
+    changed |= origin_idim != origin_idims_[i];
+    origin_idims_[i] = origin_idim;
   }
-  return 0;
+  return changed;
 }
 
 }  // namespace subgraph
diff --git a/lite/kernels/npu/bridges/engine.h b/lite/kernels/npu/bridges/engine.h
index 6a3f72077a9bed7a296b184330af119262472ada..daa02fb0d7bf8f70ebf8b21821a274b6a0ba062d 100644
--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
@@ -30,52 +30,39 @@ class Engine {
  public:
   Engine(KernelContext *ctx,
          int block_idx,
-         cpp::BlockDesc *block_desc,
+         const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+         Scope *exec_scope,
          const std::vector<std::string> &input_names,
-         const std::vector<std::string> &output_names,
-         lite::Scope *scope,
-         std::string model_cache_dir = "")
-      : ctx_(ctx),
-        block_idx_(block_idx),
-        block_desc_(block_desc),
-        input_names_(input_names),
-        output_names_(output_names),
-        scope_(scope),
-        model_cache_dir_(model_cache_dir) {}
+         const std::vector<std::string> &output_names);
   virtual ~Engine() = default;
 
-  virtual int Build();
-  virtual int Launch();
+  virtual bool Run();
 
  private:
   Engine(const Engine &) = delete;
 
  protected:
-  virtual int BuildDeviceProgram();
-  virtual int LaunchDeviceProgram();
+  virtual bool PrepareWorkspaceForOriginProgram();
+  virtual bool BuildOriginProgram();
+  virtual bool LaunchOriginProgram();
 
-  virtual int BuildOriginProgram();
-  virtual int LaunchOriginProgram();
+  virtual bool PrepareWorkspaceForDeviceProgram();
+  virtual bool BuildDeviceProgram();
+  virtual bool LaunchDeviceProgram();
 
-  virtual void InitDeviceTensor();
   virtual bool InputShapeChanged();
 
   KernelContext *ctx_{nullptr};
-  int block_idx_;
-  cpp::BlockDesc *block_desc_;
+  int block_idx_{-1};
+  const std::shared_ptr<const cpp::ProgramDesc> program_desc_{nullptr};
   std::vector<std::string> input_names_;
   std::vector<std::string> output_names_;
-  Scope *scope_{nullptr};
-  // SUCCESS: device program build successed. FAILED: device program build
-  // failed. REBUILD_WHEN_SHAPE_CHANGED: device program build successed but need
-  // to rebuild when input shape changed.
-  int build_device_program_status_{0};
-  std::vector<DDim> origin_idims_;
-  std::vector<DDim> origin_odims_;
+  Scope *exec_scope_{nullptr};
+  bool is_first_epoch_{true};
+  std::vector<std::vector<int64_t>> origin_idims_;
   std::vector<Tensor *> origin_itensors_;
   std::vector<Tensor *> origin_otensors_;
-  std::vector<Instruction> origin_program_;
-  std::string model_cache_dir_{""};
+  std::unique_ptr<RuntimeProgram> origin_program_{nullptr};
 };
 
 }  // namespace subgraph
diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h
index 38b03e06fa212728888cf47b3048d71fd4de06fc..1bc588496a253aa82183e020adc39989ad8d7312 100644
--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <utility>
 #include <vector>
-#include "graph/op/all_ops.h"
+#include "graph/compatible/all_ops.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 
diff --git a/lite/kernels/npu/bridges/matmul_op.cc b/lite/kernels/npu/bridges/matmul_op.cc
index 32af1916899454ef7a045339da5e9fc8a6131cfc..79ba82d94f24f61c2b9f51bd29634151bfcfa0ab 100644
--- a/lite/kernels/npu/bridges/matmul_op.cc
+++ b/lite/kernels/npu/bridges/matmul_op.cc
@@ -94,10 +94,10 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   } else {
     matmul_node = graph->Add<ge::op::BatchMatMul>(out_name);
     auto matmul_op = matmul_node->data<ge::op::BatchMatMul>();
-    matmul_op->set_input_x(*x_node->data());
-    matmul_op->set_input_y(*y_node->data());
-    matmul_op->set_attr_adj_x(transpose_x);
-    matmul_op->set_attr_adj_y(transpose_y);
+    matmul_op->set_input_x1(*x_node->data());
+    matmul_op->set_input_x2(*y_node->data());
+    matmul_op->set_attr_adj_x1(transpose_x);
+    matmul_op->set_attr_adj_x2(transpose_y);
   }
 
   if (fabs(alpha - 1.f) > 1e-6f) {
diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h
index b6ce66fe34963d8c3bc9c2bccc0f3a294ab16290..d431133bfd2361c3ffd80d54c445d13e382492f5 100644
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -25,6 +25,7 @@ USE_SUBGRAPH_BRIDGE(hard_sigmoid, kNPU);
 USE_SUBGRAPH_BRIDGE(log, kNPU);
 USE_SUBGRAPH_BRIDGE(sqrt, kNPU);
 USE_SUBGRAPH_BRIDGE(square, kNPU);
+USE_SUBGRAPH_BRIDGE(thresholded_relu, kNPU);
 
 USE_SUBGRAPH_BRIDGE(batch_norm, kNPU);
 USE_SUBGRAPH_BRIDGE(less_than, kNPU);
diff --git a/lite/kernels/npu/bridges/utility.cc b/lite/kernels/npu/bridges/utility.cc
index d9c9ffae923631d20c462149a57fccf3335836fd..abc24ea2ca8fea35007687218db963181e304156 100644
--- a/lite/kernels/npu/bridges/utility.cc
+++ b/lite/kernels/npu/bridges/utility.cc
@@ -144,6 +144,8 @@ int CvtActMode(std::string act_type) {
     act_mode = 9;
   } else if (act_type == "hard_sigmoid") {
     act_mode = 10;
+  } else if (act_type == "thresholded_relu") {
+    act_mode = 11;
   } else {
     // TODO(hong19860320) support more activation mode
     LOG(FATAL) << "[NPU] Unsupported activation type " << act_type;
diff --git a/lite/kernels/npu/bridges/utility.h b/lite/kernels/npu/bridges/utility.h
index 107d90c116b8239a9060f252c45c2b2d7901ddf7..6e75e58187909ad59da37dbcb0737a92ec014e22 100644
--- a/lite/kernels/npu/bridges/utility.h
+++ b/lite/kernels/npu/bridges/utility.h
@@ -20,11 +20,11 @@
 #include <string>
 #include <vector>
 #include "graph/buffer.h"
+#include "graph/compatible/operator_reg.h"
 #include "graph/graph.h"
 #include "graph/model.h"
 #include "graph/op/all_ops.h"
 #include "graph/operator.h"
-#include "graph/operator_reg.h"
 #include "lite/core/op_lite.h"
 #include "lite/utils/macros.h"
 
@@ -97,25 +97,26 @@ REG_OP(Pad)
     /*
      * Multiplies slices of two tensors in batches.
      * <Input>
-     *      x : The input tensor
-     *      y : The input tensor
+     *    x1 : The input tensor
+     *    x2 : The input tensor
      * <Output>
-     *      z : The output tensor
+     *    y : The output tensor
      * <Attr>
-     *      adj_x : adj_x is true, the input tensor x  is  transposed, otherwise
-     * it will not be transposed. Default is false (The current version only
-     * supports false).
-     *      adj_y : adj_y is true, the input tensor y  is  transposed, otherwise
-     * it will not be transposed. Default is false.
+     *    adj_x1 : adj_x1 is true, the input tensor x1  is  transposed,
+     * otherwise it will not be transposed.
+     *             Default is false (The current version only supports false).
+     *    adj_x2 : adj_x2 is true, the input tensor x2  is  transposed,
+     * otherwise it will not be transposed.
+     *             Default is false.
      * <Added in HiAI version>
-     *      100.320.010.010
+     *    100.320.010.010
      */
     REG_OP(BatchMatMul)
-    .INPUT(x, TensorType({DT_FLOAT}))
-    .INPUT(y, TensorType({DT_FLOAT}))
-    .OUTPUT(z, TensorType({DT_FLOAT}))
-    .ATTR(adj_x, AttrValue::BOOL{false})
-    .ATTR(adj_y, AttrValue::BOOL{false})
+    .INPUT(x1, TensorType({DT_FLOAT}))
+    .INPUT(x2, TensorType({DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT}))
+    .ATTR(adj_x1, AttrValue::BOOL{false})
+    .ATTR(adj_x2, AttrValue::BOOL{false})
     .OP_END()
 
 }  // namespace ge
diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc
index f17d73f8dfd540c8a1b809d780084b05299ccc2f..e9c5957ff6d8f026f712de04f4e32cd69baf50a9 100644
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -16,6 +16,7 @@
 #include <sys/time.h>
 #include <time.h>
 #include <algorithm>
+#include <functional>
 #include <utility>
 #include "hiai_ir_build.h"  // NOLINT
 #include "lite/backends/npu/device.h"
@@ -24,205 +25,283 @@
 #include "lite/kernels/npu/bridges/paddle_use_bridges.h"
 #include "lite/kernels/npu/bridges/utility.h"
 #include "lite/utils/io.h"
+#include "lite/utils/md5.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace npu {
 
-std::string SubgraphEngine::GenerateModelCacheName() const {
-  auto inames = device_inames_;
-  auto onames = device_onames_;
-  std::stable_sort(inames.begin(), inames.end());
-
-  std::string model_cache_name = "subgraph_" + std::to_string(block_idx_);
-  for (auto iname : inames) {
-    model_cache_name += "_";
-    auto itensor = scope_->FindTensor(iname);
-    int tmp = 0;
-    for (auto i : itensor->dims().Vectorize()) {
-      tmp += i * i;
+// Generate the model name by using md5 hashes based on:
+// 1. the sorted variable input names
+// 2. the shapes of the origin input tensors
+// 3. the sorted variable output names
+std::string DeviceProgram::GenerateModelName(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims) {
+  std::ostringstream os;
+  CHECK_EQ(input_names.size(), origin_idims.size());
+  for (int i = 0; i < input_names.size(); i++) {
+    os << input_names[i];
+    for (auto dim : origin_idims[i]) {
+      os << dim;
     }
-    model_cache_name += std::to_string(tmp % 1999);
   }
-  model_cache_name += "_.om";
+  for (auto output_name : output_names) {
+    os << output_name;
+  }
+  return MD5(os.str());
+}
 
-  return model_cache_name;
+// Deserialize the generated model, the precisions and dimensions of the origin
+// output tensors of the subgraph op from the cached configuration file and HiAI
+// om file
+bool DeviceProgram::LoadFromCacheFile(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims,
+    const std::string& model_cache_dir) {
+  // Generate the model name if not initialized
+  if (model_name_.empty()) {
+    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
+  }
+  // Load from the cached model file, return a HiAI model manager client for
+  // inference
+  auto model_path = model_cache_dir + "/" + model_name_ + ".om";
+  VLOG(3) << "[NPU] Load model from " << model_path;
+  std::vector<char> model_buffer;
+  if (!ReadFile(model_path, &model_buffer)) {
+    LOG(WARNING) << "[NPU] Open " << model_path << " for reading failed!";
+    return false;
+  }
+  bool model_comp = false;
+  model_client_ =
+      lite::npu::Device::Global().Load(model_name_, &model_buffer, &model_comp);
+  if (!model_client_) {
+    LOG(WARNING) << "[NPU] Load model failed!";
+    return false;
+  }
+  // Rewrite with the compatible model data if the cached
+  // model file is incompatible with the current device
+  if (!model_comp) {
+    VLOG(3) << "[NPU] Export the compatible model to " << model_path;
+    if (!WriteFile(model_path, model_buffer)) {
+      LOG(WARNING) << "[NPU] Open " << model_path << " for writting failed!";
+    }
+  }
+  // Deserialize the precisions and shapes of the origin output tensors from the
+  // cached configuration file
+  auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
+  VLOG(3) << "[NPU] Load configuration from " << config_path;
+  std::vector<char> config_buffer;
+  if (!ReadFile(config_path, &config_buffer)) {
+    LOG(WARNING) << "[NPU] read from " << config_path << " failed!";
+    return false;
+  }
+  std::string str(config_buffer.begin(), config_buffer.end());
+  // Parse the precision and shapes of the output tensors
+  auto output_options = Split<std::string>(str, ";");
+  CHECK_EQ(output_options.size(), output_names.size());
+  origin_otypes_.resize(output_names.size());
+  origin_odims_.resize(output_names.size());
+  for (int i = 0; i < output_names.size(); i++) {
+    auto items = Split<std::string>(output_options[i], ":");
+    CHECK_EQ(items.size(), 2);  // precision and shapes
+    origin_otypes_[i] = static_cast<PrecisionType>(std::stoi(items[0]));
+    origin_odims_[i] = Split<int64_t>(items[1], ",");
+  }
+  return true;
 }
 
-int SubgraphEngine::BuildDeviceProgram() {
+bool DeviceProgram::BuildGraphAndCacheToFile(
+    RuntimeProgram* origin_program,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims,
+    const std::vector<Tensor*>& origin_otensors,
+    const std::string& model_cache_dir) {
+  // Generate the model name if not initialized
+  if (model_name_.empty()) {
+    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
+  }
+  // Convert all of ops and their input vars and weights to HiAI IR nodes,
+  // then added them into the HiAI IR graph
   int status = 0;
-  // Convert all of ops and their input vars and weights and added into the NPU
-  // HiAI IR graph
   subgraph::npu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
-  for (auto& inst : origin_program_) {
+  CHECK(origin_program) << "[NPU] The origin program is not initialized!";
+  CHECK_GT(origin_program->instructions(kRootBlockIdx).size(), 0)
+      << "[NPU] No instructions found in the origin program!";
+  const auto& insts = origin_program->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
     op->InferShape();
     std::string op_type = op->op_info()->Type();
     if (!bridges.Exists(op_type, TARGET(kNPU))) {
-      return subgraph::FAILED;
+      return false;
     }
     auto kernel = inst.kernel();
     status |= bridges.Select(op_type, TARGET(kNPU))(
         reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
     }
   }
-  // Collect the valid input and output nodes in the HiAI IR graph and update
-  // the input and output names
-  device_inames_.clear();
-  device_onames_.clear();
+  // Collect the input and output nodes of the HiAI IR graph
   std::vector<ge::Operator> device_inodes;
+  for (size_t i = 0; i < input_names.size(); i++) {
+    CHECK(graph.Has(input_names[i]));
+    CHECK(graph.Get(input_names[i])->is_data());
+    device_inodes.push_back(*graph.Get(input_names[i])->data());
+  }
   std::vector<ge::Operator> device_onodes;
-  for (auto& input_name : input_names_) {
-    if (graph.Has(input_name)) {
-      if (graph.Get(input_name)->is_data()) {
-        device_inodes.push_back(*graph.Get(input_name)->data());
-        device_inames_.push_back(input_name);
-      } else {
-        LOG(WARNING) << "[NPU] Input node " << input_name
-                     << " is ignored because it is not a data node.";
-      }
-    } else {
-      LOG(WARNING) << "[NPU] Input node " << input_name
-                   << " is ignored because it does not exist.";
-    }
+  for (size_t i = 0; i < output_names.size(); i++) {
+    CHECK(graph.Has(output_names[i]));
+    device_onodes.push_back(*graph.Get(output_names[i])->data());
   }
-  for (auto& output_name : output_names_) {
-    if (graph.Has(output_name)) {
-      device_onodes.push_back(*graph.Get(output_name)->data());
-      device_onames_.push_back(output_name);
-    } else {
-      LOG(WARNING) << "[NPU] Output node " << output_name
-                   << " is ignored because it does not exist.";
-    }
+  // Build the HiAI IR graph to the HiAI om model
+  std::vector<char> model_buffer;
+  if (!lite::npu::Device::Global().Build(
+          device_inodes, device_onodes, &model_buffer)) {
+    LOG(WARNING) << "[NPU] Build model failed!";
+    return false;
   }
-  CHECK(!device_inames_.empty())
-      << "[NPU] No input nodes found for building NPU model";
-  CHECK(!device_onames_.empty())
-      << "[NPU] No output nodes found for building NPU model";
-
-  // Build the HiAI IR graph to HiAI om model as the device program
-  if (device_program_map_.count(inputs_shape_) > 0) {
-    return status;
+  // Load the HiAI om model and create a HiAI model manager client(from HiAI
+  // Service) to run inference.
+  bool model_comp = true;
+  model_client_ =
+      lite::npu::Device::Global().Load(model_name_, &model_buffer, &model_comp);
+  if (!model_client_) {
+    LOG(WARNING) << "[NPU] Load model failed!";
+    return false;
   }
-  std::string model_cache_full_dir =
-      model_cache_dir_.empty() ? "" : model_cache_dir_ + "/" +
-                                          GenerateModelCacheName();
-  auto device_client = lite::npu::Device::Global().Build(
-      model_name_, device_inodes, device_onodes, model_cache_full_dir);
-  if (device_client == nullptr) {
-    LOG(WARNING) << "[NPU] Build model failed!";
-    return subgraph::FAILED;
+  // Do not check model compatibility because it assume that the cached om model
+  // is always compatible with the current device
+  // Update the precison and dimensions of the origin output tensors
+  // Update the precison and dimensions of the origin output tensors
+  CHECK_EQ(origin_otensors.size(), output_names.size());
+  origin_otypes_.resize(output_names.size());
+  origin_odims_.resize(output_names.size());
+  for (size_t i = 0; i < output_names.size(); i++) {
+    origin_otypes_[i] = graph.Get(output_names[i])->precision();
+    origin_odims_[i] = origin_otensors[i]->dims().Vectorize();
   }
-  auto device_program = std::make_shared<device_program_t>(device_client);
-  if (!inputs_shape_.empty()) {
-    device_program_map_[inputs_shape_] = device_program;
+  if (!model_cache_dir.empty()) {
+    // Save the generated model to file, used for the model caching or the
+    // offline model generation
+    auto model_path = model_cache_dir + "/" + model_name_ + ".om";
+    VLOG(3) << "[NPU] Save model to " << model_path;
+    if (!WriteFile(model_path, model_buffer)) {
+      LOG(WARNING) << "[NPU] Open " << model_path << " for writting failed!";
+    }
+    // Serialize the precisions and shapes of the origin output tensors into the
+    // configuration file
+    std::ostringstream os;
+    for (int i = 0; i < output_names.size(); i++) {
+      os << static_cast<int32_t>(origin_otypes_[i]) << ":";
+      for (auto dim : origin_odims_[i]) {
+        os << dim << ",";
+      }
+      os << ";";
+    }
+    auto str = os.str();
+    std::vector<char> config_buffer(str.begin(), str.end());
+    auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
+    VLOG(3) << "[NPU] Save configuration to " << config_path;
+    if (!WriteFile(config_path, config_buffer)) {
+      LOG(WARNING) << "[NPU] Open " << config_path << " for writting failed!";
+    }
   }
+  return true;
+}
 
-  // Query and check the dimensions of valid input and output tensors
-  std::vector<hiai::TensorDimension> device_idims, device_odims;
-  if (device_program->client->GetModelIOTensorDim(
-          model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
-    LOG(WARNING)
-        << "[NPU] Get the dimensions of input and output tensors failed!";
-    return subgraph::FAILED;
+bool DeviceProgram::ShareBufferWithOriginTensors(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    std::vector<Tensor*>* origin_itensors,
+    std::vector<Tensor*>* origin_otensors,
+    std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
+    std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors) {
+  CHECK(!model_name_.empty() && model_client_);
+  // Query the dimensions of the device input and output tensors if not
+  // initialized
+  if (device_idims_.empty() || device_odims_.empty()) {
+    if (model_client_->GetModelIOTensorDim(
+            model_name_, device_idims_, device_odims_) != hiai::AI_SUCCESS) {
+      LOG(WARNING)
+          << "[NPU] Get the dimensions of input and output tensors failed!";
+      return false;
+    }
   }
-  device_program->device_idims = device_idims;
-  device_program->device_odims = device_odims;
+  // Check the dimensions of the device tensors and the origin tensors
+  CHECK_EQ(device_itensors->size(), input_names.size());
+  CHECK_EQ(device_otensors->size(), output_names.size());
+  CHECK_EQ(origin_otypes_.size(), output_names.size());
+  CHECK_EQ(origin_odims_.size(), output_names.size());
+  CHECK_EQ(device_idims_.size(), input_names.size());
+  CHECK_EQ(device_odims_.size(), output_names.size());
+  for (int i = 0; i < input_names.size(); i++) {
+    VLOG(3) << "[NPU] Inputs[" << i << "] name: " << input_names[i]
+            << " origin dims:" << (*origin_itensors)[i]->dims().repr()
+            << " device dims: {" << device_idims_[i].GetNumber() << ","
+            << device_idims_[i].GetChannel() << ","
+            << device_idims_[i].GetHeight() << ","
+            << device_idims_[i].GetWidth() << "}";
+    CHECK_EQ((*origin_itensors)[i]->dims().production(),
+             device_idims_[i].GetNumber() * device_idims_[i].GetChannel() *
+                 device_idims_[i].GetHeight() * device_idims_[i].GetWidth());
+    VLOG(3) << "[NPU] Init the input tensors for the device program and share "
+               "their buffers with the origin input tensors";
+    // Reinit device tensor will free shared buffer, so copy data to a tmp
+    // tensor
+    Tensor tmp;
+    tmp.CopyDataFrom(*(*origin_itensors)[i]);
+    (*device_itensors)[i]->Init(&(device_idims_[i]));
 
-  CHECK_EQ(device_idims.size(), device_inames_.size());
-  CHECK_EQ(device_odims.size(), device_onames_.size());
-  origin_idims_.resize(device_inames_.size());
-  origin_itensors_.resize(device_inames_.size());
-  device_itensors_.resize(device_inames_.size());
-  origin_odims_.resize(device_onames_.size());
-  origin_otensors_.resize(device_onames_.size());
-  device_otensors_.resize(device_onames_.size());
+    std::memcpy(
+        (*device_itensors)[i]->GetBuffer(), tmp.raw_data(), tmp.memory_size());
 
-  for (int i = 0; i < device_inames_.size(); i++) {
-    auto node = graph.Get(device_inames_[i]);
-    auto precision = node->precision();
-    auto layout = node->layout();
-    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
-    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
-    VLOG(3) << "[NPU] Inputs[" << i << "] name: " << device_inames_[i]
-            << " precision: " << PrecisionToStr(precision)
-            << " layout: " << DataLayoutToStr(layout) << " dims: {"
-            << device_idims[i].GetNumber() << ","
-            << device_idims[i].GetChannel() << ","
-            << device_idims[i].GetHeight() << "," << device_idims[i].GetWidth()
-            << "}";
-    // Prepare the device input tensors
-    CHECK_EQ(origin_idims_[i].production(),
-             device_idims[i].GetNumber() * device_idims[i].GetChannel() *
-                 device_idims[i].GetHeight() * device_idims[i].GetWidth());
-    device_itensors_[i].reset(new hiai::AiTensor);
-    device_itensors_[i]->Init(&(device_idims[i]));
+    // Share data buf between device_itensor and origin_itensor
+    std::shared_ptr<Buffer> buffer =
+        std::make_shared<Buffer>((*device_itensors)[i]->GetBuffer(),
+                                 lite_api::TargetType::kHost,
+                                 (*device_itensors)[i]->GetSize());
+    (*origin_itensors)[i]->ResetBuffer(buffer,
+                                       (*device_itensors)[i]->GetSize());
   }
-  device_program->origin_idims = origin_idims_;
-
-  for (int i = 0; i < device_onames_.size(); i++) {
-    auto node = graph.Get(device_onames_[i]);
-    auto precision = node->precision();
-    auto layout = node->layout();
-    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-    VLOG(3) << "[NPU] Outputs[" << i << "] name: " << device_onames_[i]
-            << " precision: " << PrecisionToStr(precision)
-            << " layout: " << DataLayoutToStr(layout) << " dims: {"
-            << device_odims[i].GetNumber() << ","
-            << device_odims[i].GetChannel() << ","
-            << device_odims[i].GetHeight() << "," << device_odims[i].GetWidth()
-            << "}";
-    // Prepare the device output tensors
-    switch (precision) {
-      case PRECISION(kFloat):
-        origin_otensors_[i]->mutable_data<float>();
-        break;
-      case PRECISION(kBool):
-        origin_otensors_[i]->mutable_data<bool>();
-        break;
-      case PRECISION(kInt8):
-        origin_otensors_[i]->mutable_data<int8_t>();
-        break;
-      case PRECISION(kInt16):
-        origin_otensors_[i]->mutable_data<int16_t>();
-        break;
-      case PRECISION(kInt32):
-        origin_otensors_[i]->mutable_data<int32_t>();
-        break;
-      case PRECISION(kInt64):
-        origin_otensors_[i]->mutable_data<int64_t>();
-        break;
-      default:
-        LOG(FATAL) << "[NPU] " << device_onames_[i]
-                   << " can't mutable data with precision type "
-                   << PrecisionToStr(precision);
-        break;
-    }
-    device_program->origin_odims = origin_odims_;
-
-    CHECK_EQ(origin_odims_[i].production(),
-             device_odims[i].GetNumber() * device_odims[i].GetChannel() *
-                 device_odims[i].GetHeight() * device_odims[i].GetWidth());
-    device_otensors_[i].reset(new hiai::AiTensor);
-    device_otensors_[i]->Init(&(device_odims[i]));
+  for (int i = 0; i < output_names.size(); i++) {
+    (*origin_otensors)[i]->set_precision(origin_otypes_[i]);
+    (*origin_otensors)[i]->Resize(origin_odims_[i]);
+    VLOG(3) << "[NPU] Outputs[" << i << "] name: " << output_names[i]
+            << " origin dims:" << (*origin_otensors)[i]->dims().repr()
+            << " device dims: {" << device_odims_[i].GetNumber() << ","
+            << device_odims_[i].GetChannel() << ","
+            << device_odims_[i].GetHeight() << ","
+            << device_odims_[i].GetWidth() << "}";
+    CHECK_EQ((*origin_otensors)[i]->dims().production(),
+             device_odims_[i].GetNumber() * device_odims_[i].GetChannel() *
+                 device_odims_[i].GetHeight() * device_odims_[i].GetWidth());
+    (*device_otensors)[i]->Init(&(device_odims_[i]));
+    VLOG(3) << "[NPU] Init the output tensors for the device program and share "
+               "their buffers with the origin output tensors";
+    // Share data buf between device_itensor and origin_itensor
+    std::shared_ptr<Buffer> buffer =
+        std::make_shared<Buffer>((*device_otensors)[i]->GetBuffer(),
+                                 lite_api::TargetType::kHost,
+                                 (*device_otensors)[i]->GetSize());
+    (*origin_otensors)[i]->ResetBuffer(buffer,
+                                       (*device_otensors)[i]->GetSize());
   }
-  return status;
+  return true;
 }
 
-int SubgraphEngine::LaunchDeviceProgram() {
-  // Copy the data of origin input tensors to the buffer of input HiAI tensors
-  // init device_itensors_, device_otensors_, origin_otensors_
-  auto device_program = device_program_map_[inputs_shape_];
-
+bool DeviceProgram::ZeroCopyRun(
+    std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
+    std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors) {
+  CHECK(!model_name_.empty() && model_client_);
   // Run the HiAI model by name
   std::string key = "model_name";  // Note: key seems must be model_name
   hiai::AiContext model_context;
@@ -234,88 +313,106 @@ int SubgraphEngine::LaunchDeviceProgram() {
   };
   int istamp;
   auto start_time = GetCurrentUS();
-  CHECK_EQ(device_program->client->Process(
-               model_context, device_itensors_, device_otensors_, 1000, istamp),
+  CHECK_EQ(model_client_->Process(
+               model_context, *device_itensors, *device_otensors, 1000, istamp),
            hiai::AI_SUCCESS);
   VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
-
-  return 0;
+  return true;
 }
 
-int SubgraphEngine::Build() {
-  if (device_program_map_.count(inputs_shape_) > 0) {
-    return subgraph::SUCCESS;
+bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
+  // Obtain the origin input tensors, and create the origin output
+  // tensors(Don't try to access them before launch the device program or the
+  // origin program)
+  PrepareWorkspaceForOriginProgram();
+  // Create the device input and output tensors, but don't initialize them
+  // with the dimensions
+  device_itensors_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    device_itensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_itensors_[i]);
+  }
+  device_otensors_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    device_otensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_otensors_[i]);
   }
-  // In order to attach all of the ops of the block desc, we need to build the
-  // original program firstly.
-  BuildOriginProgram();
-  // Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph
-  build_device_program_status_ = BuildDeviceProgram();
-  return build_device_program_status_;
+  return true;
 }
 
-void SubgraphEngine::InitDeviceTensor() {
-  auto device_program = device_program_map_[inputs_shape_];
-  for (size_t i = 0; i < device_itensors_.size(); i++) {
-    if (device_itensors_[i]->GetBuffer() != origin_itensors_[i]->raw_data()) {
-      VLOG(3) << "init device_itensors and share input tensor buf between "
-                 "device and host";
-      device_itensors_[i]->Init(&(device_program->device_idims[i]));
-      std::memcpy(device_itensors_[i]->GetBuffer(),
-                  origin_itensors_[i]->raw_data(),
-                  origin_itensors_[i]->memory_size());
-      // share data buf between device_itensor and origin_itensor
-      std::shared_ptr<Buffer> buffer =
-          std::make_shared<Buffer>(device_itensors_[i]->GetBuffer(),
-                                   lite_api::TargetType::kHost,
-                                   device_itensors_[i]->GetSize());
-      origin_itensors_[i]->ResetBuffer(buffer, device_itensors_[i]->GetSize());
+bool SubgraphEngine::BuildDeviceProgram() {
+  // Check if the cache device program exists
+  if (!device_programs_.count(origin_idims_)) {
+    auto device_program = std::make_shared<DeviceProgram>();
+    // Obtain the model cache dir from the NPU Context of the subgraph op
+    auto model_cache_dir =
+        ctx_->As<NPUContext>().SubgraphModelCacheDir(exec_scope_);
+    VLOG(3) << "[NPU] Getting subgraph_model_cache_dir: " << model_cache_dir;
+    // Check and load if the cached model and configuration file exists
+    if (model_cache_dir.empty() ||
+        !device_program->LoadFromCacheFile(
+            input_names_, output_names_, origin_idims_, model_cache_dir)) {
+      // Build the model online, including converting the paddle ops to the HiAI
+      // IR nodes, building the HiAI IR graph to the om model, then load it as a
+      // new HiAI model manager client for inference.
+      if (!origin_program_) {
+        BuildOriginProgram();
+      }
+      CHECK(origin_program_) << "[NPU] The origin program is not initialized!";
+      CHECK_GT(origin_program_->instructions().size(), 0)
+          << "[NPU] No instructions found in the origin program!";
+      if (!device_program->BuildGraphAndCacheToFile(origin_program_.get(),
+                                                    input_names_,
+                                                    output_names_,
+                                                    origin_idims_,
+                                                    origin_otensors_,
+                                                    model_cache_dir)) {
+        return false;
+      }
     }
-  }
-  for (size_t i = 0; i < device_otensors_.size(); i++) {
-    if (device_otensors_[i]->GetBuffer() != origin_otensors_[i]->raw_data()) {
-      VLOG(3) << "init device_otensors and share output tensor buf between "
-                 "device and host";
-      device_otensors_[i]->Init(&(device_program->device_odims[i]));
-      // share data buf between device_itensor and origin_itensor
-      origin_otensors_[i]->Resize(device_program->origin_odims[i]);
-      std::shared_ptr<Buffer> buffer =
-          std::make_shared<Buffer>(device_otensors_[i]->GetBuffer(),
-                                   lite_api::TargetType::kHost,
-                                   device_otensors_[i]->GetSize());
-      origin_otensors_[i]->ResetBuffer(buffer, device_otensors_[i]->GetSize());
+    if (device_program->model_client_ == nullptr) {
+      return false;
     }
+    device_programs_[origin_idims_] = device_program;
   }
+  auto device_program = device_programs_[origin_idims_];
+  CHECK(device_program && device_program->model_client_);
+  return device_program->ShareBufferWithOriginTensors(input_names_,
+                                                      output_names_,
+                                                      &origin_itensors_,
+                                                      &origin_otensors_,
+                                                      &device_itensors_,
+                                                      &device_otensors_);
 }
 
-bool SubgraphEngine::InputShapeChanged() {
-  std::vector<std::vector<int64_t>> new_shape;
-  for (auto origin_itensor : origin_itensors_) {
-    new_shape.push_back(origin_itensor->dims().Vectorize());
+bool SubgraphEngine::LaunchDeviceProgram() {
+  // Roll back to launch the origin program if the device program can't be
+  // found or the model client isn't initialized.
+  if (device_programs_.count(origin_idims_) == 0 ||
+      device_programs_[origin_idims_]->model_client_ == nullptr) {
+    return LaunchOriginProgram();
   }
-  if (inputs_shape_ == new_shape) {
-    return false;
+  auto device_program = device_programs_[origin_idims_];
+  if (!device_program->model_client_) {
+    return LaunchOriginProgram();
   }
-  inputs_shape_ = new_shape;
-  return true;
+  return device_program->ZeroCopyRun(&device_itensors_, &device_otensors_);
 }
 
 void SubgraphCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                    param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope,
-                                   NPUContext::SubgraphModelCacheDir()));
+                                   param.output_data_names));
   CHECK(engine_);
-  engine_->Build();
 }
 
 void SubgraphCompute::Run() {
   CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 
 }  // namespace npu
diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h
index 9f0b5a944137dbf9a521235b80398feca1cd82b0..2203acaee82704b2a9e93d8b14d708197d7afb1a 100644
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -28,52 +28,69 @@ namespace lite {
 namespace kernels {
 namespace npu {
 
+class DeviceProgram {
+ public:
+  DeviceProgram() {}
+  ~DeviceProgram() {}
+  std::string GenerateModelName(
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::vector<int64_t>>& origin_idims);
+  bool LoadFromCacheFile(const std::vector<std::string>& input_names,
+                         const std::vector<std::string>& output_names,
+                         const std::vector<std::vector<int64_t>>& origin_idims,
+                         const std::string& model_cache_dir);
+  bool BuildGraphAndCacheToFile(
+      RuntimeProgram* origin_program,
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::vector<int64_t>>& origin_idims,
+      const std::vector<Tensor*>& origin_otensors,
+      const std::string& model_cache_dir);
+  bool ShareBufferWithOriginTensors(
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      std::vector<Tensor*>* origin_itensors,
+      std::vector<Tensor*>* origin_otensors,
+      std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
+      std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors);
+  bool ZeroCopyRun(
+      std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
+      std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors);
+
+ public:
+  std::string model_name_{""};
+  std::shared_ptr<hiai::AiModelMngerClient> model_client_{nullptr};
+  std::vector<std::vector<int64_t>> origin_odims_;
+  std::vector<PrecisionType> origin_otypes_;
+  std::vector<hiai::TensorDimension> device_idims_{};
+  std::vector<hiai::TensorDimension> device_odims_{};
+};
+
 class SubgraphEngine : public subgraph::Engine {
  public:
-  SubgraphEngine(KernelContext *ctx,
+  SubgraphEngine(KernelContext* ctx,
                  int block_idx,
-                 cpp::BlockDesc *block_desc,
-                 const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope,
-                 std::string model_cache_dir = "")
+                 const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+                 Scope* exec_scope,
+                 const std::vector<std::string>& input_names,
+                 const std::vector<std::string>& output_names)
       : subgraph::Engine(ctx,
                          block_idx,
-                         block_desc,
+                         program_desc,
+                         exec_scope,
                          input_names,
-                         output_names,
-                         scope,
-                         model_cache_dir) {}
-
-  struct device_program_t {
-    explicit device_program_t(std::shared_ptr<hiai::AiModelMngerClient> _client)
-        : client(_client) {}
-    std::shared_ptr<hiai::AiModelMngerClient> client{nullptr};
-    std::vector<DDim> origin_idims{};
-    std::vector<DDim> origin_odims{};
-    std::vector<hiai::TensorDimension> device_idims{};
-    std::vector<hiai::TensorDimension> device_odims{};
-  };
-
-  int Build() override;
+                         output_names) {}
 
  protected:
-  int BuildDeviceProgram() override;
-  int LaunchDeviceProgram() override;
-
-  void InitDeviceTensor() override;
-  bool InputShapeChanged() override;
-
-  std::string GenerateModelCacheName() const;
+  bool PrepareWorkspaceForDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
 
-  std::string model_name_{"model.om"};
-  std::vector<std::vector<int64_t>> inputs_shape_{};
-  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>
-      device_program_map_{};
-  std::vector<std::string> device_inames_{};
-  std::vector<std::string> device_onames_{};
   std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_{};
   std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_{};
+  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<DeviceProgram>>
+      device_programs_;
 };
 
 class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> {
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index 600d0d22553af9d857d03491aabd2067db8f32ef..81e1a4d7562a9decab2e2daf4001faec7ac2fcee 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -21,6 +21,7 @@ add_kernel(fusion_elementwise_sub_activation_opencl
 add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(transpose_opencl OPENCL basic SRCS transpose_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
@@ -67,6 +68,9 @@ lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc
 lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc
              DEPS reshape_opencl op_registry program context)
 
+lite_cc_test(test_transpose_image_opencl SRCS transpose_image_compute_test.cc
+             DEPS transpose_opencl layout_opencl op_registry program context)
+             
 lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
              DEPS concat_opencl layout_opencl op_registry program context)
 
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
index fed8171cc273b437be411225363bf4a732769ae3..083f72134eba8afc7db696f68d64098b9c59a0f9 100644
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -28,91 +28,83 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
-/* image kernel*/
+
 void ConvImageCompute::PrepareForRun() {
-  const auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
+  ReInitWhenNeeded();
+
+  auto filter_dims = conv_param_->filter->dims();
+  filter_tensor_n_ = filter_dims[0];
+  filter_tensor_c_ = filter_dims[1];
+  filter_tensor_h_ = filter_dims[2];
+  filter_tensor_w_ = filter_dims[3];
 
-  float* filter_cpu = param.filter->mutable_data<float>();
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
   const bool is_mali = context.cl_context()->IsArmMali();
-  filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
-  tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
-  tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
-  int bs = x_dims[0];
-  int c_in = x_dims[1];
-  int h_out = output_dims[2];
-  int w_out = output_dims[3];
-  int kernel_h = filter_dims[2];  // oihw
-  int kernel_w = filter_dims[3];
-  auto paddings = *param.paddings;
-  auto dilations = *param.dilations;
-  int stride_h = param.strides[0];
-  int stride_w = param.strides[1];
-  int pad_h = paddings[0];
-  int pad_w = paddings[2];
-  int groups = param.groups;
-  bool relu_fused = param.fuse_relu;
-  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
-  bool zero_pad = (pad_h == 0) && (pad_w == 0);
-
-  bool pad_equal =
-      ((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) &&
-       (paddings[2] == paddings[3]));
-  bool stride_equal = stride_h == stride_w;
-  bool dilation_equal = dilations[0] == dilations[1];
+
+  auto paddings = *conv_param_->paddings;
+  pad_up_ = paddings[0];
+  pad_down_ = paddings[1];
+  pad_left_ = paddings[2];
+  pad_right_ = paddings[3];
+
+  auto dilations = *conv_param_->dilations;
+  dilation_h_ = dilations[0];
+  dilation_w_ = dilations[1];
+
+  stride_h_ = conv_param_->strides[0];
+  stride_w_ = conv_param_->strides[1];
+
+  groups_ = conv_param_->groups;
+  relu_fused_ = conv_param_->fuse_relu;
+  has_bias_ = (conv_param_->bias) != nullptr;
+  offset_ = filter_tensor_h_ / 2 - pad_up_;
+
+  bool pad_equal = ((pad_left_ == pad_up_) && (pad_up_ == pad_left_) &&
+                    (pad_left_ == pad_right_));
+  bool stride_equal = stride_h_ == stride_w_;
+  bool dilation_equal = dilation_h_ == dilation_w_;
 
   VLOG(3) << "Is arm mali  / " << (is_mali ? "Yes" : "No");
-  VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
-  VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
-          << " stride_w:" << stride_w << " pad_h:" << pad_h
-          << " pad_w:" << pad_w << " kernel_h:" << kernel_h
-          << " kernel_h:" << kernel_h;
-  VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
-          << " " << x_dims[3];
-  VLOG(3) << "dialtion:" << dilations[0] << " " << dilations[1];
-  VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
-          << output_dims[2] << " " << output_dims[3];
-  VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
-          << filter_dims[2] << " " << filter_dims[3];
+  VLOG(3) << "Is relu fused? / " << (relu_fused_ ? "Yes" : "No");
+  VLOG(3) << "groups:" << groups_ << " stride_h_:" << stride_h_
+          << " stride_w_:" << stride_w_ << " pad_left_:" << pad_left_
+          << " pad_up_:" << pad_up_ << " filter_tensor_h_:" << filter_tensor_h_
+          << " filter_tensor_h_:" << filter_tensor_h_;
+  VLOG(3) << "input_tensor_nchw:" << input_tensor_n_ << " " << input_tensor_c_
+          << " " << input_tensor_h_ << " " << input_tensor_w_;
+  VLOG(3) << "dialtion:" << dilation_h_ << " " << dilation_w_;
+  VLOG(3) << "output_dims:" << output_tensor_n_ << " " << output_tensor_c_
+          << " " << output_tensor_h_ << " " << output_tensor_w_;
+  VLOG(3) << "filter_dims:" << filter_tensor_n_ << " " << filter_tensor_c_
+          << " " << filter_tensor_h_ << " " << filter_tensor_w_;
   VLOG(3) << "pad_equal:" << pad_equal;
   VLOG(3) << "stride_equal:" << stride_equal;
   VLOG(3) << "dilation_equal:" << dilation_equal;
-  VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " "
-          << paddings[2] << " " << paddings[3];
+  VLOG(3) << "padding :" << pad_up_ << " " << pad_down_ << " " << pad_left_
+          << " " << pad_right_;
   CHECK(pad_equal && stride_equal && dilation_equal);
+  CHECK_GE(conv_param_->dilations->size(), 2);
+  CHECK(dilation_h_ == dilation_w_);
+  CHECK_GE(conv_param_->paddings->size(), 2);
+  CHECK(pad_left_ == pad_up_);
+  CHECK_GE(conv_param_->strides.size(), 2);
+  CHECK(stride_h_ == stride_w_);
+
+  if (!is_mali) {
+    use_tune_ = false;
+  }
 
-  // general gws..
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  default_c_blk_ = default_work_size[0];
-  default_w_blk_ = default_work_size[1];
-  default_nh_blk_ = default_work_size[2];
-  c_blk_ = default_c_blk_;
-  w_blk_ = default_w_blk_;
-  nh_blk_ = default_nh_blk_;
-  global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                  static_cast<size_t>(w_blk_),
-                                  static_cast<size_t>(nh_blk_)};
-
-  if (kernel_h == 1 && kernel_w == 1) {
-    // conv2d_1x1
-    // if (param.x->dims()[1] % 4 == 0) {
-    //   kernel_func_names_.push_back("conv2d_1x1_simple");
-    // } else {
-    //   kernel_func_names_.push_back("conv2d_1x1_opt");
-    // }
+  /*********************************************
+   * Upload filter, bias to opencl device
+   *********************************************/
+  float* filter_cpu = conv_param_->filter->mutable_data<float>();
+  filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
+  tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
+  tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
 
-    if (param.x->dims()[1] % 4 == 0) {
+  if (filter_tensor_h_ == 1 && filter_tensor_h_ == 1) {
+    if (input_tensor_c_ % 4 == 0) {
       kernel_func_names_.push_back("conv2d_1x1_simple");
     } else {
       kernel_func_names_.push_back("conv2d_1x1_opt");
@@ -121,89 +113,49 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    // std::vector<half_t> filter_image_v(filter_image_dims[0] *
-    //                                    filter_image_dims[1] * 4);  // 4 :
-    //                                    RGBA
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
-
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d1x1opt;
-    {
-      // calc 1x1 gws
-      w_blk_ = maptofactor(default_w_blk_, 4);
-      c_blk_ = default_c_blk_;
-      nh_blk_ = default_nh_blk_;
-      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                      static_cast<size_t>(w_blk_),
-                                      static_cast<size_t>(nh_blk_)};
-    }
 #define DEPTH_CONV_USE_SPL
 #ifdef DEPTH_CONV_USE_SPL
-  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
-             kernel_h == 3 && kernel_w == 3 && groups > 1) {
+  } else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_ &&
+             filter_tensor_h_ == 3 && filter_tensor_w_ == 3 && groups_ > 1) {
     // depth_conv2d_3x3s1, depth_conv2d_3x3
-    if (stride_h == 1 && dilations[0] == 1) {
+    if (stride_h_ == 1 && dilation_h_ == 1) {
       kernel_func_names_.push_back("depth_conv2d_3x3s1");
       impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
-      {
-        // depthwise spl gws s1
-        int c_block = (output_dims[1] + 3) / 4;
-        int w = output_dims[3];
-        int nh = output_dims[0] * output_dims[2];
-        int w_blk_size = 2;
-        int w_blk = (w + w_blk_size - 1) / w_blk_size;
-
-        c_blk_ = c_block;
-        w_blk_ = w_blk;
-        nh_blk_ = nh;
-        global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                        static_cast<size_t>(w_blk_),
-                                        static_cast<size_t>(nh_blk_)};
-      }
     } else {
       kernel_func_names_.push_back("depth_conv2d_3x3");
       impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
-      {
-        // depthwise spl gws
-        int c_block = (output_dims[1] + 3) / 4;
-        int w = output_dims[3];
-        int nh = output_dims[0] * output_dims[2];
-
-        c_blk_ = c_block;
-        w_blk_ = w;
-        nh_blk_ = nh;
-
-        global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                        static_cast<size_t>(w_blk_),
-                                        static_cast<size_t>(nh_blk_)};
-      }
     }
     kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
 #endif
-  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]
+  } else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_
 #ifdef DEPTH_CONV_USE_SPL
              &&
-             kernel_h != 3
+             filter_tensor_h_ != 3
 #endif
 #undef DEPTH_CONV_USE_SPL
              ) {
@@ -213,75 +165,61 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::DepthwiseConv2d;
-  } else if (kernel_w == 3 && kernel_h == 3) {
+  } else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) {
 // #define CONV3x3OPT_FALL_BACK
 #ifndef CONV3x3OPT_FALL_BACK
     // conv2d_3x3
-    kernel_func_names_.push_back(bs > 1 ? "conv2d_3x3_multi_batch"
-                                        : "conv2d_3x3_opt");
+    kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch"
+                                                     : "conv2d_3x3_opt");
     kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d3x3opt;
-
-    {
-      int w_blk_size = 5;
-      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
-
-      int h_blk_size = 1;
-      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
-
-      c_blk_ = default_c_blk_;
-      w_blk_ = w_blk;
-      nh_blk_ = h_blk;
-
-      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                      static_cast<size_t>(w_blk_),
-                                      static_cast<size_t>(nh_blk_)};
-    }
 #else
     kernel_func_names_.push_back("conv2d_3x3");
     kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d3x3;
-
 #endif
 #undef CONV3x3OPT_FALL_BACK
-  } else if (kernel_h == 5 && kernel_w == 5) {
+  } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) {
 #define CONV_5x5_OPT
 #ifndef CONV_5x5_OPT
     // conv2d_5x5
@@ -290,55 +228,42 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d5x5;
 #else
     // conv2d_5x5_opt
 
-    kernel_func_names_.push_back(bs > 1 ? "conv2d_5x5_multi_batch"
-                                        : "conv2d_5x5_opt");
+    kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_5x5_multi_batch"
+                                                     : "conv2d_5x5_opt");
     kernel_func_paths_.push_back("image/conv2d_5x5_opt_kernel.cl");
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d5x5opt;
-    {
-      int w_blk_size = 5;
-      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
-
-      int h_blk_size = 1;
-      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
-
-      c_blk_ = default_c_blk_;
-      w_blk_ = w_blk;
-      nh_blk_ = h_blk;
-
-      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                      static_cast<size_t>(w_blk_),
-                                      static_cast<size_t>(nh_blk_)};
-    }
 #endif
 #undef CONV_5x5_OPT
-  } else if (kernel_h == 7 && kernel_w == 7) {
+  } else if (filter_tensor_h_ == 7 && filter_tensor_w_ == 7) {
 #define CONV_7x7_OPT
 #ifndef CONV_7x7_OPT
     // conv2d_7x7
@@ -347,52 +272,39 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d7x7;
 
 #else
     // conv2d_7x7
-    kernel_func_names_.push_back(bs > 1 ? "conv2d_7x7_multi_batch"
-                                        : "conv2d_7x7_opt");
+    kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_7x7_multi_batch"
+                                                     : "conv2d_7x7_opt");
     kernel_func_paths_.push_back("image/conv2d_7x7_opt_kernel.cl");
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d7x7opt;
-    {
-      int w_blk_size = 5;
-      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
-
-      int h_blk_size = 1;
-      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
-
-      c_blk_ = default_c_blk_;
-      w_blk_ = w_blk;
-      nh_blk_ = h_blk;
-
-      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                      static_cast<size_t>(w_blk_),
-                                      static_cast<size_t>(nh_blk_)};
-    }
 #endif
 #undef CONV_7x7_OPT
   } else {
@@ -404,30 +316,30 @@ void ConvImageCompute::PrepareForRun() {
   // build options
   std::string build_options_single(" -DCL_DTYPE_half");
   // relu options
-  VLOG(3) << "relu_fused:" << relu_fused
-          << " param.activation_param.active_type:"
-          << static_cast<int>(param.activation_param.active_type)
-          << " param.activation_param.has_active:"
-          << param.activation_param.has_active;
-  if (param.activation_param.has_active) {
-    if (param.activation_param.active_type ==
-        lite_api::ActivationType::kRelu) {  // Note: judge using `relu_fused`
+  VLOG(3) << "relu_fused_:" << relu_fused_
+          << " conv_param_->activation_param.active_type:"
+          << static_cast<int>(conv_param_->activation_param.active_type)
+          << " conv_param_->activation_param.has_active:"
+          << conv_param_->activation_param.has_active;
+  if (conv_param_->activation_param.has_active) {
+    if (conv_param_->activation_param.active_type ==
+        lite_api::ActivationType::kRelu) {  // Note: judge using `relu_fused_`
                                             // also is ok
       build_options_single += " -DRELU";
-    } else if (param.activation_param.active_type ==
+    } else if (conv_param_->activation_param.active_type ==
                lite_api::ActivationType::kRelu6) {
       build_options_single += " -DRELU6";
     } else {
       LOG(FATAL) << "Unsupported activation type:"
-                 << static_cast<int>(param.activation_param.active_type);
+                 << static_cast<int>(conv_param_->activation_param.active_type);
     }
   }
+  GetGlobalWorkSize();
 
   // bias options
-  const bool has_bias = param.bias != nullptr;
   const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  if (has_bias) {
+      has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims();
+  if (has_bias_) {
     bias_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
     build_options_single +=
         is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
@@ -435,21 +347,36 @@ void ConvImageCompute::PrepareForRun() {
     // convert cpu buffer bias --> gpu image
     CLImageConverterFolder bias_converter;
     const DDim& bias_image_dims =
-        bias_converter.InitImageDimInfoWith(param.bias->dims());
-
+        bias_converter.InitImageDimInfoWith(conv_param_->bias->dims());
+    bias_image_h_ = bias_image_dims[1];
+    bias_image_w_ = bias_image_dims[0];
     tensor_hold_bias_image_->Resize(
         {1, bias_image_dims[0], bias_image_dims[1], 4});
 
     half_t* bias_image_data = tensor_hold_bias_image_->mutable_data<half_t>();
 
-    float* bias_cpu_data = param.bias->mutable_data<float>();
+    float* bias_cpu_data = conv_param_->bias->mutable_data<float>();
     bias_converter.NCHWToImage(
-        bias_cpu_data, bias_image_data, param.bias->dims());
+        bias_cpu_data, bias_image_data, conv_param_->bias->dims());
     this->bias_gpu_image_->mutable_data<half_t, cl::Image2D>(
         bias_image_dims[0], bias_image_dims[1], bias_image_data);
     // convert cpu buffer bias --> gpu image --- end ----
+  } else {
+    bias_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
+    CLImageConverterFolder bias_converter;
+    tensor_hold_bias_image_->Resize({1, 1, 1, 4});
+    half_t* bias_image_data = tensor_hold_bias_image_->mutable_data<half_t>();
+    this->bias_gpu_image_->mutable_data<half_t, cl::Image2D>(
+        1, 1, bias_image_data);
   }
 
+  // define image pointer for filter, bias
+  input_image_p_ = conv_param_->x->data<half_t, cl::Image2D>();
+  filter_image_p_ = filter_gpu_image_->data<half_t, cl::Image2D>();
+  bias_image_p_ = bias_gpu_image_->data<half_t, cl::Image2D>();
+  output_image_p_ = conv_param_->output->mutable_data<half_t, cl::Image2D>(
+      output_image_w_, output_image_h_);
+
   build_options_.push_back(build_options_single);
 
   for (size_t i = 0; i < kernel_func_names_.size(); i++) {
@@ -475,55 +402,55 @@ void ConvImageCompute::PrepareForRun() {
   VLOG(4) << "max_work_group_size: " << max_work_group_size;
 
   if (max_work_group_size > 0 && use_lws_) {
-    double min_turn_time = DBL_MAX;
+    double min_tune_time = DBL_MAX;
     cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize(
         global_work_size_, max_work_group_size);
     VLOG(3) << "origin  :local_work_size_ : " << best_local_work_size[0] << " "
             << best_local_work_size[1] << " " << best_local_work_size[2];
     cl::NDRange last_local_work_size = cl::NDRange{
         static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
-    if (use_turn_) {
+    if (use_tune_) {
       for (size_t i = 1; i < 15; i++) {
-        if (kernel_h == 1 && kernel_w == 1) {
+        if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) {
           // todo use diff logics
-          local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
+          local_work_size_ = context.cl_context()->LocalWorkSizeTune(
               global_work_size_, max_work_group_size, i);
         } else {
-          local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
+          local_work_size_ = context.cl_context()->LocalWorkSizeTune(
               global_work_size_, max_work_group_size, i);
         }
         if (last_local_work_size[0] == local_work_size_[0] &&
             last_local_work_size[1] == local_work_size_[1] &&
             last_local_work_size[2] == local_work_size_[2]) {
-          // skiped turned lws
+          // skiped tuneed lws
           continue;
         }
-        auto turn_time = this->Turn(10);
-        if (min_turn_time > turn_time) {
-          min_turn_time = turn_time;
+        auto tune_time = this->Tune(10);
+        if (min_tune_time > tune_time) {
+          min_tune_time = tune_time;
           best_local_work_size = local_work_size_;
         }
         last_local_work_size = local_work_size_;
       }
       // reverse
       for (size_t i = 1; i < 15; i++) {
-        if (kernel_h == 1 && kernel_w == 1) {
+        if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) {
           // todo use diff logics
-          local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse(
+          local_work_size_ = context.cl_context()->LocalWorkSizeTuneReverse(
               global_work_size_, max_work_group_size, i);
         } else {
-          local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse(
+          local_work_size_ = context.cl_context()->LocalWorkSizeTuneReverse(
               global_work_size_, max_work_group_size, i);
         }
         if (last_local_work_size[0] == local_work_size_[0] &&
             last_local_work_size[1] == local_work_size_[1] &&
             last_local_work_size[2] == local_work_size_[2]) {
-          // skiped turned lws
+          // skiped tuneed lws
           continue;
         }
-        auto turn_time = this->Turn(10);
-        if (min_turn_time > turn_time) {
-          min_turn_time = turn_time;
+        auto tune_time = this->Tune(10);
+        if (min_tune_time > tune_time) {
+          min_tune_time = tune_time;
           best_local_work_size = local_work_size_;
         }
         last_local_work_size = local_work_size_;
@@ -537,548 +464,316 @@ void ConvImageCompute::PrepareForRun() {
   }
 }
 
-void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-               static_cast<int>(paddings[0]);
-
-  // calc input_c_block
-  auto input_image_shape = InitImageDimInfoWith(input_dims);
-  int input_c_block = input_image_shape["width"] / input_dims[3];
-  int input_c = input_dims[1];
-  auto dilations = *param.dilations;
-
+void ConvImageCompute::ReInitWhenNeeded() {
+  conv_param_ = param_.get_mutable<param_t>();
+  auto x_dims = conv_param_->x->dims();
 #ifdef LITE_WITH_LOG
-  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
-          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+  LOG(INFO) << "is_first_epoch_for_run_:" << is_first_epoch_for_run_
+            << ", last_input_dims_:" << last_input_dims_
+            << ", x_dims:" << x_dims;
 #endif
-#ifdef LITE_WITH_LOG
-  VLOG(4) << "============ conv2d_1x1 params ============";
-  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-          << input_image_shape["height"];
-  VLOG(4) << "input_c_block: " << input_c_block;
-  VLOG(4) << "input_c: " << input_c;
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "offset: " << offset;
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-// VLOG(4) << "default work size{c_block, w, nh}: "
-//         << "{" << c_block << ", " << w << ", " << nh << ""
-//         << "}";
-#endif
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  // handle bias  use buffer for channel wise , use image for element wise
-  const cl::Buffer* bias_buf = nullptr;
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
-
-  auto kernel = kernel_;
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, offset);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c_block);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, default_w_blk_);
-  CL_CHECK_FATAL(status);
-
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                local_work_size_,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-  if (is_turn) {
-    CLRuntime::Global()->command_queue().finish();
-  }
-}
 
-void ConvImageCompute::Conv2d3x3(bool is_turn) {
-  auto kernel = kernel_;
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int input_channel = input_dims[1];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int output_channel = output_dims[1];
-  int filter_width = filter_dims[3];
-  int filter_height = filter_dims[2];
-  int filter_channel = filter_dims[1];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-               static_cast<int>(paddings[0]);
-
-  // calc input_c_block
-  auto input_image_shape = InitImageDimInfoWith(input_dims);
-  int input_c_block = input_image_shape["width"] / input_dims[3];
-  int input_c = input_dims[1];
-  auto dilations = *param.dilations;
-
-  // re-calc group
-  int new_groups{param.groups};
-  if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
-    new_groups = 1;
-  } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
-    new_groups = input_channel / filter_channel;
-  }
-  /* TODO(ysh329): mobile has no case below
-     else {
-      LOG(FATAL) << "Not support conv3x3 case with"
-                 << " input_dims:" << input_dims << " output_dims:" <<
-    output_dims
-                 << " filter_dims:" << filter_dims;
+  if (is_first_epoch_for_run_ || last_input_dims_ != x_dims) {
+    is_first_epoch_for_run_ = false;
+    last_input_dims_ = x_dims;
+
+    input_tensor_n_ = x_dims[0];
+    input_tensor_c_ = x_dims[1];
+    input_tensor_h_ = x_dims[2];
+    input_tensor_w_ = x_dims[3];
+    auto x_image_shape = InitImageDimInfoWith(x_dims);
+    input_image_h_ = x_image_shape["height"];
+    input_image_w_ = x_image_shape["width"];
+
+    auto output_dims = conv_param_->output->dims();
+    output_tensor_n_ = output_dims[0];
+    output_tensor_c_ = output_dims[1];
+    output_tensor_h_ = output_dims[2];
+    output_tensor_w_ = output_dims[3];
+    auto output_image_shape = InitImageDimInfoWith(output_dims);
+    output_image_h_ = output_image_shape["height"];
+    output_image_w_ = output_image_shape["width"];
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    CHECK_GE(conv_param_->x->dims().size(), 4);
+    CHECK_GE(conv_param_->output->dims().size(), 4);
+    if (kernel_func_names_.size() > 0 &&
+        kernel_func_names_[0] == "conv2d_3x3") {
+      groups_ = conv_param_->groups;
+      if (filter_tensor_n_ == output_tensor_c_ &&
+          filter_tensor_c_ == input_tensor_c_) {
+        groups_ = 1;
+      } else if (!(filter_tensor_n_ == input_tensor_c_ &&
+                   filter_tensor_c_ == 1)) {
+        groups_ = input_tensor_c_ / filter_tensor_c_;
+      }
     }
-  */
-
-  // const std::vector<size_t>& default_work_size =
-  //     DefaultWorkSize(output_dims,
-  //                     DDim(std::vector<DDim::value_type>{
-  //                         static_cast<int64_t>(out_image_shape["width"]),
-  //                         static_cast<int64_t>(out_image_shape["height"])}));
-
-  // int c_block = default_work_size[0];
-  // int w = default_work_size[1];
-  // int nh = default_work_size[2];
-
-  // VLOG(4) << "============ conv2d params ============";
-  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-  //         << input_image_shape["height"];
-  // VLOG(4) << "input_c_block: " << input_c_block;
-  // VLOG(4) << "input_c: " << input_c;
-  // VLOG(4) << "input_image: " << input_image;
-  // VLOG(4) << "input_dims: " << input_dims;
-  // VLOG(4) << "filter_dims: " << filter_dims;
-  // VLOG(4) << "filter_image: " << filter_image;
-  // VLOG(4) << "output_dims: " << output_dims;
-  // VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-  //         << out_image_shape["height"];
-  // VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  // VLOG(4) << "has bias: " << has_bias;
-  // VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  // VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  // VLOG(4) << "offset: " << offset;
-  // VLOG(4) << "dilations.size : " << dilations.size();
-  // VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  // VLOG(4) << "param.groups(groups):" << param.groups;
-  // VLOG(4) << "new_groups:" << new_groups;
-  // VLOG(4) << "default work size{c_block, w, nh}: "
-  //         << "{" << c_block << ", " << w << ", " << nh << ""
-  //         << "}";
-
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  // STL::stringstream kernel_key;
-  // kernel_key << kernel_func_names_[0] << build_options_[0];
-  // auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  // VLOG(4) << "kernel_key: " << kernel_key.str();
-  // VLOG(4) << "kernel ready ... " << kernel_key.str();
-  // VLOG(4) << "w: " << w;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
-    VLOG(4) << "set bias_image: ";
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
+    // define image pointer for input, output
+    input_image_p_ = conv_param_->x->data<half_t, cl::Image2D>();
+    output_image_p_ = conv_param_->output->mutable_data<half_t, cl::Image2D>(
+        output_image_w_, output_image_h_);
+
+    GetGlobalWorkSize();
   }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, offset);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c_block);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_channel);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_channel);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, new_groups);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<int>(input_dims[1]));
-  CL_CHECK_FATAL(status);
-
-  // auto global_work_size =
-  //     cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-  //                 static_cast<size_t>(default_work_size.data()[1]),
-  //                 static_cast<size_t>(default_work_size.data()[2])};
-
-  // VLOG(4) << "out_image: " << out_image;
-  // VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-  //         << global_work_size[1] << "," << global_work_size[2] << "}";
-
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                cl::NullRange,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
 }
-void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto dilations = *param.dilations;
-
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int input_channel = input_dims[1];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int output_channel = output_dims[1];
-  CHECK_EQ(input_dims[0], output_dims[0]);
-  int batch = input_dims[0];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
 
-#ifdef LITE_WITH_LOG
-  VLOG(4) << "============ conv2d params ============";
-  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-  //         << input_image_shape["height"];
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "input_dims: " << input_dims;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-#endif
+void ConvImageCompute::GetGlobalWorkSize() {
+  if (kernel_func_names_.size() <= 0) return;
+  // general input_c_block
+  input_c_block_ = static_cast<int>(input_image_w_ / input_tensor_w_);
 
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
+  // general gws
+  auto output_dims = conv_param_->output->dims();
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(output_image_w_),
+                          static_cast<int64_t>(output_image_h_)}));
+  default_c_blk_ = default_work_size[0];
+  default_w_blk_ = default_work_size[1];
+  default_nh_blk_ = default_work_size[2];
+  c_blk_ = default_c_blk_;
+  w_blk_ = default_w_blk_;
+  nh_blk_ = default_nh_blk_;
+  global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                  static_cast<size_t>(w_blk_),
+                                  static_cast<size_t>(nh_blk_)};
 
-  auto kernel = kernel_;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
-#ifdef LITE_WITH_LOG
-    VLOG(4) << "set bias_image: ";
-#endif
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
+  if (kernel_func_names_[0] == "conv2d_1x1_simple" ||
+      kernel_func_names_[0] == "conv2d_1x1_opt") {
+    w_blk_ = maptofactor(default_w_blk_, 4);
+    c_blk_ = default_c_blk_;
+    nh_blk_ = default_nh_blk_;
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+
+  } else if (kernel_func_names_[0] == "depth_conv2d_3x3s1") {
+    // depthwise spl gws s1
+    int c_block = (output_tensor_c_ + 3) / 4;
+    int w = output_tensor_w_;
+    int nh = output_tensor_n_ * output_tensor_h_;
+    int w_blk_size = 2;
+    int w_blk = (w + w_blk_size - 1) / w_blk_size;
+
+    c_blk_ = c_block;
+    w_blk_ = w_blk;
+    nh_blk_ = nh;
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+  } else if (kernel_func_names_[0] == "depth_conv2d_3x3") {
+    // depthwise spl gws
+    int c_block = (output_tensor_c_ + 3) / 4;
+    int w = output_tensor_w_;
+    int nh = output_tensor_n_ * output_tensor_h_;
+
+    c_blk_ = c_block;
+    w_blk_ = w;
+    nh_blk_ = nh;
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+    input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4);
+  } else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" ||
+             kernel_func_names_[0] == "conv2d_3x3_opt") {
+    int w_blk_size = 5;
+    int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+    int h_blk_size = 1;
+    int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+    c_blk_ = default_c_blk_;
+    w_blk_ = w_blk;
+    nh_blk_ = h_blk;
+
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+  } else if (kernel_func_names_[0] == "conv2d_5x5_multi_batch" ||
+             kernel_func_names_[0] == "conv2d_5x5_opt") {
+    int w_blk_size = 5;
+    int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+    int h_blk_size = 1;
+    int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+    c_blk_ = default_c_blk_;
+    w_blk_ = w_blk;
+    nh_blk_ = h_blk;
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+  } else if (kernel_func_names_[0] == "conv2d_7x7_multi_batch" ||
+             kernel_func_names_[0] == "conv2d_7x7_opt") {
+    int w_blk_size = 5;
+    int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+    int h_blk_size = 1;
+    int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+    c_blk_ = default_c_blk_;
+    w_blk_ = w_blk;
+    nh_blk_ = h_blk;
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
   }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, paddings[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, batch);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_channel);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
+}
 
+void ConvImageCompute::Conv2d1x1opt(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
-          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+  PrintConvInfo();
 #endif
+  auto& context = ctx_->As<OpenCLContext>();
 
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                local_work_size_,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-  if (is_turn) {
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, offset_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, input_c_block_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(15, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(16, default_w_blk_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 local_work_size_,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
 
-void ConvImageCompute::Conv2d5x5(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int filter_width = filter_dims[3];
-  int filter_height = filter_dims[2];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-               static_cast<int>(paddings[0]);
-
-  // calc input_c_block
-  auto input_image_shape = InitImageDimInfoWith(input_dims);
-  int input_c_block = input_image_shape["width"] / input_dims[3];
-  int input_c = input_dims[1];
-  auto dilations = *param.dilations;
-
+void ConvImageCompute::Conv2d3x3(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  VLOG(4) << "============ conv2d params ============";
-  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-          << input_image_shape["height"];
-  VLOG(4) << "input_c_block: " << input_c_block;
-  VLOG(4) << "input_c: " << input_c;
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "input_dims: " << input_dims;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "offset: " << offset;
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  PrintConvInfo();
 #endif
+  auto& context = ctx_->As<OpenCLContext>();
 
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, offset_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, input_c_block_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(15, output_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(16, filter_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(17, filter_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(18, filter_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(19, groups_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(20, input_tensor_c_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 cl::NullRange,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+}
 
-  auto kernel = kernel_;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
+void ConvImageCompute::Conv2d3x3opt(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-    VLOG(4) << "set bias_image: ";
+  PrintConvInfo();
 #endif
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, offset);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c_block);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
+  auto& context = ctx_->As<OpenCLContext>();
+
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, pad_left_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_tensor_n_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(15, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
 
 #ifdef LITE_WITH_LOG
   //  VLOG(4) << "out_image: " << out_image;
@@ -1086,697 +781,406 @@ void ConvImageCompute::Conv2d5x5(bool is_turn) {
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                cl::NullRange,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-  if (is_turn) {
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 local_work_size_,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
 
-void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto dilations = *param.dilations;
-
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int input_channel = input_dims[1];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int output_channel = output_dims[1];
-  CHECK_EQ(input_dims[0], output_dims[0]);
-  int batch = input_dims[0];
-
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-
-// default_work_size[2] = h_blk;
+void ConvImageCompute::Conv2d5x5(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  VLOG(4) << "============ conv2d params ============";
-  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-  //         << input_image_shape["height"];
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "input_dims: " << input_dims;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  PrintConvInfo();
 #endif
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
-
-  auto kernel = kernel_;
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, paddings[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, batch);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_channel);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
-
-  //  VLOG(4) << "out_image: " << out_image;
+  auto& context = ctx_->As<OpenCLContext>();
 
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                local_work_size_,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-  if (is_turn) {
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, offset_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, input_c_block_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 cl::NullRange,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
 
-void ConvImageCompute::Conv2d7x7(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int filter_width = filter_dims[3];
-  int filter_height = filter_dims[2];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-               static_cast<int>(paddings[0]);
-
-  // calc input_c_block
-  auto input_image_shape = InitImageDimInfoWith(input_dims);
-  int input_c_block = input_image_shape["width"] / input_dims[3];
-  int input_c = input_dims[1];
-  auto dilations = *param.dilations;
-
+void ConvImageCompute::Conv2d5x5opt(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  VLOG(4) << "============ conv2d params ============";
-  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-          << input_image_shape["height"];
-  VLOG(4) << "input_c_block: " << input_c_block;
-  VLOG(4) << "input_c: " << input_c;
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "input_dims: " << input_dims;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "offset: " << offset;
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  PrintConvInfo();
 #endif
+  auto& context = ctx_->As<OpenCLContext>();
 
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, pad_left_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_tensor_n_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(15, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 local_work_size_,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+  if (enable_tune) {
+    CLRuntime::Global()->command_queue().finish();
   }
+}
 
-  auto kernel = kernel_;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
+void ConvImageCompute::Conv2d7x7(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-    VLOG(4) << "set bias_image: ";
+  PrintConvInfo();
 #endif
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
+  auto& context = ctx_->As<OpenCLContext>();
+
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, offset_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, input_c_block_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 cl::NullRange,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+  if (enable_tune) {
+    CLRuntime::Global()->command_queue().finish();
   }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, offset);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c_block);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
+}
 
+void ConvImageCompute::Conv2d7x7opt(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
-          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+  PrintConvInfo();
 #endif
+  auto& context = ctx_->As<OpenCLContext>();
 
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                cl::NullRange,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-
-  if (is_turn) {
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, pad_left_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_tensor_n_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(15, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 local_work_size_,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
-void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto dilations = *param.dilations;
-
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int input_channel = input_dims[1];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int output_channel = output_dims[1];
-  CHECK_EQ(input_dims[0], output_dims[0]);
-  int batch = input_dims[0];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
 
+void ConvImageCompute::DepthwiseConv2d3x3s1(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  VLOG(4) << "============ conv2d 7x7 params ============";
-  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-  //         << input_image_shape["height"];
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "input_dims: " << input_dims;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  PrintConvInfo();
 #endif
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
+  auto& context = ctx_->As<OpenCLContext>();
 
-  auto kernel = kernel_;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, paddings[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, batch);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_channel);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
-
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                local_work_size_,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-
-  if (is_turn) {
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, pad_left_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 local_work_size_,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
-void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto x_dims = param.x->dims();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto dilations = *param.dilations;
-
-  auto* input_img = param.x->data<half_t, cl::Image2D>();
-  auto* filter_img = filter_gpu_image_->data<half_t, cl::Image2D>();
-
-  const cl::Image2D* bias_img = nullptr;
-  if (param.bias) {
-    bias_img = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
-
-  auto image_shape = InitImageDimInfoWith(output_dims);
-
-  auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
-      image_shape["width"], image_shape["height"]);
 
-  auto kernel = kernel_;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_img);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_img);
-  CL_CHECK_FATAL(status);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
+void ConvImageCompute::DepthwiseConv2d3x3(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-    VLOG(4) << "set bias_image: ";
+  PrintConvInfo();
 #endif
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *output_img);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[1]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
-  CL_CHECK_FATAL(status);
-
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                local_work_size_,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-
-  if (is_turn) {
+  auto& context = ctx_->As<OpenCLContext>();
+
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, offset_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_c_block_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 cl::NullRange,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
 
-void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto x_dims = param.x->dims();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto dilations = *param.dilations;
-  int offset = filter_dims[2] / 2 - paddings[0];
-  int input_c_block = (x_dims[1] + 3) / 4;
-
-  auto* input_img = param.x->data<half_t, cl::Image2D>();
-  auto* filter_img = filter_gpu_image_->data<half_t, cl::Image2D>();
-
-  const cl::Image2D* bias_img = nullptr;
-  if (param.bias) {
-    bias_img = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
-
-  auto image_shape = InitImageDimInfoWith(output_dims);
-
-  auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
-      image_shape["width"], image_shape["height"]);
-
-  auto kernel = kernel_;
-
+void ConvImageCompute::DepthwiseConv2d(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  VLOG(4) << "setArg";
-  VLOG(4) << "strides = " << strides[0];
-  VLOG(4) << "offset = " << offset;
-  VLOG(4) << "dilations = " << dilations[0];
-  VLOG(4) << "input_c_block = " << input_c_block;
-  VLOG(4) << "x_dims[3] = " << x_dims[3];
-  VLOG(4) << "x_dims[2] = " << x_dims[2];
-  VLOG(4) << "output_dims[3] = " << output_dims[3];
-  VLOG(4) << "output_dims[2] = " << output_dims[2];
+  PrintConvInfo();
 #endif
+  auto& context = ctx_->As<OpenCLContext>();
 
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_img);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_img);
-  CL_CHECK_FATAL(status);
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-#ifdef LITE_WITH_LOG
-    VLOG(4) << "set bias_image: ";
-#endif
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *output_img);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(offset));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(input_c_block));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
-  CL_CHECK_FATAL(status);
-
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                cl::NullRange,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-
-  if (is_turn) {
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, offset_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, input_c_block_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(15, filter_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(16, filter_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 cl::NullRange,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
 
-void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int filter_width = filter_dims[3];
-  int filter_height = filter_dims[2];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-               static_cast<int>(paddings[0]);
+void ConvImageCompute::Run() { (this->*impl_)(false); }
 
-  // calc input_c_block
-  auto input_image_shape = InitImageDimInfoWith(input_dims);
-  int input_c_block = input_image_shape["width"] / input_dims[3];
-  int input_c = input_dims[1];
-  auto dilations = *param.dilations;
+void ConvImageCompute::PrintConvInfo() {
+  const bool is_element_wise_bias =
+      has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims();
 
-#ifdef LITE_WITH_LOG
-  VLOG(4) << "============ depthwise conv2d params ============";
-  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-          << input_image_shape["height"];
-  VLOG(4) << "input_c_block: " << input_c_block;
-  VLOG(4) << "input_c: " << input_c;
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "input_image_shape: " << input_image_w_ << "," << input_image_h_;
+  //  VLOG(4) << "input_image: " << input_image_p_;
+  VLOG(4) << "input_dims: " << conv_param_->x->dims();
+  VLOG(4) << "filter_dims: " << conv_param_->filter->dims();
   //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "output_dims: " << conv_param_->output->dims();
+  VLOG(4) << "out_image_shape: " << output_image_w_ << ", " << output_image_h_;
+  VLOG(4) << "paddings: " << pad_left_ << "," << pad_up_;
+  VLOG(4) << "has bias: " << has_bias_;
   VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "offset: " << offset;
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-#endif
-
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  // handle bias  use buffer for channel wise , use image for element wise
-  const cl::Buffer* bias_buf = nullptr;
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
-
-  auto kernel = kernel_;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
-#ifdef LITE_WITH_LOG
-    VLOG(4) << "set bias_image: ";
-#endif
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, offset);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c_block);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_height);
-  CL_CHECK_FATAL(status);
-
-#ifdef LITE_WITH_LOG
+  VLOG(4) << "strides: " << stride_h_ << "," << stride_w_;
+  VLOG(4) << "offset: ";
+  VLOG(4) << "dilations.size : " << conv_param_->dilations->size();
+  VLOG(4) << "dilations: " << dilation_h_ << ", " << dilation_w_;
   VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
-#endif
-
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                cl::NullRange,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
 }
 
-void ConvImageCompute::Run() { (this->*impl_)(false); }
-
-double ConvImageCompute::Turn(int times) {
+double ConvImageCompute::Tune(int times) {
   auto GetCurrentUS = []() -> double {
     struct timeval time;
     gettimeofday(&time, NULL);
diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h
index 64276a5721cb20718604d91d3cfac31e583ddbf1..e61557a71dfbf1353decc9491b67c5e1e326512e 100644
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -33,6 +33,7 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
+
 class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
                                            PRECISION(kFP16),
                                            DATALAYOUT(kImageDefault)> {
@@ -42,8 +43,11 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
 
   void PrepareForRun() override;
 
+  void ReInitWhenNeeded() override;
+
   void Run() override;
-  double Turn(int times = 5);
+
+  double Tune(int times = 5);
 
 #ifdef LITE_WITH_PROFILE
   void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
@@ -56,16 +60,20 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
 #endif
 
  private:
-  void Conv2d1x1opt(bool is_turn = false);
-  void Conv2d3x3(bool is_turn = false);
-  void Conv2d3x3opt(bool is_turn = false);
-  void Conv2d5x5(bool is_turn = false);
-  void Conv2d5x5opt(bool is_turn = false);
-  void Conv2d7x7(bool is_turn = false);
-  void Conv2d7x7opt(bool is_turn = false);
-  void DepthwiseConv2d3x3s1(bool is_turn = false);
-  void DepthwiseConv2d3x3(bool is_turn = false);
-  void DepthwiseConv2d(bool is_turn = false);
+  void PrintConvInfo();
+  void GetGlobalWorkSize();
+  void Conv2d1x1opt(bool enable_tune = false);
+  void Conv2d3x3(bool enable_tune = false);
+  void Conv2d3x3opt(bool enable_tune = false);
+  void Conv2d5x5(bool enable_tune = false);
+  void Conv2d5x5opt(bool enable_tune = false);
+  void Conv2d7x7(bool enable_tune = false);
+  void Conv2d7x7opt(bool enable_tune = false);
+  void DepthwiseConv2d3x3s1(bool enable_tune = false);
+  void DepthwiseConv2d3x3(bool enable_tune = false);
+  void DepthwiseConv2d(bool enable_tune = false);
+
+  param_t* conv_param_{nullptr};
 
   kernel_t impl_;
   std::vector<std::string> kernel_func_names_{};
@@ -79,19 +87,72 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
   std::unique_ptr<Tensor> tensor_hold_bias_image_{nullptr};
   cl::NDRange global_work_size_ = cl::NDRange{
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+
+  // opencl kernel args
   int c_blk_ = 1;
   int w_blk_ = 1;
   int nh_blk_ = 1;
 
+  const cl::Image2D* input_image_p_{nullptr};
+  const cl::Image2D* filter_image_p_{nullptr};
+  const cl::Image2D* bias_image_p_{nullptr};
+  const cl::Image2D* output_image_p_{nullptr};
+
+  int stride_h_{-1};
+  int stride_w_{-1};
+
+  int dilation_h_{-1};
+  int dilation_w_{-1};
+
+  int pad_up_{-1};
+  int pad_down_{-1};
+  int pad_left_{-1};
+  int pad_right_{-1};
+
+  int offset_{-1};
+  int groups_{-1};
+  bool relu_fused_{false};
+  bool has_bias_{false};
+
+  int input_tensor_n_{-1};
+  int input_tensor_c_{-1};
+  int input_tensor_h_{-1};
+  int input_tensor_w_{-1};
+  int input_image_h_{-1};
+  int input_image_w_{-1};
+  int input_c_block_{-1};
+
+  int output_tensor_n_{-1};
+  int output_tensor_c_{-1};
+  int output_tensor_h_{-1};
+  int output_tensor_w_{-1};
+  int output_image_h_{-1};
+  int output_image_w_{-1};
+
+  int filter_tensor_n_{-1};
+  int filter_tensor_c_{-1};
+  int filter_tensor_h_{-1};
+  int filter_tensor_w_{-1};
+  int filter_image_h_{-1};
+  int filter_image_w_{-1};
+
+  int bias_image_h_{-1};
+  int bias_image_w_{-1};
+
   int default_c_blk_ = 1;
   int default_w_blk_ = 1;
   int default_nh_blk_ = 1;
+  // =================
+
+  DDim last_input_dims_{};
+  bool is_first_epoch_for_run_{true};
 
   cl::Kernel kernel_;
+  cl_int status_;
   cl::NDRange local_work_size_ = cl::NDRange{
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   bool use_lws_{true};
-  bool use_turn_{false};
+  bool use_tune_{true};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/expand_image_compute_test.cc b/lite/kernels/opencl/expand_image_compute_test.cc
index 1fa046c938a4b45bec0ae9842ed51fc0805b4131..c372855193e938081208addce058e3e38b692cbb 100644
--- a/lite/kernels/opencl/expand_image_compute_test.cc
+++ b/lite/kernels/opencl/expand_image_compute_test.cc
@@ -11,9 +11,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <random>
 
 #include <gtest/gtest.h>
+#include <random>
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
@@ -54,11 +54,11 @@ TEST(expand_hw_image2d, compute) {
   context->As<OpenCLContext>().InitOnce();
 
   kernel->SetParam(param);
-  std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
+  std::unique_ptr<KernelContext> expand_context(new KernelContext);
   context->As<OpenCLContext>().CopySharedTo(
-      &(pixel_shuffle_context->As<OpenCLContext>()));
+      &(expand_context->As<OpenCLContext>()));
 
-  kernel->SetContext(std::move(pixel_shuffle_context));
+  kernel->SetContext(std::move(expand_context));
 
   const DDim in_dim =
       DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
@@ -179,11 +179,11 @@ TEST(expand_c2hw_image2d, compute) {
   context->As<OpenCLContext>().InitOnce();
 
   kernel->SetParam(param);
-  std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
+  std::unique_ptr<KernelContext> expand_context(new KernelContext);
   context->As<OpenCLContext>().CopySharedTo(
-      &(pixel_shuffle_context->As<OpenCLContext>()));
+      &(expand_context->As<OpenCLContext>()));
 
-  kernel->SetContext(std::move(pixel_shuffle_context));
+  kernel->SetContext(std::move(expand_context));
 
   const DDim in_dim =
       DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
@@ -303,11 +303,11 @@ TEST(expand_c3hw_image2d, compute) {
   context->As<OpenCLContext>().InitOnce();
 
   kernel->SetParam(param);
-  std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
+  std::unique_ptr<KernelContext> expand_context(new KernelContext);
   context->As<OpenCLContext>().CopySharedTo(
-      &(pixel_shuffle_context->As<OpenCLContext>()));
+      &(expand_context->As<OpenCLContext>()));
 
-  kernel->SetContext(std::move(pixel_shuffle_context));
+  kernel->SetContext(std::move(expand_context));
 
   const DDim in_dim =
       DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
@@ -428,11 +428,11 @@ TEST(expand_c4hw_image2d, compute) {
   context->As<OpenCLContext>().InitOnce();
 
   kernel->SetParam(param);
-  std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
+  std::unique_ptr<KernelContext> expand_context(new KernelContext);
   context->As<OpenCLContext>().CopySharedTo(
-      &(pixel_shuffle_context->As<OpenCLContext>()));
+      &(expand_context->As<OpenCLContext>()));
 
-  kernel->SetContext(std::move(pixel_shuffle_context));
+  kernel->SetContext(std::move(expand_context));
 
   const DDim in_dim =
       DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
@@ -551,11 +551,11 @@ TEST(expand_n_image2d, compute) {
   context->As<OpenCLContext>().InitOnce();
 
   kernel->SetParam(param);
-  std::unique_ptr<KernelContext> pixel_shuffle_context(new KernelContext);
+  std::unique_ptr<KernelContext> expand_context(new KernelContext);
   context->As<OpenCLContext>().CopySharedTo(
-      &(pixel_shuffle_context->As<OpenCLContext>()));
+      &(expand_context->As<OpenCLContext>()));
 
-  kernel->SetContext(std::move(pixel_shuffle_context));
+  kernel->SetContext(std::move(expand_context));
 
   const DDim in_dim =
       DDim(std::vector<DDim::value_type>{INPUT_N, INPUT_C, INPUT_H, INPUT_W});
diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc
index 9763faf2f33f578e6f62b07a8c89390e1b80c159..3a31c8993d77388b95260ad5c0be65f791c433eb 100644
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -35,10 +35,27 @@ class FcCompute
  public:
   using param_t = operators::FcParam;
 
-  void PrepareForRun() override {}
+  void PrepareForRun() override {
+    fc_param_ = param_.get_mutable<param_t>();
+    auto w_t = fc_param_->w;
+    auto bias_t = fc_param_->bias;
+
+    w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
+    auto w_gpu_data =
+        w_gpu_t_->mutable_data(TARGET(kOpenCL), w_t->memory_size());
+    TargetWrapperCL::MemcpySync(
+        w_gpu_data, w_t->raw_data(), w_t->memory_size(), IoDirection::HtoD);
+
+    bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
+    auto b_gpu_data =
+        bias_gpu_t_->mutable_data(TARGET(kOpenCL), bias_t->memory_size());
+    TargetWrapperCL::MemcpySync(b_gpu_data,
+                                bias_t->raw_data(),
+                                bias_t->memory_size(),
+                                IoDirection::HtoD);
+  }
 
   void ReInitWhenNeeded() override {
-    fc_param_ = param_.get_mutable<param_t>();
     const auto x_dims = fc_param_->input->dims();
     if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
         first_epoch_for_reinit_) {
@@ -93,7 +110,7 @@ class FcCompute
   }
 
   void GetGlobalWorkSize() {
-    if (m_ == 1) {  // gemv
+    if (kernel_func_name_ == "fc_gemv_1x4") {  // gemv
       global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
     } else {  // gemm
       global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
@@ -103,8 +120,8 @@ class FcCompute
 
   void Run() override {
     auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
-    auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
-    auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
+    auto* w_buf = w_gpu_t_->data<float, cl::Buffer>();
+    auto* bias_buf = bias_gpu_t_->data<float, cl::Buffer>();
     auto* out_buf =
         fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
@@ -154,6 +171,10 @@ class FcCompute
   std::string time_stamp_{GetTimeStamp()};
   bool first_epoch_for_reinit_{true};
   DDim last_x_dims_;
+
+  std::unique_ptr<Tensor> w_gpu_t_{nullptr};
+  std::unique_ptr<Tensor> bias_gpu_t_{nullptr};
+
   cl::NDRange global_work_size_;
   cl::Kernel kernel_;
 };
@@ -166,7 +187,7 @@ class FcCompute
 REGISTER_LITE_KERNEL(
     fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
     .Finalize();
diff --git a/lite/kernels/opencl/fc_buffer_compute_test.cc b/lite/kernels/opencl/fc_buffer_compute_test.cc
index 4c9c8c47e4306c92486dd1b847884200959453dd..85793dffee9e4717e257ad8c73258ce35ad61d54 100644
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
@@ -126,9 +126,11 @@ TEST(fc, compute) {
         out.Resize(out_dim);
         out_ref.Resize(out_dim);
 
+        VLOG(2) << "out.dims():" << out.dims() << ", out_dim:" << out_dim;
+
         auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-        auto* w_data = w.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-        auto* bias_data = bias.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+        auto* w_data = w.mutable_data<float>();
+        auto* bias_data = bias.mutable_data<float>();
         auto* out_data = out.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
         std::default_random_engine engine;
@@ -148,17 +150,15 @@ TEST(fc, compute) {
         }
         for (size_t i = 0; i < w_dim.production(); ++i) {
           w_source[i] = static_cast<int>(dist(engine));
+          w_data[i] = w_source[i];
         }
         for (size_t i = 0; i < bias_dim.production(); ++i) {
           bias_source[i] = 10;  // static_cast<int>(dist(engine));
+          bias_data[i] = 10;
         }
 
         TargetWrapperCL::MemcpySync(
             x_data, x_source.data(), x_size, IoDirection::HtoD);
-        TargetWrapperCL::MemcpySync(
-            w_data, w_source.data(), w_size, IoDirection::HtoD);
-        TargetWrapperCL::MemcpySync(
-            bias_data, bias_source.data(), bias_size, IoDirection::HtoD);
 
         // run opencl kernel
         kernel->Launch();
@@ -186,8 +186,10 @@ TEST(fc, compute) {
 #endif
 
         std::vector<float> out_data_from_gpu(out_dim.production());
-        TargetWrapperCL::MemcpySync(
-            out_data_from_gpu.data(), out_data, bias_size, IoDirection::DtoH);
+        TargetWrapperCL::MemcpySync(out_data_from_gpu.data(),
+                                    out_data,
+                                    out_data_from_gpu.size() * sizeof(float),
+                                    IoDirection::DtoH);
 
         // run cpu ref
         auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
diff --git a/lite/kernels/opencl/nearest_interp_image_compute_test.cc b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
index 4a9948832d1a96d95a7f317bd3ac8245292ae02b..fb40da290d10ed49f293cf7ff78865f2e7967eab 100644
--- a/lite/kernels/opencl/nearest_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
@@ -155,6 +155,7 @@ TEST(nearest_interp_image2d, compute) {
               auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
               auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
               auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+              memset(reinterpret_cast<char *>(y_data_ref), 0, y_ref.numel());
               auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
                   x_data, 0, sizeof(float) * x_dim.production()));
               auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
diff --git a/lite/kernels/opencl/transpose_image_compute.cc b/lite/kernels/opencl/transpose_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31184092efa40cea47c3cacb6a65f03d15a229b2
--- /dev/null
+++ b/lite/kernels/opencl/transpose_image_compute.cc
@@ -0,0 +1,395 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"
+
+#undef LITE_WITH_LOG
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+// transpose operator
+class TransposeComputeFloatImage
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void PrepareForRun() override {
+    auto& param = *param_.get_mutable<param_t>();
+    Tensor* const output = param.output;
+    const DDimLite& out_dims = output->dims();
+    if (out_dims.size() == 4) {
+      kernel_func_name_ = "transpose_4d";
+    } else {
+      kernel_func_name_ = "transpose";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/transpose_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+  }
+
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const Tensor* const x = param.x;
+    const auto x_dims = x->dims();
+    const std::map<std::string, size_t>& input_image_shape =
+        InitImageDimInfoWith(x_dims);
+    const int64_t& input_image_width = input_image_shape.at("width");
+    const int64_t& input_image_height = input_image_shape.at("height");
+    const cl::Image2D* const x_image = x->data<half_t, cl::Image2D>();
+
+    Tensor* const output = param.output;
+    const DDimLite& out_dims = output->dims();
+    VLOG(4) << "out_dims= " << out_dims;
+    const std::map<std::string, size_t>& out_image_shape =
+        InitImageDimInfoWith(out_dims);
+    cl::Image2D* const out_image = output->mutable_data<half_t, cl::Image2D>(
+        out_image_shape.at("width"), out_image_shape.at("height"));
+#ifdef LITE_WITH_LOG
+    VLOG(4) << "out_dims=   " << out_dims;
+#endif
+    const std::vector<size_t>& default_work_size = DefaultWorkSize(
+        out_dims,
+        DDim(std::vector<DDim::value_type>{
+            static_cast<int64_t>(out_image_shape.at("width")),
+            static_cast<int64_t>(out_image_shape.at("height"))}));
+
+    int out_C = 0, out_H = 0, out_W = 0, in_W = 0;
+    if (param.output->dims().size() == 4) {
+      out_C = out_dims[1];
+      out_H = out_dims[2];
+      out_W = out_dims[3];
+      in_W = x_dims[3];
+    } else if (param.output->dims().size() == 3) {
+      out_C = out_dims[0];
+      out_H = out_dims[1];
+      out_W = out_dims[2];
+      in_W = x_dims[2];
+    } else if (param.output->dims().size() == 2) {
+      out_C = 1;
+      out_H = out_dims[0];
+      out_W = out_dims[1];
+      in_W = x_dims[1];
+    }
+
+#ifdef LITE_WITH_LOG
+    VLOG(4) << "out_C=" << out_C;
+    VLOG(4) << "out_H=" << out_H;
+    VLOG(4) << "out_W=" << out_W;
+    VLOG(4) << "in_W=" << in_W;
+    VLOG(4) << "default_work_size= " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+#endif
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifdef LITE_WITH_LOG
+    VLOG(4) << TargetToStr(x->target());
+    VLOG(4) << TargetToStr(param.output->target());
+#endif
+
+    int arg_idx = 0;
+    cl_int status;
+    status = kernel.setArg(arg_idx, *x_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_C);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_H);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_W);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_W);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size =
+        cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                    static_cast<size_t>(default_work_size.data()[1]),
+                    static_cast<size_t>(default_work_size.data()[2])};
+
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
+    CL_CHECK_FATAL(status);
+  }
+
+ private:
+  std::string kernel_func_name_{"transpose"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
+};
+
+// transpose2 operator
+class Transpose2ComputeFloatImage
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void PrepareForRun() override {}
+
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {}
+#endif
+
+  bool IsShuffleChannel(const std::vector<int>& axis) {
+    bool is_shuffle_channel = true;
+    if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
+      for (int i = 3; i < axis.size(); ++i) {
+        if (axis[i] != i) {
+          is_shuffle_channel = false;
+          break;
+        }
+      }
+    } else {
+      return false;
+    }
+    return is_shuffle_channel;
+  }
+
+  template <typename Dtype>
+  void DeviceTensorToHostTensor(const Tensor* device_tensor,
+                                Tensor* host_tensor) {
+    host_tensor->Resize(device_tensor->dims());
+    Dtype* host_ptr = host_tensor->mutable_data<Dtype>();
+    CLRuntime::Global()->command_queue().finish();
+    CLImageConverterDefault default_converter;
+    auto device_tensor_image_dim =
+        default_converter.InitImageDimInfoWith(device_tensor->dims());
+    half_t* image_data = new half_t[device_tensor_image_dim.production() * 4];
+    TargetWrapperCL::ImgcpySync(image_data,
+                                device_tensor->data<half_t, cl::Image2D>(),
+                                device_tensor_image_dim[0],
+                                device_tensor_image_dim[1],
+                                0,
+                                0,
+                                IoDirection::DtoH);
+    default_converter.ImageToNCHW(
+        image_data, host_ptr, device_tensor_image_dim, host_tensor->dims());
+    delete[] image_data;
+  }
+
+  template <typename Dtype>
+  void HostTensorToDeviceTensor(const Tensor* host_tensor,
+                                Tensor* device_tensor) {
+    Dtype* host_ptr = const_cast<Dtype*>(host_tensor->data<Dtype>());
+    CLImageConverterDefault default_converter;
+    auto device_tensor_image_dim =
+        default_converter.InitImageDimInfoWith(device_tensor->dims());
+    device_tensor->mutable_data<half_t, cl::Image2D>(
+        device_tensor_image_dim[0], device_tensor_image_dim[1]);
+    half_t* image_data = new half_t[device_tensor->dims().production() * 4];
+    default_converter.NCHWToImage(host_ptr, image_data, device_tensor->dims());
+
+    TargetWrapperCL::ImgcpySync(
+        device_tensor->mutable_data<half_t, cl::Image2D>(),
+        image_data,
+        device_tensor_image_dim[0],
+        device_tensor_image_dim[1],
+        0,
+        0,
+        IoDirection::HtoD);
+
+    delete[] image_data;
+  }
+
+  template <typename Dtype>
+  void ShuffleChannelCompute(const operators::TransposeParam& param) {
+    const Tensor* input = param.x;
+    Tensor* input_tensor = new Tensor();
+    DeviceTensorToHostTensor<Dtype>(input, input_tensor);
+    Dtype* input_ptr = input_tensor->mutable_data<Dtype>();
+
+    Tensor* output = param.output;
+    Tensor* output_tensor = new Tensor();
+    output_tensor->Resize(output->dims());
+    Dtype* output_ptr = output_tensor->mutable_data<Dtype>();
+
+    // input and output's shape dimension must >= 2 && <= 6.
+    const DDim& in_dim = input->dims();
+    const DDim& out_dim = output->dims();
+    size_t offset = 1;
+    for (int i = 3; i < param.axis.size(); ++i) {
+      offset *= in_dim[i];
+    }
+#pragma omp parallel for collapse(3)
+    for (int batch = 0; batch < out_dim[0]; ++batch) {
+      for (int c1 = 0; c1 < out_dim[1]; ++c1) {
+        for (int c2 = 0; c2 < out_dim[2]; ++c2) {
+          size_t out_offset =
+              ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
+          size_t in_offset =
+              ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
+          memcpy(output_ptr + out_offset,
+                 input_ptr + in_offset,
+                 offset * sizeof(Dtype));
+        }
+      }
+    }
+    HostTensorToDeviceTensor<Dtype>(output_tensor, output);
+    delete input_tensor;
+    delete output_tensor;
+  }
+
+  template <typename Dtype>
+  void Transpose2Compute(const operators::TransposeParam& param) {
+    const Tensor* input = param.x;
+    Tensor* input_tensor = new Tensor();
+    DeviceTensorToHostTensor<Dtype>(input, input_tensor);
+    Dtype* input_ptr = input_tensor->mutable_data<Dtype>();
+
+    Tensor* output = param.output;
+    Tensor* output_tensor = new Tensor();
+    output_tensor->Resize(output->dims());
+    Dtype* output_ptr = output_tensor->mutable_data<Dtype>();
+
+    // input and output's shape dimension must >= 2 && <= 6.
+    const DDim& in_dim = input->dims();
+    const DDim& out_dim = output->dims();
+
+    // precompute inverted output dim and strides
+    size_t rout_dim[6], strides[6];
+    auto& axis = param.axis;
+    int permute = axis.size();  // permute must >=2 && <= 6.
+    for (int i = 0; i < permute; ++i) {
+      int k = permute - 1 - i;
+      strides[k] = 1;
+      for (int j = axis[i] + 1; j < permute; ++j) {
+        strides[k] *= in_dim[j];
+      }
+      rout_dim[k] = out_dim[i];
+    }
+
+    // unroll the first 2 dimensions
+    int reamin_dim = 1;
+    for (int i = 2; i < out_dim.size(); ++i) {
+      reamin_dim *= out_dim[i];
+    }
+
+#pragma omp parallel for collapse(2)
+    for (int batch = 0; batch < out_dim[0]; ++batch) {
+      for (int j = 0; j < out_dim[1]; ++j) {
+        size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
+        Dtype* out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
+        int indics[4] = {0, 0, 0, 0};
+        for (int k = 0; k < reamin_dim; ++k) {
+          out_ptr[k] = input_ptr[offset];
+          indics[0] += 1;
+          offset += strides[0];
+          for (int p = 0; p < permute - 3; ++p) {
+            if (indics[p] == rout_dim[p]) {
+              indics[p + 1] += 1;
+              indics[p] = 0;
+              offset += strides[p + 1];
+              offset -= rout_dim[p] * strides[p];
+            } else {
+              break;
+            }
+          }
+        }
+      }
+    }
+    HostTensorToDeviceTensor<Dtype>(output_tensor, output);
+    delete input_tensor;
+    delete output_tensor;
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const std::vector<int> axis = param.axis;
+
+    bool shuffle_channel = IsShuffleChannel(axis);
+    if (shuffle_channel) {
+      ShuffleChannelCompute<float>(param);
+    } else {
+      Transpose2Compute<float>(param);
+    }
+  }
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(transpose,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::TransposeComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(transpose2,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::Transpose2ComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+#define LITE_WITH_LOG
diff --git a/lite/kernels/opencl/transpose_image_compute_test.cc b/lite/kernels/opencl/transpose_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9db9b3732d44aa3f342a8cf8b7b2fe5819586a5f
--- /dev/null
+++ b/lite/kernels/opencl/transpose_image_compute_test.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+#include "lite/operators/reshape_op.h"
+#include "lite/utils/logging.h"
+
+#define FP16_MAX_DIFF (5e-1)
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+static inline void TestWithKernel(
+    const std::unique_ptr<paddle::lite::KernelBase>& kernel) {
+  int64_t batch_size = 1;
+  int64_t ic = 2;
+  int64_t ih = 3;
+  int64_t iw = 4;
+
+  int64_t oc = 3;
+  int64_t oh = 4;
+  int64_t ow = 2;
+
+  lite::Tensor input, output;
+  operators::TransposeParam param;
+
+  param.x = &input;
+  param.output = &output;
+  param.axis = std::vector<int>({0, 2, 3, 1});
+  const DDim input_dim =
+      lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+  input.Resize(input_dim);
+  const DDim output_dim =
+      lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+  param.output->Resize(output_dim);
+
+  LOG(INFO) << "prepare kernel SetParam------";
+  kernel->SetParam(param);
+
+  size_t input_image_width = iw * ((ic + 3) / 4);
+  size_t input_image_height = ih * batch_size;
+
+  size_t output_image_width = ow * ((oc + 3) / 4);
+  size_t output_image_height = oh * batch_size;
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+
+  std::vector<float> input_v(batch_size * ic * ih * iw);
+
+  LOG(INFO) << "gen input ...";
+
+  float* input_v_data = &input_v[0];
+  auto index = 0;
+  for (auto& i : input_v) {
+    i = index++;
+  }
+
+  paddle::lite::CLImageConverterDefault default_convertor;
+
+  std::vector<half_t> x_image_data(input_image_width * input_image_height *
+                                   4);  // 4 : RGBA
+
+  LOG(INFO) << "set mapped input  ...";
+  default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim);
+
+  auto* input_image = input.mutable_data<half_t, cl::Image2D>(
+      input_image_width, input_image_height, x_image_data.data());
+
+  LOG(INFO) << "prepare kernel ready";
+
+  LOG(INFO) << "mutable output ...";
+  CLImageConverterDefault default_converter;
+  DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = output.mutable_data<half_t, cl::Image2D>(
+      out_image_shape[0], out_image_shape[1]);
+
+  LOG(INFO) << "kernel context ...";
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  std::unique_ptr<KernelContext> transpose_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(transpose_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(transpose_context));
+
+  LOG(INFO) << "kernel launch ...";
+  kernel->Launch();
+
+  CLRuntime::Global()->command_queue().finish();
+
+  half_t* out_image_data = new half_t[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              output.data<half_t, cl::Image2D>(),
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter.ImageToNCHW(
+      out_image_data, out_data, out_image_shape, output_dim);
+
+  // check output data
+  index = 0;
+  auto hxw = ih * iw;
+  auto cxhxw = ic * hxw;
+  for (auto n = 0; n < batch_size; n++) {
+    for (auto h = 0; h < ih; h++) {
+      for (auto w = 0; w < iw; w++) {
+        for (auto c = 0; c < ic; c++) {
+          auto input_index = n * cxhxw + c * hxw + h * iw + w;
+          auto input_value = input_v_data[input_index];
+          auto output_value = out_data[index];
+          auto abs_diff = abs(input_value - output_value);
+          auto relative_diff = COMPUTE_RELATIVE_DIFF(input_value, output_value);
+          EXPECT_EQ(
+              (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+          index++;
+        }
+      }
+    }
+  }
+}
+
+TEST(transpose_opencl, compute) {
+  auto kernels = KernelRegistry::Global().Create("transpose",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFP16),
+                                                 DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  TestWithKernel(kernel);
+}
+
+TEST(transpose2_opencl, compute) {
+  auto kernels = KernelRegistry::Global().Create("transpose2",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFP16),
+                                                 DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  TestWithKernel(kernel);
+}
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(transpose, kOpenCL, kFP16, kImageDefault, image2d);
diff --git a/lite/kernels/rknpu/subgraph_compute.cc b/lite/kernels/rknpu/subgraph_compute.cc
index e0b63205705609b6899918ce8e254ccdf6cbad47..da01539b291d57da1501f8c3790acae8496581f3 100644
--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
@@ -28,117 +28,55 @@ namespace lite {
 namespace kernels {
 namespace rknpu {
 
-int SubgraphEngine::BuildDeviceProgram() {
+bool SubgraphEngine::BuildDeviceProgram() {
   LOG(INFO) << "[RKNPU]:BuildDeviceProgram";
   int status = 0;
   // Convert all of ops and their input vars and weights and added into the NPU
   // RKNPU IR graph
   subgraph::rknpu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
-  for (auto& inst : origin_program_) {
+  if (!origin_program_) {
+    BuildOriginProgram();
+  }
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
     op->InferShape();
     std::string op_type = op->op_info()->Type();
     if (!bridges.Exists(op_type, TARGET(kRKNPU))) {
-      return subgraph::FAILED;
+      return false;
     }
     auto kernel = inst.kernel();
     status |= bridges.Select(op_type, TARGET(kRKNPU))(
         reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
     }
   }
   // Collect the valid input and output nodes in the RKNPU IR graph and update
   // the input and output names
-  device_inames_.clear();
-  device_onames_.clear();
-
-  for (auto& input_name : input_names_) {
-    LOG(INFO) << "[RKNPU] Input node " << input_name;
-    if (graph.Has(input_name)) {
-      LOG(INFO) << input_name << " Precision "
-                << PrecisionToStr(graph.Get(input_name)->precision());
-      device_itensors_.push_back(graph.Get(input_name)->data());
-      device_inames_.push_back(input_name);
-    } else {
-      LOG(WARNING) << "[RKNPU] Input node " << input_name
-                   << " is ignored because it does not exist.";
-    }
-  }
-
-  for (auto& output_name : output_names_) {
-    LOG(INFO) << "[RKNPU] Output node " << output_name;
-    if (graph.Has(output_name)) {
-      auto tensor = scope_->FindMutableTensor(output_name);
-      LOG(INFO) << output_name << " Precision "
-                << PrecisionToStr(tensor->precision());
-      device_otensors_.push_back(graph.Get(output_name)->data());
-      device_onames_.push_back(output_name);
-    } else {
-      LOG(WARNING) << "[RKNPU] Output node " << output_name
-                   << " is ignored because it does not exist.";
-    }
-  }
-  CHECK(!device_inames_.empty())
-      << "[RKNPU] No input nodes found for building NPU model";
-  CHECK(!device_onames_.empty())
-      << "[RKNPU] No output nodes found for building NPU model";
-
-  device_program_ = lite::rknpu::Device::Global().Build(
-      model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
-  if (device_program_ == nullptr) {
-    LOG(WARNING) << "[RKNPU] Build model failed!";
-    return subgraph::FAILED;
-  }
-
-  // input
-  origin_idims_.resize(input_names_.size());
-  origin_itensors_.resize(input_names_.size());
+  device_itensors_.clear();
+  device_otensors_.clear();
   for (size_t i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
-    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
-  }
-  // output
-  origin_odims_.resize(output_names_.size());
-  origin_otensors_.resize(output_names_.size());
-  for (size_t i = 0; i < output_names_.size(); i++) {
-    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-
-    auto output_dims = origin_otensors_[i]->dims();
-  }
-
-  origin_idims_.resize(device_inames_.size());
-  origin_itensors_.resize(device_inames_.size());
-  device_itensors_.resize(device_inames_.size());
-  origin_odims_.resize(device_onames_.size());
-  origin_otensors_.resize(device_onames_.size());
-  device_otensors_.resize(device_onames_.size());
-  for (int i = 0; i < device_inames_.size(); i++) {
-    auto node = graph.Get(device_inames_[i]);
+    CHECK(graph.Has(input_names_[i])) << "[RKNPU] Failed to find input node "
+                                      << input_names_[i];
+    auto node = graph.Get(input_names_[i]);
     auto precision = node->precision();
     auto layout = node->layout();
-    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
-    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
-
-    LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << device_inames_[i]
+    LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << input_names_[i]
               << " precision: " << PrecisionToStr(precision)
               << " layout: " << DataLayoutToStr(layout);
+    device_itensors_.push_back(node->data());
   }
-  for (int i = 0; i < device_onames_.size(); i++) {
-    auto node = graph.Get(device_onames_[i]);
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    CHECK(graph.Has(output_names_[i])) << "[RKNPU] Failed to find output node "
+                                       << output_names_[i];
+    auto node = graph.Get(output_names_[i]);
     auto precision = node->precision();
     auto layout = node->layout();
-    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-    LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << device_onames_[i]
+    LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << output_names_[i]
               << " precision: " << PrecisionToStr(precision)
               << " layout: " << DataLayoutToStr(layout);
     // Prepare the device output tensors
@@ -159,22 +97,30 @@ int SubgraphEngine::BuildDeviceProgram() {
         origin_otensors_[i]->mutable_data<int64_t>();
         break;
       default:
-        LOG(FATAL) << "[RKNPU] " << device_onames_[i]
+        LOG(FATAL) << "[RKNPU] " << output_names_[i]
                    << " can't mutable data with precision type "
                    << PrecisionToStr(precision);
         break;
     }
+    device_otensors_.push_back(node->data());
+  }
+  // Create the RKNPU model and set the input and output nodes
+  device_program_ = lite::rknpu::Device::Global().Build(
+      model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
+  if (device_program_ == nullptr) {
+    LOG(WARNING) << "[RKNPU] Build model failed!";
+    return false;
   }
-  return status;
+  return true;
 }
 
-int SubgraphEngine::LaunchDeviceProgram() {
+bool SubgraphEngine::LaunchDeviceProgram() {
   LOG(INFO) << "[RKNPU]:LaunchDeviceProgram";
   std::vector<rk::nn::InputInfo> inputs;
   std::vector<rk::nn::OutputInfo> outputs;
 
-  inputs.resize(device_itensors_.size());
-  for (size_t i = 0; i < device_itensors_.size(); i++) {
+  inputs.resize(origin_itensors_.size());
+  for (size_t i = 0; i < origin_itensors_.size(); i++) {
     inputs[i].index = i;
     inputs[i].buf = const_cast<void*>(origin_itensors_[i]->raw_data());
     inputs[i].size = origin_itensors_[i]->memory_size();
@@ -184,8 +130,8 @@ int SubgraphEngine::LaunchDeviceProgram() {
     inputs[i].layout = rk::nn::DataLayoutType::NCHW;
   }
 
-  outputs.resize(device_otensors_.size());
-  for (size_t i = 0; i < device_otensors_.size(); i++) {
+  outputs.resize(origin_otensors_.size());
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
     outputs[i].index = i;
     outputs[i].buf = const_cast<void*>(origin_otensors_[i]->raw_data());
     outputs[i].size = origin_otensors_[i]->memory_size();
@@ -195,26 +141,25 @@ int SubgraphEngine::LaunchDeviceProgram() {
   device_program_->SetInputs(inputs);
   device_program_->Run();
   device_program_->GetOutputs(outputs);
-  return 0;
+  return true;
 }
 
 void SubgraphCompute::PrepareForRun() {
   LOG(INFO) << "[RKNPU]:PrepareForRun";
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                    param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
   CHECK(engine_);
-  engine_->Build();
 }
 
 void SubgraphCompute::Run() {
   LOG(INFO) << "[RKNPU]:Run";
   CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 
 }  // namespace rknpu
diff --git a/lite/kernels/rknpu/subgraph_compute.h b/lite/kernels/rknpu/subgraph_compute.h
index 863e6aef39ad54f0e9d94d4b507c6fca4128ebb8..78162b3d165bde8e33436654bbcd1110ad9afea6 100644
--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
@@ -34,22 +34,26 @@ class SubgraphEngine : public subgraph::Engine {
  public:
   SubgraphEngine(KernelContext *ctx,
                  int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                  const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}
 
  protected:
-  int BuildDeviceProgram() override;
-  int LaunchDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
 
   std::string model_name_;
   std::vector<std::string> device_inames_;
   std::vector<std::string> device_onames_;
-  std::vector<std::shared_ptr<rk::nn::Tensor>> device_itensors_;
-  std::vector<std::shared_ptr<rk::nn::Tensor>> device_otensors_;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> device_itensors_{};
+  std::vector<std::shared_ptr<rk::nn::Tensor>> device_otensors_{};
   std::unique_ptr<rk::nn::Exection> device_program_{nullptr};
 };
 
diff --git a/lite/kernels/x86/activation_compute.cc b/lite/kernels/x86/activation_compute.cc
index 2910364f37b74d94977e2397e31eb97fd367825e..9b4c2fadd9ce427db272a9bb0cfd0e0a10716f11 100644
--- a/lite/kernels/x86/activation_compute.cc
+++ b/lite/kernels/x86/activation_compute.cc
@@ -78,3 +78,13 @@ REGISTER_LITE_KERNEL(softsign,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(sigmoid,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SoftsignCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/activation_compute_test.cc b/lite/kernels/x86/activation_compute_test.cc
index 8cc2607e73e605214e08e42e70de457a206e2468..550cf299f676105271e758eb1a13e880045ee1cc 100644
--- a/lite/kernels/x86/activation_compute_test.cc
+++ b/lite/kernels/x86/activation_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/activation_compute.cc"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/activation_compute.cc"
 
 namespace paddle {
 namespace lite {
@@ -26,8 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(relu_x86, retrive_op) {
-  auto relu =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("relu");
+  auto relu = KernelRegistry::Global().Create("relu");
   ASSERT_FALSE(relu.empty());
   ASSERT_TRUE(relu.front());
 }
diff --git a/lite/kernels/x86/attention_padding_mask_compute_test.cc b/lite/kernels/x86/attention_padding_mask_compute_test.cc
index 35ce822e010fc3ce2dc756b86e3a437789cc8359..5c672a1ee05116ccefec074f54d0726a7cd010ea 100644
--- a/lite/kernels/x86/attention_padding_mask_compute_test.cc
+++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/attention_padding_mask_compute.cc"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/attention_padding_mask_compute.cc"
 
 namespace paddle {
 namespace lite {
@@ -81,8 +83,7 @@ int get_max_len(const LoD& lod) {
 
 TEST(attention_padding_mask_x86, retrive_op) {
   auto attention_padding_mask =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "attention_padding_mask");
+      KernelRegistry::Global().Create("attention_padding_mask");
   ASSERT_FALSE(attention_padding_mask.empty());
   ASSERT_TRUE(attention_padding_mask.front());
 }
diff --git a/lite/kernels/x86/batch_norm_compute_test.cc b/lite/kernels/x86/batch_norm_compute_test.cc
index 5ec2cdcdda0e9ff3698c80584b36396b38328e03..dd70f78efa7334355c459fd1d85a7da4f5b05b60 100644
--- a/lite/kernels/x86/batch_norm_compute_test.cc
+++ b/lite/kernels/x86/batch_norm_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/batch_norm_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/batch_norm_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,9 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(batch_norm_x86, retrive_op) {
-  auto batch_norm =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "batch_norm");
+  auto batch_norm = KernelRegistry::Global().Create("batch_norm");
   ASSERT_FALSE(batch_norm.empty());
   ASSERT_TRUE(batch_norm.front());
 }
diff --git a/lite/kernels/x86/cast_compute_test.cc b/lite/kernels/x86/cast_compute_test.cc
index f7aa52ca6d0dde603357f009220b4a3a53f56833..b039cf5d3b01032e60ef7bdcf31a45c8ed302215 100644
--- a/lite/kernels/x86/cast_compute_test.cc
+++ b/lite/kernels/x86/cast_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/cast_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/cast_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,8 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(cast_x86, retrive_op) {
-  auto cast =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("cast");
+  auto cast = KernelRegistry::Global().Create("cast");
   ASSERT_FALSE(cast.empty());
   ASSERT_TRUE(cast.front());
 }
diff --git a/lite/kernels/x86/concat_compute_test.cc b/lite/kernels/x86/concat_compute_test.cc
index 468e9422752561ff6416e8859b485462b9e2abbe..4be51dff6ed613842de431cce8a7960182073c4f 100644
--- a/lite/kernels/x86/concat_compute_test.cc
+++ b/lite/kernels/x86/concat_compute_test.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/concat_compute.h"
 #include <gtest/gtest.h>
+
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/concat_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -23,9 +25,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(concat_x86, retrive_op) {
-  auto concat =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "concat");
+  auto concat = KernelRegistry::Global().Create("concat");
   ASSERT_FALSE(concat.empty());
   ASSERT_TRUE(concat.front());
 }
diff --git a/lite/kernels/x86/conv_compute_test.cc b/lite/kernels/x86/conv_compute_test.cc
index 2827c6577e5bf311b4002526d4ac10f636162d96..cd46571a2a9fd6b428f84ca278a453c8675d6ed6 100644
--- a/lite/kernels/x86/conv_compute_test.cc
+++ b/lite/kernels/x86/conv_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/conv_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/conv_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,9 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(conv_x86, retrive_op) {
-  auto conv2d =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "conv2d");
+  auto conv2d = KernelRegistry::Global().Create("conv2d");
   ASSERT_FALSE(conv2d.empty());
   ASSERT_TRUE(conv2d.front());
 }
diff --git a/lite/kernels/x86/dropout_compute_test.cc b/lite/kernels/x86/dropout_compute_test.cc
index 279f639f40ece0a10e45fe16f36fcb443cea550a..d30fbbea670d9509e722e3a27fd3dbf1d89a308c 100644
--- a/lite/kernels/x86/dropout_compute_test.cc
+++ b/lite/kernels/x86/dropout_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/dropout_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/dropout_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,9 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(dropout_x86, retrive_op) {
-  auto dropout =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "dropout");
+  auto dropout = KernelRegistry::Global().Create("dropout");
   ASSERT_FALSE(dropout.empty());
   ASSERT_TRUE(dropout.front());
 }
diff --git a/lite/kernels/x86/elementwise_compute_test.cc b/lite/kernels/x86/elementwise_compute_test.cc
index 9850c0ce86756cd12e28ab95688b79a1c539189c..6379faacad75f98f73eafbdfc2f8c9deb4d086cb 100644
--- a/lite/kernels/x86/elementwise_compute_test.cc
+++ b/lite/kernels/x86/elementwise_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/elementwise_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/elementwise_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,9 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(elementwise_add_x86, retrive_op) {
-  auto elementwise_add =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "elementwise_add");
+  auto elementwise_add = KernelRegistry::Global().Create("elementwise_add");
   ASSERT_FALSE(elementwise_add.empty());
   ASSERT_TRUE(elementwise_add.front());
 }
diff --git a/lite/kernels/x86/elementwise_op_function.h b/lite/kernels/x86/elementwise_op_function.h
index f736248ed3632af92dea2823439e6e7d28ff3e1b..4cb7160097e320798c1b1e2ee94d7fec8aedc6d6 100644
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "lite/fluid/for_range.h"
 #include "lite/fluid/transform.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
 #include "lite/utils/variant.h"
 
 namespace paddle {
@@ -66,9 +65,8 @@ inline void get_mid_dims(const lite::DDim &x_dims,
     for (size_t i = 0; i < y_dims.size(); ++i) {
       if (x_dims[i + axis] != y_dims[i]) {
         // only support single y_dims[i] = 1 now.
-        PADDLE_ENFORCE_EQ(
-            *mid_flag, 0, "Broadcast support y_dims with single 1.");
-        PADDLE_ENFORCE_EQ(y_dims[i], 1, "Broadcast dimension mismatch.");
+        CHECK_EQ(*mid_flag, 0) << "Broadcast support y_dims with single 1.";
+        CHECK_EQ(y_dims[i], 1) << "Broadcast dimension mismatch.";
         // m*n*k m*1*k
         for (size_t j = 0; j < i; ++j) {
           (*pre) *= y_dims[j];
@@ -95,8 +93,7 @@ inline void get_mid_dims(const lite::DDim &x_dims,
     }
 
     for (size_t i = 0; i < y_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[i + axis], y_dims[i], "Broadcast dimension mismatch.");
+      CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch.";
       (*n) *= y_dims[i];
     }
 
@@ -314,17 +311,16 @@ void ElementwiseComputeEx(const lite::Context<Target> &ctx,
   TransformFunctor<Functor, T, Target, OutType> functor(x, y, z, ctx, func);
   auto x_dims = x->dims();
   auto y_dims_untrimed = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(),
-                    y_dims_untrimed.size(),
-                    "Rank of first input must >= rank of second input.");
+  CHECK_GE(x_dims.size(), y_dims_untrimed.size())
+      << "Rank of first input must >= rank of second input.";
   if (x_dims == y_dims_untrimed) {
     functor.Run();
     return;
   }
 
   axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
-  PADDLE_ENFORCE(axis >= 0 && axis < static_cast<int>(x_dims.size()),
-                 "Axis should be in range [0, x_dims)");
+  CHECK(axis >= 0 && axis < static_cast<int>(x_dims.size()))
+      << "Axis should be in range [0, x_dims)";
   auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
   int pre, n, post, mid_flag = 0;
@@ -560,9 +556,8 @@ void FusedElemwiseAndActComputeEx(const lite::Context<Target> &ctx,
                                   lite::Tensor *out,
                                   lite::Tensor *intermediate_out) {
   if (KeepIntermediateOut) {
-    PADDLE_ENFORCE(intermediate_out,
-                   "The save_intermediate_out is opened, "
-                   "intermediate_out should not be nullptr.");
+    CHECK(intermediate_out) << "The save_intermediate_out is opened, "
+                               "intermediate_out should not be nullptr.";
   }
 
   const lite::DDim &x_dim = x.dims();
diff --git a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
index 16bec18a1c1c4d0075e1ed1dcc4f3a3462917868..e3e8b13413808b447018ac14acf9d4a16c0f47a6 100644
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/fill_constant_batch_size_like_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/fill_constant_batch_size_like_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -26,8 +29,7 @@ namespace x86 {
 
 TEST(fill_constant_batch_size_like_x86, retrive_op) {
   auto fill_constant_batch_size_like =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "fill_constant_batch_size_like");
+      KernelRegistry::Global().Create("fill_constant_batch_size_like");
   ASSERT_FALSE(fill_constant_batch_size_like.empty());
   ASSERT_TRUE(fill_constant_batch_size_like.front());
 }
diff --git a/lite/kernels/x86/gather_compute_test.cc b/lite/kernels/x86/gather_compute_test.cc
index 286dfcb08a0c2c7bc038e0ad3b5673bd7c0f8b19..63284452244b19b807f8b101cab5cbabbbf68476 100644
--- a/lite/kernels/x86/gather_compute_test.cc
+++ b/lite/kernels/x86/gather_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/gather_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/gather_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,9 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(gather_x86, retrive_op) {
-  auto gather =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "gather");
+  auto gather = KernelRegistry::Global().Create("gather");
   ASSERT_FALSE(gather.empty());
   int cnt = 0;
   for (auto item = gather.begin(); item != gather.end(); ++item) {
diff --git a/lite/kernels/x86/gelu_compute_test.cc b/lite/kernels/x86/gelu_compute_test.cc
index e930cd32df91196fa9f4559ee6ba22bd8b82d337..9bda9ac4c1c0cee84141095b3100bb82a99661b7 100644
--- a/lite/kernels/x86/gelu_compute_test.cc
+++ b/lite/kernels/x86/gelu_compute_test.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
 #include "lite/kernels/x86/activation_compute.cc"
 
@@ -26,8 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(gelu_x86, retrive_op) {
-  auto gelu =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("gelu");
+  auto gelu = KernelRegistry::Global().Create("gelu");
   ASSERT_FALSE(gelu.empty());
   ASSERT_TRUE(gelu.front());
 }
diff --git a/lite/kernels/x86/gru_compute_test.cc b/lite/kernels/x86/gru_compute_test.cc
index 3e0e944f23bafda6a5eb742a8e4b023c268c9955..c4a0045b3c1b27dfb1b518aede7dad2872cd1dc2 100644
--- a/lite/kernels/x86/gru_compute_test.cc
+++ b/lite/kernels/x86/gru_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/gru_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/gru_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,8 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(gru_x86, retrive_op) {
-  auto gru =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("gru");
+  auto gru = KernelRegistry::Global().Create("gru");
   ASSERT_FALSE(gru.empty());
   ASSERT_TRUE(gru.front());
 }
diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h
index 46d151bbc406e19b498b87420029da7f9c1c2f12..ba75dad11b75441dc09b75224bfc4dfb271396a8 100644
--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
@@ -63,10 +63,10 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     out.ShareDataWith(*y);
     out.Resize(matrix_shape);
 
-    PADDLE_ENFORCE_EQ(Mean->numel(), left);
-    PADDLE_ENFORCE_EQ(Var->numel(), left);
-    PADDLE_ENFORCE_EQ(Scale->numel(), right);
-    PADDLE_ENFORCE_EQ(Bias->numel(), right);
+    CHECK_EQ(Mean->numel(), left);
+    CHECK_EQ(Var->numel(), left);
+    CHECK_EQ(Scale->numel(), right);
+    CHECK_EQ(Bias->numel(), right);
 
     auto ker = paddle::lite::jit::KernelFuncs<jit::LayerNormTuple<T>,
                                               lite::fluid::CPUPlace>::Cache()
diff --git a/lite/kernels/x86/layer_norm_compute_test.cc b/lite/kernels/x86/layer_norm_compute_test.cc
index d39500a5e8827230ddeecd6bbe30f8c0a47ee929..617f1fae066aa6dc5068d293f8e977a2d37fe496 100644
--- a/lite/kernels/x86/layer_norm_compute_test.cc
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
@@ -12,15 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/layer_norm_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/backends/x86/jit/helper.h"
 #include "lite/backends/x86/jit/kernel_base.h"
 #include "lite/backends/x86/jit/kernels.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/layer_norm_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -74,9 +76,7 @@ std::vector<float> ref(lite::Tensor* x,
 
 // layer_norm
 TEST(layer_norm_x86, retrive_op) {
-  auto layer_norm =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "layer_norm");
+  auto layer_norm = KernelRegistry::Global().Create("layer_norm");
   ASSERT_FALSE(layer_norm.empty());
   ASSERT_TRUE(layer_norm.front());
 }
diff --git a/lite/kernels/x86/leaky_relu_compute_test.cc b/lite/kernels/x86/leaky_relu_compute_test.cc
index 76daf4ff9ffc5dea8b532610abc917406356b3a5..75ebcf071298d072682b6ea535b3c8244c328500 100644
--- a/lite/kernels/x86/leaky_relu_compute_test.cc
+++ b/lite/kernels/x86/leaky_relu_compute_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <vector>
+
 #include "lite/core/op_registry.h"
 #include "lite/kernels/x86/activation_compute.h"
 
@@ -24,9 +26,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(leaky_relu_x86, retrive_op) {
-  auto leaky_relu =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "leaky_relu");
+  auto leaky_relu = KernelRegistry::Global().Create("leaky_relu");
   ASSERT_FALSE(leaky_relu.empty());
   ASSERT_TRUE(leaky_relu.front());
 }
diff --git a/lite/kernels/x86/match_matrix_tensor_compute_test.cc b/lite/kernels/x86/match_matrix_tensor_compute_test.cc
index 0c3f3ad50940ab0059ab04fb507a786f735584b9..02ed8e1b4bb3a7bccc8560cb1f51166d3833e6bf 100644
--- a/lite/kernels/x86/match_matrix_tensor_compute_test.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/match_matrix_tensor_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/match_matrix_tensor_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,9 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(match_matrix_tensor_x86, retrive_op) {
-  auto kernel =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "match_matrix_tensor");
+  auto kernel = KernelRegistry::Global().Create("match_matrix_tensor");
   ASSERT_FALSE(kernel.empty());
   ASSERT_TRUE(kernel.front());
 }
diff --git a/lite/kernels/x86/matmul_compute_test.cc b/lite/kernels/x86/matmul_compute_test.cc
index 53d2d1a47a0cdbdaf5dfa83a79987d908171a36d..1e98702193af11ea8678bdfbc2382c7845c49b38 100644
--- a/lite/kernels/x86/matmul_compute_test.cc
+++ b/lite/kernels/x86/matmul_compute_test.cc
@@ -12,22 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/matmul_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/matmul_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace x86 {
 
 TEST(matmul_x86, retrive_op) {
-  auto matmul =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "matmul");
+  auto matmul = KernelRegistry::Global().Create("matmul");
   ASSERT_FALSE(matmul.empty());
   ASSERT_TRUE(matmul.front());
 }
diff --git a/lite/kernels/x86/mul_compute_test.cc b/lite/kernels/x86/mul_compute_test.cc
index 32d82cbb77aeb71dcd1c172ec0c1e343c3954fea..0d66a2dbd6eb27dac6acde47cc395c3c6245b1b5 100644
--- a/lite/kernels/x86/mul_compute_test.cc
+++ b/lite/kernels/x86/mul_compute_test.cc
@@ -12,21 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/mul_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/mul_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace x86 {
 
 TEST(mul_x86, retrive_op) {
-  auto mul =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("mul");
+  auto mul = KernelRegistry::Global().Create("mul");
   ASSERT_FALSE(mul.empty());
   ASSERT_TRUE(mul.front());
 }
diff --git a/lite/kernels/x86/pool_compute_test.cc b/lite/kernels/x86/pool_compute_test.cc
index 4ea727cedd5206f5f1ac2685297f72c3019bb313..d67d3a1de2248a1f8c180867c76b5d31affc11b9 100644
--- a/lite/kernels/x86/pool_compute_test.cc
+++ b/lite/kernels/x86/pool_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/pool_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/pool_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,9 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(pool_x86, retrive_op) {
-  auto pool2d =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "pool2d");
+  auto pool2d = KernelRegistry::Global().Create("pool2d");
   ASSERT_FALSE(pool2d.empty());
   ASSERT_TRUE(pool2d.front());
 }
diff --git a/lite/kernels/x86/relu_compute_test.cc b/lite/kernels/x86/relu_compute_test.cc
index 37ed6db7f919e31828f89462fa46d5263c480fcc..c2233bd04cf33c983db521335d88339592d2ce6b 100644
--- a/lite/kernels/x86/relu_compute_test.cc
+++ b/lite/kernels/x86/relu_compute_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <vector>
+
 #include "lite/core/op_registry.h"
 #include "lite/kernels/x86/activation_compute.h"
 
@@ -24,8 +26,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(relu_x86, retrive_op) {
-  auto relu =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("relu");
+  auto relu = KernelRegistry::Global().Create("relu");
   ASSERT_FALSE(relu.empty());
   ASSERT_TRUE(relu.front());
 }
diff --git a/lite/kernels/x86/reshape_compute_test.cc b/lite/kernels/x86/reshape_compute_test.cc
index 16fc8f31aded0ef62fdf14aa671a73ccf6635fb7..88f38adee4aa413ac91bfdec0294c816020942b5 100644
--- a/lite/kernels/x86/reshape_compute_test.cc
+++ b/lite/kernels/x86/reshape_compute_test.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/reshape_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/reshape_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -26,9 +29,7 @@ namespace x86 {
 
 // reshape
 TEST(reshape_x86, retrive_op) {
-  auto reshape =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "reshape");
+  auto reshape = KernelRegistry::Global().Create("reshape");
   ASSERT_FALSE(reshape.empty());
   ASSERT_TRUE(reshape.front());
 }
@@ -86,9 +87,7 @@ TEST(reshape_x86, run_test) {
 
 // reshape2
 TEST(reshape2_x86, retrive_op) {
-  auto reshape2 =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "reshape2");
+  auto reshape2 = KernelRegistry::Global().Create("reshape2");
   ASSERT_FALSE(reshape2.empty());
   ASSERT_TRUE(reshape2.front());
 }
diff --git a/lite/kernels/x86/scale_compute_test.cc b/lite/kernels/x86/scale_compute_test.cc
index 6da27f444c7ed4c5a86e5f08a6c1612110bb02b9..dafb1e590f27f14208cff1e9aef79b28256cd048 100644
--- a/lite/kernels/x86/scale_compute_test.cc
+++ b/lite/kernels/x86/scale_compute_test.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/scale_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/scale_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -24,8 +26,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(scale_x86, retrive_op) {
-  auto scale =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("scale");
+  auto scale = KernelRegistry::Global().Create("scale");
   ASSERT_FALSE(scale.empty());
   ASSERT_TRUE(scale.front());
 }
diff --git a/lite/kernels/x86/search_fc_compute_test.cc b/lite/kernels/x86/search_fc_compute_test.cc
index 425df2a0f0544d7345923cb2efdce96074845311..515a5e30c81e9edd6b9ebb8e52955b5de6ec9e24 100644
--- a/lite/kernels/x86/search_fc_compute_test.cc
+++ b/lite/kernels/x86/search_fc_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/search_fc_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/search_fc_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -53,9 +55,7 @@ void fc_cpu_base(const lite::Tensor* X,
 }
 
 TEST(search_fc_x86, retrive_op) {
-  auto search_fc =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "search_fc");
+  auto search_fc = KernelRegistry::Global().Create("search_fc");
   ASSERT_FALSE(search_fc.empty());
   ASSERT_TRUE(search_fc.front());
 }
diff --git a/lite/kernels/x86/search_grnn_compute_test.cc b/lite/kernels/x86/search_grnn_compute_test.cc
index b85d97e3f1be1f2f02837d347e42ce6731c58414..d120ca7500513bc99b71bf0003ec31bcf1e2ac19 100644
--- a/lite/kernels/x86/search_grnn_compute_test.cc
+++ b/lite/kernels/x86/search_grnn_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/search_grnn_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/search_grnn_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,9 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(search_grnn_x86, retrive_op) {
-  auto kernel =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "search_grnn");
+  auto kernel = KernelRegistry::Global().Create("search_grnn");
   ASSERT_FALSE(kernel.empty());
   ASSERT_TRUE(kernel.front());
 }
diff --git a/lite/kernels/x86/search_group_padding_compute_test.cc b/lite/kernels/x86/search_group_padding_compute_test.cc
index f4c36c2a63488a6bb902a2b8b4ad81fa32b37672..ae2007e463c0fc97a099cd5ae902b623e361066c 100644
--- a/lite/kernels/x86/search_group_padding_compute_test.cc
+++ b/lite/kernels/x86/search_group_padding_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/search_group_padding_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/search_group_padding_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,8 +28,7 @@ namespace x86 {
 
 TEST(search_group_padding_x86, retrieve_op) {
   auto search_group_padding =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "search_group_padding");
+      KernelRegistry::Global().Create("search_group_padding");
   ASSERT_FALSE(search_group_padding.empty());
   ASSERT_TRUE(search_group_padding.front());
 }
diff --git a/lite/kernels/x86/search_seq_depadding_compute_test.cc b/lite/kernels/x86/search_seq_depadding_compute_test.cc
index 0d978b35ed040d6b7c44354f37999e6e34e2e3ef..32bf3276bb378beafbf273ffe7142b9b8fc493ac 100644
--- a/lite/kernels/x86/search_seq_depadding_compute_test.cc
+++ b/lite/kernels/x86/search_seq_depadding_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/search_seq_depadding_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/search_seq_depadding_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,9 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(search_seq_depadding_x86, retrive_op) {
-  auto kernel =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "search_seq_depadding");
+  auto kernel = KernelRegistry::Global().Create("search_seq_depadding");
   ASSERT_FALSE(kernel.empty());
   ASSERT_TRUE(kernel.front());
 }
diff --git a/lite/kernels/x86/sequence_arithmetic_compute_test.cc b/lite/kernels/x86/sequence_arithmetic_compute_test.cc
index 3b41e7d7ce37ebaf6a3f8518bc248ff4ec5c1aec..d80d3c2d1097fe2bbb47eb4c9d1384ae54d7fe8c 100644
--- a/lite/kernels/x86/sequence_arithmetic_compute_test.cc
+++ b/lite/kernels/x86/sequence_arithmetic_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/sequence_arithmetic_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/sequence_arithmetic_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -77,8 +79,7 @@ void prepare_input(Tensor* x, const LoD& x_lod) {
 
 TEST(sequence_arithmetic_x86, retrive_op) {
   auto sequence_arithmetic =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "sequence_arithmetic");
+      KernelRegistry::Global().Create("sequence_arithmetic");
   ASSERT_FALSE(sequence_arithmetic.empty());
   ASSERT_TRUE(sequence_arithmetic.front());
 }
diff --git a/lite/kernels/x86/sequence_concat_compute_test.cc b/lite/kernels/x86/sequence_concat_compute_test.cc
index eb6678a655ed1eb5a7bcda1dc2a6b8afe4477d2d..9899e6c08a1d1af9dea3728b5105ff78286de819 100644
--- a/lite/kernels/x86/sequence_concat_compute_test.cc
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/sequence_concat_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/sequence_concat_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -94,9 +97,7 @@ static void sequence_concat_ref(const std::vector<lite::Tensor*>& xs,
 }  // namespace
 
 TEST(sequence_concat_x86, retrive_op) {
-  auto sequence_concat =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "sequence_concat");
+  auto sequence_concat = KernelRegistry::Global().Create("sequence_concat");
   ASSERT_FALSE(sequence_concat.empty());
   ASSERT_TRUE(sequence_concat.front());
 }
diff --git a/lite/kernels/x86/sequence_expand_as_compute_test.cc b/lite/kernels/x86/sequence_expand_as_compute_test.cc
index d49fdbb7a6164435abb9eb7189b18376066d55df..6eafb5f1e5275e375b7c61fda3c437b6959b8dd2 100644
--- a/lite/kernels/x86/sequence_expand_as_compute_test.cc
+++ b/lite/kernels/x86/sequence_expand_as_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/sequence_expand_as_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/sequence_expand_as_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -27,8 +29,7 @@ namespace x86 {
 
 TEST(sequence_expand_as_x86, retrive_op) {
   auto sequence_expand_as =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "sequence_expand_as");
+      KernelRegistry::Global().Create("sequence_expand_as");
   ASSERT_FALSE(sequence_expand_as.empty());
   ASSERT_TRUE(sequence_expand_as.front());
 }
diff --git a/lite/kernels/x86/sequence_pool_compute_test.cc b/lite/kernels/x86/sequence_pool_compute_test.cc
index 372bfaf8741cdcdc902efb6b8380eb4c34dd49ad..35116adbf6f06b87482cfff99182ee6c675ba7ed 100644
--- a/lite/kernels/x86/sequence_pool_compute_test.cc
+++ b/lite/kernels/x86/sequence_pool_compute_test.cc
@@ -12,21 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/sequence_pool_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/sequence_pool_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace x86 {
 
 TEST(sequence_pool_x86, retrive_op) {
-  auto sequence_pool =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "sequence_pool");
+  auto sequence_pool = KernelRegistry::Global().Create("sequence_pool");
   ASSERT_FALSE(sequence_pool.empty());
   ASSERT_TRUE(sequence_pool.front());
 }
diff --git a/lite/kernels/x86/sequence_reverse_compute_test.cc b/lite/kernels/x86/sequence_reverse_compute_test.cc
index adf9981b242bfbb7f60989369715354cc2043685..37c2f9571d486a36eccc1f01c06a1550d4609730 100644
--- a/lite/kernels/x86/sequence_reverse_compute_test.cc
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/sequence_reverse_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/sequence_reverse_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -44,9 +46,7 @@ static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) {
 }  // namespace
 
 TEST(sequence_reverse_x86, retrive_op) {
-  auto sequence_reverse =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "sequence_reverse");
+  auto sequence_reverse = KernelRegistry::Global().Create("sequence_reverse");
   ASSERT_FALSE(sequence_reverse.empty());
   ASSERT_TRUE(sequence_reverse.front());
 }
diff --git a/lite/kernels/x86/sgd_compute.cc b/lite/kernels/x86/sgd_compute.cc
index a3241468f9f09d66401aa83e0d738779e555dfba..dd056e30209953c1f360d714db50e3236f278510 100644
--- a/lite/kernels/x86/sgd_compute.cc
+++ b/lite/kernels/x86/sgd_compute.cc
@@ -41,8 +41,8 @@ class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto *param_out = &sgd_param.ParamOut->raw_tensor();
 
     auto sz = param_out->numel();
-    PADDLE_ENFORCE_EQ(param->numel(), sz);
-    PADDLE_ENFORCE_EQ(grad->numel(), sz);
+    CHECK_EQ(param->numel(), sz);
+    CHECK_EQ(grad->numel(), sz);
 
     paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1);
     const T *lr = learning_rate->template data<T>();
diff --git a/lite/kernels/x86/shape_compute_test.cc b/lite/kernels/x86/shape_compute_test.cc
index 88bd98f33ffc7a727de584543bc7392cdbb2883f..9fe5e6c51eaee783072717cea055b00b75c59c07 100644
--- a/lite/kernels/x86/shape_compute_test.cc
+++ b/lite/kernels/x86/shape_compute_test.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/shape_compute.h"
 #include <gtest/gtest.h>
+
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/shape_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -23,8 +25,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(shape_x86, retrive_op) {
-  auto shape =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("shape");
+  auto shape = KernelRegistry::Global().Create("shape");
   ASSERT_FALSE(shape.empty());
   ASSERT_TRUE(shape.front());
 }
diff --git a/lite/kernels/x86/slice_compute.h b/lite/kernels/x86/slice_compute.h
index ad30215691cde66ab1c7c8c57930fc6d58de7cd5..d32327668bac389e42ff9411be50ce3df42e39ff 100644
--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
@@ -157,7 +157,7 @@ void slice_compute(const lite::Tensor* in,
     }
   }
 
-  out->mutable_data<float>(lite::TargetType::kX86);
+  out->mutable_data<float>();
 
   auto new_out_dims = out->dims();
   auto offsets = Eigen::array<int, D>();
diff --git a/lite/kernels/x86/slice_compute_test.cc b/lite/kernels/x86/slice_compute_test.cc
index a62a62cd88ce48c4d47d784ecbc2fd16d0f433d1..b978d4533ccb28ae8826b8304d93f9bdbe85d106 100644
--- a/lite/kernels/x86/slice_compute_test.cc
+++ b/lite/kernels/x86/slice_compute_test.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/slice_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/slice_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -79,8 +82,7 @@ static void slice_ref(const float* input,
 }
 
 TEST(slice_x86, retrive_op) {
-  auto slice =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("slice");
+  auto slice = KernelRegistry::Global().Create("slice");
   ASSERT_FALSE(slice.empty());
   ASSERT_TRUE(slice.front());
 }
diff --git a/lite/kernels/x86/softmax_compute_test.cc b/lite/kernels/x86/softmax_compute_test.cc
index 0debeecb3150dfdd2626b6f8f3f6b5ef63981d93..f3def92992c7ca01e75d12b86b2680768a9fd2ee 100644
--- a/lite/kernels/x86/softmax_compute_test.cc
+++ b/lite/kernels/x86/softmax_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/softmax_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/softmax_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,9 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(softmax_x86, retrive_op) {
-  auto softmax =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "softmax");
+  auto softmax = KernelRegistry::Global().Create("softmax");
   ASSERT_FALSE(softmax.empty());
   ASSERT_TRUE(softmax.front());
 }
diff --git a/lite/kernels/x86/stack_compute_test.cc b/lite/kernels/x86/stack_compute_test.cc
index d105165a98f936b7a6973e57f5199977a0b8bed3..33942fca96508d2868520e5b5e242b83a1f38b0e 100644
--- a/lite/kernels/x86/stack_compute_test.cc
+++ b/lite/kernels/x86/stack_compute_test.cc
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/stack_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/stack_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -25,8 +28,7 @@ namespace x86 {
 
 // stack
 TEST(stack_x86, retrive_op) {
-  auto stack =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("stack");
+  auto stack = KernelRegistry::Global().Create("stack");
   ASSERT_FALSE(stack.empty());
   ASSERT_TRUE(stack.front());
 }
diff --git a/lite/kernels/x86/tanh_compute_test.cc b/lite/kernels/x86/tanh_compute_test.cc
index 8132505fad6d93997c73ffb735a4a798c15d87a6..6cba531fd34df029a1cdaaf9d6925e379796260d 100644
--- a/lite/kernels/x86/tanh_compute_test.cc
+++ b/lite/kernels/x86/tanh_compute_test.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
 #include "lite/kernels/x86/activation_compute.cc"
 
@@ -26,8 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(tanh_x86, retrive_op) {
-  auto tanh =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("tanh");
+  auto tanh = KernelRegistry::Global().Create("tanh");
   ASSERT_FALSE(tanh.empty());
   ASSERT_TRUE(tanh.front());
 }
diff --git a/lite/kernels/x86/transpose_compute.h b/lite/kernels/x86/transpose_compute.h
index 5f6faed2017b6bdef60e7505bf1f0088d86b3ec1..87e7fee7deec711914bd43039301f7180a4bcaa0 100644
--- a/lite/kernels/x86/transpose_compute.h
+++ b/lite/kernels/x86/transpose_compute.h
@@ -60,7 +60,7 @@ inline void TransCompute(const int dim,
       trans6(context, in, out, axis);
       break;
     default:
-      PADDLE_THROW("Tensors with rank at most 6 are supported");
+      LOG(FATAL) << "Tensors with rank at most 6 are supported";
   }
 }
 
diff --git a/lite/kernels/x86/transpose_compute_test.cc b/lite/kernels/x86/transpose_compute_test.cc
index d8533d98258637eba516974e03cd4d88fd452293..aa99db36c450326765d602aaf0b48f72a1a63e13 100644
--- a/lite/kernels/x86/transpose_compute_test.cc
+++ b/lite/kernels/x86/transpose_compute_test.cc
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/transpose_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/transpose_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -25,9 +28,7 @@ namespace x86 {
 
 // transpose
 TEST(transpose_x86, retrive_op) {
-  auto transpose =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "transpose");
+  auto transpose = KernelRegistry::Global().Create("transpose");
   ASSERT_FALSE(transpose.empty());
   ASSERT_TRUE(transpose.front());
 }
@@ -75,9 +76,7 @@ TEST(transpose_x86, run_test) {
 
 // transpose2
 TEST(transpose2_x86, retrive_op) {
-  auto transpose2 =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "transpose2");
+  auto transpose2 = KernelRegistry::Global().Create("transpose2");
   ASSERT_FALSE(transpose2.empty());
   ASSERT_TRUE(transpose2.front());
 }
diff --git a/lite/kernels/x86/var_conv_2d_compute_test.cc b/lite/kernels/x86/var_conv_2d_compute_test.cc
index edef8cb2df75dfb45ad4964975365d4ddbbe9086..a6787b2e3e84360a63618f130305446316a08e01 100644
--- a/lite/kernels/x86/var_conv_2d_compute_test.cc
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/var_conv_2d_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/x86/var_conv_2d_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -197,9 +200,7 @@ static void var_conv_2d_ref(const lite::Tensor* bottom,
 }
 
 TEST(var_conv_2d_x86, retrive_op) {
-  auto var_conv_2d =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "var_conv_2d");
+  auto var_conv_2d = KernelRegistry::Global().Create("var_conv_2d");
   ASSERT_FALSE(var_conv_2d.empty());
   ASSERT_TRUE(var_conv_2d.front());
 }
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index 7ded008387b7d7c92fb2ce6b18e73e1c1e51f29d..fdb485df02f366f7f4868965b1f20c6861b03d43 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -6,6 +6,7 @@ if(LITE_WITH_XTCL)
   add_subdirectory(bridges)
   add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
 else()
+  # basic
   add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} target_wrapper_xpu)
   add_kernel(batch_norm_compute_xpu XPU basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
@@ -15,15 +16,32 @@ else()
   add_kernel(mul_compute_xpu XPU basic SRCS mul_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(softmax_compute_xpu XPU basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(scale_compute_xpu XPU basic SRCS scale_compute.cc DEPS ${lite_kernel_deps})
-  add_kernel(lookup_table_compute_xpu XPU basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
-  add_kernel(layer_norm_compute_xpu XPU basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(dropout_compute_xpu XPU basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(matmul_compute_xpu XPU basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(sequence_topk_avg_pooling_compute_xpu XPU basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(concat_compute_xpu XPU basic SRCS concat_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(search_fc_compute_xpu XPU basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps})
+
+  # extra
+  add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(layer_norm_compute_xpu XPU extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(sequence_reverse_compute_xpu XPU extra SRCS sequence_reverse_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(sequence_concat_compute_xpu XPU extra SRCS sequence_concat_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(sequence_arithmetic_compute_xpu XPU extra SRCS sequence_arithmetic_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(sequence_pool_compute_xpu XPU extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(match_matrix_tensor_compute_xpu XPU extra SRCS match_matrix_tensor_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(var_conv_2d_compute_xpu XPU extra SRCS var_conv_2d_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(search_grnn_compute_xpu XPU extra SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps})
+
+  # extra(fused kernel)
   add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__resnet_cbam_compute_xpu XPU extra SRCS __xpu__resnet_cbam_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(__xpu__multi_encoder_compute_xpu XPU extra SRCS __xpu__multi_encoder_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(__xpu__embedding_with_eltwise_add_compute_xpu XPU extra SRCS __xpu__embedding_with_eltwise_add_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(__xpu__fc_compute_xpu XPU extra SRCS __xpu__fc_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__search_attention_compute_xpu XPU extra SRCS __xpu__search_attention_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__mmdnn_compute_xpu XPU extra SRCS __xpu__mmdnn_compute.cc DEPS ${lite_kernel_deps})
 endif()
diff --git a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
index 376cdd0dc23426ede42ddac60e061727f73322e3..224bfdc130338bc653091400708bc8a7421a9482 100644
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
@@ -31,11 +31,14 @@ void XPUEmbeddingWithEltwiseAddCompute::PrepareForRun() {
     CHECK_EQ(table_dims.size(), 2); /* shape like [table_len, embed_dim] */
     table_lens_cpu_.push_back(table_dims[0]);
   }
-  void* lens_ptr = nullptr;
+
   size_t lens_size = table_lens_cpu_.size() * sizeof(int);
-  xpu_malloc(&lens_ptr, lens_size);
-  xpu_memcpy(lens_ptr, &table_lens_cpu_[0], lens_size, XPU_HOST_TO_DEVICE);
-  table_lens_guard_.reset(lens_ptr);
+  table_lens_guard_ =
+      TargetWrapperXPU::MallocScratchPad(lens_size, false /* use_l3 */);
+  XPU_CALL(xpu_memcpy(table_lens_guard_->addr_,
+                      &table_lens_cpu_[0],
+                      lens_size,
+                      XPU_HOST_TO_DEVICE));
 }
 
 void XPUEmbeddingWithEltwiseAddCompute::Run() {
@@ -55,16 +58,16 @@ void XPUEmbeddingWithEltwiseAddCompute::Run() {
   int embed_dim = table_dims[1];
   int emb_layer_num = param.Ids.size();
   int r = xdnn::embedding_with_ewadd<float, int64_t, false, false>(
-      ctx.GetRawContext(),                        /* context */
-      embed_dim,                                  /* embed_dim */
-      idx_len,                                    /* idx_len */
-      emb_layer_num,                              /* emb_layer_num */
-      param.padding_idx,                          /* padding_idx */
-      &arg_tables_[0],                            /* tables */
-      &arg_ids_[0],                               /* indices */
-      static_cast<int*>(table_lens_guard_.get()), /* table_lens */
-      nullptr,                                    /* scale_after_emb */
-      nullptr,                                    /* scale_after_ewadd */
+      ctx.GetRawContext(),                         /* context */
+      embed_dim,                                   /* embed_dim */
+      idx_len,                                     /* idx_len */
+      emb_layer_num,                               /* emb_layer_num */
+      param.padding_idx,                           /* padding_idx */
+      &arg_tables_[0],                             /* tables */
+      &arg_ids_[0],                                /* indices */
+      static_cast<int*>(table_lens_guard_->addr_), /* table_lens */
+      nullptr,                                     /* scale_after_emb */
+      nullptr,                                     /* scale_after_ewadd */
       param.Out->mutable_data<float>(TARGET(kXPU)) /* top */);
   CHECK_EQ(r, 0);
 }
diff --git a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
index 10ba6e0b5b76a1dbebfd633732f7c36e6ac7c954..124ed7866f0a52b892e30ae41398d5140064c964 100644
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include <memory>
 #include <vector>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
 #include "lite/core/kernel.h"
-#include "lite/kernels/xpu/utils.h"  // XPUFreeDeleter
 
 namespace paddle {
 namespace lite {
@@ -36,7 +35,7 @@ class XPUEmbeddingWithEltwiseAddCompute
  private:
   std::vector<const int64_t*> arg_ids_;
   std::vector<const float*> arg_tables_;
-  std::unique_ptr<void, XPUFreeDeleter> table_lens_guard_;
+  XPUScratchPadGuard table_lens_guard_;
   std::vector<int> table_lens_cpu_;
 };
 
diff --git a/lite/kernels/xpu/__xpu__mmdnn_compute.cc b/lite/kernels/xpu/__xpu__mmdnn_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09d59fcee37c634a87636ac80e7be15d927f2509
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__mmdnn_compute.cc
@@ -0,0 +1,1514 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+namespace {
+
+void FillMax(float max, float* xpu_ptr) {
+  float maxs[4] = {max, 0.0f, 0.0f, 0.0f};
+  XPU_CALL(xpu_memcpy(
+      xpu_ptr, maxs, 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+}
+
+void GrnnLayout(int batch,
+                const std::vector<int>& offset,
+                std::vector<int>* new_offset_ptr,
+                std::vector<int>* idx_sorted_ptr) {
+  auto& new_offset = *new_offset_ptr;
+  auto& idx_sorted = *idx_sorted_ptr;
+
+  std::vector<int> width;
+  width.resize(batch);
+  new_offset.clear();
+  idx_sorted.clear();
+
+  idx_sorted.resize(batch);
+  for (int i = 0; i < batch; i++) {
+    width[i] = offset[i + 1] - offset[i];
+    idx_sorted[i] = i;
+  }
+  std::sort(idx_sorted.data(),
+            idx_sorted.data() + batch,
+            [&width](int a, int b) { return width[a] > width[b]; });
+  int max_width = width[idx_sorted[0]];
+  new_offset.resize(max_width + 1);
+  new_offset[0] = 0;
+  int j = batch - 1;
+  int last_width = 0;
+  int sub_row = 0;
+  int sub_col = 0;
+
+  for (int i = 1; i <= max_width;) {
+    for (int k = j; k >= 0; --k) {
+      if (width[idx_sorted[k]] > last_width) {
+        sub_row = width[idx_sorted[k]] - last_width;
+        sub_col = k + 1;
+        for (int s = 0; s < sub_row; s++) {
+          new_offset[i] = new_offset[i - 1] + sub_col;
+          i++;
+        }
+        // move on
+        last_width = width[idx_sorted[k]];
+        j = k - 1;
+        break;
+      }
+    }
+  }
+}
+
+}  // anonymous namespace
+
+class MMDNNIdInfo {
+  XPUScratchPadGuard l3_buffer_guard_;
+  char* l3_buffer_{nullptr};
+  std::unique_ptr<char[]> cpu_buffer_guard_;
+  char* cpu_buffer_{nullptr};
+
+ public:
+  const int64_t* id0_64{nullptr};
+  const int64_t* id1_64{nullptr};
+  int64_t* lod_64{nullptr};
+  int* lod_32{nullptr};
+  int* new_offset_32{nullptr};
+  int* idx_sorted_32{nullptr};
+
+  std::vector<int> lod;
+  std::vector<int> new_offset;
+  std::vector<int> idx_sorted;
+  int batch;
+  int seqlen_max;
+  int seqlen_sum;
+  int seqlen_square_sum;
+
+  void Init(int upper_bound_batch, int upper_bound_seqlen) {
+    int ub_lod_64_size = (upper_bound_batch + 1) * sizeof(int64_t);
+    int ub_lod_32_size = (upper_bound_batch + 1) * sizeof(int);
+    int ub_new_offset_32_size = (upper_bound_seqlen + 1) * sizeof(int);
+    int ub_idx_sorted_32_size = (upper_bound_batch + 1) * sizeof(int);
+    int total_size = ub_lod_64_size + ub_lod_32_size + ub_new_offset_32_size +
+                     ub_idx_sorted_32_size;
+
+    // TODO(miaotianxiang): use l3?
+    l3_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(total_size, false);
+    l3_buffer_ = reinterpret_cast<char*>(l3_buffer_guard_->addr_);
+    cpu_buffer_guard_.reset(new char[total_size]);
+    cpu_buffer_ = cpu_buffer_guard_.get();
+  }
+
+  void Update(lite::Tensor* id0, lite::Tensor* id1) {
+    auto& id0_lod = id0->lod()[0];
+    lod.clear();
+    for (auto e : id0_lod) {
+      lod.push_back(e);
+    }
+
+    seqlen_max = 0;
+    seqlen_sum = 0;
+    seqlen_square_sum = 0;
+    batch = lod.size() - 1;
+    for (int i = 0; i < batch; i++) {
+      int seqlen = lod[i + 1] - lod[i];
+      seqlen_max = std::max(seqlen_max, seqlen);
+      seqlen_sum = seqlen_sum + seqlen;
+      seqlen_square_sum = seqlen_square_sum + seqlen * seqlen;
+    }
+    GrnnLayout(batch, lod, &new_offset, &idx_sorted);
+
+    id0_64 = id0->data<int64_t>();
+    id1_64 = id1->data<int64_t>();
+
+    int offset = 0;
+    lod_64 = reinterpret_cast<int64_t*>(l3_buffer_ + offset);
+    memcpy(
+        cpu_buffer_ + offset, id0_lod.data(), id0_lod.size() * sizeof(int64_t));
+    offset += id0_lod.size() * sizeof(int64_t);
+    lod_32 = reinterpret_cast<int*>(l3_buffer_ + offset);
+    memcpy(cpu_buffer_ + offset, lod.data(), lod.size() * sizeof(int));
+    offset += lod.size() * sizeof(int);
+    new_offset_32 = reinterpret_cast<int*>(l3_buffer_ + offset);
+    memcpy(cpu_buffer_ + offset,
+           new_offset.data(),
+           new_offset.size() * sizeof(int));
+    offset += new_offset.size() * sizeof(int);
+    idx_sorted_32 = reinterpret_cast<int*>(l3_buffer_ + offset);
+    memcpy(cpu_buffer_ + offset,
+           idx_sorted.data(),
+           idx_sorted.size() * sizeof(int));
+    offset += idx_sorted.size() * sizeof(int);
+    XPU_CALL(xpu_memcpy(
+        l3_buffer_, cpu_buffer_, offset, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  }
+};
+
+class MMDNNFcOp {
+  const int16_t* weight_{nullptr};
+  XPUScratchPadGuard weight_max_guard_;
+  float* weight_max_{nullptr};
+  const float* bias_{nullptr};
+  XPUScratchPadGuard in_max_guard_;
+  float* in_max_{nullptr};
+  int n_;
+  int k_;
+  xdnn::Activation_t::act_enum act_type_;
+  XPUScratchPadGuard out_max_guard_;
+
+ public:
+  float* out_max{nullptr};
+
+  void Init(const int16_t* weight,
+            float weight_max,
+            const float* bias,
+            int n,
+            int k,
+            xdnn::Activation_t::act_enum act_type) {
+    n_ = n;
+    k_ = k;
+    act_type_ = act_type;
+
+    weight_ = weight;
+    weight_max_guard_ =
+        TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false);
+    weight_max_ = reinterpret_cast<float*>(weight_max_guard_->addr_);
+    FillMax(weight_max, weight_max_);
+
+    bias_ = bias;
+
+    in_max_guard_ =
+        TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false);
+    out_max_guard_ =
+        TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false);
+    in_max_ = reinterpret_cast<float*>(in_max_guard_->addr_);
+    out_max = reinterpret_cast<float*>(in_max_guard_->addr_);
+  }
+
+  void Init(lite::Tensor* weight,
+            float weight_max,
+            lite::Tensor* bias,
+            int n,
+            int k,
+            xdnn::Activation_t::act_enum act_type) {
+    Init(weight->data<int16_t>(),
+         weight_max,
+         bias ? bias->data<float>() : nullptr,
+         n,
+         k,
+         act_type);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             const float* in,
+             int m,
+             float* out,
+             const float* in_max_by_caller = nullptr) {
+    int r = 0;
+    if (in_max_by_caller == nullptr) {
+      r = xdnn::findmax<float>(ctx, in, m * k_, in_max_);
+      CHECK_EQ(r, 0);
+      in_max_by_caller = in_max_;
+    }
+    r = xdnn::gemm_int16_maxptr<float, int16_t, float>(ctx,
+                                                       false,
+                                                       true,
+                                                       m,
+                                                       n_,
+                                                       k_,
+                                                       1.0f,
+                                                       in,
+                                                       k_,
+                                                       weight_,
+                                                       k_,
+                                                       0.0f,
+                                                       out,
+                                                       n_,
+                                                       bias_,
+                                                       act_type_,
+                                                       in_max_by_caller,
+                                                       weight_max_,
+                                                       out_max);
+    CHECK_EQ(r, 0);
+  }
+};
+
+class MMDNNGrnnOp {
+  MMDNNFcOp fc_e2h0_;
+  MMDNNFcOp fc_e2h1_;
+  MMDNNFcOp fc_e2h2_;
+  const int16_t* dense_h2h_{nullptr};
+  float dense_h2h_max_[3];
+  XPUScratchPadGuard input_max_guard_;
+  float* input_max_{nullptr};
+  XPUScratchPadGuard hbm_buffer_guard_;
+  float* hbm_buffer_{nullptr};
+  // require: cap_l * max(cap_e_, cap_h_) * 5
+  // seq2batch_out: [cap_l, cap_e_]
+  // fc_e2h_out: [3, cap_l, cap_h_]
+  // gru_out: [cap_l, cap_h_]
+  int cap_e_;
+  int cap_h_;
+  int max_cap_l_;
+
+ public:
+  void Init(lite::Tensor* wh,
+            const std::vector<float>& wh_maxs,
+            lite::Tensor* wi,
+            const std::vector<float>& wi_maxs,
+            int cap_e,
+            int cap_h,
+            int max_cap_l) {
+    cap_e_ = cap_e;
+    cap_h_ = cap_h;
+    max_cap_l_ = max_cap_l;
+
+    // weight
+    auto* dense_e2h = wi->data<int16_t>();
+    fc_e2h0_.Init(dense_e2h,
+                  wi_maxs[0],
+                  nullptr,
+                  cap_h_,
+                  cap_e_,
+                  xdnn::Activation_t::LINEAR);
+    fc_e2h1_.Init(dense_e2h + cap_e_ * cap_h_,
+                  wi_maxs[1],
+                  nullptr,
+                  cap_h_,
+                  cap_e_,
+                  xdnn::Activation_t::LINEAR);
+    fc_e2h2_.Init(dense_e2h + cap_e_ * cap_h_ * 2,
+                  wi_maxs[2],
+                  nullptr,
+                  cap_h_,
+                  cap_e_,
+                  xdnn::Activation_t::LINEAR);
+
+    dense_h2h_ = wh->data<int16_t>();
+    dense_h2h_max_[0] = wh_maxs[0];
+    dense_h2h_max_[1] = wh_maxs[1];
+    dense_h2h_max_[2] = wh_maxs[2];
+
+    input_max_guard_ =
+        TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false);
+    input_max_ = reinterpret_cast<float*>(input_max_guard_->addr_);
+    hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(
+        5 * std::max(cap_e_, cap_h_) * max_cap_l_ * sizeof(float), false);
+    hbm_buffer_ = reinterpret_cast<float*>(hbm_buffer_guard_->addr_);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             const MMDNNIdInfo& sentense,
+             const float* in,
+             float* out,
+             float* l3_buffer = nullptr,
+             int l3_size = 0) {
+    int batch = sentense.batch;
+    int cap_l = sentense.seqlen_sum;
+    int max_width = sentense.seqlen_max;
+
+    int slot_size = cap_l * std::max(cap_e_, cap_h_);
+    float* seq2batch_out = hbm_buffer_;
+    float* fc_e2h_out = hbm_buffer_ + 1 * slot_size;
+    float* gru_out = hbm_buffer_ + 4 * slot_size;
+    if (l3_size > 0 && l3_size >= 5 * slot_size * sizeof(float)) {
+      seq2batch_out = l3_buffer;
+      fc_e2h_out = l3_buffer + 1 * slot_size;
+      gru_out = l3_buffer + 4 * slot_size;
+    }
+
+    int r = 0;
+    r = xdnn::search_seq2batch(ctx,
+                               batch,
+                               max_width,
+                               cap_e_,
+                               sentense.idx_sorted_32,
+                               sentense.lod_32,
+                               sentense.new_offset_32,
+                               in,
+                               seq2batch_out);
+    CHECK_EQ(r, 0);
+
+    r = xdnn::findmax<float>(ctx, in, cap_l * cap_e_, input_max_);
+    CHECK_EQ(r, 0);
+    fc_e2h0_.Infer(ctx, seq2batch_out, cap_l, fc_e2h_out, input_max_);
+    fc_e2h1_.Infer(
+        ctx, seq2batch_out, cap_l, fc_e2h_out + cap_l * cap_h_, input_max_);
+    fc_e2h2_.Infer(
+        ctx, seq2batch_out, cap_l, fc_e2h_out + cap_l * cap_h_ * 2, input_max_);
+    r = xdnn::search_grnn<float, int16_t>(ctx,
+                                          cap_l,
+                                          cap_h_,
+                                          cap_e_,
+                                          max_width,
+                                          sentense.new_offset_32,
+                                          fc_e2h_out,
+                                          dense_h2h_,
+                                          gru_out,
+                                          dense_h2h_max_[0],
+                                          dense_h2h_max_[1],
+                                          dense_h2h_max_[2]);
+    CHECK_EQ(r, 0);
+
+    r = xdnn::search_batch2seq(ctx,
+                               batch,
+                               max_width,
+                               cap_h_,
+                               sentense.idx_sorted_32,
+                               sentense.lod_32,
+                               sentense.new_offset_32,
+                               gru_out,
+                               out);
+    CHECK_EQ(r, 0);
+  }
+};
+
+class MMDNNAttentionOp {
+  int dim_;
+  float alpha0_;
+  float alpha1_;
+  MMDNNFcOp seqfc_;
+  XPUScratchPadGuard hbm_buffer_guard_;
+  float* hbm_buffer_{nullptr};
+  // require: cap_l * dim_ + seqlen_square_sum
+  // seqfc_out: [cap_l, dim_]
+  // batchgemm0_out: [seqlen_square_sum]
+  // seq_softmax_out: [seqlen_square_sum], reuse of batchgemm0_out
+  // batchgemm1_out: [cap_l, dim_], reuse of seqfc_out
+
+ public:
+  void Init(lite::Tensor* att_fc_w,
+            float att_fc_w_max,
+            lite::Tensor* att_fc_b,
+            int dim,
+            int upper_bound_batch,
+            int upper_bound_seqlen) {
+    dim_ = dim;
+    alpha0_ = 0.0883883461356163f;  // TODO(miaotianxiang):
+    alpha1_ = 1.0f;
+
+    seqfc_.Init(att_fc_w,
+                att_fc_w_max,
+                att_fc_b,
+                dim_,
+                dim_,
+                xdnn::Activation_t::LINEAR);
+    hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch * (upper_bound_seqlen * dim_ +
+                              upper_bound_seqlen * upper_bound_seqlen)) *
+            sizeof(float),
+        false);
+    hbm_buffer_ = reinterpret_cast<float*>(hbm_buffer_guard_->addr_);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             const MMDNNIdInfo& sentense,
+             const float* input,
+             float* pool_out,
+             float* l3_buffer = nullptr,
+             int l3_size = 0) {
+    int batch = sentense.batch;
+    int cap_l = sentense.seqlen_sum;
+    int max_width = sentense.seqlen_max;
+    int* lod_32 = sentense.lod_32;
+
+    float* seqfc_out = hbm_buffer_;
+    float* batchgemm0_out = hbm_buffer_ + cap_l * dim_;
+    float* seq_softmax_out = batchgemm0_out;
+    float* batchgemm1_out = seqfc_out;
+    if (l3_size > 0 &&
+        l3_size >=
+            (cap_l * dim_ + sentense.seqlen_square_sum) * sizeof(float)) {
+      seqfc_out = l3_buffer;
+      batchgemm0_out = l3_buffer + cap_l * dim_;
+      seq_softmax_out = batchgemm0_out;
+      batchgemm1_out = seqfc_out;
+    }
+
+    seqfc_.Infer(ctx, input, cap_l, seqfc_out);
+    int r = 0;
+    r = xdnn::search_noaligned_mat_mul(ctx,
+                                       0,
+                                       1,
+                                       batch,
+                                       lod_32,
+                                       max_width,
+                                       dim_,
+                                       alpha0_,
+                                       input,
+                                       seqfc_out,
+                                       batchgemm0_out);
+    CHECK_EQ(r, 0);
+    r = xdnn::search_seq_softmax(
+        ctx, batchgemm0_out, seq_softmax_out, lod_32, batch, max_width);
+    CHECK_EQ(r, 0);
+    r = xdnn::search_noaligned_mat_mul(ctx,
+                                       0,
+                                       0,
+                                       batch,
+                                       lod_32,
+                                       max_width,
+                                       dim_,
+                                       alpha1_,
+                                       seq_softmax_out,
+                                       input,
+                                       batchgemm1_out);
+    CHECK_EQ(r, 0);
+    r = xdnn::sequence_pooling_forward(ctx,
+                                       xdnn::Pooling_t::MAX_WITHOUT_INDEX,
+                                       batch,
+                                       lod_32,
+                                       dim_,
+                                       batchgemm1_out,
+                                       nullptr,
+                                       pool_out);
+    CHECK_EQ(r, 0);
+  }
+};
+
+class MMDNNMatchConvTopk {
+  std::vector<int> topks_;
+  int dim_t_;
+  int dim_in_;
+  int out_channel_;
+
+  MMDNNFcOp xw_fc_;
+  const int16_t* conv_weight_{nullptr};
+  float conv_weight_max_;
+  XPUScratchPadGuard hbm_buffer_guard_;
+  float* hbm_buffer_{nullptr};
+  // xw_out: [sum(left_len), dim_t_ * dim_in_]
+  // xwy_out: [sum(left_len * right_len) * dim_t_]
+  // conv_out: [sum(left_len * right_len) * out_channel_]
+  // seq_concat_out: [sum(left_len * right_len) * (dim_t_ + out_channel_)]
+
+  XPUScratchPadGuard left_lod_32_guard_;
+  int* left_lod_32_{nullptr};
+  XPUScratchPadGuard right_lod_32_guard_;
+  int* right_lod_32_{nullptr};
+  XPUScratchPadGuard match_lod_32_guard_;
+  int* match_lod_32_{nullptr};
+  XPUScratchPadGuard conv_lod_32_guard_;
+  int* conv_lod_32_{nullptr};
+  XPUScratchPadGuard topk_offset_32_guard_;
+  int* topk_offset_32_{nullptr};
+  XPUScratchPadGuard topks_xpu_guard_;
+  int* topks_xpu_{nullptr};
+  XPUScratchPadGuard useless_topk_pos_guard_;
+  int* useless_topk_pos_{nullptr};
+
+ public:
+  float* seq_avg_topk_out{nullptr};
+
+  void Init(lite::Tensor* input_w,
+            float input_w_max,
+            lite::Tensor* conv_w,
+            float conv_w_max,
+            int dim_t,
+            int dim_in,
+            int out_channel,
+            int upper_bound_batch,
+            int upper_bound_seqlen,
+            const std::vector<int>& topks) {
+    dim_t_ = dim_t;
+    dim_in_ = dim_in;
+    out_channel_ = out_channel;
+    topks_ = topks;
+
+    xw_fc_.Init(input_w,
+                input_w_max,
+                nullptr,
+                dim_t_ * dim_in_,
+                dim_in_,
+                xdnn::Activation_t::LINEAR);
+    conv_weight_ = conv_w->data<int16_t>();
+    conv_weight_max_ = conv_w_max;
+
+    hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch * upper_bound_seqlen * dim_t_ * dim_in_ +
+         upper_bound_batch * upper_bound_seqlen * upper_bound_seqlen *
+             (dim_t_ + out_channel_) * 2) *
+            sizeof(float),
+        false);
+    hbm_buffer_ = reinterpret_cast<float*>(hbm_buffer_guard_->addr_);
+
+    left_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch + 1) * sizeof(int), false);
+    left_lod_32_ = reinterpret_cast<int*>(left_lod_32_guard_->addr_);
+    right_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch + 1) * sizeof(int), false);
+    right_lod_32_ = reinterpret_cast<int*>(right_lod_32_guard_->addr_);
+    match_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch + 1) * sizeof(int), false);
+    match_lod_32_ = reinterpret_cast<int*>(match_lod_32_guard_->addr_);
+    conv_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch + 1) * sizeof(int), false);
+    conv_lod_32_ = reinterpret_cast<int*>(conv_lod_32_guard_->addr_);
+    topk_offset_32_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch + 1) * sizeof(int), false);
+    topk_offset_32_ = reinterpret_cast<int*>(topk_offset_32_guard_->addr_);
+    topks_xpu_guard_ =
+        TargetWrapperXPU::MallocScratchPad(topks_.size() * sizeof(int), false);
+    topks_xpu_ = reinterpret_cast<int*>(topks_xpu_guard_->addr_);
+    XPU_CALL(xpu_memcpy(topks_xpu_,
+                        topks_.data(),
+                        topks_.size() * sizeof(int),
+                        XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+    useless_topk_pos_guard_ =
+        TargetWrapperXPU::MallocScratchPad(4 * sizeof(int), false);
+    useless_topk_pos_ = reinterpret_cast<int*>(useless_topk_pos_guard_->addr_);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             lite::Tensor* left,
+             lite::Tensor* right,
+             lite::Tensor* out,
+             float* l3_buffer = nullptr,
+             int l3_size = 0) {
+    auto left_lod = left->lod()[0];
+    auto right_lod = right->lod()[0];
+    int batch = left_lod.size() - 1;
+
+    std::vector<int> left_lod_32_cpu;
+    for (auto e : left_lod) {
+      left_lod_32_cpu.push_back(e);
+    }
+    XPU_CALL(xpu_memcpy(left_lod_32_,
+                        left_lod_32_cpu.data(),
+                        left_lod_32_cpu.size() * sizeof(int),
+                        XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+    std::vector<int> right_lod_32_cpu;
+    for (auto e : right_lod) {
+      right_lod_32_cpu.push_back(e);
+    }
+    XPU_CALL(xpu_memcpy(right_lod_32_,
+                        right_lod_32_cpu.data(),
+                        right_lod_32_cpu.size() * sizeof(int),
+                        XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+
+    std::vector<int> lod_match = {0};
+    std::vector<int> lod_conv = {0};
+    std::vector<int> lod_topk = {0};
+    int x_mul_y_sum = 0;
+    int left_seqlen_sum = 0;
+    int left_seqlen_max = 0;
+    int right_seqlen_sum = 0;
+    int right_seqlen_max = 0;
+    for (int i = 0; i < batch; i++) {
+      int len_x = left_lod[i + 1] - left_lod[i];
+      int len_y = right_lod[i + 1] - right_lod[i];
+      int imgsize = len_x * len_y;
+      x_mul_y_sum = x_mul_y_sum + imgsize;
+      lod_match.push_back(lod_match.back() + imgsize * dim_t_);
+      lod_conv.push_back(lod_conv.back() + imgsize * out_channel_);
+      lod_topk.push_back(lod_topk.back() + imgsize * (dim_t_ + out_channel_));
+
+      left_seqlen_max = std::max(left_seqlen_max, len_x);
+      right_seqlen_max = std::max(right_seqlen_max, len_y);
+      left_seqlen_sum += len_x;
+      right_seqlen_sum += len_y;
+    }
+    XPU_CALL(xpu_memcpy(match_lod_32_,
+                        lod_match.data(),
+                        lod_match.size() * sizeof(int),
+                        XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+    XPU_CALL(xpu_memcpy(conv_lod_32_,
+                        lod_conv.data(),
+                        lod_conv.size() * sizeof(int),
+                        XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+    XPU_CALL(xpu_memcpy(topk_offset_32_,
+                        lod_topk.data(),
+                        lod_topk.size() * sizeof(int),
+                        XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+
+    float* xwy_out = hbm_buffer_;
+    float* conv_out = hbm_buffer_ + x_mul_y_sum * dim_t_;
+    float* seq_concat_out = hbm_buffer_ + x_mul_y_sum * (dim_t_ + out_channel_);
+    float* xw_out = hbm_buffer_ + x_mul_y_sum * (dim_t_ + out_channel_) * 2;
+    int total_len = x_mul_y_sum * (dim_t_ + out_channel_) * 2 +
+                    left_seqlen_sum * dim_t_ * dim_in_;
+    if (l3_size > 0 && l3_size >= total_len * sizeof(float)) {
+      xwy_out = l3_buffer;
+      conv_out = l3_buffer + x_mul_y_sum * dim_t_;
+      seq_concat_out = l3_buffer + x_mul_y_sum * (dim_t_ + out_channel_);
+      xw_out = l3_buffer + x_mul_y_sum * (dim_t_ + out_channel_) * 2;
+    }
+    seq_avg_topk_out = out->mutable_data<float>(TARGET(kXPU));
+
+    int max_width = std::max(left_seqlen_max, right_seqlen_max);
+    xw_fc_.Infer(ctx, left->data<float>(), left_seqlen_sum, xw_out);
+    int r = 0;
+    r = xdnn::match_matrix_tensor(ctx,
+                                  batch,
+                                  xw_out,
+                                  right->data<float>(),
+                                  left_lod_32_,
+                                  right_lod_32_,
+                                  dim_t_,
+                                  dim_in_,
+                                  xwy_out,
+                                  xw_fc_.out_max,
+                                  xdnn::Activation_t::RELU,
+                                  max_width);
+    CHECK_EQ(r, 0);
+    r = xdnn::search_varconv<float, int16_t>(
+        ctx,
+        batch,
+        dim_t_,
+        out_channel_,
+        5,
+        5,
+        1,
+        1,
+        xwy_out,
+        conv_weight_,
+        right_lod_32_,
+        left_lod_32_,
+        conv_out,
+        conv_weight_max_,
+        xdnn::Activation_t::RELU);  // TODO(miaotianxiang):
+    CHECK_EQ(r, 0);
+    r = xdnn::sequence_concat(ctx,
+                              xwy_out,
+                              match_lod_32_,
+                              conv_out,
+                              conv_lod_32_,
+                              seq_concat_out,
+                              batch);
+    CHECK_EQ(r, 0);
+    r = xdnn::sequence_topk_avg_pooling(ctx,
+                                        seq_concat_out,
+                                        seq_avg_topk_out,
+                                        useless_topk_pos_,
+                                        batch,
+                                        dim_t_ + out_channel_,
+                                        topk_offset_32_,
+                                        left_lod_32_,
+                                        right_lod_32_,
+                                        topks_xpu_,
+                                        topks_.size());
+    CHECK_EQ(r, 0);
+  }
+};
+
+class MMDNNBidEmbGrnnAtt {
+  const float* table_{nullptr};
+  int table_len_;
+  int emb_dim_;
+  int cap_h_;
+  MMDNNGrnnOp bi_fw_;
+  MMDNNGrnnOp bi_rv_;
+  MMDNNAttentionOp att_;
+  XPUScratchPadGuard hbm_buffer_guard_;
+  float* hbm_buffer_{nullptr};
+  // require at least: 4 * cap_l * emb_dim_
+  // emb_rv: [cap_l, emb_dim_]
+  // grnn_fw: [cap_l, emb_dim_]
+  // grnn_rv: [cap_l, emb_dim_]
+  // grnn_rv_rv: [cap_l, emb_dim_]
+  // concat_2in: [cap_l, 2 * emb_dim_]
+  // L3.bi_fw: 5 * cap_l * emb_dim_
+  // L3.bi_rv: 5 * cap_l * emb_dim_
+  // L3.att:   cap_l * 2 * emb_dim_ + seqlen_square_sum
+
+  // execution-plan:
+  // 1. bid_emb_ew,                   alloc(emb_rv)
+  // 2. bi_rv,                        alloc(grnn_rv)
+  // 3.                               free(emb_rv)
+  // 4. sequence_reverse,             alloc(grnn_rv_rv)
+  // 5. sequence_pooling(grnn_rv)
+  // 6.                               free(grnn_rv)
+  // 7. bi_fw                         alloc(grnn_fw)
+  // 8. sequence_pooling(grnn_fw)
+  // 9. concat_2                      alloc(concat_2in)
+  // 10. concat_3
+  // 11. att
+
+  // alloc-plan:
+  // [0]: emb_rv, grnn_rv_rv
+  // [1]: grnn_rv, grnn_fw
+  // [2, 3]: concat_2in
+  // [2, 3, 4, 5, 6]: L3.bi_fw, L3.bi_rv
+  // [4, 5, ..., ?]:  L3.att
+
+ public:
+  float* emb_fw{nullptr};
+  float* concat_3in{nullptr};
+  float* pool_fw{nullptr};
+  float* pool_rv{nullptr};
+  float* att_out{nullptr};
+
+  void Init(lite::Tensor* table,
+            lite::Tensor* fw_wh,
+            const std::vector<float>& fw_wh_maxs,
+            lite::Tensor* fw_wi,
+            const std::vector<float>& fw_wi_maxs,
+            lite::Tensor* rv_wh,
+            const std::vector<float>& rv_wh_maxs,
+            lite::Tensor* rv_wi,
+            const std::vector<float>& rv_wi_maxs,
+            lite::Tensor* att_fc_w,
+            float att_fc_w_max,
+            lite::Tensor* att_fc_b,
+            int upper_bound_batch,
+            int upper_bound_seqlen) {
+    table_ = table->data<float>();
+    table_len_ = table->dims()[0];
+    emb_dim_ = table->dims()[1];
+    cap_h_ = emb_dim_;
+    int max_cap_l = upper_bound_batch * upper_bound_seqlen;
+
+    bi_fw_.Init(
+        fw_wh, fw_wh_maxs, fw_wi, fw_wi_maxs, emb_dim_, cap_h_, max_cap_l);
+    bi_rv_.Init(
+        rv_wh, rv_wh_maxs, rv_wi, rv_wi_maxs, emb_dim_, cap_h_, max_cap_l);
+    att_.Init(att_fc_w,
+              att_fc_w_max,
+              att_fc_b,
+              2 * cap_h_,
+              upper_bound_batch,
+              upper_bound_seqlen);
+
+    hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(
+        4 * max_cap_l * cap_h_ * sizeof(float), false);
+    hbm_buffer_ = reinterpret_cast<float*>(hbm_buffer_guard_->addr_);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             int batch,
+             const MMDNNIdInfo& sentense,
+             lite::Tensor* grnn_fw_pool_out,
+             lite::Tensor* grnn_rv_pool_out,
+             lite::Tensor* att_pool_out,
+             lite::Tensor* concat_3in1_out,
+             lite::Tensor* emb_fw_out,
+             float* l3_buffer = nullptr,
+             int l3_size = 0) {
+    int cap_l = sentense.seqlen_sum;
+    int slot_len = cap_l * cap_h_;
+
+    float* emb_rv = hbm_buffer_;
+    float* grnn_fw = hbm_buffer_ + slot_len;
+    float* grnn_rv = hbm_buffer_ + slot_len;
+    float* grnn_rv_rv = hbm_buffer_;
+    float* concat_2in = hbm_buffer_ + 2 * slot_len;
+    if (l3_size > 0 && l3_size >= 4 * slot_len * sizeof(float)) {
+      emb_rv = l3_buffer;
+      grnn_fw = l3_buffer + slot_len;
+      grnn_rv = l3_buffer + slot_len;
+      grnn_rv_rv = l3_buffer;
+    }
+    emb_fw = emb_fw_out->mutable_data<float>(TARGET(kXPU));
+    concat_3in = concat_3in1_out->mutable_data<float>(TARGET(kXPU));
+    pool_fw = grnn_fw_pool_out->mutable_data<float>(TARGET(kXPU));
+    pool_rv = grnn_rv_pool_out->mutable_data<float>(TARGET(kXPU));
+    att_out = att_pool_out->mutable_data<float>(TARGET(kXPU));
+
+    int r = 0;
+    r = xdnn::search_bid_emb_ew(ctx,
+                                batch,
+                                sentense.lod_64,
+                                sentense.id0_64,
+                                sentense.id1_64,
+                                table_,
+                                table_len_,
+                                emb_dim_,
+                                emb_fw,
+                                emb_rv,
+                                table_len_ - 2,
+                                1);
+    CHECK_EQ(r, 0);
+    bi_rv_.Infer(ctx,
+                 sentense,
+                 emb_rv,
+                 grnn_rv,
+                 l3_buffer + 2 * slot_len,
+                 l3_size - 2 * slot_len * sizeof(float));
+    r = xdnn::sequence_reverse(
+        ctx, batch, sentense.lod_32, cap_h_, grnn_rv, grnn_rv_rv);
+    CHECK_EQ(r, 0);
+    r = xdnn::sequence_pooling_forward(ctx,
+                                       xdnn::Pooling_t::LAST,
+                                       batch,
+                                       sentense.lod_32,
+                                       cap_h_,
+                                       grnn_rv,
+                                       nullptr,
+                                       pool_rv);
+    CHECK_EQ(r, 0);
+
+    bi_fw_.Infer(ctx,
+                 sentense,
+                 emb_fw,
+                 grnn_fw,
+                 l3_buffer + 2 * slot_len,
+                 l3_size - 2 * slot_len * sizeof(float));
+    r = xdnn::sequence_pooling_forward(ctx,
+                                       xdnn::Pooling_t::LAST,
+                                       batch,
+                                       sentense.lod_32,
+                                       cap_h_,
+                                       grnn_fw,
+                                       nullptr,
+                                       pool_fw);
+    CHECK_EQ(r, 0);
+    const int concat_widths[] = {cap_h_, cap_h_, cap_h_};
+    const float* concat_ptrs[] = {emb_fw, grnn_fw, grnn_rv_rv};
+    r = xdnn::concat<float>(
+        ctx, cap_l, concat_widths + 1, 2, concat_ptrs + 1, concat_2in);
+    CHECK_EQ(r, 0);
+    r = xdnn::concat<float>(
+        ctx, cap_l, concat_widths, 3, concat_ptrs, concat_3in);
+    CHECK_EQ(r, 0);
+    att_.Infer(ctx,
+               sentense,
+               concat_2in,
+               att_out,
+               l3_buffer + 4 * slot_len,
+               l3_size - 4 * slot_len * sizeof(float));
+  }
+};
+
+class MMDNNEmbAtt {
+  const float* table_{nullptr};
+  int table_len_;
+  int emb_dim_;
+  MMDNNAttentionOp att_;
+
+ public:
+  float* emb_fw{nullptr};
+  float* att_out{nullptr};
+
+  void Init(lite::Tensor* table,
+            lite::Tensor* att_fc_w,
+            float att_fc_w_max,
+            lite::Tensor* att_fc_b,
+            int upper_bound_batch,
+            int upper_bound_seqlen) {
+    table_ = table->data<float>();
+    table_len_ = table->dims()[0];
+    emb_dim_ = table->dims()[1];
+    att_.Init(att_fc_w,
+              att_fc_w_max,
+              att_fc_b,
+              emb_dim_,
+              upper_bound_batch,
+              upper_bound_seqlen);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             int batch,
+             const MMDNNIdInfo& sentense,
+             lite::Tensor* att_pool_out,
+             lite::Tensor* emb_fw_out,
+             float* l3_buffer = nullptr,
+             int l3_size = 0) {
+    emb_fw = emb_fw_out->mutable_data<float>(TARGET(kXPU));
+    att_out = att_pool_out->mutable_data<float>(TARGET(kXPU));
+
+    int cap_l = sentense.lod.back();
+    const float* emb_tables[] = {table_, table_};
+    const int64_t* emb_indices[] = {sentense.id0_64, sentense.id1_64};
+    int r =
+        xdnn::embedding_with_ewadd<float, int64_t, false, false>(ctx,
+                                                                 emb_dim_,
+                                                                 cap_l,
+                                                                 2,
+                                                                 table_len_ - 2,
+                                                                 emb_tables,
+                                                                 emb_indices,
+                                                                 nullptr,
+                                                                 nullptr,
+                                                                 emb_fw);
+    CHECK_EQ(r, 0);
+    att_.Infer(ctx, sentense, emb_fw, att_out, l3_buffer, l3_size);
+  }
+};
+
+class MMDNNMergeAll {
+  MMDNNGrnnOp coverage_fw_;
+  MMDNNGrnnOp coverage_rv_;
+  int cap_e_;
+  int cap_h_;
+
+  // TODO(miaotianxiang):
+  const int fc0_k_ = 1152;
+  const int fc0_n_ = 512;
+  const int fc1_k_ = 640;
+  const int fc1_n_ = 320;
+  const int fc2_k_ = 320;
+  const int fc2_n_ = 1;
+  MMDNNFcOp fc0_;
+  MMDNNFcOp fc1_;
+  MMDNNFcOp fc2_;
+
+  XPUScratchPadGuard hbm_buffer_guard_;
+  float* hbm_buffer_{nullptr};
+  // topk_concat_out_fw:  [cap_l, cap_e_] <= [cap_l, cap_h_]
+  // topk_concat_out_rv:  [cap_l, cap_e_] <= [cap_l, cap_h_]
+  // grnn_fw:             [cap_l, cap_h_]
+  // grnn_rv:             [cap_l, cap_h_]
+  // pool_fw:             [batch, cap_h_]
+  // pool_rv:             [batch, cap_h_]
+  // fc0_in:              [batch, fc0_k_]
+  // fc0_out:             [batch, fc0_n_]
+  // fc1_in:              [batch, fc1_k_]
+  // fc1_out:             [batch, fc1_n_]
+  // fc2_out:             [batch, fc2_n_]
+
+ public:
+  void Init(lite::Tensor* grnn_fw_wh,
+            std::vector<float> grnn_fw_wh_maxs,
+            lite::Tensor* grnn_fw_wi,
+            std::vector<float> grnn_fw_wi_maxs,
+            lite::Tensor* grnn_rv_wh,
+            std::vector<float> grnn_rv_wh_maxs,
+            lite::Tensor* grnn_rv_wi,
+            std::vector<float> grnn_rv_wi_maxs,
+            lite::Tensor* fc0_w,
+            float fc0_w_max,
+            lite::Tensor* fc0_b,
+            lite::Tensor* fc1_w,
+            float fc1_w_max,
+            lite::Tensor* fc1_b,
+            lite::Tensor* fc2_w,
+            float fc2_w_max,
+            lite::Tensor* fc2_b,
+            int upper_bound_batch,
+            int upper_bound_seqlen) {
+    int max_cap_l = upper_bound_batch * upper_bound_seqlen;
+    cap_e_ = grnn_fw_wi->dims()[2];
+    cap_h_ = grnn_fw_wi->dims()[1];
+
+    coverage_fw_.Init(grnn_fw_wh,
+                      grnn_fw_wh_maxs,
+                      grnn_fw_wi,
+                      grnn_fw_wi_maxs,
+                      cap_e_,
+                      cap_h_,
+                      max_cap_l);
+    coverage_rv_.Init(grnn_rv_wh,
+                      grnn_rv_wh_maxs,
+                      grnn_rv_wi,
+                      grnn_rv_wi_maxs,
+                      cap_e_,
+                      cap_h_,
+                      max_cap_l);
+
+    fc0_.Init(
+        fc0_w, fc0_w_max, fc0_b, fc0_n_, fc0_k_, xdnn::Activation_t::RELU);
+    fc1_.Init(
+        fc1_w, fc1_w_max, fc1_b, fc1_n_, fc1_k_, xdnn::Activation_t::RELU);
+    fc2_.Init(
+        fc2_w, fc2_w_max, fc2_b, fc2_n_, fc2_k_, xdnn::Activation_t::LINEAR);
+
+    int hbm_total_len = max_cap_l * cap_e_ * 2 + max_cap_l * cap_h_ * 2 +
+                        upper_bound_batch * (2 * cap_h_ + fc0_k_ + fc0_n_ +
+                                             fc1_k_ + fc1_n_ + fc2_n_);
+    hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(
+        hbm_total_len * sizeof(float), false);
+    hbm_buffer_ = reinterpret_cast<float*>(hbm_buffer_guard_->addr_);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             const MMDNNIdInfo& sentense,
+             const std::vector<lite::Tensor*> concat_topk_x,
+             const std::vector<lite::Tensor*> concat_7in1_x,
+             lite::Tensor* out,
+             float* l3_buffer = nullptr,
+             int l3_size = 0) {
+    int batch = sentense.batch;
+    int cap_l = sentense.seqlen_sum;
+
+    float* topk_concat_out_fw = hbm_buffer_;
+    int hbm_total_len =
+        cap_l * cap_e_ * 2 + cap_l * cap_h_ * 2 +
+        batch * (2 * cap_h_ + fc0_k_ + fc0_n_ + fc1_k_ + fc1_n_ + fc2_n_);
+    if (l3_size > 0 && l3_size >= hbm_total_len * sizeof(float)) {
+      topk_concat_out_fw = l3_buffer;
+    }
+    float* topk_concat_out_rv = topk_concat_out_fw + cap_l * cap_e_;
+    float* grnn_fw = topk_concat_out_rv + cap_l * cap_e_;
+    float* grnn_rv = grnn_fw + cap_l * cap_h_;
+    float* pool_fw = grnn_rv + cap_l * cap_h_;
+    float* pool_rv = pool_fw + batch * cap_h_;
+    float* fc0_in = pool_fw + batch * cap_h_ * 2;
+    float* fc0_out = fc0_in + batch * fc0_k_;
+    float* fc1_in = fc0_out + batch * fc0_n_;
+    float* fc1_out = fc1_in + batch * fc1_k_;
+    // float* fc2_out = fc1_out + batch * fc1_n_;
+    float* fc2_out = out->mutable_data<float>(TARGET(kXPU));
+
+    std::vector<int> concat_widths;
+    std::vector<const float*> concat_ptrs;
+    for (const auto* t : concat_topk_x) {
+      concat_widths.push_back(static_cast<int>(t->dims()[1]));
+      concat_ptrs.push_back(t->data<float>());
+    }
+    int r = 0;
+    r = xdnn::concat<float>(ctx,
+                            cap_l,
+                            concat_widths.data(),
+                            concat_widths.size(),
+                            concat_ptrs.data(),
+                            topk_concat_out_fw);
+    CHECK_EQ(r, 0);
+    r = xdnn::sequence_reverse(ctx,
+                               batch,
+                               sentense.lod_32,
+                               cap_e_,
+                               topk_concat_out_fw,
+                               topk_concat_out_rv);
+    CHECK_EQ(r, 0);
+    coverage_fw_.Infer(ctx,
+                       sentense,
+                       topk_concat_out_fw,
+                       grnn_fw,
+                       l3_buffer + hbm_total_len,
+                       l3_size - hbm_total_len * sizeof(float));
+    coverage_rv_.Infer(ctx,
+                       sentense,
+                       topk_concat_out_rv,
+                       grnn_rv,
+                       l3_buffer + hbm_total_len,
+                       l3_size - hbm_total_len * sizeof(float));
+    r = xdnn::sequence_pooling_forward(ctx,
+                                       xdnn::Pooling_t::LAST,
+                                       batch,
+                                       sentense.lod_32,
+                                       cap_h_,
+                                       grnn_fw,
+                                       nullptr,
+                                       pool_fw);
+    CHECK_EQ(r, 0);
+    r = xdnn::sequence_pooling_forward(ctx,
+                                       xdnn::Pooling_t::LAST,
+                                       batch,
+                                       sentense.lod_32,
+                                       cap_h_,
+                                       grnn_rv,
+                                       nullptr,
+                                       pool_rv);
+    CHECK_EQ(r, 0);
+
+    const int concat_widths_fc0[] = {
+        static_cast<int>(concat_7in1_x[0]->dims()[1]),
+        static_cast<int>(concat_7in1_x[1]->dims()[1]),
+        static_cast<int>(concat_7in1_x[2]->dims()[1]),
+        static_cast<int>(concat_7in1_x[3]->dims()[1]),
+        static_cast<int>(concat_7in1_x[4]->dims()[1]),
+        static_cast<int>(concat_7in1_x[5]->dims()[1]),
+        static_cast<int>(concat_7in1_x[6]->dims()[1]),
+    };
+    const float* concat_ptrs_fc0[] = {
+        concat_7in1_x[0]->data<float>(),
+        concat_7in1_x[1]->data<float>(),
+        concat_7in1_x[2]->data<float>(),
+        concat_7in1_x[3]->data<float>(),
+        concat_7in1_x[4]->data<float>(),
+        concat_7in1_x[5]->data<float>(),
+        concat_7in1_x[6]->data<float>(),
+    };
+    const int concat_widths_fc1[] = {cap_h_, cap_h_, fc0_n_};
+    const float* concat_ptrs_fc1[] = {pool_fw, pool_rv, fc0_out};
+
+    r = xdnn::concat<float>(
+        ctx, batch, concat_widths_fc0, 7, concat_ptrs_fc0, fc0_in);
+    CHECK_EQ(r, 0);
+    fc0_.Infer(ctx, fc0_in, batch, fc0_out);
+    r = xdnn::concat<float>(
+        ctx, batch, concat_widths_fc1, 3, concat_ptrs_fc1, fc1_in);
+    CHECK_EQ(r, 0);
+    fc1_.Infer(ctx, fc1_in, batch, fc1_out);
+    fc2_.Infer(ctx, fc1_out, batch, fc2_out);
+  }
+};
+
+class XPUMmdnnBidEmbGrnnAttCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMmdnnBidEmbGrnnAttParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  MMDNNIdInfo id_;
+  MMDNNBidEmbGrnnAtt compound_;
+};
+
+void XPUMmdnnBidEmbGrnnAttCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  id_.Init(XPU_MAX_LOD_SIZE, XPU_MAX_LOD_SEQ_LEN);
+  compound_.Init(param.emb_tbl,
+                 param.grnn_fw_wh,
+                 param.grnn_fw_wh_maxs,
+                 param.grnn_fw_wi,
+                 param.grnn_fw_wi_maxs,
+                 param.grnn_rv_wh,
+                 param.grnn_rv_wh_maxs,
+                 param.grnn_rv_wi,
+                 param.grnn_rv_wi_maxs,
+                 param.att_fc_w,
+                 param.att_fc_w_max,
+                 param.att_fc_b,
+                 XPU_MAX_LOD_SIZE,
+                 XPU_MAX_LOD_SEQ_LEN);
+}
+
+void XPUMmdnnBidEmbGrnnAttCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* xpu_ctx = ctx.GetRawContext();
+
+  int batch = param.id0->lod()[0].size() - 1;
+  id_.Update(param.id0, param.id1);
+  compound_.Infer(ctx.GetRawContext(),
+                  batch,
+                  id_,
+                  param.grnn_fw_pool_out,
+                  param.grnn_rv_pool_out,
+                  param.att_pool_out,
+                  param.concat_3in1_out,
+                  param.emb_fw_out,
+                  reinterpret_cast<float*>(
+                      reinterpret_cast<char*>(xpu_ctx->workspace_l3_ptr) +
+                      xpu_ctx->used_l3_size),
+                  xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size);
+}
+
+class XPUMmdnnBidEmbGrnnAttCompute2
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMmdnnBidEmbGrnnAttParam2;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  MMDNNIdInfo id_;
+  MMDNNBidEmbGrnnAtt compound_;
+};
+
+void XPUMmdnnBidEmbGrnnAttCompute2::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  id_.Init(XPU_MAX_LOD_SIZE, XPU_MAX_LOD_SEQ_LEN);
+  compound_.Init(param.emb_tbl,
+                 param.grnn_fw_wh,
+                 param.grnn_fw_wh_maxs,
+                 param.grnn_fw_wi,
+                 param.grnn_fw_wi_maxs,
+                 param.grnn_rv_wh,
+                 param.grnn_rv_wh_maxs,
+                 param.grnn_rv_wi,
+                 param.grnn_rv_wi_maxs,
+                 param.att_fc_w,
+                 param.att_fc_w_max,
+                 param.att_fc_b,
+                 XPU_MAX_LOD_SIZE,
+                 XPU_MAX_LOD_SEQ_LEN);
+}
+
+void XPUMmdnnBidEmbGrnnAttCompute2::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* xpu_ctx = ctx.GetRawContext();
+
+  int batch = param.id0->lod()[0].size() - 1;
+  id_.Update(param.id0, param.id1);
+  compound_.Infer(ctx.GetRawContext(),
+                  batch,
+                  id_,
+                  param.grnn_fw_pool_out,
+                  param.grnn_rv_pool_out,
+                  param.att_pool_out,
+                  param.concat_3in1_out,
+                  param.emb_fw_out,
+                  reinterpret_cast<float*>(
+                      reinterpret_cast<char*>(xpu_ctx->workspace_l3_ptr) +
+                      xpu_ctx->used_l3_size),
+                  xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size);
+
+  int num = param.id0->numel();
+  int embed_dim = param.emb_tbl->dims()[1];
+
+  // TODO(miaotianxiang):
+  int r = xdnn::embedding<float, int64_t>(
+      ctx.GetRawContext(),                               /* context */
+      num,                                               /* num */
+      param.id0->data<int64_t>(),                        /* indices */
+      embed_dim,                                         /* embed_dim */
+      param.emb_tbl->data<float>(),                      /* table */
+      param.emb0_out->mutable_data<float>(TARGET(kXPU)), /* top */
+      128000 /* padding_idx */);
+  CHECK_EQ(r, 0);
+}
+
+class XPUMmdnnBidEmbAttCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMmdnnBidEmbAttParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  MMDNNIdInfo id_;
+  MMDNNEmbAtt compound_;
+};
+
+void XPUMmdnnBidEmbAttCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  id_.Init(XPU_MAX_LOD_SIZE, XPU_MAX_LOD_SEQ_LEN);
+  compound_.Init(param.emb_tbl,
+                 param.att_fc_w,
+                 param.att_fc_w_max,
+                 param.att_fc_b,
+                 XPU_MAX_LOD_SIZE,
+                 XPU_MAX_LOD_SEQ_LEN);
+}
+
+void XPUMmdnnBidEmbAttCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* xpu_ctx = ctx.GetRawContext();
+
+  int batch = param.id0->lod()[0].size() - 1;
+  id_.Update(param.id0, param.id1);
+  compound_.Infer(ctx.GetRawContext(),
+                  batch,
+                  id_,
+                  param.att_pool_out,
+                  param.emb_fw_out,
+                  reinterpret_cast<float*>(
+                      reinterpret_cast<char*>(xpu_ctx->workspace_l3_ptr) +
+                      xpu_ctx->used_l3_size),
+                  xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size);
+}
+
+class XPUMmdnnMatchConvTopkCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMmdnnMatchConvTopkParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  MMDNNMatchConvTopk compound_;
+};
+
+void XPUMmdnnMatchConvTopkCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  compound_.Init(param.input_w,
+                 param.input_w_max,
+                 param.conv_w,
+                 param.conv_w_max,
+                 param.dim_t,
+                 param.input_w->dims()[0],
+                 param.output_channel,
+                 XPU_MAX_LOD_SIZE,
+                 XPU_MAX_LOD_SEQ_LEN,
+                 param.topks);
+}
+
+void XPUMmdnnMatchConvTopkCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* xpu_ctx = ctx.GetRawContext();
+
+  compound_.Infer(ctx.GetRawContext(),
+                  param.input_x,
+                  param.input_y,
+                  param.topk_out,
+                  reinterpret_cast<float*>(
+                      reinterpret_cast<char*>(xpu_ctx->workspace_l3_ptr) +
+                      xpu_ctx->used_l3_size),
+                  xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size);
+}
+
+class XPUMmdnnMergeAllCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMmdnnMergeAllParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  MMDNNIdInfo id_;
+  MMDNNMergeAll compound_;
+};
+
+void XPUMmdnnMergeAllCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  id_.Init(XPU_MAX_LOD_SIZE, XPU_MAX_LOD_SEQ_LEN);
+  compound_.Init(param.grnn_fw_wh,
+                 param.grnn_fw_wh_maxs,
+                 param.grnn_fw_wi,
+                 param.grnn_fw_wi_maxs,
+                 param.grnn_rv_wh,
+                 param.grnn_rv_wh_maxs,
+                 param.grnn_rv_wi,
+                 param.grnn_rv_wi_maxs,
+                 param.fc0_w,
+                 param.fc0_w_max,
+                 param.fc0_b,
+                 param.fc1_w,
+                 param.fc1_w_max,
+                 param.fc1_b,
+                 param.fc2_w,
+                 param.fc2_w_max,
+                 param.fc2_b,
+                 XPU_MAX_LOD_SIZE,
+                 XPU_MAX_LOD_SEQ_LEN);
+}
+
+void XPUMmdnnMergeAllCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* xpu_ctx = ctx.GetRawContext();
+
+  id_.Update(param.concat_topk_x[0], param.concat_topk_x[1]);
+  compound_.Infer(ctx.GetRawContext(),
+                  id_,
+                  param.concat_topk_x,
+                  param.concat_7in1_x,
+                  param.out,
+                  reinterpret_cast<float*>(
+                      reinterpret_cast<char*>(xpu_ctx->workspace_l3_ptr) +
+                      xpu_ctx->used_l3_size),
+                  xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_grnn_att,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMmdnnBidEmbGrnnAttCompute,
+                     def)
+    .BindInput("id0", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("id1", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("emb_tbl", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_fw_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_fw_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_rv_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_rv_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("att_fc_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("att_fc_b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("grnn_fw_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("grnn_rv_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("att_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("concat_3in1_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("emb_fw_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_grnn_att2,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMmdnnBidEmbGrnnAttCompute2,
+                     def)
+    .BindInput("id0", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("id1", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("emb_tbl", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_fw_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_fw_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_rv_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_rv_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("att_fc_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("att_fc_b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("emb0_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("grnn_fw_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("grnn_rv_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("att_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("concat_3in1_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("emb_fw_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_att,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMmdnnBidEmbAttCompute,
+                     def)
+    .BindInput("id0", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("id1", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("emb_tbl", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("att_fc_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("att_fc_b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("att_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("concat_3in1_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("emb_fw_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__mmdnn_match_conv_topk,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMmdnnMatchConvTopkCompute,
+                     def)
+    .BindInput("input_x", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("input_y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("input_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("conv_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("topk_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__mmdnn_merge_all,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMmdnnMergeAllCompute,
+                     def)
+    .BindInput("concat_7in1_x", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("concat_topk_x", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_fw_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_fw_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_rv_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_rv_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("fc0_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("fc0_b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("fc1_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("fc1_b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("fc2_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("fc2_b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.h b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
index 71db4e6f44f9c36e4acdaf0a440463a61f4e3099..dbc2d785d42ad29dc1cfbe36f744b71662e48315 100644
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include <vector>
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/kernel.h"
diff --git a/lite/kernels/xpu/__xpu__resnet50_compute.h b/lite/kernels/xpu/__xpu__resnet50_compute.h
index 3d42f8b6f26edf615dba165b553b633673a4ae66..7ce8b1192ea9e85d83ddbeddc374378692866aa6 100644
--- a/lite/kernels/xpu/__xpu__resnet50_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include <vector>
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/kernel.h"
diff --git a/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc b/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d57445cd44953f504e292ad38d44d047daa3a7a
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__resnet_cbam_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUResNetCbamCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  for (auto* filter : param.filter) {
+    arg_filter_.push_back(
+        reinterpret_cast<const int16_t*>(filter->data<float>()));
+  }
+  for (auto* bias : param.bias) {
+    if (bias == nullptr) {
+      arg_bias_.push_back(nullptr);
+    } else {
+      arg_bias_.push_back(bias->data<float>());
+    }
+  }
+  for (auto* max_filter : param.max_filter) {
+    arg_max_filter_.push_back(max_filter->data<float>());
+  }
+}
+
+void XPUResNetCbamCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto input_dims = param.input->dims();
+  int batch_size = input_dims[0];
+  int height = input_dims[2];
+  int width = input_dims[3];
+
+  int r = xdnn::conv2d_int16_resnet_cbam<float, int16_t>(
+      ctx.GetRawContext(),                             /* context */
+      batch_size,                                      /* num */
+      height,                                          /* height */
+      width,                                           /* width */
+      param.input->data<float>(),                      /* bottom */
+      &arg_filter_[0],                                 /* weight_list */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* top */
+      &arg_bias_[0],                                   /* bias_list */
+      &arg_max_filter_[0],                             /* max_filter_list */
+      param.pool_p,                                    /* pool_p */
+      true,                                            /* midtype_fp16 */
+      false /* dynamic_shape */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__resnet_cbam,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUResNetCbamCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("MaxFilter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__resnet_cbam_compute.h b/lite/kernels/xpu/__xpu__resnet_cbam_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..b952bb088ea88399966c170cbeadebfa698889d8
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__resnet_cbam_compute.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUResNetCbamCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUResNetCbamParam;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+ private:
+  std::vector<const int16_t *> arg_filter_;
+  std::vector<const float *> arg_max_filter_;
+  std::vector<const float *> arg_bias_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/__xpu__search_attention_compute.cc b/lite/kernels/xpu/__xpu__search_attention_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f02f566dfb01f2d8a57302e714f4f2cb3d4b786
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__search_attention_compute.cc
@@ -0,0 +1,231 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__search_attention_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUMmdnnSearchAttentionCompute::PrepareForRun() {
+  offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  pad_begin_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  w_max_xpu_guard_ =
+      TargetWrapperXPU::MallocScratchPad(8 * sizeof(float), false /* use_l3 */);
+  buffer_at_l3_guard_ = TargetWrapperXPU::MallocScratchPad(
+      5 * L3_SLOT_SIZE * sizeof(float), false /* use_l3 */);
+  buffer_at_gm_guard_ = TargetWrapperXPU::MallocScratchPad(
+      5 * GM_SLOT_SIZE * sizeof(float), false /* use_l3 */);
+
+  offset_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  pad_begin_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+}
+
+void XPUMmdnnSearchAttentionCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* X = param.X;
+  auto* W = param.W;
+  auto* b = param.b;
+  float W_max = param.W_max;
+  float alpha0 = param.alpha0;
+  float alpha1 = param.alpha1;
+  float mask = param.mask;
+
+  const int16_t* w_data = W->data<int16_t>();
+  const float* b_data = b->data<float>();
+
+  int batch = X->lod()[0].size() - 1;
+  int dim0 = X->dims()[0];
+  int dim1 = X->dims()[1];
+  const auto offset = X->lod()[0];
+  int max_seq = 0;
+
+  auto* top = param.Out;
+  LoD top_lod;
+  top_lod.push_back(X->lod()[0]);
+  top->set_lod(top_lod);
+  top->Resize({dim0, dim1});
+  auto* top_data = top->mutable_data<float>(TARGET(kXPU));
+
+  float maxs_cpu[8] = {0.0f, 0.0f, 0.0f, 0.0f, W_max, 0.0f, 0.0f, 0.0f};
+  for (int i = 0; i < batch; ++i) {
+    offset_cpu[i] = offset[i];  // type of offset is int64, not supported by xpu
+    pad_begin_cpu[i] = offset[i + 1] - offset[i];
+    if (offset[i + 1] - offset[i] > max_seq) {
+      max_seq = offset[i + 1] - offset[i];
+    }
+  }
+  offset_cpu[batch] = offset[batch];
+
+  XPU_CALL(xpu_memcpy(offset_xpu_guard_->addr_,
+                      offset_cpu.get(),
+                      offset.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(pad_begin_xpu_guard_->addr_,
+                      pad_begin_cpu.get(),
+                      batch * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(w_max_xpu_guard_->addr_,
+                      maxs_cpu,
+                      8 * sizeof(float),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+
+  int* offset_xpu = reinterpret_cast<int*>(offset_xpu_guard_->addr_);
+  int* pad_begin_xpu = reinterpret_cast<int*>(pad_begin_xpu_guard_->addr_);
+  float* maxs_xpu = reinterpret_cast<float*>(w_max_xpu_guard_->addr_);
+  float* buffer_at_l3 = reinterpret_cast<float*>(buffer_at_l3_guard_->addr_);
+  float* buffer_at_gm = reinterpret_cast<float*>(buffer_at_gm_guard_->addr_);
+
+  // when use l3, max_seq <= 128:
+  // group_padding:           batch * max_seq * dim1;     at (slot0, slot1)
+  // seq_fc:                  batch * max_seq * dim1;     at (slot2, slot3)
+  // batchgemm0:              batch * max_seq * max_seq;  at slot4
+  // attention_padding_mask:  batch * max_seq * max_seq;  at slot3
+  // seq_softmax:             batch * max_seq * max_seq;  at slot4
+  // batchgemm1:              batch * max_seq * dim1;     at (slot2, slot3)
+  float* group_padding_output = buffer_at_l3;
+  float* seq_fc_output = buffer_at_l3 + 2 * L3_SLOT_SIZE;
+  float* batchgemm0_output = buffer_at_l3 + 4 * L3_SLOT_SIZE;
+  float* attention_output = buffer_at_l3 + 3 * L3_SLOT_SIZE;
+  float* seq_softmax_output = buffer_at_l3 + 4 * L3_SLOT_SIZE;
+  float* batchgemm1_output = buffer_at_l3 + 2 * L3_SLOT_SIZE;
+
+  if (max_seq > 128) {
+    group_padding_output = buffer_at_gm;
+    seq_fc_output = buffer_at_gm + 1 * GM_SLOT_SIZE;
+    batchgemm0_output = buffer_at_gm + 2 * GM_SLOT_SIZE;
+    attention_output = buffer_at_gm + 1 * GM_SLOT_SIZE;
+    seq_softmax_output = buffer_at_gm + 3 * GM_SLOT_SIZE;
+    batchgemm1_output = buffer_at_gm + 4 * GM_SLOT_SIZE;
+  }
+
+  const auto* bottom_data = X->data<float>();
+  int r = 0;
+  r = xdnn::search_sequence_pad_depad(ctx.GetRawContext(),
+                                      const_cast<float*>(bottom_data),
+                                      group_padding_output,
+                                      offset_xpu,
+                                      max_seq,
+                                      batch,
+                                      dim1,
+                                      0);  // is_depad = 0
+  CHECK_EQ(r, 0);
+  // do-findmax
+  r = xdnn::findmax<float>(ctx.GetRawContext(),
+                           group_padding_output,
+                           batch * max_seq * dim1,
+                           maxs_xpu);
+  CHECK_EQ(r, 0);
+  r = xdnn::gemm_int16_maxptr<float, int16_t, float>(
+      ctx.GetRawContext(),        /* ctx */
+      false,                      /* trans_a */
+      true,                       /* trans_b */
+      batch * max_seq,            /* m */
+      dim1,                       /* n */
+      dim1,                       /* k */
+      1.0f,                       /* alpha */
+      group_padding_output,       /* data_a */
+      dim1,                       /* lda */
+      w_data,                     /* data_b */
+      dim1,                       /* ldb */
+      0.0f,                       /* beta */
+      seq_fc_output,              /* data_c */
+      dim1,                       /* ldc */
+      b_data,                     /* bias */
+      xdnn::Activation_t::LINEAR, /* act */
+      maxs_xpu,                   /* max_a */
+      maxs_xpu + 4,               /* max_b */
+      nullptr /* max_c */);
+  CHECK_EQ(r, 0);
+  r = xdnn::search_aligned_mat_mul(ctx.GetRawContext(),
+                                   0,
+                                   1,
+                                   batch,
+                                   max_seq,
+                                   max_seq,
+                                   dim1,
+                                   alpha0,
+                                   group_padding_output,
+                                   dim1,
+                                   seq_fc_output,
+                                   dim1,
+                                   batchgemm0_output,
+                                   max_seq);
+  CHECK_EQ(r, 0);
+  r = xdnn::search_pad_mask(ctx.GetRawContext(),
+                            batchgemm0_output,
+                            attention_output,
+                            pad_begin_xpu,
+                            batch,
+                            max_seq,
+                            max_seq,
+                            batch,
+                            mask);
+  CHECK_EQ(r, 0);
+  r = xdnn::softmax2d_forward(ctx.GetRawContext(),
+                              attention_output,
+                              seq_softmax_output,
+                              batch * max_seq,
+                              max_seq,
+                              true);
+  CHECK_EQ(r, 0);
+  r = xdnn::search_aligned_mat_mul(ctx.GetRawContext(),
+                                   0,
+                                   0,
+                                   batch,
+                                   max_seq,
+                                   dim1,
+                                   max_seq,
+                                   alpha1,
+                                   seq_softmax_output,
+                                   max_seq,
+                                   group_padding_output,
+                                   dim1,
+                                   batchgemm1_output,
+                                   dim1);
+  CHECK_EQ(r, 0);
+  r = xdnn::search_sequence_pad_depad(ctx.GetRawContext(),
+                                      top_data,
+                                      batchgemm1_output,
+                                      offset_xpu,
+                                      max_seq,
+                                      batch,
+                                      dim1,
+                                      1);  // is_depad = 1
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__mmdnn_search_attention,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMmdnnSearchAttentionCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__search_attention_compute.h b/lite/kernels/xpu/__xpu__search_attention_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9670dbab6247927acf6ac7d7b47f98a464a3489
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__search_attention_compute.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUMmdnnSearchAttentionCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMmdnnSearchAttentionParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard offset_xpu_guard_;
+  XPUScratchPadGuard pad_begin_xpu_guard_;
+  XPUScratchPadGuard w_max_xpu_guard_;
+  XPUScratchPadGuard buffer_at_l3_guard_;
+  XPUScratchPadGuard buffer_at_gm_guard_;
+
+  std::unique_ptr<int[]> offset_cpu;
+  std::unique_ptr<int[]> pad_begin_cpu;
+
+  const int L3_SLOT_SIZE = 40 * 128 * 128;
+  const int GM_SLOT_SIZE = 40 * 512 * 512;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/activation_compute.h b/lite/kernels/xpu/activation_compute.h
index e440bde4146a88929c52c20ff1038eb35be91d38..f2ad667886ac33191687b70aa7548050461545e7 100644
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/batch_norm_compute.h b/lite/kernels/xpu/batch_norm_compute.h
index 7b428476b96ca3b2b60c66df28b7f82e8f57bebc..f5244574cebab6b10bbd81af9c8303ffec9f0965 100644
--- a/lite/kernels/xpu/batch_norm_compute.h
+++ b/lite/kernels/xpu/batch_norm_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/cast_compute.h b/lite/kernels/xpu/cast_compute.h
index 8992c29732630a5bf0d9c092461569234257e3a9..efd4cbae8d2d708b25729f04f36bc22d1d909e11 100644
--- a/lite/kernels/xpu/cast_compute.h
+++ b/lite/kernels/xpu/cast_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/concat_compute.cc b/lite/kernels/xpu/concat_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f088bb80f0c500c6f900726195bcb5903049d3fb
--- /dev/null
+++ b/lite/kernels/xpu/concat_compute.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/concat_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void ConcatCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto ins = param.x;
+  auto out = param.output;
+  int64_t axis = param.axis;
+
+  int n = ins.size();
+  int h = 1;
+  int w_except_axis = 1;
+  CHECK(n <= 8) << "XPU only surpport at most 8 tensors for now";
+  for (int i = 0; i < axis; ++i) {
+    h *= (ins[0]->dims())[i];
+  }
+  for (int i = axis + 1; i < ins[0]->dims().size(); ++i) {
+    w_except_axis *= (ins[0]->dims())[i];
+  }
+  CHECK(axis >= 0) << "concat: axis shoud >= 0!";
+  CHECK(axis < ins[0]->dims().size()) << "concat: axis shoud < ins[0]->dims()!";
+  for (int i = 0; i < n; ++i) {
+    int hh = 1;
+    int ww = 1;
+    for (int j = 0; j < axis; ++j) {
+      hh *= (ins[i]->dims())[j];
+    }
+    for (int j = axis + 1; j < ins[i]->dims().size(); ++j) {
+      ww *= (ins[i]->dims())[j];
+    }
+    CHECK(hh == h) << "concat: h should be eual!";
+    CHECK(ww == w_except_axis) << "concat: w should be eual except for axis!";
+  }
+
+  int in_w_host[n];      // NOLINT
+  const float* ptrs[n];  // NOLINT
+
+  for (int i = 0; i < n; ++i) {
+    ptrs[i] = ins[i]->data<float>();
+    in_w_host[i] = w_except_axis * (ins[i]->dims())[axis];
+  }
+
+  int r = xdnn::concat<float>(ctx.GetRawContext(), /* ctx */
+                              h,                   /* height */
+                              in_w_host,           /* width_x */
+                              n,                   /* n */
+                              ptrs,                /* lm_ptrs */
+                              out->mutable_data<float>(TARGET(kXPU)) /*y*/);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    concat, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ConcatCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/utils.h b/lite/kernels/xpu/concat_compute.h
similarity index 78%
rename from lite/kernels/xpu/utils.h
rename to lite/kernels/xpu/concat_compute.h
index d410cb1567d5c60aeb52b798d9f17c7f5692e096..f29899a741194270272770d8b781cd9b0b54abc9 100644
--- a/lite/kernels/xpu/utils.h
+++ b/lite/kernels/xpu/concat_compute.h
@@ -14,15 +14,20 @@
 
 #pragma once
 
-#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace xpu {
 
-struct XPUFreeDeleter {
-  void operator()(void* p) const { xpu_free(p); }
+class ConcatCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  virtual void Run();
+
+  virtual ~ConcatCompute() = default;
 };
 
 }  // namespace xpu
diff --git a/lite/kernels/xpu/conv_compute.h b/lite/kernels/xpu/conv_compute.h
index b7631ce4e5773afe7cdd797a245c806b51d25c56..76159444c1861fad14b6ac4f0d32da626b3a8802 100644
--- a/lite/kernels/xpu/conv_compute.h
+++ b/lite/kernels/xpu/conv_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/dropout_compute.h b/lite/kernels/xpu/dropout_compute.h
index 0eaafb4f5555a163623402fee82d50bfa095b0b3..360450df537a68b9412d21db4e06dc74d6071ca6 100644
--- a/lite/kernels/xpu/dropout_compute.h
+++ b/lite/kernels/xpu/dropout_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/elementwise_compute.h b/lite/kernels/xpu/elementwise_compute.h
index 863ee3c643f9c431dacd057e251941914b1dd1c5..d910b9293e74428c426d9505245bc5958fc9df3a 100644
--- a/lite/kernels/xpu/elementwise_compute.h
+++ b/lite/kernels/xpu/elementwise_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/layer_norm_compute.h b/lite/kernels/xpu/layer_norm_compute.h
index 5d2df37795811ef8027e12b25139f2b7091cceed..9eeb5924c512fcfbf8825a9ff775378dfe4d6d4c 100644
--- a/lite/kernels/xpu/layer_norm_compute.h
+++ b/lite/kernels/xpu/layer_norm_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/lookup_table_compute.cc b/lite/kernels/xpu/lookup_table_compute.cc
index 568d303adefaa06bb8665b4cc92d4a949419d587..4256687fa8c17c7fe36e91ff727d52eb1047646f 100644
--- a/lite/kernels/xpu/lookup_table_compute.cc
+++ b/lite/kernels/xpu/lookup_table_compute.cc
@@ -29,12 +29,13 @@ void LookupTableCompute::Run() {
   int embed_dim = param.W->dims()[1];
 
   int r = xdnn::embedding<float, int64_t>(
-      ctx.GetRawContext(),        /* context */
-      num,                        /* num */
-      param.Ids->data<int64_t>(), /* indices */
-      embed_dim,                  /* embed_dim */
-      param.W->data<float>(),     /* table */
-      param.Out->mutable_data<float>(TARGET(kXPU)) /* top */);
+      ctx.GetRawContext(),                          /* context */
+      num,                                          /* num */
+      param.Ids->data<int64_t>(),                   /* indices */
+      embed_dim,                                    /* embed_dim */
+      param.W->data<float>(),                       /* table */
+      param.Out->mutable_data<float>(TARGET(kXPU)), /* top */
+      param.padding_idx /* padding_idx */);
   CHECK_EQ(r, 0);
 }
 
diff --git a/lite/kernels/xpu/lookup_table_compute.h b/lite/kernels/xpu/lookup_table_compute.h
index 2ba1afc869cf9c3a49ab1ad29c66c6c89ba87d19..7a43f5244e5d514a1644aac0437951af35bb7767 100644
--- a/lite/kernels/xpu/lookup_table_compute.h
+++ b/lite/kernels/xpu/lookup_table_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/match_matrix_tensor_compute.cc b/lite/kernels/xpu/match_matrix_tensor_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3ee547ccce56cd16401e4aca465e64d99a26185
--- /dev/null
+++ b/lite/kernels/xpu/match_matrix_tensor_compute.cc
@@ -0,0 +1,182 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/match_matrix_tensor_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void MatchMatrixTensorCompute::PrepareForRun() {
+  wx_max_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  offset_l_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  offset_r_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+
+  offset_l_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  offset_r_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+}
+
+void MatchMatrixTensorCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* x = param.x;
+  auto* y = param.y;
+  auto* w = param.w;
+  auto* out = param.out;
+  auto* tmp = param.tmp;
+  int dim_t = param.dim_t;
+  float w_max = param.__xpu__w_max;
+  bool fuse_relu = param.fuse_relu;
+  bool float_to_fix = param.__xpu__float_to_fix;
+  CHECK(float_to_fix) << "W should be fixed point";
+
+  xdnn::Activation_t act = xdnn::Activation_t::LINEAR;
+  if (fuse_relu) {
+    act = xdnn::Activation_t::RELU;
+  }
+
+  int dim_in = x->dims()[1];
+  const auto& offset_l = x->lod()[0];
+  const auto& offset_r = y->lod()[0];
+
+  std::vector<size_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+    int len_l = offset_l[b + 1] - offset_l[b];
+    int len_r = offset_r[b + 1] - offset_r[b];
+    top_size += dim_t * len_l * len_r;
+    top_offset.push_back(top_size);
+  }
+  auto* bottom_l_data = x->data<float>();
+  auto* bottom_r_data = y->data<float>();
+  auto* w_data = w->data<int16_t>();
+  auto* out_data = out->mutable_data<float>(TARGET(kXPU));
+  auto* bottom_l_trans_data = tmp->mutable_data<float>(TARGET(kXPU));
+  int batch_size = x->lod()[0].size() - 1;
+
+  float* wx_max = reinterpret_cast<float*>(wx_max_xpu_guard_->addr_);
+  int* offset_l_xpu = reinterpret_cast<int*>(offset_l_xpu_guard_->addr_);
+  int* offset_r_xpu = reinterpret_cast<int*>(offset_r_xpu_guard_->addr_);
+
+  int r = xdnn::gemm_int16_tmp_api<float, int16_t, float>(
+      ctx.GetRawContext(),        /* ctx */
+      false,                      /* trans_a */
+      false,                      /* trans_b */
+      x->dims()[0],               /* m */
+      dim_t * dim_in,             /* n */
+      dim_in,                     /* k */
+      1.0f,                       /* alpha */
+      bottom_l_data,              /* data_a */
+      dim_in,                     /* lda */
+      w_data,                     /* data_b */
+      dim_t * dim_in,             /* ldb */
+      0.0f,                       /* beta */
+      bottom_l_trans_data,        /* data_c */
+      dim_t * dim_in,             /* ldc */
+      nullptr,                    /* bias */
+      xdnn::Activation_t::LINEAR, /* act */
+      0.0f,                       /* max_a */
+      w_max,                      /* max_b */
+      wx_max /* max_c */);
+  CHECK_EQ(r, 0);
+
+  int max_width = 0;
+  for (int i = 0; i < offset_l.size(); ++i) {
+    offset_l_cpu[i] = offset_l[i];
+    if (i != 0 && (offset_l_cpu[i] - offset_l_cpu[i - 1] > max_width)) {
+      max_width = offset_l_cpu[i] - offset_l_cpu[i - 1];
+    }
+  }
+  for (int i = 0; i < offset_r.size(); ++i) {
+    offset_r_cpu[i] = offset_r[i];
+    if (i != 0 && (offset_r_cpu[i] - offset_r_cpu[i - 1] > max_width)) {
+      max_width = offset_r_cpu[i] - offset_r_cpu[i - 1];
+    }
+  }
+  XPU_CALL(xpu_memcpy(offset_l_xpu,
+                      offset_l_cpu.get(),
+                      offset_l.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(offset_r_xpu,
+                      offset_r_cpu.get(),
+                      offset_r.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+
+  r = xdnn::match_matrix_tensor(ctx.GetRawContext(),
+                                batch_size,
+                                bottom_l_trans_data,
+                                bottom_r_data,
+                                offset_l_xpu,
+                                offset_r_xpu,
+                                dim_t,
+                                dim_in,
+                                out_data,
+                                wx_max,
+                                act,
+                                max_width);
+  CHECK_EQ(r, 0);
+
+  int lod_lv1_size = batch_size * dim_t;
+  int lod_lv2_size = x->lod()[0].back() * dim_t;
+  std::vector<size_t> out_lod0(batch_size + 1, 0);
+  std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
+  std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
+  for (int i = 0; i < batch_size; i++) {
+    out_lod0[i + 1] = out_lod0[i] + dim_t;
+    int len_l = offset_l[i + 1] - offset_l[i];
+
+    for (int j = 0; j < dim_t; j++) {
+      out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
+      int len_r = offset_r[i + 1] - offset_r[i];
+
+      for (int k = 0; k < len_l; k++) {
+        out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] =
+            out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r;
+      }
+    }
+  }
+
+  paddle::lite::LoD out_lod;
+  out_lod.push_back(top_offset);
+  out_lod.push_back(offset_l);
+  out_lod.push_back(offset_r);
+  out->set_lod(out_lod);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(match_matrix_tensor,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::MatchMatrixTensorCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Tmp", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/match_matrix_tensor_compute.h b/lite/kernels/xpu/match_matrix_tensor_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bd0b622db1fce178ea66604d89dc50d6477a105
--- /dev/null
+++ b/lite/kernels/xpu/match_matrix_tensor_compute.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class MatchMatrixTensorCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatchMatrixTensorParam;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+ private:
+  XPUScratchPadGuard wx_max_xpu_guard_;
+  XPUScratchPadGuard offset_l_xpu_guard_;
+  XPUScratchPadGuard offset_r_xpu_guard_;
+
+  std::unique_ptr<int[]> offset_l_cpu;
+  std::unique_ptr<int[]> offset_r_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/matmul_compute.h b/lite/kernels/xpu/matmul_compute.h
index aca3cbc603eff490ae19fd2546352adca3c1a7cf..0fef2086e294fa5cd79e49adeb6b136f484a1efd 100644
--- a/lite/kernels/xpu/matmul_compute.h
+++ b/lite/kernels/xpu/matmul_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/mul_compute.h b/lite/kernels/xpu/mul_compute.h
index bb2778c0e73189b11135395b42655e0250bbfd0a..3c91384b726a4d43c6a38e96d143657c12dadd8a 100644
--- a/lite/kernels/xpu/mul_compute.h
+++ b/lite/kernels/xpu/mul_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/pool_compute.h b/lite/kernels/xpu/pool_compute.h
index 5648554c41c76396184b7dc536f8c8628cbf23e4..39e14f04a8c41bc057ac5733d881ba713c0883b2 100644
--- a/lite/kernels/xpu/pool_compute.h
+++ b/lite/kernels/xpu/pool_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/scale_compute.h b/lite/kernels/xpu/scale_compute.h
index 6989b0f0f31e54a63dac2f7c2090dc676e31acfb..5a84fe26a0d409dcd979ca7c26128775a4f64df2 100644
--- a/lite/kernels/xpu/scale_compute.h
+++ b/lite/kernels/xpu/scale_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/search_fc_compute.cc b/lite/kernels/xpu/search_fc_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..52a9999b468564d81288ce494f575a8d1d46e4fc
--- /dev/null
+++ b/lite/kernels/xpu/search_fc_compute.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/search_fc_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SearchFcCompute::PrepareForRun() {
+  maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(float), false /* use_l3 */);
+}
+
+void SearchFcCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* bottom = param.X;
+  auto* w = param.W;
+  auto* b = param.b;
+  auto* top = param.Out;
+  float w_max = param.__xpu__w_max;
+  int out_size = param.out_size;
+  bool fuse_relu = param.fuse_relu;
+  bool float_to_fix = param.__xpu__float_to_fix;
+  CHECK(float_to_fix) << "W should be fixed point";
+
+  int batch = bottom->dims()[0];
+  int _out = w->dims()[0];
+  int _in = w->dims()[1];
+
+  xdnn::Activation_t act = xdnn::Activation_t::LINEAR;
+  if (fuse_relu) {
+    act = xdnn::Activation_t::RELU;
+  }
+
+  std::vector<int64_t> top_dims{bottom->dims()[0], out_size};
+  top->Resize(top_dims);
+
+  const auto* bottom_data = bottom->data<float>();
+  const auto* weights = w->data<int16_t>();
+  const auto* bias_data = b->data<float>();
+  auto* top_data = top->mutable_data<float>(TARGET(kXPU));
+
+  float* maxs_xpu = reinterpret_cast<float*>(maxs_xpu_guard_->addr_);
+  float maxs_cpu[8] = {0.0f, 0.0f, 0.0f, 0.0f, w_max, 0.0f, 0.0f, 0.0f};
+  XPU_CALL(xpu_memcpy(maxs_xpu,
+                      &maxs_cpu[0],
+                      8 * sizeof(float),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+
+  int r = xdnn::findmax<float>(
+      ctx.GetRawContext(), bottom_data, batch * _in, maxs_xpu);
+  CHECK_EQ(r, 0);
+  r = xdnn::gemm_int16_maxptr<float, int16_t, float>(
+      ctx.GetRawContext(), /* ctx */
+      false,               /* trans_a */
+      true,                /* trans_b */
+      batch,               /* m */
+      _out,                /* n */
+      _in,                 /* k */
+      1.0f,                /* alpha */
+      bottom_data,         /* data_a */
+      _in,                 /* lda */
+      weights,             /* data_b */
+      _in,                 /* ldb */
+      0.0f,                /* beta */
+      top_data,            /* data_c */
+      _out,                /* ldc */
+      bias_data,           /* bias */
+      act,                 /* act */
+      maxs_xpu,            /* max_a */
+      maxs_xpu + 4,        /* max_b */
+      nullptr /* max_c */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_fc,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SearchFcCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/search_fc_compute.h b/lite/kernels/xpu/search_fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7ee06abd957187c18c1306f40a77735f40558e7
--- /dev/null
+++ b/lite/kernels/xpu/search_fc_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SearchFcCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchFcParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard maxs_xpu_guard_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/search_grnn_compute.cc b/lite/kernels/xpu/search_grnn_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4e2e4a9969149b0d2f7f2b75c195d1b3a5fda5c
--- /dev/null
+++ b/lite/kernels/xpu/search_grnn_compute.cc
@@ -0,0 +1,285 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/search_grnn_compute.h"
+#include <algorithm>
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SearchGrnnCompute::PrepareForRun() {
+  offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  new_offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SEQ_LEN * sizeof(int), false /* use_l3 */);
+  maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(16 * sizeof(float),
+                                                       false /* use_l3 */);
+
+  idx_sorted_by_width_data_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  offset_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  new_offset_cpu.reset(new int[XPU_MAX_LOD_SEQ_LEN]);
+}
+
+void SearchGrnnCompute::prepare_layout(const operators::SearchGrnnParam& param,
+                                       const paddle::lite::Tensor* bottom) {
+  auto* idx_sorted_by_width = param.idx_sorted_by_width;
+  auto* layout_input = param.layout_input;
+
+  int dim0 = bottom->dims()[0];
+  int dim1 = 1;
+  if (bottom->dims().size() > 1) {
+    dim1 = bottom->dims()[1];
+  }
+  int batch = bottom->lod()[0].size() - 1;
+  auto& offset = bottom->lod()[0];
+
+  idx_sorted_by_width->Resize({batch});
+  std::vector<int> width;
+  width.resize(batch);
+
+  // sort sequences by width (descending) and find the largest width in the
+  // batch
+  for (int i = 0; i < batch; i++) {
+    width[i] = offset[i + 1] - offset[i];
+    idx_sorted_by_width_data_cpu[i] = i;
+  }
+  std::sort(idx_sorted_by_width_data_cpu.get(),
+            idx_sorted_by_width_data_cpu.get() + batch,
+            [&width](int a, int b) { return width[a] > width[b]; });
+  int max_width = width[idx_sorted_by_width_data_cpu[0]];
+
+  // start of reorganizing the input
+  std::vector<size_t> new_offset;
+  new_offset.resize(max_width + 1);
+  new_offset[0] = 0;
+  int j = batch - 1;
+  int last_width = 0;
+  int sub_row = 0;
+  int sub_col = 0;
+
+  for (int i = 1; i <= max_width;) {
+    for (int k = j; k >= 0; --k) {
+      if (width[idx_sorted_by_width_data_cpu[k]] > last_width) {
+        sub_row = width[idx_sorted_by_width_data_cpu[k]] - last_width;
+        sub_col = k + 1;
+        for (int s = 0; s < sub_row; s++) {
+          new_offset[i] = new_offset[i - 1] + sub_col;
+          i++;
+        }
+        // move on
+        last_width = width[idx_sorted_by_width_data_cpu[k]];
+        j = k - 1;
+        break;
+      }
+    }
+  }
+
+  // copying to the reorganized buffer
+  if (bottom->dims().size() == 1) {
+  } else {
+    LoD new_lod;
+    new_lod.push_back(new_offset);
+    layout_input->set_lod(new_lod);
+    layout_input->Resize({dim0, dim1});
+  }
+
+  XPU_CALL(xpu_memcpy(idx_sorted_by_width->mutable_data<int>(TARGET(kXPU)),
+                      idx_sorted_by_width_data_cpu.get(),
+                      idx_sorted_by_width->numel() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+}
+
+void SearchGrnnCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* bottom = param.x;
+  auto* wi = param.wi;
+  auto* wh = param.wh;
+  auto* top = param.out;
+  auto* tmp_buffer = param.tmp_buffer;
+  auto* idx_sorted_by_width = param.idx_sorted_by_width;
+  auto* layout_input = param.layout_input;
+  int cap_h = param.num_hidden;
+  int cap_e = param.num_input;
+  int cap_l = bottom->dims()[0];
+  auto wi_max = param.__xpu__wi_max;
+  auto wh_max = param.__xpu__wh_max;
+  bool float_to_fix = param.__xpu__float_to_fix;
+  CHECK(float_to_fix) << "W should be fixed point";
+
+  int dim = 1;
+  if (bottom->dims().size() > 1) {
+    dim = bottom->dims()[1];
+  }
+
+  const auto& offset = bottom->lod()[0];
+  LoD top_lod;
+  top_lod.push_back(offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{cap_l, cap_h};
+  top->Resize(top_dims_vec);
+  auto* top_hidden = top->mutable_data<float>(TARGET(kXPU));
+  const auto* dense_e2h = wi->data<int16_t>();
+  const auto* dense_h2h = wh->data<int16_t>();
+
+  // Prepare idx_sorted_by_width
+  prepare_layout(param, bottom);
+  int batch = bottom->lod()[0].size() - 1;
+  int max_width = layout_input->lod()[0].size() - 1;
+  const auto& new_offset = layout_input->lod()[0];
+  auto* new_emb = layout_input->mutable_data<float>(TARGET(kXPU));
+
+  // Prepare offset and new_offset
+  int* offset_xpu = reinterpret_cast<int*>(offset_xpu_guard_->addr_);
+  int* new_offset_xpu = reinterpret_cast<int*>(new_offset_xpu_guard_->addr_);
+  float* maxs_xpu = reinterpret_cast<float*>(maxs_xpu_guard_->addr_);
+  CHECK_LE(offset.size(), 64);
+  CHECK_LE(new_offset.size(), 256);
+
+  for (size_t i = 0; i < offset.size(); ++i) {
+    offset_cpu[i] = offset[i];
+  }
+  for (size_t i = 0; i < new_offset.size(); ++i) {
+    new_offset_cpu[i] = new_offset[i];
+  }
+  XPU_CALL(xpu_memcpy(offset_xpu,
+                      offset_cpu.get(),
+                      offset.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(new_offset_xpu,
+                      new_offset_cpu.get(),
+                      new_offset.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+
+  int r = xdnn::search_seq2batch(ctx.GetRawContext(),
+                                 batch,
+                                 max_width,
+                                 dim,
+                                 idx_sorted_by_width->data<int>(),
+                                 offset_xpu,
+                                 new_offset_xpu,
+                                 bottom->data<float>(),
+                                 new_emb);
+  CHECK_EQ(r, 0);
+
+  // this buffer is used for book keeping info which will be used in bp
+  // buffer also needed in bp, so make it larger
+  tmp_buffer->Resize({20, cap_l, cap_h});
+  auto* buffer_data = tmp_buffer->mutable_data<float>(TARGET(kXPU));
+  // the internal hidden
+  auto* hidden = buffer_data + 19 * cap_l * cap_h;
+
+  // do-findmax
+  float maxs_cpu[16] = {0.0f,
+                        0.0f,
+                        0.0f,
+                        0.0f,
+                        wi_max[0],
+                        0.0f,
+                        0.0f,
+                        0.0f,
+                        wi_max[1],
+                        0.0f,
+                        0.0f,
+                        0.0f,
+                        wi_max[2],
+                        0.0f,
+                        0.0f,
+                        0.0f};
+  XPU_CALL(xpu_memcpy(maxs_xpu,
+                      maxs_cpu,
+                      16 * sizeof(float),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  r = xdnn::findmax<float>(
+      ctx.GetRawContext(), new_emb, cap_l * cap_e, maxs_xpu);
+  CHECK_EQ(r, 0);
+
+  // precompute embedding to hidden
+  for (int i = 0; i < 3; ++i) {
+    const int16_t* data_b = dense_e2h + i * cap_e * cap_h;  // e2h, e2hr, e2hz
+    float* data_c = buffer_data + i * cap_l * cap_h;  // w_x_e, wr_x_e, wz_x_e
+    int r = xdnn::gemm_int16_maxptr<float, int16_t, float>(
+        ctx.GetRawContext(),
+        false,
+        true,  // trans_a, trans_b
+        cap_l,
+        cap_h,
+        cap_e,  // m, n, k
+        1.0f,
+        new_emb,
+        cap_e,  // alpha, data_a, lda
+        data_b,
+        cap_e,
+        0.0f,  // data_b, ldb, beta
+        data_c,
+        cap_h,  // data_c, ldc
+        nullptr,
+        xdnn::Activation_t::LINEAR,  // bias, act
+        maxs_xpu,
+        maxs_xpu + 4 * (i + 1));  // max_a, max_b
+    CHECK_EQ(r, 0);
+  }
+
+  r = xdnn::search_grnn<float, int16_t>(ctx.GetRawContext(),
+                                        cap_l,
+                                        cap_h,
+                                        cap_e,
+                                        max_width,
+                                        new_offset_xpu,
+                                        buffer_data,
+                                        dense_h2h,
+                                        hidden,
+                                        wh_max[0],
+                                        wh_max[1],
+                                        wh_max[2]);
+  CHECK_EQ(r, 0);
+
+  r = xdnn::search_batch2seq(ctx.GetRawContext(),
+                             batch,
+                             max_width,
+                             cap_h,
+                             idx_sorted_by_width->data<int>(),
+                             offset_xpu,
+                             new_offset_xpu,
+                             hidden,
+                             top_hidden);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_grnn,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SearchGrnnCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("tmp_buffer", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("idx_sorted_by_width",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindOutput("layout_input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/search_grnn_compute.h b/lite/kernels/xpu/search_grnn_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..7208e782474d39eabb41b4bc969d27a1d7d5f797
--- /dev/null
+++ b/lite/kernels/xpu/search_grnn_compute.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SearchGrnnCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchGrnnParam;
+
+  void PrepareForRun() override;
+
+  void prepare_layout(const operators::SearchGrnnParam& param,
+                      const paddle::lite::Tensor* bottom);
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard offset_xpu_guard_;
+  XPUScratchPadGuard new_offset_xpu_guard_;
+  XPUScratchPadGuard maxs_xpu_guard_;
+
+  std::unique_ptr<int[]> idx_sorted_by_width_data_cpu;
+  std::unique_ptr<int[]> offset_cpu;
+  std::unique_ptr<int[]> new_offset_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/sequence_arithmetic_compute.cc b/lite/kernels/xpu/sequence_arithmetic_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1b9866123395b2d7867154c3b398adae670ed97
--- /dev/null
+++ b/lite/kernels/xpu/sequence_arithmetic_compute.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/sequence_arithmetic_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SequenceArithmeticCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* bottom0 = param.X;
+  auto* bottom1 = param.Y;
+  auto* top = param.Out;
+
+  int op_type = param.op_type;
+
+  auto len1 = bottom0->numel();
+  auto len2 = bottom1->numel();
+  const auto* bottom_data0 = bottom0->data<float>();
+  const auto* bottom_data1 = bottom1->data<float>();
+  auto* top_data = top->mutable_data<float>(TARGET(kXPU));
+
+  int r = 0;
+  switch (op_type) {
+    case 1:  // addition: top[0] = bottom[0] + bottom[1]
+      if (len1 > len2) {
+        r = xdnn::elementwise_add(
+            ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2);
+        CHECK_EQ(r, 0);
+        r = xdnn::memcpy_device(ctx.GetRawContext(),
+                                &top_data[len2],
+                                &bottom_data0[len2],
+                                (len1 - len2) * sizeof(float));
+        CHECK_EQ(r, 0);
+      } else {
+        r = xdnn::elementwise_add(
+            ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1);
+        CHECK_EQ(r, 0);
+      }
+      break;
+    case 2:  // substraction: top[0] = bottom[0] - bottom[1]
+      if (len1 > len2) {
+        r = xdnn::elementwise_sub(
+            ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2);
+        CHECK_EQ(r, 0);
+        r = xdnn::memcpy_device(ctx.GetRawContext(),
+                                &top_data[len2],
+                                &bottom_data0[len2],
+                                (len1 - len2) * sizeof(float));
+        CHECK_EQ(r, 0);
+      } else {
+        r = xdnn::elementwise_sub(
+            ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1);
+        CHECK_EQ(r, 0);
+      }
+      break;
+    case 3:  // multiplication: top[0] = bottom[0] * bottom[1]
+      if (len1 > len2) {
+        r = xdnn::elementwise_mul(
+            ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2);
+        CHECK_EQ(r, 0);
+        r = xdnn::memcpy_device(ctx.GetRawContext(),
+                                &top_data[len2],
+                                &bottom_data0[len2],
+                                (len1 - len2) * sizeof(float));
+        CHECK_EQ(r, 0);
+      } else {
+        r = xdnn::elementwise_mul(
+            ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1);
+        CHECK_EQ(r, 0);
+      }
+      break;
+    default:
+      break;
+  }
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_arithmetic,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SequenceArithmeticCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(search_seq_arithmetic,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SequenceArithmeticCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/sequence_arithmetic_compute.h b/lite/kernels/xpu/sequence_arithmetic_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..9526587ac48cd5025022d646e31c24cac6b59a13
--- /dev/null
+++ b/lite/kernels/xpu/sequence_arithmetic_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SequenceArithmeticCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceArithmeticParam;
+
+  void Run() override;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/sequence_concat_compute.cc b/lite/kernels/xpu/sequence_concat_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..349fdbad2a89300703c820588b4647bfba77ece5
--- /dev/null
+++ b/lite/kernels/xpu/sequence_concat_compute.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/sequence_concat_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SequenceConcatCompute::PrepareForRun() {
+  lod0_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  lod1_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+
+  lod0_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  lod1_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+}
+
+template <typename T>
+inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
+                     std::vector<lite::Tensor>* xs_in_order) {
+  std::vector<uint64_t> result;
+  result.resize(xs[0]->lod()[0].size());
+
+  for (size_t i = 1; i < result.size(); ++i) {
+    size_t sum = 0;
+    for (size_t j = 0; j < xs.size(); ++j) {
+      auto& x_lod = xs[j]->lod()[0];
+      if (x_lod[i - 1] < x_lod[i]) {
+        xs_in_order->emplace_back(xs[j]->Slice<T>(x_lod[i - 1], x_lod[i]));
+      }
+      sum += x_lod[i];
+    }
+    result[i] = sum;
+  }
+  LoD lod;
+  lod.emplace_back(result);
+  return lod;
+}
+
+void SequenceConcatCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto xs = param.X;
+  auto out = param.Out;
+
+  size_t lod_size = 0;
+  for (auto& x : xs) {
+    if (lod_size == 0) {
+      lod_size = x->lod()[0].size();
+    } else {
+      CHECK_EQ(lod_size, x->lod()[0].size())
+          << "The number of sequence must be same between each input";
+    }
+  }
+  CHECK_NE(lod_size, 0) << "Each input must have sequence information";
+
+  // TODO(miaotianxiang):
+  int64_t dim0 = 0;
+  int64_t feature_size = 0;
+  std::vector<int64_t> out_dims;
+  for (const auto& tensor : param.X) {
+    const auto x_dims = tensor->dims();
+    if (out_dims.empty()) {
+      out_dims = x_dims.data();
+    }
+    dim0 += x_dims[0];
+    if (feature_size == 0) {
+      feature_size = x_dims.production() / x_dims[0];
+    } else {
+      CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+          << "Inputs of sequence concat must have same feature size";
+    }
+  }
+  out_dims[0] = dim0;
+  out->Resize(out_dims);
+  std::vector<lite::Tensor> x_in_order;
+  out->set_lod(ConcatLoD<float>(xs, &x_in_order));
+
+  CHECK(xs.size() == 2) << "XPU only support sequence_pool for 2 tensors";
+
+  auto lod0 = xs[0]->lod()[0];
+  auto lod1 = xs[1]->lod()[0];
+  int batch_size = lod0.size() - 1;
+
+  int* lod0_xpu = reinterpret_cast<int*>(lod0_xpu_guard_->addr_);
+  int* lod1_xpu = reinterpret_cast<int*>(lod1_xpu_guard_->addr_);
+  for (int i = 0; i < lod0.size(); ++i) {
+    lod0_cpu[i] = lod0[i];
+  }
+  for (int i = 0; i < lod1.size(); ++i) {
+    lod1_cpu[i] = lod1[i];
+  }
+  XPU_CALL(xpu_memcpy(lod0_xpu,
+                      lod0_cpu.get(),
+                      lod0.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(lod1_xpu,
+                      lod1_cpu.get(),
+                      lod1.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+
+  int r = xdnn::sequence_concat(ctx.GetRawContext(),
+                                xs[0]->data<float>(),
+                                lod0_xpu,
+                                xs[1]->data<float>(),
+                                lod1_xpu,
+                                out->mutable_data<float>(TARGET(kXPU)),
+                                batch_size);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_concat,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SequenceConcatCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/sequence_concat_compute.h b/lite/kernels/xpu/sequence_concat_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..5726671975d546d1e549ecbe95790c11faafba7b
--- /dev/null
+++ b/lite/kernels/xpu/sequence_concat_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SequenceConcatCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceConcatParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard lod0_xpu_guard_;
+  XPUScratchPadGuard lod1_xpu_guard_;
+
+  std::unique_ptr<int[]> lod0_cpu;
+  std::unique_ptr<int[]> lod1_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/sequence_pool_compute.cc b/lite/kernels/xpu/sequence_pool_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8e71639b7f4c67f7e60103a42766a4d32026bc1
--- /dev/null
+++ b/lite/kernels/xpu/sequence_pool_compute.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/sequence_pool_compute.h"
+#include <string>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUSequencePoolCompute::PrepareForRun() {
+  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+}
+
+void XPUSequencePoolCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* in = param.X;
+  auto* out = param.Out;
+  std::string pool_type_str = param.pool_type;
+
+  auto dims = in->dims();
+  auto lod = in->lod();
+  dims[0] = lod[0].size() - 1;
+
+  xdnn::Pooling_t pool_type = xdnn::Pooling_t::MAX_WITHOUT_INDEX;
+  if (pool_type_str == "MAX") {
+  } else if (pool_type_str == "LAST") {
+    pool_type = xdnn::Pooling_t::LAST;
+  } else {
+    CHECK(false);
+  }
+
+  int num_seq = out->dims()[0];
+  int dim = out->numel() / num_seq;
+
+  auto in_lod = in->lod()[0];
+  for (size_t i = 0; i < in_lod.size(); ++i) {
+    lod_cpu[i] = in_lod[i];
+  }
+  int* lod_xpu = reinterpret_cast<int*>(lod_xpu_guard_->addr_);
+  XPU_CALL(xpu_memcpy(lod_xpu,
+                      lod_cpu.get(),
+                      in_lod.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+
+  int r =
+      xdnn::sequence_pooling_forward(ctx.GetRawContext(),
+                                     pool_type,
+                                     num_seq,
+                                     lod_xpu,
+                                     dim,
+                                     in->data<float>(),
+                                     nullptr /* index */,
+                                     out->mutable_data<float>(TARGET(kXPU)));
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_pool,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUSequencePoolCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/sequence_pool_compute.h b/lite/kernels/xpu/sequence_pool_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..232634de0e387e764eccdeeda4cb8fd2d5dce598
--- /dev/null
+++ b/lite/kernels/xpu/sequence_pool_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUSequencePoolCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequencePoolParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard lod_xpu_guard_;
+
+  std::unique_ptr<int[]> lod_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/sequence_reverse_compute.cc b/lite/kernels/xpu/sequence_reverse_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb3f37890b644a660c594fb0fd6eea332b90b8d6
--- /dev/null
+++ b/lite/kernels/xpu/sequence_reverse_compute.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/sequence_reverse_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T, PrecisionType PType>
+void SequenceReverseCompute<T, PType>::PrepareForRun() {
+  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+}
+
+template <typename T, PrecisionType PType>
+void SequenceReverseCompute<T, PType>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* x = param.X;
+  auto* y = param.Out;
+
+  auto lod = x->lod()[0];
+  size_t limit = x->numel();
+  size_t ele_cnt_in_4_byte = limit / x->dims()[0];
+  auto* x_data = x->template data<T>();
+  auto* y_data = y->template mutable_data<T>(TARGET(kXPU));
+  int batch_size = lod.size() - 1;
+
+  if (std::is_same<T, uint8_t>::value) {
+    ele_cnt_in_4_byte /= 4;
+  } else if (std::is_same<T, int>::value) {
+    // remain the same
+  } else if (std::is_same<T, int64_t>::value) {
+    ele_cnt_in_4_byte *= 2;
+  } else if (std::is_same<T, float>::value) {
+    // remain the same
+  } else if (std::is_same<T, double>::value) {
+    ele_cnt_in_4_byte *= 2;
+  }
+
+  for (size_t i = 0; i < lod.size(); ++i) {
+    lod_cpu[i] = lod[i];
+  }
+  int* lod_xpu = reinterpret_cast<int*>(lod_xpu_guard_->addr_);
+  XPU_CALL(xpu_memcpy(lod_xpu,
+                      lod_cpu.get(),
+                      lod.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+
+  int r = xdnn::sequence_reverse(ctx.GetRawContext(),
+                                 batch_size,
+                                 lod_xpu,
+                                 ele_cnt_in_4_byte,
+                                 reinterpret_cast<const float*>(x_data),
+                                 reinterpret_cast<float*>(y_data));
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace xpu = paddle::lite::kernels::xpu;
+using SequenceReverseFp32 =
+    xpu::SequenceReverseCompute<float, PRECISION(kFloat)>;
+using SequenceReverseInt64 =
+    xpu::SequenceReverseCompute<int64_t, PRECISION(kInt64)>;
+
+REGISTER_LITE_KERNEL(
+    sequence_reverse, kXPU, kFloat, kNCHW, SequenceReverseFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    sequence_reverse, kXPU, kInt64, kNCHW, SequenceReverseInt64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/xpu/sequence_reverse_compute.h b/lite/kernels/xpu/sequence_reverse_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..91b285de767c65f93352380df7877e53d61ccd53
--- /dev/null
+++ b/lite/kernels/xpu/sequence_reverse_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T, PrecisionType PType>
+class SequenceReverseCompute : public KernelLite<TARGET(kXPU), PType> {
+ public:
+  using param_t = operators::SequenceReverseParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard lod_xpu_guard_;
+  std::unique_ptr<int[]> lod_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e8485e2999b29dfb487d0c7c632fcfa7a9a3d00
--- /dev/null
+++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/sequence_topk_avg_pooling_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SequenceTopkAvgPoolingCompute::PrepareForRun() {
+  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      4 * XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  in_lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  row_lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  col_lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+}
+
+void SequenceTopkAvgPoolingCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* in = param.X;
+  auto* row = param.ROW;
+  auto* col = param.COLUMN;
+  auto* out = param.Out;
+  auto* pos = param.pos;
+
+  auto channel_num = param.channel_num;
+  auto topks = param.topks;
+  auto k_num = topks.size();
+  auto max_k = topks[topks.size() - 1];
+  auto in_lod = in->lod()[0];
+
+  auto row_lod = row->lod()[0];
+  auto col_lod = col->lod()[0];
+  int batch_size = row_lod.size() - 1;
+  int pos_total_size = row_lod[batch_size] * channel_num * max_k;
+  std::vector<int64_t> vec_pos_shape;
+  vec_pos_shape.push_back(pos_total_size);
+  pos->Resize(vec_pos_shape);
+  auto pos_data = pos->mutable_data<int>(TARGET(kXPU));
+
+  int offset = 0;
+  std::vector<uint64_t> vec_out_lod;
+  vec_out_lod.reserve(batch_size + 1);
+  for (int i = 0; i <= batch_size; ++i) {
+    offset = row_lod[i];
+    vec_out_lod.push_back(offset);
+  }
+  LoD lod_temp;
+  lod_temp.push_back(vec_out_lod);
+  out->set_lod(lod_temp);
+
+  auto in_data = in->data<float>();
+  auto out_data = out->mutable_data<float>(TARGET(kXPU));
+
+  int* in_lod_xpu = reinterpret_cast<int*>(lod_xpu_guard_->addr_);
+  int* row_lod_xpu = in_lod_xpu + in_lod.size();
+  int* col_lod_xpu = row_lod_xpu + row_lod.size();
+  int* topks_xpu = col_lod_xpu + col_lod.size();
+  for (int i = 0; i < in_lod.size(); ++i) {
+    in_lod_cpu[i] = in_lod[i];
+  }
+  for (int i = 0; i < row_lod.size(); ++i) {
+    row_lod_cpu[i] = row_lod[i];
+  }
+  for (int i = 0; i < col_lod.size(); ++i) {
+    col_lod_cpu[i] = col_lod[i];
+  }
+  XPU_CALL(xpu_memcpy(in_lod_xpu,
+                      in_lod_cpu.get(),
+                      in_lod.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(row_lod_xpu,
+                      row_lod_cpu.get(),
+                      row_lod.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(col_lod_xpu,
+                      col_lod_cpu.get(),
+                      col_lod.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(topks_xpu,
+                      topks.data(),
+                      topks.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+
+  int r = xdnn::sequence_topk_avg_pooling(ctx.GetRawContext(),
+                                          in_data,
+                                          out_data,
+                                          pos_data,
+                                          batch_size,
+                                          channel_num,
+                                          in_lod_xpu,
+                                          row_lod_xpu,
+                                          col_lod_xpu,
+                                          topks_xpu,
+                                          k_num);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_topk_avg_pooling,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SequenceTopkAvgPoolingCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("pos", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c54ca96225ee9ec37d6d0487a526347c19fdb2d
--- /dev/null
+++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SequenceTopkAvgPoolingCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceTopkAvgPoolingParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard lod_xpu_guard_;
+  std::unique_ptr<int[]> in_lod_cpu;
+  std::unique_ptr<int[]> row_lod_cpu;
+  std::unique_ptr<int[]> col_lod_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/softmax_compute.h b/lite/kernels/xpu/softmax_compute.h
index e807f38a2ea3c9645b78340ac4dc87d1984c40f7..a3d282588776b7d64bc856adf92685c8524af035 100644
--- a/lite/kernels/xpu/softmax_compute.h
+++ b/lite/kernels/xpu/softmax_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/stack_compute.cc b/lite/kernels/xpu/stack_compute.cc
index 90a6c70b49f39ce744f2a03eec41d79ddc768a19..156162923ceeb4abed466164b11672715f813fd7 100644
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
@@ -25,9 +25,8 @@ void StackCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
 
   int n = param.X.size();
-  void* x_ptr = nullptr;
-  xpu_malloc(&x_ptr, n * 8 /* sizeof(__global__ float*) */);
-  x_ptr_guard_.reset(x_ptr);
+  x_ptr_guard_ = TargetWrapperXPU::MallocScratchPad(
+      n * 8 /* sizeof(__global__ float*) */, false /* use_l3 */);
   x_ptr_cpu_.reserve(n);
 }
 
@@ -47,14 +46,15 @@ void StackCompute::Run() {
   for (int i = 0; i < n; ++i) {
     x_ptr_cpu_[i] = param.X[i]->data<float>();
   }
-  xpu_memcpy(x_ptr_guard_.get(), &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(
+      x_ptr_guard_->addr_, &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE));
 
   int r = xdnn::stack_forward(
       ctx.GetRawContext(), /* context */
       height,              /* height */
       width,               /* width */
       n,                   /* n */
-      x_ptr_guard_.get(),  /* x_ptr */
+      x_ptr_guard_->addr_, /* x_ptr */
       param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
   CHECK_EQ(r, 0);
 }
diff --git a/lite/kernels/xpu/stack_compute.h b/lite/kernels/xpu/stack_compute.h
index 1ba1d92dc9479cfd00c5e154df7b5476ffd9976c..7618e2a147b862aee097a42b36721d520ad6012c 100644
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include <memory>
 #include <vector>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
 #include "lite/core/kernel.h"
-#include "lite/kernels/xpu/utils.h"  // XPUFreeDeleter
 
 namespace paddle {
 namespace lite {
@@ -35,7 +34,7 @@ class StackCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
   virtual ~StackCompute() = default;
 
  private:
-  std::unique_ptr<void, XPUFreeDeleter> x_ptr_guard_;
+  XPUScratchPadGuard x_ptr_guard_;
   std::vector<const float*> x_ptr_cpu_;
 };
 
diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc
index 9c2191331c85a7f99ffb5a2e9662ed5831cb1dda..ac301108386e2da43b2efc372b96531df8d55523 100644
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -27,26 +27,50 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-int SubgraphEngine::BuildDeviceProgram() {
+bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
+  // Obtain the origin input tensors, and create the origin output
+  // tensors(Don't try to access them before launch the device program or the
+  // origin program)
+  PrepareWorkspaceForOriginProgram();
+  // Create the device input and output tensors, but don't initialize them
+  // with the dimensions
+  device_itensors_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    device_itensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_itensors_[i]);
+  }
+  device_otensors_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    device_otensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_otensors_[i]);
+  }
+  return true;
+}
+
+bool SubgraphEngine::BuildDeviceProgram() {
   int status = 0;
   // Convert all of ops and their input vars and weights and added into the XPU
   // IR graph
   subgraph::xpu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
-  for (auto& inst : origin_program_) {
+  if (!origin_program_) {
+    BuildOriginProgram();
+  }
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
     op->InferShape();
     std::string op_type = op->op_info()->Type();
     if (!bridges.Exists(op_type, TARGET(kXPU))) {
-      return subgraph::FAILED;
+      return false;
     }
     auto kernel = inst.kernel();
     status |= bridges.Select(op_type, TARGET(kXPU))(
         reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
     }
   }
   // Obtain the output nodes of the XPU IR graph and build the graph to the XPU
@@ -86,7 +110,7 @@ int SubgraphEngine::BuildDeviceProgram() {
       &graph.builder_, &graph.params_, &device_onodes);
   if (device_program_ == nullptr) {
     LOG(WARNING) << "[XPU] Build model failed!";
-    return subgraph::FAILED;
+    return false;
   }
 
   // Query and check the dimensions of input and output tensors
@@ -100,7 +124,7 @@ int SubgraphEngine::BuildDeviceProgram() {
     auto node = graph.Get(device_inames_[i]);
     auto precision = node->precision();
     auto layout = node->layout();
-    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
+    origin_itensors_[i] = exec_scope_->FindMutableTensor(device_inames_[i]);
     CHECK(origin_itensors_[i]);
     origin_idims_[i] = origin_itensors_[i]->dims();
     VLOG(3) << "[XPU] Inputs[" << i << "] name: " << device_inames_[i]
@@ -124,7 +148,7 @@ int SubgraphEngine::BuildDeviceProgram() {
     auto node = graph.Get(device_onames_[i]);
     auto precision = node->precision();
     auto layout = node->layout();
-    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
+    origin_otensors_[i] = exec_scope_->FindMutableTensor(device_onames_[i]);
     CHECK(origin_otensors_[i]);
     origin_odims_[i] = origin_otensors_[i]->dims();
     VLOG(3) << "[XPU] Outputs[" << i << "] name: " << device_onames_[i]
@@ -166,10 +190,10 @@ int SubgraphEngine::BuildDeviceProgram() {
     device_otensors_[i].strides = nullptr;
     device_otensors_[i].byte_offset = 0;
   }
-  return status;
+  return true;
 }
 
-int SubgraphEngine::LaunchDeviceProgram() {
+bool SubgraphEngine::LaunchDeviceProgram() {
   for (size_t i = 0; i < device_itensors_.size(); i++) {
     // Update the data pointer of DLTensor to track the origin input tensors
     device_itensors_[i].data =
@@ -191,24 +215,23 @@ int SubgraphEngine::LaunchDeviceProgram() {
         const_cast<void*>(origin_otensors_[i]->raw_data());
     device_program_->CopyOutputTo(i, &device_otensors_[i]);
   }
-  return 0;
+  return true;
 }
 
 void SubgraphCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                    param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
   CHECK(engine_);
-  engine_->Build();
 }
 
 void SubgraphCompute::Run() {
   CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 
 }  // namespace xpu
diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h
index 601c8821bc826e350c233573bf7eff89cdf5c1f5..25ffa721572ce05b0652d56659f3db12903c589b 100644
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
@@ -31,21 +31,26 @@ class SubgraphEngine : public subgraph::Engine {
  public:
   SubgraphEngine(KernelContext *ctx,
                  int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                  const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}
 
  protected:
-  int BuildDeviceProgram() override;
-  int LaunchDeviceProgram() override;
+  bool PrepareWorkspaceForDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
 
   std::vector<std::string> device_inames_;
   std::vector<std::string> device_onames_;
-  std::vector<DLTensor> device_itensors_;
-  std::vector<DLTensor> device_otensors_;
+  std::vector<DLTensor> device_itensors_{};
+  std::vector<DLTensor> device_otensors_{};
   std::unique_ptr<xtcl::network::xRuntimeInstance> device_program_{nullptr};
 };
 
diff --git a/lite/kernels/xpu/var_conv_2d_compute.cc b/lite/kernels/xpu/var_conv_2d_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b73581951f46a5f3cdbaf64cf732b1909805d27d
--- /dev/null
+++ b/lite/kernels/xpu/var_conv_2d_compute.cc
@@ -0,0 +1,141 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/var_conv_2d_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void VarConv2DCompute::PrepareForRun() {
+  offset_x_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  offset_y_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  offset_x_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  offset_y_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+}
+
+void VarConv2DCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* bottom = param.X;
+  auto* w = param.W;
+  auto* top = param.Out;
+
+  int output_channel = param.output_channel;
+  int input_channel = param.input_channel;
+  int kernel_h = param.kernel_h;
+  int kernel_w = param.kernel_w;
+  int stride_h = param.stride_h;
+  int stride_w = param.stride_w;
+  float w_max = param.__xpu__w_max;
+  bool fuse_relu = param.fuse_relu;
+  bool float_to_fix = param.__xpu__float_to_fix;
+  CHECK(float_to_fix) << "W should be fixed point";
+
+  xdnn::Activation_t act = xdnn::Activation_t::LINEAR;
+  if (fuse_relu) {
+    act = xdnn::Activation_t::RELU;
+  }
+
+  int batch = bottom->lod()[0].size() - 1;
+  const auto& offset_x = bottom->lod()[2];
+  const auto& offset_y = bottom->lod()[1];
+  std::vector<size_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    int top_im_y = 0;
+    if (width != 0) {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    if (height != 0) {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_im_size = top_im_y * top_im_x;
+    top_size += output_channel * top_im_size;
+    top_offset.push_back(top_size);
+  }
+
+  LoD top_lod;
+  top_lod.push_back(top_offset);
+  top_lod.push_back(bottom->lod()[1]);
+  top_lod.push_back(bottom->lod()[2]);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{top_size};
+  top_dims_vec.push_back(1);
+  top->Resize(top_dims_vec);
+  auto* top_data = top->mutable_data<float>(TARGET(kXPU));
+
+  auto* bottom_data = bottom->data<float>();
+  auto* w_data = w->data<int16_t>();
+
+  int* offset_x_xpu = reinterpret_cast<int*>(offset_x_xpu_guard_->addr_);
+  int* offset_y_xpu = reinterpret_cast<int*>(offset_y_xpu_guard_->addr_);
+  for (int i = 0; i < (batch + 1); ++i) {
+    offset_x_cpu[i] = offset_x[i];
+    offset_y_cpu[i] = offset_y[i];
+  }
+  XPU_CALL(xpu_memcpy(offset_x_xpu,
+                      offset_x_cpu.get(),
+                      (batch + 1) * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(offset_y_xpu,
+                      offset_y_cpu.get(),
+                      (batch + 1) * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+
+  int r = xdnn::search_varconv<float, int16_t>(ctx.GetRawContext(),
+                                               batch,
+                                               input_channel,
+                                               output_channel,
+                                               kernel_h,
+                                               kernel_w,
+                                               stride_h,
+                                               stride_w,
+                                               bottom_data,
+                                               w_data,
+                                               offset_x_xpu,
+                                               offset_y_xpu,
+                                               top_data,
+                                               w_max,
+                                               act);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(var_conv_2d,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::VarConv2DCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/var_conv_2d_compute.h b/lite/kernels/xpu/var_conv_2d_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d9f0ca7a9851a0c3071e72519c4ad1f40ea3483
--- /dev/null
+++ b/lite/kernels/xpu/var_conv_2d_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class VarConv2DCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::VarConv2DParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard offset_x_xpu_guard_;
+  XPUScratchPadGuard offset_y_xpu_guard_;
+  std::unique_ptr<int[]> offset_x_cpu;
+  std::unique_ptr<int[]> offset_y_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/CMakeLists.txt b/lite/model_parser/CMakeLists.txt
index 34d524c5c1b86fb6b689b86089c355e3de42a34e..a83cecf4444910e710d0eb92b9c3449190f5bda2 100644
--- a/lite/model_parser/CMakeLists.txt
+++ b/lite/model_parser/CMakeLists.txt
@@ -1,8 +1,9 @@
 if (NOT LITE_ON_TINY_PUBLISH)
     add_subdirectory(pb)
 endif()
-add_subdirectory(cpp)
+add_subdirectory(general)
 add_subdirectory(naive_buffer)
+add_subdirectory(flatbuffers)
 
 #lite_cc_library(runtime_lite SRCS runtime.cc)
 
diff --git a/lite/model_parser/base/apis.h b/lite/model_parser/base/apis.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa3449017c902479a7f6ad37ef73b3a316f585cc
--- /dev/null
+++ b/lite/model_parser/base/apis.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/model_parser/base/block_desc.h"
+#include "lite/model_parser/base/op_desc.h"
+#include "lite/model_parser/base/program_desc.h"
+#include "lite/model_parser/base/proto_desc.h"
+#include "lite/model_parser/base/traits.h"
+#include "lite/model_parser/base/var_desc.h"
+#include "lite/utils/all.h"
diff --git a/lite/model_parser/base/block_desc.h b/lite/model_parser/base/block_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3d2e2452714d474e9d6bc9280cb2c5455fecc98
--- /dev/null
+++ b/lite/model_parser/base/block_desc.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+class BlockDescReadAPI {
+ public:
+  virtual int32_t Idx() const = 0;
+  virtual int32_t ParentIdx() const = 0;
+  virtual size_t VarsSize() const = 0;
+  virtual size_t OpsSize() const = 0;
+  virtual int32_t ForwardBlockIdx() const = 0;
+
+  template <typename T>
+  T* GetVar(int32_t idx);
+
+  template <typename T>
+  T const* GetVar(int32_t idx) const;
+
+  template <typename T>
+  T* GetOp(int32_t idx);
+
+  template <typename T>
+  T const* GetOp(int32_t idx) const;
+
+  virtual ~BlockDescReadAPI() = default;
+};
+
+class BlockDescWriteAPI {
+ public:
+  virtual void SetIdx(int32_t idx) { NotImplemented(); }
+  virtual void SetParentIdx(int32_t idx) { NotImplemented(); }
+  virtual void ClearVars() { NotImplemented(); }
+  virtual void ClearOps() { NotImplemented(); }
+  virtual void SetForwardBlockIdx(int32_t idx) { NotImplemented(); }
+
+  template <typename T>
+  T* AddVar() {
+    NotImplemented();
+    return nullptr;
+  }
+
+  template <typename T>
+  T* AddOp() {
+    NotImplemented();
+    return nullptr;
+  }
+
+  virtual ~BlockDescWriteAPI() = default;
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL) << "BlockDescWriteAPI is not available in model read-only mode.";
+  }
+};
+
+// The reading and writing of the model are one-time and separate.
+// This interface is a combination of reading and writing interfaces,
+// which is used to support legacy interfaces.
+
+class BlockDescAPI : public BlockDescReadAPI, public BlockDescWriteAPI {
+ public:
+  virtual ~BlockDescAPI() = default;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/base/op_desc.h b/lite/model_parser/base/op_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..534ff0feabd2234b4d7a72894383020a5f64d594
--- /dev/null
+++ b/lite/model_parser/base/op_desc.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/model_parser/base/traits.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+
+class OpDescReadAPI {
+ public:
+  virtual std::string Type() const = 0;
+  virtual std::vector<std::string> Input(const std::string& param) const = 0;
+  virtual std::vector<std::string> InputArgumentNames() const = 0;
+  virtual std::vector<std::string> Output(const std::string& param) const = 0;
+  virtual std::vector<std::string> OutputArgumentNames() const = 0;
+  virtual bool HasAttr(const std::string& name) const = 0;
+  virtual OpAttrType GetAttrType(const std::string& name) const = 0;
+  virtual std::vector<std::string> AttrNames() const = 0;
+
+  template <typename T>
+  T GetAttr(const std::string& name) const;
+
+  std::string Repr() const {
+    STL::stringstream ss;
+    ss << Type();
+    ss << "(";
+    for (auto& arg : InputArgumentNames()) {
+      ss << arg << ":";
+      for (auto val : Input(arg)) {
+        ss << val << " ";
+      }
+    }
+    ss << ") -> (";
+    for (auto& arg : OutputArgumentNames()) {
+      ss << arg << ":";
+      for (auto val : Output(arg)) {
+        ss << val << " ";
+      }
+    }
+    ss << ")";
+    return ss.str();
+  }
+
+  virtual ~OpDescReadAPI() = default;
+};
+
+class OpDescWriteAPI {
+ public:
+  virtual void SetType(const std::string& type) { NotImplemented(); }
+  virtual void SetInput(const std::string& param,
+                        const std::vector<std::string>& args) {
+    NotImplemented();
+  }
+  virtual void SetOutput(const std::string& param,
+                         const std::vector<std::string>& args) {
+    NotImplemented();
+  }
+
+  template <typename T>
+  void SetAttr(const std::string& name, const T& v) {
+    NotImplemented();
+  }
+
+  virtual ~OpDescWriteAPI() = default;
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL) << "OpDescWriteAPI is not available in model read-only mode.";
+  }
+};
+
+// The reading and writing of the model are one-time and separate.
+// This interface is a combination of reading and writing interfaces,
+// which is used to support legacy interfaces.
+
+class OpDescAPI : public OpDescReadAPI, public OpDescWriteAPI {
+ public:
+  using AttrType = OpAttrType;
+  virtual ~OpDescAPI() = default;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/base/program_desc.h b/lite/model_parser/base/program_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ca128bd0aa8ba39752247074e8d57c0d23513f3
--- /dev/null
+++ b/lite/model_parser/base/program_desc.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+class ProgramDescReadAPI {
+ public:
+  virtual size_t BlocksSize() const = 0;
+  virtual bool HasVersion() const = 0;
+  virtual int64_t Version() const = 0;
+
+  template <typename T>
+  T* GetBlock(int32_t idx);
+
+  template <typename T>
+  T const* GetBlock(int32_t idx) const;
+
+  virtual ~ProgramDescReadAPI() = default;
+};
+
+class ProgramDescWriteAPI {
+ public:
+  virtual void ClearBlocks() { NotImplemented(); }
+  virtual void SetVersion(int64_t version) { NotImplemented(); }
+
+  template <typename T>
+  T* AddBlock() {
+    NotImplemented();
+    return nullptr;
+  }
+
+  virtual ~ProgramDescWriteAPI() = default;
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL)
+        << "ProgramDescWriteAPI is not available in model read-only mode.";
+  }
+};
+
+// The reading and writing of the model are one-time and separate.
+// This interface is a combination of reading and writing interfaces,
+// which is used to support legacy interfaces.
+
+class ProgramDescAPI : public ProgramDescReadAPI, public ProgramDescWriteAPI {
+ public:
+  virtual ~ProgramDescAPI() = default;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/base/proto_desc.h b/lite/model_parser/base/proto_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f62ef6e43883fd41c509795d1e4f695fdbb8910
--- /dev/null
+++ b/lite/model_parser/base/proto_desc.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+
+// The Index of first Block in Program. also called root block.
+constexpr int kRootBlockIdx = 0;
+// The Parent Index of root Block, this block does not exist.
+constexpr int kNoneBlockIdx = -1;
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/base/traits.h b/lite/model_parser/base/traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..bda293686c7996abb9b0fe36edcc84407ed3b541
--- /dev/null
+++ b/lite/model_parser/base/traits.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+
+// The AttrType is used to make the proto::AttrType portable.
+enum class OpAttrType {
+  INT = 0,
+  FLOAT = 1,
+  STRING = 2,
+  INTS = 3,
+  FLOATS = 4,
+  STRINGS = 5,
+  BOOLEAN = 6,
+  BOOLEANS = 7,
+  BLOCK = 8,
+  LONG = 9,
+  BLOCKS = 10,
+  LONGS = 11,
+  UNK,
+};
+
+struct Standard {};
+struct Flatbuffers {};
+
+template <typename T, typename U>
+class VectorView;
+
+template <typename T, typename U = Standard>
+struct OpDataTypeTrait;
+
+#define ATTR_TYPE_TRAIT_IMPL(T, type__)             \
+  template <typename U>                             \
+  struct OpDataTypeTrait<type__, U> {               \
+    typedef type__ ET;                              \
+    typedef type__ RT;                              \
+    static constexpr OpAttrType AT = OpAttrType::T; \
+    static constexpr const char* ATN = #T;          \
+  };
+#define ATTR_VECTOR_TYPE_TRAIT_IMPL(T, type__)      \
+  template <typename U>                             \
+  struct OpDataTypeTrait<std::vector<type__>, U> {  \
+    typedef type__ ET;                              \
+    typedef VectorView<type__, U> RT;               \
+    static constexpr OpAttrType AT = OpAttrType::T; \
+    static constexpr const char* ATN = #T;          \
+  };
+
+ATTR_TYPE_TRAIT_IMPL(BLOCK, int16_t);
+ATTR_TYPE_TRAIT_IMPL(INT, int32_t);
+ATTR_TYPE_TRAIT_IMPL(FLOAT, float);
+ATTR_TYPE_TRAIT_IMPL(STRING, std::string);
+ATTR_TYPE_TRAIT_IMPL(BOOLEAN, bool);
+ATTR_TYPE_TRAIT_IMPL(LONG, int64_t);
+
+ATTR_VECTOR_TYPE_TRAIT_IMPL(INTS, int32_t);
+ATTR_VECTOR_TYPE_TRAIT_IMPL(FLOATS, float);
+ATTR_VECTOR_TYPE_TRAIT_IMPL(STRINGS, std::string);
+ATTR_VECTOR_TYPE_TRAIT_IMPL(LONGS, int64_t);
+
+#undef ATTR_TYPE_TRAIT_IMPL
+#undef ATTR_VECTOR_TYPE_TRAIT_IMPL
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/base/var_desc.h b/lite/model_parser/base/var_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..47596f8792a83677a036bcb3d937e67576204546
--- /dev/null
+++ b/lite/model_parser/base/var_desc.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+enum class VarDataType {
+  // Pod Types
+  BOOL = 0,
+  INT16,
+  INT32,
+  INT64,
+  FP16,
+  FP32,
+  FP64,
+  // Tensor<size_t> is used in C++.
+  SIZE_T,
+  UINT8,
+  INT8,
+
+  // Other types that may need additional descriptions
+  LOD_TENSOR,
+  SELECTED_ROWS,
+  FEED_MINIBATCH,
+  FETCH_LIST,
+  STEP_SCOPES,
+  LOD_RANK_TABLE,
+  LOD_TENSOR_ARRAY,
+  PLACE_LIST,
+  READER,
+  // Any runtime decided variable type is raw
+  // raw variables should manage their own allocations
+  // in operators like nccl_op
+  RAW,
+  TUPLE
+};
+
+class VarDescReadAPI {
+ public:
+  virtual std::string Name() const = 0;
+  virtual VarDataType GetType() const = 0;
+  virtual bool Persistable() const = 0;
+  virtual std::vector<int64_t> GetShape() const = 0;
+  virtual ~VarDescReadAPI() = default;
+};
+
+class VarDescWriteAPI {
+ public:
+  virtual void SetName(std::string name) { NotImplemented(); }
+  virtual void SetType(VarDataType type) { NotImplemented(); }
+  virtual void SetPersistable(bool persistable) { NotImplemented(); }
+  virtual void SetShape(const std::vector<int64_t>& dims) { NotImplemented(); }
+  virtual ~VarDescWriteAPI() = default;
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL) << "VarDescWriteAPI is not available in model read-only mode.";
+  }
+};
+
+// The reading and writing of the model are one-time and separate.
+// This interface is a combination of reading and writing interfaces,
+// which is used to support legacy interfaces.
+
+class VarDescAPI : public VarDescReadAPI, public VarDescWriteAPI {
+ public:
+  using VarDataType = lite::VarDataType;
+  using Type = lite::VarDataType;
+  virtual ~VarDescAPI() = default;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/base/vector_view.h b/lite/model_parser/base/vector_view.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4149d9c5acae83472904a86c47659355972855e
--- /dev/null
+++ b/lite/model_parser/base/vector_view.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <type_traits>
+#include <vector>
+#include "lite/model_parser/base/traits.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace vector_view {
+
+template <typename T, typename U = void>
+struct ElementTraits {
+  typedef T element_type;
+};
+
+template <typename T, typename U>
+struct VectorTraits;
+
+template <typename T>
+struct VectorTraits<T, Standard> {
+  typedef std::vector<T> vector_type;
+  typedef typename vector_type::const_iterator const_iterator;
+  typedef typename vector_type::const_reference const_reference;
+  typedef const_reference subscript_return_type;
+};
+
+}  // namespace vector_view
+
+// In the process of optimizing the performance of model loading, we found
+// that it was necessary to reduce the copying and construction of STL
+// containers. So use VectorView to simulate the operation of STL containers
+// without copying, such as iteration and subscripting.
+//
+// Currently, VectorView is applicable to STL vector and Flatbuffers Vector.
+// We used the template Traits to unify the behavior of the two, and provided
+// an implicit conversion operator from VectorView to STL vector. Please use
+// implicit conversion with caution because it will bring significant overhead.
+
+template <typename T, typename U = Flatbuffers>
+class VectorView {
+ public:
+  typedef vector_view::VectorTraits<T, U> Traits;
+  explicit VectorView(typename Traits::vector_type const* cvec) {
+    cvec_ = cvec;
+  }
+  typename Traits::subscript_return_type operator[](size_t i) const {
+    return cvec_->operator[](i);
+  }
+  typename Traits::const_iterator begin() const {
+    if (!cvec_) {
+      return typename Traits::const_iterator();
+    }
+    return cvec_->begin();
+  }
+  typename Traits::const_iterator end() const {
+    if (!cvec_) {
+      return typename Traits::const_iterator();
+    }
+    return cvec_->end();
+  }
+  size_t size() const {
+    if (!cvec_) {
+      return 0;
+    }
+    return cvec_->size();
+  }
+  operator std::vector<T>() const {
+    VLOG(5) << "Copying elements out of VectorView will damage performance.";
+    std::vector<T> tmp;
+    tmp.reserve(size());
+    for (size_t i = 0; i < size(); ++i) {
+      tmp.push_back(cvec_->operator[](i));
+    }
+    return tmp;
+  }
+  ~VectorView() = default;
+
+ private:
+  typename Traits::vector_type const* cvec_;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/compatibility.cc b/lite/model_parser/compatibility.cc
index 67d7c9d69152d31d1381ea847ef859a08e4f82a7..dd43f7bd25277e34a2fd8b04aae6b705402a0436 100644
--- a/lite/model_parser/compatibility.cc
+++ b/lite/model_parser/compatibility.cc
@@ -20,10 +20,7 @@
 #include "lite/model_parser/naive_buffer/program_desc.h"
 #include "lite/model_parser/naive_buffer/var_desc.h"
 #ifndef LITE_ON_TINY_PUBLISH
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/program_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #endif
 
 namespace paddle {
diff --git a/lite/model_parser/compatibility.h b/lite/model_parser/compatibility.h
index 9e421d709d1823852d6dac5cd0070b4330f56752..a47870cf9c4d8e1743f2eb749823e88f18b33900 100644
--- a/lite/model_parser/compatibility.h
+++ b/lite/model_parser/compatibility.h
@@ -17,7 +17,7 @@
 #include <set>
 #include <string>
 #include "lite/api/paddle_place.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/model_parser/compatibility_test.cc b/lite/model_parser/compatibility_test.cc
index b3cb38f1c95649567b72d73b8938420537ec7b5b..957bcb25ea68b5555c9937de4e87dc8e9c4923b1 100644
--- a/lite/model_parser/compatibility_test.cc
+++ b/lite/model_parser/compatibility_test.cc
@@ -17,10 +17,7 @@
 #include "lite/api/paddle_lite_factory_helper.h"
 
 #include "lite/model_parser/compatible_pb.h"
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/program_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 
 USE_LITE_KERNEL(leaky_relu, kCUDA, kFloat, kNCHW, def);
 
diff --git a/lite/model_parser/compatible_pb.cc b/lite/model_parser/compatible_pb.cc
index 3d66a5234994036397e445744499696909a8ab3e..8bfeb419e51b01ae008959ac5af3e9752834b1ab 100644
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
@@ -234,7 +234,7 @@ void OpAttrsCppToAny(const cpp::OpDesc &cpp_desc, OpDescType *any_desc) {
   template <>                                                               \
   void TransformBlockDescCppToAny<NT::T>(const cpp::T &cpp_desc,            \
                                          NT::T *any_desc) {                 \
-    auto desc = cpp_desc;                                                   \
+    const cpp::T &desc = cpp_desc;                                          \
     any_desc->SetIdx(desc.Idx());                                           \
     any_desc->SetParentIdx(desc.ParentIdx());                               \
     any_desc->SetForwardBlockIdx(desc.ForwardBlockIdx());                   \
@@ -277,7 +277,7 @@ void OpAttrsCppToAny(const cpp::OpDesc &cpp_desc, OpDescType *any_desc) {
   template <>                                                            \
   void TransformProgramDescCppToAny<NT::T>(const cpp::T &cpp_desc,       \
                                            NT::T *any_desc) {            \
-    auto desc = cpp_desc;                                                \
+    auto &desc = cpp_desc;                                               \
     if (desc.HasVersion()) {                                             \
       any_desc->SetVersion(desc.Version());                              \
     }                                                                    \
diff --git a/lite/model_parser/compatible_pb.h b/lite/model_parser/compatible_pb.h
index 80fee49133130b09fbdd490ed86dce0af924aac1..c9889a5879160dd60ec64c4806df8af888db99c9 100644
--- a/lite/model_parser/compatible_pb.h
+++ b/lite/model_parser/compatible_pb.h
@@ -21,10 +21,7 @@
  * lite::pb::XXDesc/lite::naive_buffer::XXDesc.
  */
 
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/program_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/model_parser/compatible_pb_test.cc b/lite/model_parser/compatible_pb_test.cc
index 088b64bf2cd13ce0f443f962bd2cb5f709c4d4f2..d9a46e463209eb33e6f2cb53f4644056f88e7085 100644
--- a/lite/model_parser/compatible_pb_test.cc
+++ b/lite/model_parser/compatible_pb_test.cc
@@ -14,10 +14,7 @@
 
 #include "lite/model_parser/compatible_pb.h"
 #include <gtest/gtest.h>
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/program_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/model_parser/naive_buffer/block_desc.h"
 #include "lite/model_parser/naive_buffer/op_desc.h"
 #include "lite/model_parser/naive_buffer/program_desc.h"
diff --git a/lite/model_parser/cpp_desc.h b/lite/model_parser/cpp_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..477f90a28d7bf1e31dbc648b18af42381e0c93d6
--- /dev/null
+++ b/lite/model_parser/cpp_desc.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/model_parser/general/block_desc.h"
+#include "lite/model_parser/general/op_desc.h"
+#include "lite/model_parser/general/program_desc.h"
+#include "lite/model_parser/general/var_desc.h"
+
+namespace paddle {
+namespace lite {
+namespace cpp = general;
+}
+}
diff --git a/lite/model_parser/desc_apis.h b/lite/model_parser/desc_apis.h
deleted file mode 100644
index 801d89e57b9a77ce04516cfdb67ce8917694188e..0000000000000000000000000000000000000000
--- a/lite/model_parser/desc_apis.h
+++ /dev/null
@@ -1,261 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <map>
-#include <string>
-#include <vector>
-#include "lite/utils/all.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-
-/*
- * Compatible interfaces for all the different kinds of XXXDesc. All the XXXDesc
- * classes should implement this.
- */
-
-class VarDescAPI {
- public:
-  enum class Type {
-    // Pod Types
-    BOOL = 0,
-    INT16,
-    INT32,
-    INT64,
-    FP16,
-    FP32,
-    FP64,
-    // Tensor<size_t> is used in C++.
-    SIZE_T,
-    UINT8,
-    INT8,
-
-    // Other types that may need additional descriptions
-    LOD_TENSOR,
-    SELECTED_ROWS,
-    FEED_MINIBATCH,
-    FETCH_LIST,
-    STEP_SCOPES,
-    LOD_RANK_TABLE,
-    LOD_TENSOR_ARRAY,
-    PLACE_LIST,
-    READER,
-    // Any runtime decided variable type is raw
-    // raw variables should manage their own allocations
-    // in operators like nccl_op
-    RAW,
-    TUPLE
-  };
-
-  using VarDataType = Type;
-
-  virtual ~VarDescAPI() = default;
-
-  // Get var's name
-  virtual std::string Name() const = 0;
-  // Set var's name
-  virtual void SetName(std::string name) = 0;
-  // Get var's type
-  virtual Type GetType() const = 0;
-  // Set var's type
-  virtual void SetType(Type type) = 0;
-  // Tell whether var is persistable or not
-  virtual bool Persistable() const = 0;
-  // Set var to be persistable or not
-  virtual void SetPersistable(bool persistable) = 0;
-  // Get var's shape
-  virtual std::vector<int64_t> GetShape() const = 0;
-  // Set var's shape
-  virtual void SetShape(const std::vector<int64_t>& dims) = 0;
-};
-
-/*
- * NOTE Some interfaces are weried, we remain them unchanged to keep compatible
- * with framework::OpDesc in Fluid framework.
- */
-class OpDescAPI {
- public:
-  // The AttrType is used to make the proto::AttrType portable.
-  enum class AttrType {
-    INT = 0,
-    FLOAT = 1,
-    STRING = 2,
-    INTS = 3,
-    FLOATS = 4,
-    STRINGS = 5,
-    BOOLEAN = 6,
-    BOOLEANS = 7,
-    BLOCK = 8,
-    LONG = 9,
-    BLOCKS = 10,
-    LONGS = 11,
-    UNK,
-  };
-
-  template <AttrType Type>
-  struct AttrTypeTrait;
-
-  template <typename T>
-  struct DataTypeTrait;
-
-  virtual ~OpDescAPI() = default;
-
-  /// Get operator's type.
-  virtual std::string Type() const = 0;
-  /// Set operator's type.
-  virtual void SetType(const std::string& type) = 0;
-  /// Get arguments given the parameter.
-  virtual std::vector<std::string> Input(const std::string& param) const = 0;
-  /// Get parameters.
-  virtual std::vector<std::string> InputArgumentNames() const = 0;
-  /// Get arguments given the parameter.
-  virtual std::vector<std::string> Output(const std::string& param) const = 0;
-  /// Get parameters.
-  virtual std::vector<std::string> OutputArgumentNames() const = 0;
-  /// Set a input given the parameter and arguments.
-  virtual void SetInput(const std::string& param,
-                        const std::vector<std::string>& args) = 0;
-  virtual void SetOutput(const std::string& param,
-                         const std::vector<std::string>& args) = 0;
-  /// Tell whether this desc has an attribute.
-  virtual bool HasAttr(const std::string& name) const = 0;
-
-  /// Get the type of an attribute.
-  virtual AttrType GetAttrType(const std::string& name) const = 0;
-
-  virtual std::vector<std::string> AttrNames() const = 0;
-
-  /// Set an attribute.
-  template <typename T>
-  void SetAttr(const std::string& name, const T& v);
-
-  /// Get an attribute.
-  template <typename T>
-  T GetAttr(const std::string& name) const;
-
-  std::string Repr() const {
-    STL::stringstream ss;
-    ss << Type();
-    ss << "(";
-    for (auto& arg : InputArgumentNames()) {
-      ss << arg << ":";
-      for (auto val : Input(arg)) {
-        ss << val << " ";
-      }
-    }
-    ss << ") -> (";
-    for (auto& arg : OutputArgumentNames()) {
-      ss << arg << ":";
-      for (auto val : Output(arg)) {
-        ss << val << " ";
-      }
-    }
-    ss << ")";
-    return ss.str();
-  }
-};
-
-#define TYPE_TRAIT_IMPL(T, type__)                          \
-  template <>                                               \
-  struct OpDescAPI::AttrTypeTrait<OpDescAPI::AttrType::T> { \
-    typedef type__ DT;                                      \
-  };                                                        \
-  template <>                                               \
-  struct OpDescAPI::DataTypeTrait<type__> {                 \
-    static constexpr AttrType AT = OpDescAPI::AttrType::T;  \
-    static constexpr const char* ATN = #T;                  \
-  };
-
-TYPE_TRAIT_IMPL(INT, int32_t);
-TYPE_TRAIT_IMPL(FLOAT, float);
-TYPE_TRAIT_IMPL(STRING, std::string);
-TYPE_TRAIT_IMPL(BOOLEAN, bool);
-TYPE_TRAIT_IMPL(LONG, int64_t);
-TYPE_TRAIT_IMPL(INTS, std::vector<int>);
-TYPE_TRAIT_IMPL(FLOATS, std::vector<float>);
-TYPE_TRAIT_IMPL(STRINGS, std::vector<std::string>);
-TYPE_TRAIT_IMPL(LONGS, std::vector<int64_t>);
-#undef TYPE_TRAIT_IMPL
-
-class BlockDescAPI {
- public:
-  virtual ~BlockDescAPI() = default;
-
-  virtual int32_t Idx() const = 0;
-
-  virtual void SetIdx(int32_t idx) = 0;
-
-  virtual int32_t ParentIdx() const = 0;
-
-  virtual void SetParentIdx(int32_t idx) = 0;
-
-  virtual size_t VarsSize() const = 0;
-
-  virtual void ClearVars() = 0;
-
-  // NOTE: This ugly method is used to compatible interfaces between cpp and
-  // pb/nb backends
-  // TODO(sangoly): refine this
-  template <typename T>
-  T* GetVar(int32_t idx);
-
-  template <typename T>
-  T* AddVar();
-
-  virtual size_t OpsSize() const = 0;
-
-  virtual void ClearOps() = 0;
-
-  // NOTE: This ugly method is used to compatible interfaces between cpp and
-  // pb/nb backends
-  // TODO(sangoly): refine this
-  template <typename T>
-  T* GetOp(int32_t idx);
-
-  template <typename T>
-  T* AddOp();
-
-  virtual int32_t ForwardBlockIdx() const = 0;
-
-  virtual void SetForwardBlockIdx(int32_t idx) = 0;
-};
-
-class ProgramDescAPI {
- public:
-  virtual ~ProgramDescAPI() = default;
-
-  virtual size_t BlocksSize() const = 0;
-
-  virtual void ClearBlocks() = 0;
-
-  // NOTE: This ugly method is used to compatible interfaces between cpp and
-  // pb/nb backends
-  // TODO(sangoly): refine this
-  template <typename T>
-  T* GetBlock(int32_t idx);
-
-  template <typename T>
-  T* AddBlock();
-
-  virtual bool HasVersion() const = 0;
-
-  virtual int64_t Version() const = 0;
-
-  virtual void SetVersion(int64_t version) = 0;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/CMakeLists.txt b/lite/model_parser/flatbuffers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b7ae9514efaa406d6b339c7917ad3dc2ad4a1f4f
--- /dev/null
+++ b/lite/model_parser/flatbuffers/CMakeLists.txt
@@ -0,0 +1,13 @@
+function(lite_fbs_library TARGET)
+  set(multiValueArgs SRCS FBS_DEPS)
+  cmake_parse_arguments(args "" "" "${multiValueArgs}" ${ARGN})
+  lite_cc_library(${TARGET} SRCS ${args_SRCS})
+  add_dependencies(${TARGET} ${args_FBS_DEPS})
+endfunction()
+
+lite_fbs_library(fbs_op_desc SRCS op_desc.cc FBS_DEPS framework_fbs_header)
+lite_fbs_library(fbs_var_desc SRCS var_desc.cc FBS_DEPS framework_fbs_header)
+lite_fbs_library(fbs_block_desc SRCS block_desc.cc FBS_DEPS framework_fbs_header)
+lite_cc_library(fbs_program_desc SRCS program_desc.cc DEPS fbs_op_desc fbs_var_desc fbs_block_desc)
+lite_cc_library(fbs_io SRCS io.cc DEPS fbs_program_desc)
+lite_cc_test(test_vector_view SRCS vector_view_test.cc DEPS fbs_program_desc)
diff --git a/lite/model_parser/flatbuffers/block_desc.cc b/lite/model_parser/flatbuffers/block_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..64087bb0707a891cc94a2d1234bb582312c3c10a
--- /dev/null
+++ b/lite/model_parser/flatbuffers/block_desc.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/model_parser/flatbuffers/block_desc.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+template <>
+proto::VarDesc const* BlockDesc::GetVar<proto::VarDesc>(int32_t idx) const {
+  CHECK_LT(idx, VarsSize()) << "idx >= vars.size()";
+  return desc_->vars()->Get(idx);
+}
+
+template <>
+proto::OpDesc const* BlockDesc::GetOp<proto::OpDesc>(int32_t idx) const {
+  CHECK_LT(idx, OpsSize()) << "idx >= ops.size()";
+  return desc_->ops()->Get(idx);
+}
+
+template <>
+VarDesc const* BlockDesc::GetVar<VarDesc>(int32_t idx) const {
+  CHECK_LT(idx, VarsSize()) << "idx >= vars.size()";
+  return &vars_[idx];
+}
+
+template <>
+OpDesc const* BlockDesc::GetOp<OpDesc>(int32_t idx) const {
+  CHECK_LT(idx, OpsSize()) << "idx >= ops.size()";
+  return &ops_[idx];
+}
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/block_desc.h b/lite/model_parser/flatbuffers/block_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd99bdaa69020823ad6ca50438f21356eae41459
--- /dev/null
+++ b/lite/model_parser/flatbuffers/block_desc.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/model_parser/base/block_desc.h"
+#include "lite/model_parser/flatbuffers/framework_generated.h"
+#include "lite/model_parser/flatbuffers/op_desc.h"
+#include "lite/model_parser/flatbuffers/var_desc.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+class BlockDesc : public BlockDescAPI {
+ public:
+  explicit BlockDesc(proto::BlockDesc const* desc) : desc_(desc) {
+    CHECK(desc_);
+    vars_.reserve(VarsSize());
+    ops_.reserve(OpsSize());
+    for (size_t idx = 0; idx < VarsSize(); ++idx) {
+      vars_.push_back(VarDesc(desc_->vars()->Get(idx)));
+    }
+    for (size_t idx = 0; idx < OpsSize(); ++idx) {
+      ops_.push_back(OpDesc(desc_->ops()->Get(idx)));
+    }
+  }
+
+  int32_t Idx() const override { return desc_->idx(); }
+
+  int32_t ParentIdx() const override { return desc_->parent_idx(); }
+
+  size_t VarsSize() const override { return desc_->vars()->size(); }
+
+  template <typename T>
+  T const* GetVar(int32_t idx) const;
+
+  template <typename T>
+  T* GetVar(int32_t idx) {
+    NotImplemented();
+    return nullptr;
+  }
+
+  size_t OpsSize() const override {
+    CHECK(desc_);
+    CHECK(desc_->ops());
+    return desc_->ops()->size();
+  }
+
+  template <typename T>
+  T const* GetOp(int32_t idx) const;
+
+  template <typename T>
+  T* GetOp(int32_t idx) {
+    NotImplemented();
+    return nullptr;
+  }
+
+  const std::vector<VarDesc>& GetVars() const { return vars_; }
+
+  int32_t ForwardBlockIdx() const override {
+    return desc_->forward_block_idx();
+  }
+
+  BlockDesc() { NotImplemented(); }
+
+ private:
+  proto::BlockDesc const* desc_;  // not_own
+  std::vector<VarDesc> vars_;
+  std::vector<OpDesc> ops_;
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL) << "The additional interfaces of BlockDesc is temporarily "
+                  "unavailable in read-only mode.";
+  }
+};
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/framework.fbs b/lite/model_parser/flatbuffers/framework.fbs
new file mode 100644
index 0000000000000000000000000000000000000000..90f6e626088003975f18303e47230a85c303181d
--- /dev/null
+++ b/lite/model_parser/flatbuffers/framework.fbs
@@ -0,0 +1,172 @@
+// Generated from framework.proto
+
+namespace paddle.lite.fbs.proto;
+
+enum AttrType : int {
+  INT = 0,
+  FLOAT = 1,
+  STRING = 2,
+  INTS = 3,
+  FLOATS = 4,
+  STRINGS = 5,
+  BOOLEAN = 6,
+  BOOLEANS = 7,
+  BLOCK = 8,
+  LONG = 9,
+  BLOCKS = 10,
+  LONGS = 11,
+}
+
+namespace paddle.lite.fbs.proto.VarType_;
+
+enum Type : int {
+  BOOL = 0,
+  INT16 = 1,
+  INT32 = 2,
+  INT64 = 3,
+  FP16 = 4,
+  FP32 = 5,
+  FP64 = 6,
+  LOD_TENSOR = 7,
+  SELECTED_ROWS = 8,
+  FEED_MINIBATCH = 9,
+  FETCH_LIST = 10,
+  STEP_SCOPES = 11,
+  LOD_RANK_TABLE = 12,
+  LOD_TENSOR_ARRAY = 13,
+  PLACE_LIST = 14,
+  READER = 15,
+  RAW = 17,
+  TUPLE = 18,
+  SIZE_T = 19,
+  UINT8 = 20,
+  INT8 = 21,
+}
+
+namespace paddle.lite.fbs.proto.CompatibleInfo_;
+
+enum Type : int {
+  COMPATIBLE = 0,
+  DEFINITELY_NOT = 1,
+  POSSIBLE = 2,
+  BUG_FIX = 3,
+  PRECISION_CHANGE = 4,
+}
+
+namespace paddle.lite.fbs.proto;
+
+table Version {
+  version:long;
+}
+
+table OpDesc {
+  type:string (required);
+  inputs:[paddle.lite.fbs.proto.OpDesc_.Var];
+  outputs:[paddle.lite.fbs.proto.OpDesc_.Var];
+  attrs:[paddle.lite.fbs.proto.OpDesc_.Attr];
+  is_target:bool;
+}
+
+namespace paddle.lite.fbs.proto.OpDesc_;
+
+table Attr {
+  name:string (required, key);
+  type:paddle.lite.fbs.proto.AttrType;
+  i:int;
+  f:float;
+  s:string;
+  ints:[int];
+  floats:[float];
+  strings:[string];
+  b:bool;
+  bools:[bool];
+  block_idx:int;
+  l:long;
+  blocks_idx:[int];
+  longs:[long];
+}
+
+table Var {
+  parameter:string (required, key);
+  arguments:[string];
+}
+
+namespace paddle.lite.fbs.proto;
+
+table VarType {
+  type:paddle.lite.fbs.proto.VarType_.Type;
+  selected_rows:paddle.lite.fbs.proto.VarType_.TensorDesc;
+  lod_tensor:paddle.lite.fbs.proto.VarType_.LoDTensorDesc;
+  tensor_array:paddle.lite.fbs.proto.VarType_.LoDTensorArrayDesc;
+  reader:paddle.lite.fbs.proto.VarType_.ReaderDesc;
+  tuple:paddle.lite.fbs.proto.VarType_.Tuple;
+}
+
+namespace paddle.lite.fbs.proto.VarType_;
+
+table TensorDesc {
+  data_type:paddle.lite.fbs.proto.VarType_.Type;
+  dims:[long];
+}
+
+table LoDTensorDesc {
+  tensor:paddle.lite.fbs.proto.VarType_.TensorDesc (required);
+  lod_level:int;
+}
+
+table LoDTensorArrayDesc {
+  tensor:paddle.lite.fbs.proto.VarType_.TensorDesc (required);
+  lod_level:int;
+}
+
+table ReaderDesc {
+  lod_tensor:[paddle.lite.fbs.proto.VarType_.LoDTensorDesc];
+}
+
+table Tuple {
+  element_type:[paddle.lite.fbs.proto.VarType_.Type];
+}
+
+namespace paddle.lite.fbs.proto;
+
+table VarDesc {
+  name:string (required, key);
+  type:paddle.lite.fbs.proto.VarType (required);
+  persistable:bool;
+  need_check_feed:bool;
+}
+
+table BlockDesc {
+  idx:int;
+  parent_idx:int;
+  vars:[paddle.lite.fbs.proto.VarDesc];
+  ops:[paddle.lite.fbs.proto.OpDesc];
+  forward_block_idx:int = -1;
+}
+
+table CompatibleInfo {
+  version:string (required);
+  type:paddle.lite.fbs.proto.CompatibleInfo_.Type;
+}
+
+table OpCompatibleMap {
+  pair:[paddle.lite.fbs.proto.OpCompatibleMap_.OpCompatiblePair];
+  default_required_version:string;
+}
+
+namespace paddle.lite.fbs.proto.OpCompatibleMap_;
+
+table OpCompatiblePair {
+  op_name:string (required, key);
+  compatible_info:paddle.lite.fbs.proto.CompatibleInfo (required);
+}
+
+namespace paddle.lite.fbs.proto;
+
+table ProgramDesc {
+  blocks:[paddle.lite.fbs.proto.BlockDesc];
+  version:paddle.lite.fbs.proto.Version;
+  op_compatible_map:paddle.lite.fbs.proto.OpCompatibleMap;
+}
+
+root_type paddle.lite.fbs.proto.ProgramDesc;
diff --git a/lite/model_parser/flatbuffers/io.cc b/lite/model_parser/flatbuffers/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ef8e9afaefe94d72113299050f16077a09f6c6cf
--- /dev/null
+++ b/lite/model_parser/flatbuffers/io.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/model_parser/flatbuffers/io.h"
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+void LoadModel(const std::string& path, ProgramDesc* prog) {
+  CHECK(prog);
+  FILE* file = fopen(path.c_str(), "rb");
+  fseek(file, 0, SEEK_END);
+  int64_t length = ftell(file);
+  rewind(file);
+  std::vector<char> buf(length);
+  CHECK(fread(buf.data(), 1, length, file));
+  fclose(file);
+  prog->Init(std::move(buf));
+}
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/io.h b/lite/model_parser/flatbuffers/io.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c81b192bbbcfc026bc4a2e77225c9a4c68208f3
--- /dev/null
+++ b/lite/model_parser/flatbuffers/io.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "lite/model_parser/flatbuffers/program_desc.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+void LoadModel(const std::string& path, ProgramDesc* prog);
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/op_desc.cc b/lite/model_parser/flatbuffers/op_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9e416b020d8fed0861d1d0b02ae74a9ccc47df59
--- /dev/null
+++ b/lite/model_parser/flatbuffers/op_desc.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/model_parser/flatbuffers/op_desc.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+template <>
+std::string OpDesc::GetAttr<std::string>(const std::string& name) const {
+  const auto& it = desc_->attrs()->LookupByKey(name.c_str());
+  if (!it->s()) {
+    return std::string();
+  }
+  return it->s()->str();
+}
+
+template <>
+std::string OpDesc::GetAttr<std::string>(size_t idx) const {
+  const auto& it = desc_->attrs()->Get(idx);
+  if (!it->s()) {
+    return std::string();
+  }
+  return it->s()->str();
+}
+
+template <>
+lite::VectorView<std::string, Flatbuffers>
+OpDesc::GetAttr<std::vector<std::string>>(const std::string& name) const {
+  const auto& it = desc_->attrs()->LookupByKey(name.c_str());
+  CHECK(it) << "Attr " << name << "does not exist.";
+  return VectorView<std::string>(it->strings());
+}
+
+template <>
+VectorView<std::string, Flatbuffers> OpDesc::GetAttr<std::vector<std::string>>(
+    size_t idx) const {
+  const auto& it = desc_->attrs()->Get(idx);
+  CHECK(it) << "Attr " << idx << "does not exist.";
+  return VectorView<std::string>(it->strings());
+}
+
+#define GET_ATTR_IMPL(T, fb_f__)                                         \
+  template <>                                                            \
+  typename lite::OpDataTypeTrait<T, Flatbuffers>::RT OpDesc::GetAttr<T>( \
+      const std::string& name) const {                                   \
+    const auto& it = desc_->attrs()->LookupByKey(name.c_str());          \
+    return it->fb_f__();                                                 \
+  }                                                                      \
+  template <>                                                            \
+  typename lite::OpDataTypeTrait<T, Flatbuffers>::RT OpDesc::GetAttr<T>( \
+      size_t idx) const {                                                \
+    const auto& it = desc_->attrs()->Get(idx);                           \
+    return it->fb_f__();                                                 \
+  }
+
+#define GET_ATTRS_IMPL(T, fb_f__)                                            \
+  template <>                                                                \
+  typename lite::OpDataTypeTrait<T, Flatbuffers>::RT OpDesc::GetAttr<T>(     \
+      const std::string& name) const {                                       \
+    const auto& it = desc_->attrs()->LookupByKey(name.c_str());              \
+    return typename lite::OpDataTypeTrait<T, Flatbuffers>::RT(it->fb_f__()); \
+  }                                                                          \
+  template <>                                                                \
+  typename lite::OpDataTypeTrait<T, Flatbuffers>::RT OpDesc::GetAttr<T>(     \
+      size_t idx) const {                                                    \
+    const auto& it = desc_->attrs()->Get(idx);                               \
+    return typename lite::OpDataTypeTrait<T, Flatbuffers>::RT(it->fb_f__()); \
+  }
+
+GET_ATTR_IMPL(int32_t, i);
+GET_ATTR_IMPL(int16_t, block_idx);
+GET_ATTR_IMPL(float, f);
+GET_ATTR_IMPL(bool, b);
+GET_ATTR_IMPL(int64_t, l);
+GET_ATTRS_IMPL(std::vector<int>, ints);
+GET_ATTRS_IMPL(std::vector<float>, floats);
+GET_ATTRS_IMPL(std::vector<int64_t>, longs);
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/op_desc.h b/lite/model_parser/flatbuffers/op_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..450aa49fa13b676b33bef8490c65061dc504431d
--- /dev/null
+++ b/lite/model_parser/flatbuffers/op_desc.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "lite/model_parser/base/op_desc.h"
+#include "lite/model_parser/flatbuffers/framework_generated.h"
+#include "lite/model_parser/flatbuffers/vector_view.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+class OpDesc : public OpDescAPI {
+ public:
+  explicit OpDesc(proto::OpDesc const* desc) : desc_(desc) { CHECK(desc_); }
+
+  std::string Type() const override { return desc_->type()->str(); }
+
+  // Get the arguments of parameter called `param`
+  std::vector<std::string> Input(const std::string& param) const override {
+    const auto& var = desc_->inputs()->LookupByKey(param.c_str());
+    std::vector<std::string> args_vec;
+    if (var->arguments()) {
+      args_vec.reserve(var->arguments()->size());
+      for (const auto& in : *var->arguments()) {
+        args_vec.push_back(in->str());
+      }
+    }
+    return args_vec;
+  }
+
+  std::vector<std::string> InputArgumentNames() const override {
+    const auto& vars = desc_->inputs();
+    std::vector<std::string> input_names_vec;
+    if (vars) {
+      input_names_vec.reserve(vars->size());
+      for (const auto& in : *vars) {
+        input_names_vec.push_back(in->parameter()->str());
+      }
+    }
+    return input_names_vec;
+  }
+
+  std::vector<std::string> Output(const std::string& param) const override {
+    const auto& var = desc_->outputs()->LookupByKey(param.c_str());
+    std::vector<std::string> args_vec;
+    if (var && var->arguments()) {
+      args_vec.reserve(var->arguments()->size());
+      for (const auto& out : *var->arguments()) {
+        args_vec.push_back(out->str());
+      }
+    }
+    return args_vec;
+  }
+
+  std::vector<std::string> OutputArgumentNames() const override {
+    const auto& vars = desc_->outputs();
+    std::vector<std::string> output_names_vec;
+    if (vars) {
+      output_names_vec.reserve(vars->size());
+      for (const auto& out : *vars) {
+        output_names_vec.push_back(out->parameter()->str());
+      }
+    }
+    return output_names_vec;
+  }
+
+  bool HasAttr(const std::string& name) const override {
+    return desc_->attrs()->LookupByKey(name.c_str()) != nullptr;
+  }
+
+  size_t AttrsSize() const { return desc_->attrs()->size(); }
+
+  std::string AttrName(size_t idx) const {
+    return desc_->attrs()->Get(idx)->name()->str();
+  }
+
+  OpDescAPI::AttrType GetAttrType(const std::string& name) const override {
+    const auto& attr = desc_->attrs()->LookupByKey(name.c_str());
+    CHECK(attr) << "Can not find attr: " << name;
+    return static_cast<OpDescAPI::AttrType>(attr->type());
+  }
+
+  OpDescAPI::AttrType GetAttrType(size_t idx) const {
+    const auto& attr = desc_->attrs()->Get(idx);
+    CHECK(attr);
+    return static_cast<OpDescAPI::AttrType>(attr->type());
+  }
+
+  std::vector<std::string> AttrNames() const override {
+    const auto& attrs = desc_->attrs();
+    std::vector<std::string> attr_names_vec;
+    if (attrs) {
+      attr_names_vec.reserve(attrs->size());
+      for (const auto& attr : *attrs) {
+        attr_names_vec.push_back(attr->name()->str());
+      }
+    }
+    return attr_names_vec;
+  }
+
+  template <typename T>
+  typename lite::OpDataTypeTrait<T, Flatbuffers>::RT GetAttr(
+      const std::string& name) const;
+
+  template <typename T>
+  typename lite::OpDataTypeTrait<T, Flatbuffers>::RT GetAttr(size_t idx) const;
+
+ private:
+  proto::OpDesc const* desc_;
+
+  // To reduce overhead, we expect to use namespace aliasing to make cpp::Desc
+  // and flatbuffers::Desc replace each other. However, there is no direct
+  // inheritance relationship between the two data types, and the read-only
+  // version of flatbuffers lacks some write implementations. Therefore, at
+  // present, we are temporarily providing a default interface that triggers
+  // execution-time errors to avoid type ambiguity and compile-time errors
+  // caused by different building options.
+
+ public:
+  OpDesc() { NotImplemented(); }
+  bool HasInput(const std::string& param) const {
+    return desc_->inputs()->LookupByKey(param.c_str()) != nullptr;
+  }
+
+  const std::map<std::string, std::vector<std::string>>& inputs() const {
+    NotImplemented();
+    return inputs_;
+  }
+  const std::map<std::string, std::vector<std::string>>& outputs() const {
+    NotImplemented();
+    return outputs_;
+  }
+  std::map<std::string, std::vector<std::string>>* mutable_inputs() {
+    NotImplemented();
+    return &inputs_;
+  }
+  std::map<std::string, std::vector<std::string>>* mutable_outputs() {
+    NotImplemented();
+    return &outputs_;
+  }
+
+  std::vector<std::string> input_vars() const {
+    NotImplemented();
+    return std::vector<std::string>();
+  }
+
+  std::vector<std::string> output_vars() const {
+    NotImplemented();
+    return std::vector<std::string>();
+  }
+
+  bool HasOutput(const std::string& param) const {
+    return !Output(param).empty();
+  }
+
+  const std::map<std::string, Any>& attrs() const {
+    NotImplemented();
+    return attrs_;
+  }
+  const std::map<std::string, AttrType>& attr_types() const {
+    NotImplemented();
+    return attr_types_;
+  }
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL) << "The additional interfaces of OpDesc is temporarily "
+                  "unavailable in read-only mode.";
+  }
+  std::string type_;
+  std::map<std::string, std::vector<std::string>> inputs_;
+  std::map<std::string, std::vector<std::string>> outputs_;
+  std::map<std::string, Any> attrs_;
+  std::map<std::string, AttrType> attr_types_;
+};
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/program_desc.cc b/lite/model_parser/flatbuffers/program_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f04954a9dc890a0b5866a7e6c3f3c7b18f2783e4
--- /dev/null
+++ b/lite/model_parser/flatbuffers/program_desc.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/model_parser/flatbuffers/program_desc.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+template <>
+proto::BlockDesc const* ProgramDesc::GetBlock<proto::BlockDesc>(
+    int32_t idx) const {
+  CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()";
+  return desc_->blocks()->Get(idx);
+}
+
+template <>
+BlockDesc const* ProgramDesc::GetBlock<BlockDesc>(int32_t idx) const {
+  CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()";
+  return &blocks_[idx];
+}
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/program_desc.h b/lite/model_parser/flatbuffers/program_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..55218eef5b4037d13b2f45db6de6b94cb39d994e
--- /dev/null
+++ b/lite/model_parser/flatbuffers/program_desc.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/model_parser/base/program_desc.h"
+#include "lite/model_parser/flatbuffers/block_desc.h"
+#include "lite/model_parser/flatbuffers/framework_generated.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+class ProgramDesc : public ProgramDescAPI {
+ public:
+  ProgramDesc() = default;
+  explicit ProgramDesc(const std::vector<char>& buf) { Init(buf); }
+  explicit ProgramDesc(std::vector<char>&& buf) {
+    Init(std::forward<std::vector<char>>(buf));
+  }
+
+  void Init(const std::vector<char>& buf) {
+    CHECK(buf.data());
+    buf_ = buf;
+    InitProgramDesc();
+  }
+
+  void Init(std::vector<char>&& buf) {
+    CHECK(buf.data());
+    buf_ = std::move(buf);
+    InitProgramDesc();
+  }
+
+  void InitProgramDesc() {
+    desc_ = proto::GetProgramDesc(buf_.data());
+    blocks_.reserve(BlocksSize());
+    for (size_t idx = 0; idx < BlocksSize(); ++idx) {
+      blocks_.push_back(BlockDesc(desc_->blocks()->Get(idx)));
+    }
+  }
+
+  void CopyFrom(const ProgramDesc& other) {
+    buf_ = other.buf();
+    Init(buf_);
+  }
+
+  size_t BlocksSize() const override { return desc_->blocks()->size(); }
+
+  template <typename T>
+  T const* GetBlock(int32_t idx) const;
+
+  template <typename T>
+  T* GetBlock(int32_t idx) {
+    NotImplemented();
+    return nullptr;
+  }
+
+  const std::vector<BlockDesc>& GetBlocks() const { return blocks_; }
+
+  bool HasVersion() const override { return desc_->version() != nullptr; }
+
+  int64_t Version() const override {
+    CHECK(HasVersion());
+    return desc_->version()->version();
+  }
+
+  proto::ProgramDesc const* raw_desc() const { return desc_; }
+
+  const std::vector<char>& buf() const { return buf_; }
+
+ private:
+  proto::ProgramDesc const* desc_;
+  std::vector<char> buf_;
+  std::vector<BlockDesc> blocks_;
+
+ private:
+  ProgramDesc& operator=(const ProgramDesc&) = delete;
+  ProgramDesc(const ProgramDesc&) = delete;
+  void NotImplemented() const {
+    LOG(FATAL) << "The additional interfaces of ProgramDesc is temporarily "
+                  "unavailable in read-only mode.";
+  }
+};
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/var_desc.cc b/lite/model_parser/flatbuffers/var_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a629ffd5e35223aee218a8798a597b8c684c8c62
--- /dev/null
+++ b/lite/model_parser/flatbuffers/var_desc.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/model_parser/flatbuffers/var_desc.h"
diff --git a/lite/model_parser/flatbuffers/var_desc.h b/lite/model_parser/flatbuffers/var_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..48d81df30f78ca668bbe9358b4f488fd2f4d3d66
--- /dev/null
+++ b/lite/model_parser/flatbuffers/var_desc.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/model_parser/base/var_desc.h"
+#include "lite/model_parser/flatbuffers/framework_generated.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+class VarDesc : public VarDescAPI {
+ public:
+  explicit VarDesc(proto::VarDesc const* desc) : desc_(desc) {}
+
+  std::string Name() const override { return desc_->name()->str(); }
+
+  VarDescAPI::Type GetType() const override {
+    return static_cast<VarDescAPI::Type>(desc_->type()->type());
+  }
+
+  bool Persistable() const override { return desc_->persistable(); }
+
+  std::vector<int64_t> GetShape() const override {
+    CHECK(GetType() == VarDescAPI::Type::LOD_TENSOR);
+    const auto& dims = desc_->type()->lod_tensor()->tensor()->dims();
+    std::vector<int64_t> dims_vec;
+    dims_vec.reserve(dims->size());
+    for (const auto& dim : *dims) {
+      dims_vec.push_back(dim);
+    }
+    return dims_vec;
+  }
+
+  VarDescAPI::Type GetDataType() const {
+    CHECK(GetType() == VarDescAPI::Type::LOD_TENSOR);
+    return static_cast<VarDescAPI::Type>(
+        desc_->type()->lod_tensor()->tensor()->data_type());
+  }
+
+ private:
+  proto::VarDesc const* desc_;
+
+  // To reduce overhead, we expect to use namespace aliasing to make cpp::Desc
+  // and flatbuffers::Desc replace each other. However, there is no direct
+  // inheritance relationship between the two data types, and the read-only
+  // version of flatbuffers lacks some write implementations. Therefore, at
+  // present, we are temporarily providing a default interface that triggers
+  // execution-time errors to avoid type ambiguity and compile-time errors
+  // caused by different building options.
+
+ public:
+  VarDesc() { NotImplemented(); }
+  void SetDataType(Type data_type) { NotImplemented(); }
+  void SetShape(const std::vector<int64_t>& dims) { NotImplemented(); }
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL) << "The additional interfaces of VarDesc is temporarily "
+                  "unavailable in read-only mode.";
+  }
+  std::vector<int64_t> shape_;
+};
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/vector_view.h b/lite/model_parser/flatbuffers/vector_view.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb1331823a2dce79d2b3a6784f1f2d5b5864281d
--- /dev/null
+++ b/lite/model_parser/flatbuffers/vector_view.h
@@ -0,0 +1,143 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "flatbuffers/flatbuffers.h"
+#include "lite/model_parser/base/vector_view.h"
+
+namespace paddle {
+namespace lite {
+namespace vector_view {
+
+template <typename T>
+struct ElementTraits<T*,
+                     typename std::enable_if<std::is_class<T>::value>::type> {
+  typedef flatbuffers::Offset<T> element_type;
+};
+
+template <>
+struct ElementTraits<std::string, void> {
+  typedef flatbuffers::Offset<flatbuffers::String> element_type;
+};
+
+template <typename T>
+struct VectorTraits<T, Flatbuffers> {
+  typedef flatbuffers::Vector<typename ElementTraits<T>::element_type>
+      vector_type;
+  typedef typename vector_type::const_iterator const_iterator;
+  typedef typename const_iterator::value_type value_type;
+  typedef const typename const_iterator::reference const_reference;
+  typedef value_type subscript_return_type;
+};
+
+struct FBSStrIterator {
+  typedef flatbuffers::VectorIterator<
+      flatbuffers::Offset<flatbuffers::String>,
+      typename flatbuffers::IndirectHelper<
+          flatbuffers::Offset<flatbuffers::String>>::return_type>
+      VI;
+
+  FBSStrIterator() = default;
+  explicit FBSStrIterator(const VI& iter) { iter_ = iter; }
+  const VI& raw_iter() const { return iter_; }
+
+  bool operator==(const FBSStrIterator& other) const {
+    return iter_ == other.raw_iter();
+  }
+
+  bool operator<(const FBSStrIterator& other) const {
+    return iter_ < other.raw_iter();
+  }
+
+  bool operator!=(const FBSStrIterator& other) const {
+    return iter_ != other.raw_iter();
+  }
+
+  ptrdiff_t operator-(const FBSStrIterator& other) const {
+    return iter_ - other.raw_iter();
+  }
+
+  std::string operator*() const { return iter_.operator*()->str(); }
+  std::string operator->() const { return iter_.operator->()->str(); }
+
+  FBSStrIterator& operator++() {
+    iter_++;
+    return *this;
+  }
+
+  FBSStrIterator& operator--() {
+    iter_--;
+    return *this;
+  }
+
+  FBSStrIterator operator+(const size_t& offset) {
+    return FBSStrIterator(iter_ + offset);
+  }
+
+  FBSStrIterator operator-(const size_t& offset) {
+    return FBSStrIterator(iter_ - offset);
+  }
+
+ private:
+  VI iter_;
+};
+
+}  // namespace vector_view
+
+template <>
+class VectorView<std::string, Flatbuffers> {
+ public:
+  typedef vector_view::VectorTraits<std::string, Flatbuffers> Traits;
+  explicit VectorView(typename Traits::vector_type const* cvec) {
+    cvec_ = cvec;
+  }
+  std::string operator[](size_t i) const { return cvec_->operator[](i)->str(); }
+  vector_view::FBSStrIterator begin() const {
+    if (!cvec_) {
+      return vector_view::FBSStrIterator();
+    }
+    return vector_view::FBSStrIterator(cvec_->begin());
+  }
+  vector_view::FBSStrIterator end() const {
+    if (!cvec_) {
+      return vector_view::FBSStrIterator();
+    }
+    return vector_view::FBSStrIterator(cvec_->end());
+  }
+  size_t size() const {
+    if (!cvec_) {
+      return 0;
+    }
+    return cvec_->size();
+  }
+  operator std::vector<std::string>() const {
+    VLOG(5) << "Copying elements out of VectorView will damage performance.";
+    std::vector<std::string> tmp;
+    tmp.reserve(size());
+    for (size_t i = 0; i < size(); ++i) {
+      tmp.push_back(cvec_->operator[](i)->str());
+    }
+    return tmp;
+  }
+  ~VectorView() = default;
+
+ private:
+  typename Traits::vector_type const* cvec_;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/vector_view_test.cc b/lite/model_parser/flatbuffers/vector_view_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6512ee69bd4f34c0d6e49274d478404191fd9476
--- /dev/null
+++ b/lite/model_parser/flatbuffers/vector_view_test.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/model_parser/flatbuffers/vector_view.h"
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "lite/model_parser/flatbuffers/framework_generated.h"
+
+namespace paddle {
+namespace lite {
+
+TEST(VectorView, std_vector) {
+  std::vector<int64_t> vector{1, 2, 3};
+  VectorView<int64_t, Standard> vector_view(&vector);
+  size_t i = 0;
+  for (const auto& value : vector_view) {
+    EXPECT_EQ(value, vector[i]);
+    ++i;
+  }
+  for (size_t j = 0; j < vector_view.size(); ++j) {
+    EXPECT_EQ(vector_view[i], vector[i]);
+  }
+}
+
+TEST(VectorView, Flatbuffers) {
+  using namespace flatbuffers;        // NOLINT
+  using namespace paddle::lite::fbs;  // NOLINT
+
+  auto create_desc = [](FlatBufferBuilder& fbb) {
+    /* --------- Set --------- */
+    // Attr
+    std::vector<int32_t> ints({-1, 0, 1, 2, 3});
+    auto string_0 = fbb.CreateString("string_0");
+    auto string_1 = fbb.CreateString("string_1");
+    std::vector<Offset<String>> strings;
+    strings.push_back(string_0);
+    strings.push_back(string_1);
+    auto attr = proto::OpDesc_::CreateAttrDirect(fbb,
+                                                 nullptr,
+                                                 proto::AttrType_INT,
+                                                 0,
+                                                 0.0f,
+                                                 nullptr,
+                                                 &ints,
+                                                 nullptr,
+                                                 &strings);
+
+    // OpDesc
+    std::vector<Offset<proto::OpDesc_::Attr>> attrs;
+    attrs.push_back(attr);
+    auto op_desc =
+        proto::CreateOpDescDirect(fbb, "hello!", nullptr, nullptr, &attrs);
+
+    // BlockDesc 0
+    std::vector<Offset<proto::OpDesc>> ops;
+    ops.push_back(op_desc);
+    auto block_0 = proto::CreateBlockDescDirect(fbb, 0, 0, nullptr, &ops);
+
+    // BlockDesc 1
+    auto block_1 = proto::CreateBlockDescDirect(fbb, 1);
+
+    // ProgramDesc
+    std::vector<Offset<proto::BlockDesc>> block_vector;
+    block_vector.push_back(block_0);
+    block_vector.push_back(block_1);
+    auto orc = proto::CreateProgramDescDirect(fbb, &block_vector);
+    fbb.Finish(orc);
+  };
+
+  FlatBufferBuilder fbb;
+  create_desc(fbb);
+  auto program = fbs::proto::GetProgramDesc(fbb.GetBufferPointer());
+
+  // BlockDesc View
+  VectorView<proto::BlockDesc*> block_view(program->blocks());
+  EXPECT_EQ(block_view.size(), static_cast<size_t>(2));
+  EXPECT_EQ(block_view[0]->idx(), 0);
+  EXPECT_EQ(block_view[1]->idx(), 1);
+
+  // OpDesc & Attr View
+  VectorView<proto::OpDesc*> op_view(block_view[0]->ops());
+  EXPECT_EQ(op_view[0]->type()->str(), std::string("hello!"));
+  VectorView<proto::OpDesc_::Attr*> attr_view(op_view[0]->attrs());
+
+  // int32_t View
+  VectorView<int32_t> ints_view(attr_view[0]->ints());
+  std::vector<int32_t> ints({-1, 0, 1, 2, 3});
+  size_t cnt_0 = 0;
+  for (const auto& i : ints_view) {
+    EXPECT_EQ(i, ints[cnt_0]);
+    ++cnt_0;
+  }
+  for (size_t i = 0; i < ints_view.size(); ++i) {
+    EXPECT_EQ(ints_view[i], ints[i]);
+  }
+  std::vector<int32_t> ints_2(ints_view);
+  for (size_t i = 0; i < ints_2.size(); ++i) {
+    EXPECT_EQ(ints_2[i], ints[i]);
+  }
+
+  // String View
+  VectorView<std::string> strings_view(attr_view[0]->strings());
+  std::vector<std::string> strings({"string_0", "string_1"});
+  EXPECT_EQ(strings_view.size(), strings.size());
+  size_t cnt_1 = 0;
+  for (const auto& s : strings_view) {
+    EXPECT_EQ(s, strings[cnt_1]);
+    ++cnt_1;
+  }
+  for (size_t i = 0; i < strings_view.size(); ++i) {
+    EXPECT_EQ(strings_view[i], strings[i]);
+  }
+  std::vector<std::string> string_2(strings_view);
+  for (size_t i = 0; i < string_2.size(); ++i) {
+    EXPECT_EQ(string_2[i], strings[i]);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/cpp/CMakeLists.txt b/lite/model_parser/general/CMakeLists.txt
similarity index 70%
rename from lite/model_parser/cpp/CMakeLists.txt
rename to lite/model_parser/general/CMakeLists.txt
index fe3b2f848e404385b8d948db676865b8039f4ba2..ed53678dfac4cc58b208c2faa8573bcd06943aaa 100644
--- a/lite/model_parser/cpp/CMakeLists.txt
+++ b/lite/model_parser/general/CMakeLists.txt
@@ -3,4 +3,4 @@ lite_cc_library(cpp_var_desc SRCS var_desc.cc)
 lite_cc_library(cpp_block_desc SRCS block_desc.cc)
 lite_cc_library(cpp_program_desc SRCS program_desc.cc)
 
-set(cpp_wrapper cpp_op_desc cpp_var_desc cpp_block_desc cpp_program_desc PARENT_SCOPE)
+set(cpp_wrapper cpp_program_desc cpp_block_desc cpp_var_desc cpp_op_desc PARENT_SCOPE)
diff --git a/lite/model_parser/cpp/block_desc.cc b/lite/model_parser/general/block_desc.cc
similarity index 75%
rename from lite/model_parser/cpp/block_desc.cc
rename to lite/model_parser/general/block_desc.cc
index a4dc7cd72acacb6392cecdfe9a551773c1937888..11d2376bc05a6086036b0fd026666b0b16b2de84 100644
--- a/lite/model_parser/cpp/block_desc.cc
+++ b/lite/model_parser/general/block_desc.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/model_parser/cpp/block_desc.h"
+#include "lite/model_parser/general/block_desc.h"
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 template <>
 VarDesc* BlockDesc::GetVar<VarDesc>(int32_t idx) {
@@ -24,6 +24,12 @@ VarDesc* BlockDesc::GetVar<VarDesc>(int32_t idx) {
   return &vars_[idx];
 }
 
+template <>
+VarDesc const* BlockDesc::GetVar<VarDesc>(int32_t idx) const {
+  CHECK_LT(idx, VarsSize()) << "idx >= vars.size()";
+  return &vars_[idx];
+}
+
 template <>
 VarDesc* BlockDesc::AddVar<VarDesc>() {
   vars_.emplace_back();
@@ -36,12 +42,18 @@ OpDesc* BlockDesc::GetOp<OpDesc>(int32_t idx) {
   return &ops_[idx];
 }
 
+template <>
+OpDesc const* BlockDesc::GetOp<OpDesc>(int32_t idx) const {
+  CHECK_LT(idx, OpsSize()) << "idx >= ops.size()";
+  return &ops_[idx];
+}
+
 template <>
 OpDesc* BlockDesc::AddOp<OpDesc>() {
   ops_.emplace_back();
   return &ops_.back();
 }
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/cpp/block_desc.h b/lite/model_parser/general/block_desc.h
similarity index 80%
rename from lite/model_parser/cpp/block_desc.h
rename to lite/model_parser/general/block_desc.h
index b6f473b88b84bff71650dd4ecf4d1dc803351212..e618e570c20bfb0915289d2da625865fc5b64676 100644
--- a/lite/model_parser/cpp/block_desc.h
+++ b/lite/model_parser/general/block_desc.h
@@ -14,16 +14,17 @@
 
 #pragma once
 #include <vector>
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
+#include "lite/model_parser/general/op_desc.h"
+#include "lite/model_parser/general/var_desc.h"
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 /*
- * The cpp::BlockDesc is the internal representation for Op. All the internal
+ * The general::BlockDesc is the internal representation for Op. All the
+ * internal
  * imprementation should use it, not the pb::BlockDesc.
  */
 class BlockDesc : public BlockDescAPI {
@@ -45,6 +46,11 @@ class BlockDesc : public BlockDescAPI {
   template <typename T>
   T* GetVar(int32_t idx);
 
+  template <typename T>
+  T const* GetVar(int32_t idx) const;
+
+  std::vector<VarDesc>& GetVars() { return vars_; }
+
   template <typename T>
   T* AddVar();
 
@@ -55,6 +61,9 @@ class BlockDesc : public BlockDescAPI {
   template <typename T>
   T* GetOp(int32_t idx);
 
+  template <typename T>
+  T const* GetOp(int32_t idx) const;
+
   template <typename T>
   T* AddOp();
 
@@ -70,6 +79,6 @@ class BlockDesc : public BlockDescAPI {
   int32_t forward_block_idx_;
 };
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/cpp/op_desc.cc b/lite/model_parser/general/op_desc.cc
similarity index 95%
rename from lite/model_parser/cpp/op_desc.cc
rename to lite/model_parser/general/op_desc.cc
index a816943bb9689483f1eb60575147a42594db2654..b4589a14f26b641a0e48c69ec067cd847649b67e 100644
--- a/lite/model_parser/cpp/op_desc.cc
+++ b/lite/model_parser/general/op_desc.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/model_parser/cpp/op_desc.h"
+#include "lite/model_parser/general/op_desc.h"
 #include <set>
 #include <utility>
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 std::vector<std::string> OpDesc::OutputArgumentNames() const {
   std::vector<std::string> res;
@@ -69,6 +69,6 @@ bool OpDesc::HasOutput(const std::string& param) const {
   return it != outputs_.end();
 }
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/cpp/op_desc.h b/lite/model_parser/general/op_desc.h
similarity index 91%
rename from lite/model_parser/cpp/op_desc.h
rename to lite/model_parser/general/op_desc.h
index 57d2f6bbb27a73e1093b6cef114d032e164c0432..e0c2541182adde6ab9171a55d859a5bd5a1195e2 100644
--- a/lite/model_parser/cpp/op_desc.h
+++ b/lite/model_parser/general/op_desc.h
@@ -17,16 +17,16 @@
 #include <string>
 #include <utility>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/utils/any.h"
 #include "lite/utils/varient.h"
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 /*
- * The cpp::OpDesc is the internal representation for Op. All the internal
+ * The general::OpDesc is the internal representation for Op. All the internal
  * imprementation should use it, not the pb::OpDesc.
  */
 class OpDesc : public OpDescAPI {
@@ -108,7 +108,7 @@ class OpDesc : public OpDescAPI {
 
   template <typename T>
   void SetAttr(const std::string& name, const T& v) {
-    attr_types_[name] = OpDescAPI::DataTypeTrait<T>::AT;
+    attr_types_[name] = OpDataTypeTrait<T>::AT;
     attrs_[name].set(v);
   }
 
@@ -119,8 +119,8 @@ class OpDesc : public OpDescAPI {
     auto attr_it = attr_types().find(name);
     CHECK(attr_it != attr_types().end());
     auto pair = std::make_pair(it, attr_it);
-    CHECK(pair.second->second == OpDescAPI::DataTypeTrait<T>::AT)
-        << "required type is " << OpDescAPI::DataTypeTrait<T>::ATN
+    CHECK(pair.second->second == OpDataTypeTrait<T>::AT)
+        << "required type is " << OpDataTypeTrait<T>::ATN
         << " not match the true type";
     return pair.first->second.get<T>();
   }
@@ -131,6 +131,6 @@ class OpDesc : public OpDescAPI {
   }
 };
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/cpp/program_desc.cc b/lite/model_parser/general/program_desc.cc
similarity index 78%
rename from lite/model_parser/cpp/program_desc.cc
rename to lite/model_parser/general/program_desc.cc
index 3c6adcddf319db57366e5b3cdb05bc6169f229ee..b767a6f77ca657e8ec02b8e182dd8a8b62b7d6ab 100644
--- a/lite/model_parser/cpp/program_desc.cc
+++ b/lite/model_parser/general/program_desc.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/model_parser/cpp/program_desc.h"
+#include "lite/model_parser/general/program_desc.h"
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 template <>
 BlockDesc* ProgramDesc::GetBlock<BlockDesc>(int32_t idx) {
@@ -24,12 +24,18 @@ BlockDesc* ProgramDesc::GetBlock<BlockDesc>(int32_t idx) {
   return &blocks_[idx];
 }
 
+template <>
+BlockDesc const* ProgramDesc::GetBlock<BlockDesc>(int32_t idx) const {
+  CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()";
+  return &blocks_[idx];
+}
+
 template <>
 BlockDesc* ProgramDesc::AddBlock<BlockDesc>() {
   blocks_.emplace_back();
   return &blocks_.back();
 }
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/cpp/program_desc.h b/lite/model_parser/general/program_desc.h
similarity index 72%
rename from lite/model_parser/cpp/program_desc.h
rename to lite/model_parser/general/program_desc.h
index 786dad134adf8d5ac4b03ba43b254359dfc2cdb2..bbc045412d2086473375863575e5d16146d84751 100644
--- a/lite/model_parser/cpp/program_desc.h
+++ b/lite/model_parser/general/program_desc.h
@@ -14,21 +14,29 @@
 
 #pragma once
 #include <vector>
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
+#include "lite/model_parser/general/block_desc.h"
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 /*
- * The cpp::ProgramDesc is the internal representation for Op. All the internal
+ * The general::ProgramDesc is the internal representation for Op. All the
+ * internal
  * imprementation should use it, not the pb::ProgramDesc.
  */
 class ProgramDesc : public ProgramDescAPI {
  public:
   ProgramDesc() = default;
 
+  void CopyFrom(const ProgramDesc& other) {
+    version_ = other.Version();
+    blocks_ = other.blocks();
+  }
+
+  const std::vector<BlockDesc>& blocks() const { return blocks_; }
+
   size_t BlocksSize() const override { return blocks_.size(); }
 
   void ClearBlocks() override { blocks_.clear(); }
@@ -36,6 +44,11 @@ class ProgramDesc : public ProgramDescAPI {
   template <typename T>
   T* GetBlock(int32_t idx);
 
+  template <typename T>
+  T const* GetBlock(int32_t idx) const;
+
+  std::vector<BlockDesc>& GetBlocks() { return blocks_; }
+
   template <typename T>
   T* AddBlock();
 
@@ -52,6 +65,6 @@ class ProgramDesc : public ProgramDescAPI {
   std::vector<BlockDesc> blocks_;
 };
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/cpp/var_desc.cc b/lite/model_parser/general/var_desc.cc
similarity index 92%
rename from lite/model_parser/cpp/var_desc.cc
rename to lite/model_parser/general/var_desc.cc
index e30bb3eb55d274d5287702d6247b94d5d33c4e74..f2782d1778b07ef201401a62f9c7a6295159ef5f 100644
--- a/lite/model_parser/cpp/var_desc.cc
+++ b/lite/model_parser/general/var_desc.cc
@@ -12,4 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/model_parser/cpp/var_desc.h"
+#include "lite/model_parser/general/var_desc.h"
diff --git a/lite/model_parser/cpp/var_desc.h b/lite/model_parser/general/var_desc.h
similarity index 91%
rename from lite/model_parser/cpp/var_desc.h
rename to lite/model_parser/general/var_desc.h
index c56d7cce53180e0157913372f8b0da4c9cedd8c9..ed69d035dfbe837afa79a3f52bd2c0c925bd19ea 100644
--- a/lite/model_parser/cpp/var_desc.h
+++ b/lite/model_parser/general/var_desc.h
@@ -15,14 +15,14 @@
 #pragma once
 #include <string>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 /*
- * The cpp::VarDesc is the internal representation for Op. All the internal
+ * The general::VarDesc is the internal representation for Op. All the internal
  * imprementation should use it, not the pb::VarDesc.
  */
 class VarDesc : public VarDescAPI {
@@ -59,6 +59,6 @@ class VarDesc : public VarDescAPI {
   std::vector<int64_t> shape_;
 };
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
index ea94ca52e8f123da5077f3b751ab03b857e8c390..cf93e7f2cedc8db5c5a18d26fa2499dd79c456de 100644
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -21,7 +21,7 @@
 #include "lite/core/tensor.h"
 #include "lite/core/variable.h"
 #include "lite/core/version.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/combined_params_desc.h"
 #include "lite/model_parser/naive_buffer/param_desc.h"
 #include "lite/model_parser/naive_buffer/program_desc.h"
@@ -176,7 +176,7 @@ void LoadCombinedParamsPb(const std::string &path,
                           const cpp::ProgramDesc &cpp_prog,
                           bool params_from_memory) {
   CHECK(scope);
-  auto prog = cpp_prog;
+  auto &prog = cpp_prog;
   auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
 
   // Get vars
@@ -310,7 +310,7 @@ void SaveModelPb(const std::string &model_dir,
 void SaveCombinedParamsPb(const std::string &path,
                           const lite::Scope &exec_scope,
                           const cpp::ProgramDesc &cpp_prog) {
-  auto prog = cpp_prog;
+  auto &prog = cpp_prog;
   auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
 
   // Get vars
@@ -526,7 +526,7 @@ void SaveCombinedParamsNaive(const std::string &path,
   naive_buffer::proto::CombinedParamsDesc pt_desc(&table);
   naive_buffer::CombinedParamsDesc desc(&pt_desc);
 
-  auto prog = cpp_prog;
+  auto &prog = cpp_prog;
   auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
   // set unique_var_names to avoid saving shared params repeatedly
   std::set<std::string> unique_var_names;
@@ -681,7 +681,7 @@ void LoadCombinedParamsNaive(const std::string &path,
   }
 
   // Check all params loaded
-  auto prog = cpp_prog;
+  auto &prog = cpp_prog;
   auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
   for (size_t i = 0; i < main_block_desc.VarsSize(); ++i) {
     auto &var = *main_block_desc.GetVar<cpp::VarDesc>(i);
diff --git a/lite/model_parser/naive_buffer/block_desc.h b/lite/model_parser/naive_buffer/block_desc.h
index b0ebe7c03f954654864fb9c56d6861cde7fe9384..3f99302c4033f3f732e0c79017fc251c6d0c40b5 100644
--- a/lite/model_parser/naive_buffer/block_desc.h
+++ b/lite/model_parser/naive_buffer/block_desc.h
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
 namespace paddle {
diff --git a/lite/model_parser/naive_buffer/combined_params_desc.h b/lite/model_parser/naive_buffer/combined_params_desc.h
index a5462ef5eea47867a737cd1eff344c696f9dc159..1131bab9615b53055d58ba962ad21e206ee70bfc 100644
--- a/lite/model_parser/naive_buffer/combined_params_desc.h
+++ b/lite/model_parser/naive_buffer/combined_params_desc.h
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/param_desc.h"
 #include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
diff --git a/lite/model_parser/naive_buffer/op_desc.h b/lite/model_parser/naive_buffer/op_desc.h
index cce0c22c2e717b6d622314f31af2dc418503c78b..f4cd2d8578cf69854fc4044b739fdfa3d6516d50 100644
--- a/lite/model_parser/naive_buffer/op_desc.h
+++ b/lite/model_parser/naive_buffer/op_desc.h
@@ -23,7 +23,7 @@
 #include <set>
 #include <string>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
 namespace paddle {
diff --git a/lite/model_parser/naive_buffer/param_desc.h b/lite/model_parser/naive_buffer/param_desc.h
index 0a20b153312d99602ada77317e64c5934df0f070..ebbbdaf846a3550015ec97c11ccfb7d34271b6c5 100644
--- a/lite/model_parser/naive_buffer/param_desc.h
+++ b/lite/model_parser/naive_buffer/param_desc.h
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
 namespace paddle {
diff --git a/lite/model_parser/naive_buffer/program_desc.h b/lite/model_parser/naive_buffer/program_desc.h
index 0d59b7f71f4f32d4e861b6a622cab646797bca80..6f5277ad32aa2fccf52134a262975cfdbe1b9d6c 100644
--- a/lite/model_parser/naive_buffer/program_desc.h
+++ b/lite/model_parser/naive_buffer/program_desc.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
 namespace paddle {
diff --git a/lite/model_parser/naive_buffer/var_desc.h b/lite/model_parser/naive_buffer/var_desc.h
index bf0845d7464f511dfb77812612c2b99c954600da..20c8e03a5433ba98c8dc3d98af25920a934ee31d 100644
--- a/lite/model_parser/naive_buffer/var_desc.h
+++ b/lite/model_parser/naive_buffer/var_desc.h
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/naive_buffer_wrapper_helper.h"
 #include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
diff --git a/lite/model_parser/pb/block_desc.h b/lite/model_parser/pb/block_desc.h
index d541a7fbd2dee2dbabf4acdd51259898691f9188..8844173798dcacf77c876f717b71c87cbc57e5e6 100644
--- a/lite/model_parser/pb/block_desc.h
+++ b/lite/model_parser/pb/block_desc.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 #include "lite/core/framework.pb.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/utils/cp_logging.h"
 
 namespace paddle {
@@ -50,6 +50,11 @@ class BlockDesc : public BlockDescAPI {
   template <typename T>
   T* GetVar(int32_t idx);
 
+  template <typename T>
+  T const* GetVar(int32_t idx) const {
+    return GetVar<T>(idx);
+  }
+
   template <typename T>
   T* AddVar();
 
@@ -60,6 +65,11 @@ class BlockDesc : public BlockDescAPI {
   template <typename T>
   T* GetOp(int32_t idx);
 
+  template <typename T>
+  T const* GetOp(int32_t idx) const {
+    return GetOp<T>(idx);
+  }
+
   template <typename T>
   T* AddOp();
 
diff --git a/lite/model_parser/pb/op_desc.h b/lite/model_parser/pb/op_desc.h
index f21c194a271b46c84b3a363c6f7c0d9c1f7b1f32..6f186e778298a5ae59a63188640725b3ae5322c9 100644
--- a/lite/model_parser/pb/op_desc.h
+++ b/lite/model_parser/pb/op_desc.h
@@ -26,7 +26,7 @@
 #include <string>
 #include <vector>
 #include "lite/core/framework.pb.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/utils/all.h"
 
 namespace paddle {
diff --git a/lite/model_parser/pb/program_desc.h b/lite/model_parser/pb/program_desc.h
index 38c667f78b98956d26231f90f66a9914eeb349dc..950bf5480db501289250ece88b28d1c1369e56fc 100644
--- a/lite/model_parser/pb/program_desc.h
+++ b/lite/model_parser/pb/program_desc.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 #include "lite/core/framework.pb.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/utils/cp_logging.h"
 
 namespace paddle {
@@ -42,6 +42,11 @@ class ProgramDesc : public ProgramDescAPI {
   template <typename T>
   T *GetBlock(int32_t idx);
 
+  template <typename T>
+  T const *GetBlock(int32_t idx) const {
+    return GetBlock<T>(idx);
+  }
+
   template <typename T>
   T *AddBlock();
 
diff --git a/lite/model_parser/pb/var_desc.cc b/lite/model_parser/pb/var_desc.cc
index f849b8dd0ed103f789aec41e5c88f3e4f3cdf878..42625ee6190fb98c50de2b88a08b9910d91ed014 100644
--- a/lite/model_parser/pb/var_desc.cc
+++ b/lite/model_parser/pb/var_desc.cc
@@ -294,9 +294,9 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
     case proto::VarType::LOD_TENSOR_ARRAY:
       return desc_->type().tensor_array().tensor();
     default:
-      LOG(FATAL)
-          << "Getting 'tensor_desc' is not supported by the type of var %s."
-          << this->Name();
+      LOG(WARNING) << "Getting 'tensor_desc' is not supported by the type("
+                   << static_cast<int>(desc_->type().type()) << ") of var "
+                   << this->Name();
   }
   return framework::proto::VarDesc().type().lod_tensor().tensor();
 }
@@ -312,10 +312,9 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
       }
       return res;
     default:
-      LOG(FATAL)
-          << "Getting 'tensor_descs' is not supported by the type of var "
-             "%s."
-          << this->Name();
+      LOG(WARNING) << "Getting 'tensor_descs' is not supported by the type("
+                   << static_cast<int>(desc_->type().type()) << ") of var "
+                   << this->Name();
   }
   return std::vector<proto::VarType::TensorDesc>();
 }
diff --git a/lite/model_parser/pb/var_desc.h b/lite/model_parser/pb/var_desc.h
index eefacef4b0c90faf132b2e4ef141ac7009939db5..d36881d5892ca8b4bef754554d164409fab4b858 100644
--- a/lite/model_parser/pb/var_desc.h
+++ b/lite/model_parser/pb/var_desc.h
@@ -18,7 +18,7 @@
 #include <string>
 #include <vector>
 #include "lite/core/framework.pb.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/utils/cp_logging.h"
 
 namespace paddle {
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index cd73259a338992e3a88c753c5935b547bbe7595d..17abee4a217897cef3ec7d1e03267e5d00dbef91 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -58,6 +58,7 @@ add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
 add_operator(crop_op extra SRCS crop_op.cc DEPS ${op_DEPS})
 add_operator(assign_op extra SRCS assign_op.cc DEPS ${op_DEPS})
 add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS})
+add_operator(group_norm_op extra SRCS group_norm_op.cc DEPS ${op_DEPS})
 add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS})
 
 # 3.extra ops
@@ -76,6 +77,8 @@ add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS})
 add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS})
 add_operator(sequence_unpad_op_lite extra SRCS sequence_unpad_op.cc DEPS ${op_DEPS})
+add_operator(sequence_pad_op_lite extra SRCS sequence_pad_op.cc DEPS ${op_DEPS})
+add_operator(sequence_mask_op_lite extra SRCS sequence_mask_op.cc DEPS ${op_DEPS})
 add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS})
 add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS})
 add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS})
@@ -86,6 +89,7 @@ add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_m
 add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS})
 add_operator(assign_value_op basic SRCS assign_value_op.cc DEPS ${op_DEPS})
 add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
+add_operator(fake_quantize_dequantize_abs_max_op extra SRCS fake_quantize_dequantize_abs_max.cc DEPS ${op_DEPS})
 add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS})
 add_operator(split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS ${op_DEPS})
 add_operator(merge_lod_tensor_op_lite extra SRCS merge_lod_tensor_op.cc DEPS ${op_DEPS})
@@ -110,6 +114,9 @@ add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposal
 add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS})
 add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS})
 add_operator(max_pool_with_index_op extra SRCS max_pool_with_index_op.cc DEPS ${op_DEPS})
+add_operator(pixel_shuffle_op extra SRCS pixel_shuffle_op.cc DEPS ${op_DEPS})
+add_operator(clip_op extra SRCS clip_op.cc DEPS ${op_DEPS})
+add_operator(print_op extra SRCS print_op.cc DEPS ${op_DEPS})
 
 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
@@ -137,14 +144,17 @@ add_operator(topk_op extra SRCS topk_op.cc DEPS ${op_DEPS})
 add_operator(increment_op extra SRCS increment_op.cc DEPS ${op_DEPS})
 add_operator(layer_norm_op extra SRCS layer_norm_op.cc DEPS ${op_DEPS})
 add_operator(sequence_softmax_op extra SRCS sequence_softmax_op.cc DEPS ${op_DEPS})
+add_operator(retinanet_detection_output_op extra SRCS retinanet_detection_output_op.cc DEPS ${op_DEPS})
+add_operator(where_index_op extra SRCS where_index_op.cc DEPS ${op_DEPS})
 # for content-dnn specific
 add_operator(search_aligned_mat_mul_op extra SRCS search_aligned_mat_mul_op.cc DEPS ${op_DEPS})
 add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS})
 add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS})
 add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS})
 add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS})
+add_operator(topk_pooling_op extra SRCS topk_pooling_op.cc DEPS ${op_DEPS})
 # for deformable-convNet
-add_operator(deformable_conv_op basic SRCS deformable_conv_op.cc DEPS ${op_DEPS})
+add_operator(deformable_conv_op extra SRCS deformable_conv_op.cc DEPS ${op_DEPS})
 
 # 4. training op
 add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS})
@@ -160,6 +170,9 @@ add_operator(__xpu__resnet50_op extra SRCS __xpu__resnet50_op.cc DEPS ${op_DEPS}
 add_operator(__xpu__multi_encoder_op extra SRCS __xpu__multi_encoder_op.cc DEPS ${op_DEPS})
 add_operator(__xpu__embedding_with_eltwise_add_op extra SRCS __xpu__embedding_with_eltwise_add_op.cc DEPS ${op_DEPS})
 add_operator(__xpu__fc_op extra SRCS __xpu__fc_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__resnet_cbam_op extra SRCS __xpu__resnet_cbam_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__search_attention_op extra SRCS __xpu__search_attention_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__mmdnn_op extra SRCS __xpu__mmdnn_op.cc DEPS ${op_DEPS})
 
 if (NOT LITE_WITH_X86)
     lite_cc_test(test_fc_op SRCS fc_op_test.cc
diff --git a/lite/operators/__xpu__mmdnn_op.cc b/lite/operators/__xpu__mmdnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b898c0b132dc0767c8ba28c29098ac998c2cab21
--- /dev/null
+++ b/lite/operators/__xpu__mmdnn_op.cc
@@ -0,0 +1,314 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__mmdnn_op.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUMmdnnBidEmbGrnnAttOp::CheckShape() const { return true; }
+
+bool XPUMmdnnBidEmbGrnnAttOp::InferShapeImpl() const {
+  auto& id_dims = param_.id0->dims();
+  auto& id_lod = param_.id0->lod()[0];
+  auto& emb_tbl_dims = param_.emb_tbl->dims();
+  auto& grnn_wh_dims = param_.grnn_rv_wh->dims();
+
+  param_.grnn_fw_pool_out->Resize(
+      {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]});
+  param_.grnn_rv_pool_out->Resize(
+      {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]});
+  param_.att_pool_out->Resize(
+      {(int64_t)id_lod.size() - 1, 2 * grnn_wh_dims[2]});
+  param_.concat_3in1_out->Resize({id_dims[0], 3 * grnn_wh_dims[2]});
+  param_.concat_3in1_out->set_lod({id_lod});
+  param_.emb_fw_out->Resize({id_dims[0], emb_tbl_dims[1]});
+  param_.emb_fw_out->set_lod({id_lod});
+  return true;
+}
+
+bool XPUMmdnnBidEmbGrnnAttOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                         lite::Scope* scope) {
+  param_.id0 =
+      scope->FindVar(op_desc.Input("id0").front())->GetMutable<lite::Tensor>();
+  param_.id1 =
+      scope->FindVar(op_desc.Input("id1").front())->GetMutable<lite::Tensor>();
+  param_.emb_tbl = scope->FindVar(op_desc.Input("emb_tbl").front())
+                       ->GetMutable<lite::Tensor>();
+  param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_fw_wi = scope->FindVar(op_desc.Input("grnn_fw_wi").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_wh = scope->FindVar(op_desc.Input("grnn_rv_wh").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_wi = scope->FindVar(op_desc.Input("grnn_rv_wi").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.att_fc_w = scope->FindVar(op_desc.Input("att_fc_w").front())
+                        ->GetMutable<lite::Tensor>();
+  param_.att_fc_b = scope->FindVar(op_desc.Input("att_fc_b").front())
+                        ->GetMutable<lite::Tensor>();
+
+  param_.grnn_fw_pool_out =
+      scope->FindVar(op_desc.Output("grnn_fw_pool_out").front())
+          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_pool_out =
+      scope->FindVar(op_desc.Output("grnn_rv_pool_out").front())
+          ->GetMutable<lite::Tensor>();
+  param_.att_pool_out = scope->FindVar(op_desc.Output("att_pool_out").front())
+                            ->GetMutable<lite::Tensor>();
+  param_.concat_3in1_out =
+      scope->FindVar(op_desc.Output("concat_3in1_out").front())
+          ->GetMutable<lite::Tensor>();
+  param_.emb_fw_out = scope->FindVar(op_desc.Output("emb_fw_out").front())
+                          ->GetMutable<lite::Tensor>();
+
+  param_.grnn_fw_wh_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_fw_wh_maxs");
+  param_.grnn_fw_wi_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_fw_wi_maxs");
+  param_.grnn_rv_wh_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_rv_wh_maxs");
+  param_.grnn_rv_wi_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_rv_wi_maxs");
+  param_.att_fc_w_max = op_desc.GetAttr<float>("att_fc_w_max");
+  return true;
+}
+
+bool XPUMmdnnBidEmbGrnnAttOp2::CheckShape() const { return true; }
+
+bool XPUMmdnnBidEmbGrnnAttOp2::InferShapeImpl() const {
+  auto& id_dims = param_.id0->dims();
+  auto& id_lod = param_.id0->lod()[0];
+  auto& emb_tbl_dims = param_.emb_tbl->dims();
+  auto& grnn_wh_dims = param_.grnn_rv_wh->dims();
+
+  param_.emb0_out->Resize({id_dims[0], emb_tbl_dims[1]});
+  param_.emb0_out->set_lod({id_lod});
+  param_.grnn_fw_pool_out->Resize(
+      {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]});
+  param_.grnn_rv_pool_out->Resize(
+      {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]});
+  param_.att_pool_out->Resize(
+      {(int64_t)id_lod.size() - 1, 2 * grnn_wh_dims[2]});
+  param_.concat_3in1_out->Resize({id_dims[0], 3 * grnn_wh_dims[2]});
+  param_.concat_3in1_out->set_lod({id_lod});
+  param_.emb_fw_out->Resize({id_dims[0], emb_tbl_dims[1]});
+  param_.emb_fw_out->set_lod({id_lod});
+  return true;
+}
+
+bool XPUMmdnnBidEmbGrnnAttOp2::AttachImpl(const cpp::OpDesc& op_desc,
+                                          lite::Scope* scope) {
+  param_.id0 =
+      scope->FindVar(op_desc.Input("id0").front())->GetMutable<lite::Tensor>();
+  param_.id1 =
+      scope->FindVar(op_desc.Input("id1").front())->GetMutable<lite::Tensor>();
+  param_.emb_tbl = scope->FindVar(op_desc.Input("emb_tbl").front())
+                       ->GetMutable<lite::Tensor>();
+  param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_fw_wi = scope->FindVar(op_desc.Input("grnn_fw_wi").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_wh = scope->FindVar(op_desc.Input("grnn_rv_wh").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_wi = scope->FindVar(op_desc.Input("grnn_rv_wi").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.att_fc_w = scope->FindVar(op_desc.Input("att_fc_w").front())
+                        ->GetMutable<lite::Tensor>();
+  param_.att_fc_b = scope->FindVar(op_desc.Input("att_fc_b").front())
+                        ->GetMutable<lite::Tensor>();
+
+  param_.emb0_out = scope->FindVar(op_desc.Output("emb0_out").front())
+                        ->GetMutable<lite::Tensor>();
+  param_.grnn_fw_pool_out =
+      scope->FindVar(op_desc.Output("grnn_fw_pool_out").front())
+          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_pool_out =
+      scope->FindVar(op_desc.Output("grnn_rv_pool_out").front())
+          ->GetMutable<lite::Tensor>();
+  param_.att_pool_out = scope->FindVar(op_desc.Output("att_pool_out").front())
+                            ->GetMutable<lite::Tensor>();
+  param_.concat_3in1_out =
+      scope->FindVar(op_desc.Output("concat_3in1_out").front())
+          ->GetMutable<lite::Tensor>();
+  param_.emb_fw_out = scope->FindVar(op_desc.Output("emb_fw_out").front())
+                          ->GetMutable<lite::Tensor>();
+
+  param_.grnn_fw_wh_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_fw_wh_maxs");
+  param_.grnn_fw_wi_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_fw_wi_maxs");
+  param_.grnn_rv_wh_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_rv_wh_maxs");
+  param_.grnn_rv_wi_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_rv_wi_maxs");
+  param_.att_fc_w_max = op_desc.GetAttr<float>("att_fc_w_max");
+  return true;
+}
+
+bool XPUMmdnnBidEmbAttOp::CheckShape() const { return true; }
+
+bool XPUMmdnnBidEmbAttOp::InferShapeImpl() const {
+  auto& id_dims = param_.id0->dims();
+  auto& id_lod = param_.id0->lod()[0];
+  auto& emb_tbl_dims = param_.emb_tbl->dims();
+
+  param_.att_pool_out->Resize({(int64_t)id_lod.size() - 1, emb_tbl_dims[1]});
+  param_.emb_fw_out->Resize({id_dims[0], emb_tbl_dims[1]});
+  param_.emb_fw_out->set_lod({id_lod});
+  return true;
+}
+
+bool XPUMmdnnBidEmbAttOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                     lite::Scope* scope) {
+  param_.id0 =
+      scope->FindVar(op_desc.Input("id0").front())->GetMutable<lite::Tensor>();
+  param_.id1 =
+      scope->FindVar(op_desc.Input("id1").front())->GetMutable<lite::Tensor>();
+  param_.emb_tbl = scope->FindVar(op_desc.Input("emb_tbl").front())
+                       ->GetMutable<lite::Tensor>();
+  param_.att_fc_w = scope->FindVar(op_desc.Input("att_fc_w").front())
+                        ->GetMutable<lite::Tensor>();
+  param_.att_fc_b = scope->FindVar(op_desc.Input("att_fc_b").front())
+                        ->GetMutable<lite::Tensor>();
+
+  param_.att_pool_out = scope->FindVar(op_desc.Output("att_pool_out").front())
+                            ->GetMutable<lite::Tensor>();
+  param_.emb_fw_out = scope->FindVar(op_desc.Output("emb_fw_out").front())
+                          ->GetMutable<lite::Tensor>();
+
+  param_.att_fc_w_max = op_desc.GetAttr<float>("att_fc_w_max");
+  return true;
+}
+
+bool XPUMmdnnMatchConvTopkOp::CheckShape() const { return true; }
+
+bool XPUMmdnnMatchConvTopkOp::InferShapeImpl() const {
+  int channel_num = param_.channel_num;
+  std::vector<int> topks = param_.topks;
+  auto row_dim = param_.input_x->dims();
+  auto num_k = topks.size();
+  auto row_shape_0 = row_dim[0];
+  std::vector<int64_t> vec_out_shape;
+  vec_out_shape.push_back(row_shape_0);
+  vec_out_shape.push_back(channel_num * num_k);
+
+  param_.topk_out->Resize(lite::DDim(vec_out_shape));
+  param_.topk_out->set_lod(param_.input_x->lod());
+  return true;
+}
+
+bool XPUMmdnnMatchConvTopkOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                         lite::Scope* scope) {
+  param_.input_x = scope->FindVar(op_desc.Input("input_x").front())
+                       ->GetMutable<lite::Tensor>();
+  param_.input_y = scope->FindVar(op_desc.Input("input_y").front())
+                       ->GetMutable<lite::Tensor>();
+  param_.input_w = scope->FindVar(op_desc.Input("input_w").front())
+                       ->GetMutable<lite::Tensor>();
+  param_.conv_w = scope->FindVar(op_desc.Input("conv_w").front())
+                      ->GetMutable<lite::Tensor>();
+
+  param_.topk_out = scope->FindVar(op_desc.Output("topk_out").front())
+                        ->GetMutable<lite::Tensor>();
+
+  param_.input_w_max = op_desc.GetAttr<float>("input_w_max");
+  param_.conv_w_max = op_desc.GetAttr<float>("conv_w_max");
+  param_.topks = op_desc.GetAttr<std::vector<int>>("topks");
+  param_.output_channel = op_desc.GetAttr<int>("output_channel");
+  param_.channel_num = op_desc.GetAttr<int>("channel_num");
+  param_.dim_t = op_desc.GetAttr<int>("dim_t");
+  return true;
+}
+
+bool XPUMmdnnMergeAllOp::CheckShape() const { return true; }
+
+bool XPUMmdnnMergeAllOp::InferShapeImpl() const {
+  int64_t dim0 = param_.concat_7in1_x[0]->dims()[0];
+  int64_t dim1 = param_.fc2_w->dims()[0];
+  std::vector<int64_t> vec_out_shape;
+  vec_out_shape.push_back(dim0);
+  vec_out_shape.push_back(dim1);
+
+  param_.out->Resize(lite::DDim(vec_out_shape));
+  return true;
+}
+
+bool XPUMmdnnMergeAllOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                    lite::Scope* scope) {
+  param_.concat_7in1_x.clear();
+  for (auto& name : op_desc.Input("concat_7in1_x")) {
+    auto t = scope->FindVar(name)->GetMutable<lite::Tensor>();
+    param_.concat_7in1_x.push_back(t);
+  }
+  param_.concat_topk_x.clear();
+  for (auto& name : op_desc.Input("concat_topk_x")) {
+    auto t = scope->FindVar(name)->GetMutable<lite::Tensor>();
+    param_.concat_topk_x.push_back(t);
+  }
+  param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_fw_wi = scope->FindVar(op_desc.Input("grnn_fw_wi").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_wh = scope->FindVar(op_desc.Input("grnn_rv_wh").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_wi = scope->FindVar(op_desc.Input("grnn_rv_wi").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.fc0_w = scope->FindVar(op_desc.Input("fc0_w").front())
+                     ->GetMutable<lite::Tensor>();
+  param_.fc0_b = scope->FindVar(op_desc.Input("fc0_b").front())
+                     ->GetMutable<lite::Tensor>();
+  param_.fc1_w = scope->FindVar(op_desc.Input("fc1_w").front())
+                     ->GetMutable<lite::Tensor>();
+  param_.fc1_b = scope->FindVar(op_desc.Input("fc1_b").front())
+                     ->GetMutable<lite::Tensor>();
+  param_.fc2_w = scope->FindVar(op_desc.Input("fc2_w").front())
+                     ->GetMutable<lite::Tensor>();
+  param_.fc2_b = scope->FindVar(op_desc.Input("fc2_b").front())
+                     ->GetMutable<lite::Tensor>();
+
+  param_.out =
+      scope->FindVar(op_desc.Output("out").front())->GetMutable<lite::Tensor>();
+
+  param_.grnn_fw_wh_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_fw_wh_maxs");
+  param_.grnn_fw_wi_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_fw_wi_maxs");
+  param_.grnn_rv_wh_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_rv_wh_maxs");
+  param_.grnn_rv_wi_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_rv_wi_maxs");
+  param_.fc0_w_max = op_desc.GetAttr<float>("fc0_w_max");
+  param_.fc1_w_max = op_desc.GetAttr<float>("fc1_w_max");
+  param_.fc2_w_max = op_desc.GetAttr<float>("fc2_w_max");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_grnn_att,
+                 paddle::lite::operators::XPUMmdnnBidEmbGrnnAttOp);
+REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_grnn_att2,
+                 paddle::lite::operators::XPUMmdnnBidEmbGrnnAttOp2);
+REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_att,
+                 paddle::lite::operators::XPUMmdnnBidEmbAttOp);
+REGISTER_LITE_OP(__xpu__mmdnn_match_conv_topk,
+                 paddle::lite::operators::XPUMmdnnMatchConvTopkOp);
+REGISTER_LITE_OP(__xpu__mmdnn_merge_all,
+                 paddle::lite::operators::XPUMmdnnMergeAllOp);
diff --git a/lite/operators/__xpu__mmdnn_op.h b/lite/operators/__xpu__mmdnn_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba815a1eec7d0913bc08b4f8fa520de73a4bb835
--- /dev/null
+++ b/lite/operators/__xpu__mmdnn_op.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUMmdnnBidEmbGrnnAttOp : public OpLite {
+ public:
+  XPUMmdnnBidEmbGrnnAttOp() {}
+
+  explicit XPUMmdnnBidEmbGrnnAttOp(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUMmdnnBidEmbGrnnAttOp"; }
+
+ private:
+  mutable XPUMmdnnBidEmbGrnnAttParam param_;
+};
+
+class XPUMmdnnBidEmbGrnnAttOp2 : public OpLite {
+ public:
+  XPUMmdnnBidEmbGrnnAttOp2() {}
+
+  explicit XPUMmdnnBidEmbGrnnAttOp2(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "XPUMmdnnBidEmbGrnnAttOp2";
+  }
+
+ private:
+  mutable XPUMmdnnBidEmbGrnnAttParam2 param_;
+};
+
+class XPUMmdnnBidEmbAttOp : public OpLite {
+ public:
+  XPUMmdnnBidEmbAttOp() {}
+
+  explicit XPUMmdnnBidEmbAttOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUMmdnnBidEmbAttOp"; }
+
+ private:
+  mutable XPUMmdnnBidEmbAttParam param_;
+};
+
+class XPUMmdnnMatchConvTopkOp : public OpLite {
+ public:
+  XPUMmdnnMatchConvTopkOp() {}
+
+  explicit XPUMmdnnMatchConvTopkOp(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUMmdnnMatchConvTopkOp"; }
+
+ private:
+  mutable XPUMmdnnMatchConvTopkParam param_;
+};
+
+class XPUMmdnnMergeAllOp : public OpLite {
+ public:
+  XPUMmdnnMergeAllOp() {}
+
+  explicit XPUMmdnnMergeAllOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUMmdnnMergeAllOp"; }
+
+ private:
+  mutable XPUMmdnnMergeAllParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/__xpu__resnet_cbam_op.cc b/lite/operators/__xpu__resnet_cbam_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6013f4fa90033c51df7a0d3bb670e02f8bf4628d
--- /dev/null
+++ b/lite/operators/__xpu__resnet_cbam_op.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__resnet_cbam_op.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUResNetCbamOp::CheckShape() const { return true; }
+
+bool XPUResNetCbamOp::InferShapeImpl() const {
+  auto input_shape = param_.input->dims();
+  std::vector<int64_t> output_shape_vec{1, 64};
+  paddle::lite::DDim output_shape(output_shape_vec);
+  output_shape[0] = input_shape[0];
+  param_.output->Resize(output_shape);
+  return true;
+}
+
+bool XPUResNetCbamOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                 lite::Scope* scope) {
+  param_.input = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
+  param_.output = scope->FindVar(op_desc.Output("Output").front())
+                      ->GetMutable<lite::Tensor>();
+
+  param_.filter.clear();
+  for (auto& name : op_desc.Input("Filter")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.filter.push_back(t);
+  }
+  param_.bias.clear();
+  for (auto& name : op_desc.Input("Bias")) {
+    if (name.substr(0, 11) == "placeholder") {
+      param_.bias.push_back(nullptr);
+    } else {
+      auto t =
+          const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+      param_.bias.push_back(t);
+    }
+  }
+  param_.max_filter.clear();
+  for (auto& name : op_desc.Input("MaxFilter")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.max_filter.push_back(t);
+  }
+
+  param_.pool_p = op_desc.GetAttr<float>("pool_p");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__resnet_cbam, paddle::lite::operators::XPUResNetCbamOp);
diff --git a/lite/operators/__xpu__resnet_cbam_op.h b/lite/operators/__xpu__resnet_cbam_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..26e5bafeae31183e9054e7e77ea46813c95db707
--- /dev/null
+++ b/lite/operators/__xpu__resnet_cbam_op.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUResNetCbamOp : public OpLite {
+ public:
+  XPUResNetCbamOp() {}
+  explicit XPUResNetCbamOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "ResNetCbam"; }
+
+ private:
+  mutable XPUResNetCbamParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/__xpu__search_attention_op.cc b/lite/operators/__xpu__search_attention_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..acd8c817b0d81ef03df1c05417b8bb2f56c00812
--- /dev/null
+++ b/lite/operators/__xpu__search_attention_op.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__search_attention_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUMmdnnSearchAttentionOp::CheckShape() const { return true; }
+
+bool XPUMmdnnSearchAttentionOp::InferShapeImpl() const {
+  auto& x_dims = param_.X->dims();
+  param_.Out->Resize(x_dims);
+  param_.Out->set_lod(param_.X->lod());
+  return true;
+}
+
+bool XPUMmdnnSearchAttentionOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                           lite::Scope* scope) {
+  auto x = op_desc.Input("X").front();
+  auto w = op_desc.Input("W").front();
+  auto b = op_desc.Input("b").front();
+  auto out = op_desc.Output("Out").front();
+
+  param_.X = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.W = scope->FindVar(w)->GetMutable<lite::Tensor>();
+  param_.b = scope->FindVar(b)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  param_.W_max = op_desc.GetAttr<float>("W_max");
+  param_.pad_id = op_desc.GetAttr<int>("pad_id");
+  param_.alpha0 = op_desc.GetAttr<float>("alpha0");
+  param_.alpha1 = op_desc.GetAttr<float>("alpha1");
+  param_.mask = op_desc.GetAttr<float>("mask");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__mmdnn_search_attention,
+                 paddle::lite::operators::XPUMmdnnSearchAttentionOp);
diff --git a/lite/operators/__xpu__search_attention_op.h b/lite/operators/__xpu__search_attention_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..81bd366ee8a51dc8d2d7fb4c9cb03d2199bcb4f2
--- /dev/null
+++ b/lite/operators/__xpu__search_attention_op.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUMmdnnSearchAttentionOp : public OpLite {
+ public:
+  XPUMmdnnSearchAttentionOp() {}
+
+  explicit XPUMmdnnSearchAttentionOp(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "XPUMmdnnSearchAttentionOp";
+  }
+
+ private:
+  mutable XPUMmdnnSearchAttentionParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/activation_grad_ops.cc b/lite/operators/activation_grad_ops.cc
index b31163e5dce6d9b77d923ba44ed58952263610a5..a30231be921e2c4445bb4c7a72c9572b14c1c0f5 100644
--- a/lite/operators/activation_grad_ops.cc
+++ b/lite/operators/activation_grad_ops.cc
@@ -41,15 +41,11 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
   if (opdesc.HasInput("X")) {
     auto X_name = opdesc.Input("X").front();
     param_.X = GetVar<lite::Tensor>(scope, X_name);
-  } else {
-    param_.X = param_.X_grad;
   }
 
   if (opdesc.HasInput("Out")) {
     auto Out_name = opdesc.Input("Out").front();
     param_.Out = GetVar<lite::Tensor>(scope, Out_name);
-  } else {
-    param_.Out = param_.Out_grad;
   }
 
   return true;
@@ -60,3 +56,5 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
 }  // namespace paddle
 
 REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp);
+REGISTER_LITE_OP(relu_grad, paddle::lite::operators::ActivationGradOp);
+REGISTER_LITE_OP(tanh_grad, paddle::lite::operators::ActivationGradOp);
diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc
index a3d9895955d99b96609a8c35e2493b17a11b9181..01e4116c94c75df3bd5360494c57419fe57c18ef 100644
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -82,7 +82,11 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
     param_.hard_swish_offset = opdesc.GetAttr<float>("offset");
   } else if (opdesc.Type() == "reciprocal") {
     param_.active_type = lite_api::ActivationType::kReciprocal;
+  } else if (opdesc.Type() == "thresholded_relu") {
+    param_.active_type = lite_api::ActivationType::kThresholdedRelu;
+    param_.relu_threshold = opdesc.GetAttr<float>("threshold");
   }
+
   VLOG(4) << "opdesc.Type():" << opdesc.Type();
 
   param_.Out = scope->FindVar(out_name)->GetMutable<lite::Tensor>();
@@ -100,3 +104,4 @@ REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(thresholded_relu, paddle::lite::operators::ActivationOp);
diff --git a/lite/operators/activation_ops.h b/lite/operators/activation_ops.h
index 71fda90bcd893bb0589697a7726b0b9a7500fb6d..250a88de42b4004932f78b0490a844d4a8dbc6fe 100644
--- a/lite/operators/activation_ops.h
+++ b/lite/operators/activation_ops.h
@@ -80,6 +80,9 @@ class ActivationOp : public OpLite {
         break;
       case lite_api::ActivationType::kIndentity:
         break;
+      case lite_api::ActivationType::kThresholdedRelu:
+        ch->macs = param_.X->numel();
+        break;
       default:
         LOG(FATAL) << "This Type of Activation:"
                    << static_cast<int>(param_.active_type)
diff --git a/lite/operators/assign_op.cc b/lite/operators/assign_op.cc
index fe1e8db1f954af38041621d1d676cf16833357da..f2237230dceda55c89a423e0ee9504ee1e3c1de8 100644
--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
@@ -21,15 +21,15 @@ namespace lite {
 namespace operators {
 
 bool AssignOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
+  CHECK_OR_FALSE(param_.X || param_.X_array);
+  CHECK_OR_FALSE(param_.Out || param_.Out_array);
   return true;
 }
 
 bool AssignOpLite::InferShapeImpl() const {
-  if (param_.X != nullptr) {
+  if (param_.X) {
     param_.Out->Resize(param_.X->dims());
-  } else if (param_.X_array != nullptr) {
+  } else if (param_.X_array) {
     param_.Out_array->resize(param_.Out_array->size());
   } else {
     LOG(FATAL) << "x or x_array must be set.";
diff --git a/lite/operators/assign_value_op.cc b/lite/operators/assign_value_op.cc
index ff5b55735f7b58aa2eaa2274574336dadd8061e6..f6f8cb7e3c8958693dd7234b7a21b29b769aa96c 100644
--- a/lite/operators/assign_value_op.cc
+++ b/lite/operators/assign_value_op.cc
@@ -26,12 +26,15 @@ bool AssignValueOpLite::CheckShape() const {
   auto shape = param_.shape;
   auto int32_values = param_.int32_values;
   auto fp32_values = param_.fp32_values;
+  auto int64_values = param_.int64_values;
+  auto bool_values = param_.bool_values;
   size_t shape_num = 1;
-  for (int i = 0; i < shape.size(); i++) {
+  for (size_t i = 0; i < shape.size(); i++) {
     shape_num *= shape[i];
   }
-  CHECK_OR_FALSE(shape_num == int32_values.size() ||
-                 shape_num == fp32_values.size());
+  CHECK_OR_FALSE(
+      shape_num == int32_values.size() || shape_num == fp32_values.size() ||
+      shape_num == int64_values.size() || shape_num == bool_values.size());
   return true;
 }
 
@@ -47,9 +50,18 @@ bool AssignValueOpLite::AttachImpl(const cpp::OpDesc &op_desc,
                                    lite::Scope *scope) {
   param_.shape = op_desc.GetAttr<std::vector<int>>("shape");
   param_.dtype = op_desc.GetAttr<int>("dtype");
-  param_.fp32_values = op_desc.GetAttr<std::vector<float>>("fp32_values");
-  param_.int32_values = op_desc.GetAttr<std::vector<int>>("int32_values");
-
+  if (op_desc.HasAttr("fp32_values")) {
+    param_.fp32_values = op_desc.GetAttr<std::vector<float>>("fp32_values");
+  }
+  if (op_desc.HasAttr("int32_values")) {
+    param_.int32_values = op_desc.GetAttr<std::vector<int>>("int32_values");
+  }
+  if (op_desc.HasAttr("int64_values")) {
+    param_.int64_values = op_desc.GetAttr<std::vector<int64_t>>("int64_values");
+  }
+  if (op_desc.HasAttr("bool_values")) {
+    param_.bool_values = op_desc.GetAttr<std::vector<int>>("bool_values");
+  }
   auto out = op_desc.Output("Out").front();
   param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
   return true;
diff --git a/lite/operators/clip_op.cc b/lite/operators/clip_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad8eef45f3b38cd176d1bd3d2d0b42620faf602c
--- /dev/null
+++ b/lite/operators/clip_op.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/clip_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ClipOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.out);
+  return true;
+}
+
+bool ClipOpLite::InferShapeImpl() const {
+  param_.out->Resize(param_.x->dims());
+  param_.out->set_lod(param_.x->lod());
+  return true;
+}
+
+bool ClipOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  AttachInput(op_desc, scope, "X", false, &param_.x);
+  AttachInput(op_desc, scope, "Min", true, &param_.min_tensor);
+  AttachInput(op_desc, scope, "Max", true, &param_.max_tensor);
+  AttachOutput(op_desc, scope, "Out", false, &param_.out);
+
+  param_.min = op_desc.GetAttr<float>("min");
+  param_.max = op_desc.GetAttr<float>("max");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(clip, paddle::lite::operators::ClipOpLite);
diff --git a/lite/operators/clip_op.h b/lite/operators/clip_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..25c7f9a824ffc4b395a13df39811074724211f44
--- /dev/null
+++ b/lite/operators/clip_op.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ClipOpLite : public OpLite {
+ public:
+  ClipOpLite() {}
+
+  explicit ClipOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "clip"; }
+
+ private:
+  mutable ClipParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/conditional_block_op.cc b/lite/operators/conditional_block_op.cc
index e3678e92c9d33be5428c82331ce963f4c6067369..de8bea345fe8da1e157665b93f9d50c6f6bbffa3 100644
--- a/lite/operators/conditional_block_op.cc
+++ b/lite/operators/conditional_block_op.cc
@@ -20,35 +20,37 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-bool ConditionalBlockOpLite::CheckShape() const {
+bool ConditionalBlockOp::CheckShape() const {
   CHECK_OR_FALSE(param_.cond);
-  CHECK_OR_FALSE(param_.sub_block);
-  CHECK_OR_FALSE(param_.scope);
+  CHECK_OR_FALSE(param_.program_desc);
+  CHECK_OR_FALSE(param_.exec_scope);
   return true;
 }
 
-bool ConditionalBlockOpLite::InferShapeImpl() const { return true; }
+bool ConditionalBlockOp::InferShapeImpl() const { return true; }
 
-bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                        lite::Scope *scope) {
+bool ConditionalBlockOp::AttachImpl(const cpp::OpDesc& op_desc, Scope* scope) {
   auto condition = op_desc.Input("Cond").front();
   param_.cond = scope->FindVar(condition)->GetMutable<lite::Tensor>();
-
   auto inputs = op_desc.Input("Input");
-  for (auto var : inputs) {
-    param_.x.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  for (const auto& input : inputs) {
+    auto* var = scope->FindVar(input);
+    CHECK(var);
+    param_.inputs.push_back(var->GetMutable<lite::Tensor>());
   }
-
   auto outs = op_desc.Output("Out");
-  for (auto var : outs) {
-    param_.outs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  for (const auto& out : outs) {
+    auto* var = scope->FindVar(out);
+    CHECK(var);
+    param_.outs.push_back(var->GetMutable<lite::Tensor>());
   }
-
   param_.is_scalar_condition = op_desc.GetAttr<bool>("is_scalar_condition");
   // obtain sub_block in core program.cc
-  param_.sub_block = sub_block_;
-  param_.scope = scope;
-
+  CHECK(param_.program_desc);
+  param_.block_idx = op_desc.GetAttr<int32_t>("sub_block");
+  CHECK_GE(param_.block_idx, 0);
+  param_.exec_scope = scope;
+  CHECK(param_.exec_scope);
   return true;
 }
 
@@ -57,4 +59,4 @@ bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc,
 }  // namespace paddle
 
 REGISTER_LITE_OP(conditional_block,
-                 paddle::lite::operators::ConditionalBlockOpLite);
+                 paddle::lite::operators::ConditionalBlockOp);
diff --git a/lite/operators/conditional_block_op.h b/lite/operators/conditional_block_op.h
index 1815731c8df3ac07bee80aa8e0cc658e752b5c4f..adcd8acdff391e2ae3ece9ec21669d853250dcf4 100644
--- a/lite/operators/conditional_block_op.h
+++ b/lite/operators/conditional_block_op.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/op_lite.h"
@@ -23,27 +24,30 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-class ConditionalBlockOpLite : public OpLite {
+class ConditionalBlockOp : public OpLite {
  public:
-  ConditionalBlockOpLite() {}
-  explicit ConditionalBlockOpLite(const std::string &op_type)
-      : OpLite(op_type) {}
+  ConditionalBlockOp() {}
+  explicit ConditionalBlockOp(const std::string &op_type) : OpLite(op_type) {}
 
   bool CheckShape() const override;
 
   bool InferShapeImpl() const override;
 
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
   std::string DebugString() const override { return "conditional_block"; }
 
-  void SetSubBlock(cpp::BlockDesc *desc) { sub_block_ = desc; }
+  void SetProgramDesc(std::shared_ptr<const cpp::ProgramDesc> program_desc) {
+    param_.program_desc = program_desc;
+  }
+  std::shared_ptr<const cpp::ProgramDesc> GetProgramDesc() {
+    return param_.program_desc;
+  }
 
  private:
   mutable ConditionalBlockParam param_;
-  cpp::BlockDesc *sub_block_;
 };
 
 }  // namespace operators
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
index c3e375e2e44b8184e6e7e635ab2c6c1f8889f844..a1d4e2e8a038046b257b3ab5f936cc4cb2e62c67 100644
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -74,7 +74,7 @@ class ConvOpLite : public OpLite {
     param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
 
     param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    std::vector<int> paddings = op_desc.GetAttr<std::vector<int>>("paddings");
     param_.groups = op_desc.GetAttr<int>("groups");
     auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
     param_.dilations = std::make_shared<std::vector<int>>(dilations);
@@ -130,15 +130,18 @@ class ConvOpLite : public OpLite {
       padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
     }
     // For Int8
-    if (op_desc.HasAttr("enable_int8")) {
-      param_.enable_int8 = op_desc.GetAttr<bool>("enable_int8");
-      if (op_desc.HasAttr("input_scale"))
-        param_.input_scale = op_desc.GetAttr<float>("input_scale");
-      if (op_desc.HasAttr("weight_scale"))
-        param_.weight_scale =
-            op_desc.GetAttr<std::vector<float>>("weight_scale");
-      if (op_desc.HasAttr("output_scale")) {
-        param_.output_scale = op_desc.GetAttr<float>("output_scale");
+    const OpInfo* op_info = dynamic_cast<const OpInfo*>(&op_desc);
+    if (op_info != nullptr && op_info->HasAttr("enable_int8")) {
+      param_.enable_int8 = op_info->GetAttr<bool>("enable_int8");
+      auto input_name = op_info->Input("Input").front();
+      auto filter_name = op_info->Input("Filter").front();
+      auto output_name = op_info->Output("Output").front();
+      if (op_info->HasInputScale(input_name))
+        param_.input_scale = op_info->GetInputScale(input_name)[0];
+      if (op_info->HasInputScale(filter_name))
+        param_.weight_scale = op_info->GetInputScale(filter_name);
+      if (op_info->HasOutputScale(output_name)) {
+        param_.output_scale = op_info->GetOutputScale(output_name)[0];
       }
     }
 
diff --git a/lite/operators/conv_transpose_op.cc b/lite/operators/conv_transpose_op.cc
index 9d098eb975ef071a4650ea547d6081d950b251f1..732f8c5056f930259655339c8d8a0b2846f29313 100644
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
@@ -106,7 +106,7 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc& op_desc,
   param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
 
   param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-  auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_desc.GetAttr<std::vector<int>>("paddings");
   param_.groups = op_desc.GetAttr<int>("groups");
   auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
 
diff --git a/lite/operators/deformable_conv_op.cc b/lite/operators/deformable_conv_op.cc
index 8cc8614d00801fb033bc3f449e82f9f03e271db5..a834528f27c9d6c97e355a1a149482ad00ae79aa 100644
--- a/lite/operators/deformable_conv_op.cc
+++ b/lite/operators/deformable_conv_op.cc
@@ -84,5 +84,5 @@ bool DeformableConvOpLite::InferShapeImpl() const {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_OP(DeformableConv2d,
+REGISTER_LITE_OP(deformable_conv,
                  paddle::lite::operators::DeformableConvOpLite);
diff --git a/lite/operators/deformable_conv_op.h b/lite/operators/deformable_conv_op.h
index aa736fcef6b6f74740253b8607e8bfcd938d0ff8..69b764758699089bdee0a64e33a01d838b011ec0 100644
--- a/lite/operators/deformable_conv_op.h
+++ b/lite/operators/deformable_conv_op.h
@@ -83,7 +83,7 @@ class DeformableConvOpLite : public OpLite {
     param_.conv_param.filter =
         scope->FindVar(Filter)->GetMutable<lite::Tensor>();
     param_.conv_param.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    std::vector<int> paddings = op_desc.GetAttr<std::vector<int>>("paddings");
     auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
     param_.conv_param.groups = op_desc.GetAttr<int>("groups");
     param_.conv_param.dilations = std::make_shared<std::vector<int>>(dilations);
diff --git a/lite/operators/elementwise_ops.cc b/lite/operators/elementwise_ops.cc
index 6cc41f0a66cfac4a0baa0153765a59766fa045f4..5895bb667aa22507d362004627304ecf78e085f1 100644
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -144,6 +144,8 @@ REGISTER_LITE_OP(elementwise_add, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_mul, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_max, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_div, paddle::lite::operators::ElementwiseOp);
+REGISTER_LITE_OP(elementwise_mod, paddle::lite::operators::ElementwiseOp);
+REGISTER_LITE_OP(elementwise_pow, paddle::lite::operators::ElementwiseOp);
 
 // #ifdef LITE_WITH_TRAIN
 // REGISTER_LITE_OP(elementwise_sub_grad,
diff --git a/lite/operators/fake_quantize_dequantize_abs_max.cc b/lite/operators/fake_quantize_dequantize_abs_max.cc
new file mode 100644
index 0000000000000000000000000000000000000000..354f5e9dcdbd55f634ae394187c5f9163eb9c25a
--- /dev/null
+++ b/lite/operators/fake_quantize_dequantize_abs_max.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/fake_quantize_dequantize_abs_max.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(fake_quantize_dequantize_abs_max,
+                 paddle::lite::operators::FakeQuantizeDequantizeAbsMaxOpLite);
diff --git a/lite/operators/fake_quantize_dequantize_abs_max.h b/lite/operators/fake_quantize_dequantize_abs_max.h
new file mode 100644
index 0000000000000000000000000000000000000000..7413b448ea5e2317501960a246478d15242f9cdc
--- /dev/null
+++ b/lite/operators/fake_quantize_dequantize_abs_max.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class FakeQuantizeDequantizeAbsMaxOpLite : public OpLite {
+ public:
+  FakeQuantizeDequantizeAbsMaxOpLite() {}
+
+  explicit FakeQuantizeDequantizeAbsMaxOpLite(const std::string &type)
+      : OpLite(type) {}
+
+  bool CheckShape() const override { return true; }
+
+  bool InferShapeImpl() const override { return true; }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
+    auto x = op_desc.Input("X").front();
+    auto out = op_desc.Output("Out").front();
+    auto out_scale = op_desc.Output("OutScale").front();
+
+    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+    param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+    param_.out_scale = scope->FindVar(out_scale)->GetMutable<lite::Tensor>();
+    param_.bit_length = op_desc.GetAttr<int>("bit_length");
+    return true;
+  }
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "fake_quantize_dequantize_abs_max";
+  }
+
+ private:
+  mutable FakeQuantDequantAbsMaxParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc
index d4032c5e8b98ff6d5763d2d06610d2e214ad90ca..28a220da2de0920643d46f1ed9c610dfa613cf95 100644
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -102,14 +102,18 @@ bool FcOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
   }
 
   // For Int8
-  if (op_desc.HasAttr("enable_int8")) {
-    param_.enable_int8 = op_desc.GetAttr<bool>("enable_int8");
-    if (op_desc.HasAttr("input_scale"))
-      param_.input_scale = op_desc.GetAttr<float>("input_scale");
-    if (op_desc.HasAttr("weight_scale"))
-      param_.weight_scale = op_desc.GetAttr<std::vector<float>>("weight_scale");
-    if (op_desc.HasAttr("output_scale"))
-      param_.output_scale = op_desc.GetAttr<float>("output_scale");
+  const OpInfo* op_info = dynamic_cast<const OpInfo*>(&op_desc);
+  if (op_info != nullptr && op_info->HasAttr("enable_int8")) {
+    param_.enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    auto input_name = op_info->Input("Input").front();
+    auto weight_name = op_info->Input("W").front();
+    auto out_name = op_info->Output("Out").front();
+    if (op_info->HasInputScale(input_name))
+      param_.input_scale = op_info->GetInputScale(input_name)[0];
+    if (op_info->HasInputScale(weight_name))
+      param_.weight_scale = op_info->GetInputScale(weight_name);
+    if (op_info->HasOutputScale(out_name))
+      param_.output_scale = op_info->GetOutputScale(out_name)[0];
   }
   return true;
 }
diff --git a/lite/operators/group_norm_op.cc b/lite/operators/group_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1a6413ebb140bac4a1d7e74ef42413f489395c7
--- /dev/null
+++ b/lite/operators/group_norm_op.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/group_norm_op.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool GroupNormOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.scale);
+  CHECK_OR_FALSE(param_.bias);
+  CHECK_OR_FALSE(param_.out);
+  CHECK_OR_FALSE(param_.saved_mean);
+  CHECK_OR_FALSE(param_.saved_variance);
+  auto x_dims = param_.x->dims();
+  auto scale_dims = param_.scale->dims();
+  auto bias_dims = param_.bias->dims();
+  CHECK(x_dims.size() >= 2 && x_dims.size() <= 5)
+      << "Input X must have 2 to 5 dimensions.";
+  CHECK_EQ(scale_dims.size(), 1UL) << "Input Scale must have 1 dimensions.";
+  CHECK_EQ(bias_dims.size(), 1UL) << "Input Bias must have 1 dimensions.";
+  CHECK_GT(param_.epsilon, 0.f) << "epsilon should be greater than 0.f";
+  CHECK_LT(param_.epsilon, 0.01f) << "epsilon should be less than 0.01f";
+  CHECK_EQ(param_.channels, x_dims[1])
+      << "Input channels must be equal input_shape[1]";
+  CHECK_EQ(param_.channels % param_.groups, 0)
+      << "channels must be divide groups";
+  return true;
+}
+
+bool GroupNormOp::InferShapeImpl() const {
+  auto x_dims = param_.x->dims();
+  int64_t batch_size = x_dims[0];
+  int64_t num = param_.channels / param_.groups;
+  param_.saved_mean->Resize({batch_size * num});
+  param_.saved_variance->Resize({batch_size * num});
+  param_.out->Resize(x_dims);
+  return true;
+}
+
+bool GroupNormOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
+  param_.scale =
+      scope->FindVar(op_desc.Input("Scale").front())->GetMutable<Tensor>();
+  param_.bias =
+      scope->FindVar(op_desc.Input("Bias").front())->GetMutable<Tensor>();
+  param_.saved_mean =
+      scope->FindVar(op_desc.Output("SavedMean").front())->GetMutable<Tensor>();
+  param_.saved_variance =
+      scope->FindVar(op_desc.Output("SavedVariance").front())
+          ->GetMutable<Tensor>();
+  param_.out =
+      scope->FindVar(op_desc.Output("Y").front())->GetMutable<Tensor>();
+  param_.epsilon = op_desc.GetAttr<float>("epsilon");
+  param_.groups = op_desc.GetAttr<int>("groups");
+  param_.channels = op_desc.GetAttr<int>("channels");
+  return true;
+}
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
+
+REGISTER_LITE_OP(group_norm, paddle::lite::operators::GroupNormOp);
diff --git a/lite/operators/group_norm_op.h b/lite/operators/group_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2251686ea2caa89e3934e8adae69466f9c9515d
--- /dev/null
+++ b/lite/operators/group_norm_op.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class GroupNormOp : public OpLite {
+ public:
+  GroupNormOp() {}
+
+  explicit GroupNormOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "group_norm"; }
+
+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.x->dims());
+    ch->output_shape = ch->DimToStr(param_.out->dims());
+    // ch->remark = "";
+    auto x_dims = param_.x->dims();
+    auto nc = x_dims[0] * x_dims[1];
+    auto hw = x_dims[2] * x_dims[3];
+    auto nchw = x_dims.production();
+    ch->macs = 5.f * nchw + 3.f * (nc + hw);
+  }
+#endif
+
+ private:
+  mutable GroupNormParam param_;
+};
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
diff --git a/lite/operators/gru_op.cc b/lite/operators/gru_op.cc
index 862a1ff98f699393c9aa91afab978f947cc25187..0a9128dcd27870f6456b26ba636d4189267583be 100644
--- a/lite/operators/gru_op.cc
+++ b/lite/operators/gru_op.cc
@@ -75,9 +75,8 @@ bool GRUOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
   auto batch_reset_hidden_prev = op_desc.Output("BatchResetHiddenPrev").front();
   auto batch_hidden = op_desc.Output("BatchHidden").front();
   auto hidden = op_desc.Output("Hidden").front();
-
   param_.input = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  if (op_desc.Input("H0").size()) {
+  if (!op_desc.Input("H0").empty()) {
     auto h0 = op_desc.Input("H0").front();
     param_.h0 = scope->FindVar(h0)->GetMutable<lite::Tensor>();
   }
@@ -90,7 +89,7 @@ bool GRUOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
       scope->FindVar(batch_hidden)->GetMutable<lite::Tensor>();
   param_.hidden = scope->FindVar(hidden)->GetMutable<lite::Tensor>();
 
-  if (op_desc.HasInput("Bias")) {
+  if (!op_desc.Input("Bias").empty()) {
     auto bias = op_desc.Input("Bias").front();
     param_.bias = scope->FindVar(bias)->GetMutable<lite::Tensor>();
   }
diff --git a/lite/operators/match_matrix_tensor_op.cc b/lite/operators/match_matrix_tensor_op.cc
index 1cc751109f76a96097d363b493322dde182a715d..fd70143131b458c1d985a21a6d9d84c707ba9986 100644
--- a/lite/operators/match_matrix_tensor_op.cc
+++ b/lite/operators/match_matrix_tensor_op.cc
@@ -94,6 +94,18 @@ bool MatchMatrixTensorOpLite::AttachImpl(const cpp::OpDesc& op_desc,
 
   param_.dim_t = op_desc.GetAttr<int32_t>("dim_t");
 
+  if (op_desc.HasAttr("fuse_relu")) {
+    param_.fuse_relu = op_desc.GetAttr<bool>("fuse_relu");
+  }
+#ifdef LITE_WITH_XPU
+  if (op_desc.HasAttr("__xpu__float_to_fix")) {
+    param_.__xpu__float_to_fix = op_desc.GetAttr<bool>("__xpu__float_to_fix");
+  }
+  if (op_desc.HasAttr("__xpu__w_max")) {
+    param_.__xpu__w_max = op_desc.GetAttr<float>("__xpu__w_max");
+  }
+#endif
+
   return true;
 }
 
diff --git a/lite/operators/max_pool_with_index_op.h b/lite/operators/max_pool_with_index_op.h
index bd82743c279c4728483c72f017a8fa6e94cf3eb4..dfc220907549dc9ce61726b79cb1626c2734b234 100644
--- a/lite/operators/max_pool_with_index_op.h
+++ b/lite/operators/max_pool_with_index_op.h
@@ -54,7 +54,7 @@ class MaxPoolWithIndexOpLite : public OpLite {
     param_.ksize = op_desc.GetAttr<std::vector<int>>("ksize");
     param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
     param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    std::vector<int> paddings = op_desc.GetAttr<std::vector<int>>("paddings");
     if (op_desc.HasAttr("adaptive")) {
       param_.adaptive = op_desc.GetAttr<bool>("adaptive");
     }
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 955f5a19de6c191f5eba53774f5137c90d481dd8..ef728924c1afe2cb4040ca84c27dc9ea09f18190 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -21,10 +21,9 @@
 #include "lite/core/scope.h"
 #include "lite/core/tensor.h"
 #include "lite/core/types.h"
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/utils/all.h"
-#include "lite/utils/variant.h"
 /*
  * This file contains all the argument parameter data structure for operators.
  */
@@ -91,9 +90,9 @@ struct SubgraphParam : ParamBase {
   std::vector<std::string> output_names{};
   std::vector<std::string> input_data_names{};
   std::vector<std::string> output_data_names{};
-  int sub_block_idx{-1};
-  cpp::BlockDesc* sub_block_desc{nullptr};
-  Scope* scope{nullptr};
+  int block_idx{-1};
+  std::shared_ptr<const cpp::ProgramDesc> program_desc{nullptr};
+  Scope* exec_scope{nullptr};
 };
 
 /// -------------------------- NN operators ------------------------------------
@@ -358,6 +357,8 @@ struct ActivationParam : ParamBase {
   float hard_swish_threshold{6.0};
   float hard_swish_scale{6.0};
   float hard_swish_offset{3.0};
+  // thresholded_relu
+  float relu_threshold{1.0f};
 };
 
 struct ActivationGradParam : ParamBase {
@@ -677,6 +678,13 @@ struct FakeChannelWiseDequantizeMaxAbsParam : ParamBase {
   std::vector<int> quant_bits;
 };
 
+struct FakeQuantDequantAbsMaxParam : ParamBase {
+  const lite::Tensor* x{};
+  lite::Tensor* out{};
+  lite::Tensor* out_scale{};
+  int bit_length;
+};
+
 /// ----------------------- sgd operators ----------------------
 struct SGDParam : ParamBase {
   int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
@@ -938,11 +946,10 @@ struct CompareParam : ParamBase {
 };
 
 struct WhileParam : ParamBase {
-  Scope* scope{};
   Tensor* cond{};
-  cpp::BlockDesc* sub_block{};
-  std::vector<Tensor*> x{};
-  std::vector<Tensor*> outs{};
+  int block_idx{-1};
+  std::shared_ptr<const cpp::ProgramDesc> program_desc{nullptr};
+  Scope* exec_scope{nullptr};
 };
 
 struct TopkParam : ParamBase {
@@ -1030,12 +1037,28 @@ struct SequenceExpandParam : ParamBase {
   int ref_level{-1};
 };
 
+struct SequencePadParam : ParamBase {
+  const lite::Tensor* X{};
+  const lite::Tensor* PadValue{};
+  lite::Tensor* Out{};
+  lite::Tensor* Length{};
+  int padded_length{-1};
+};
+
 struct SequenceUnpadParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Length{};
   lite::Tensor* Out{};
 };
 
+struct SequenceMaskParam : ParamBase {
+  const lite::Tensor* X{};
+  const lite::Tensor* MaxLenTensor{nullptr};
+  lite::Tensor* Y{};
+  int maxlen{-1};
+  int out_dtype;
+};
+
 struct SequenceExpandAsParam : ParamBase {
   const lite::Tensor* x{nullptr};
   const lite::Tensor* y{nullptr};
@@ -1112,6 +1135,11 @@ struct VarConv2DParam : ParamBase {
   int kernel_w;
 
   bool fuse_relu{false};
+
+#ifdef LITE_WITH_XPU
+  bool __xpu__float_to_fix{false};  // Is W already converted to int16/int8
+  float __xpu__w_max{0.0f};         // Abs max in W
+#endif
 };
 
 /// ----------------------- shape operators ----------------------
@@ -1329,6 +1357,8 @@ struct AssignValueParam : ParamBase {
   int dtype{};
   std::vector<float> fp32_values{};
   std::vector<int> int32_values{};
+  std::vector<int64_t> int64_values{};
+  std::vector<int> bool_values{};
   lite::Tensor* Out{};
 };
 
@@ -1343,6 +1373,15 @@ struct SequenceTopkAvgPoolingParam : ParamBase {
   std::vector<int> topks{};
 };
 
+/// --------------- topk_pooling operators ------------------
+struct TopkPoolingParam : ParamBase {
+  const lite::Tensor* X{};
+  const lite::Tensor* Y{};
+  lite::Tensor* Out{};
+  int top_k{1};
+  int feat_map_num{1};
+};
+
 /// --------------- search_fc operators ------------------
 struct SearchFcParam : ParamBase {
   const lite::Tensor* X{};
@@ -1350,6 +1389,13 @@ struct SearchFcParam : ParamBase {
   const lite::Tensor* b{};
   lite::Tensor* Out{};
   int out_size{};
+
+  bool fuse_relu{false};
+
+#ifdef LITE_WITH_XPU
+  bool __xpu__float_to_fix{false};  // Is W already converted to int16/int8
+  float __xpu__w_max{0.0f};         // Abs max in W
+#endif
 };
 /// --------------------- match_matrix_tensor operators --------------------
 struct MatchMatrixTensorParam : ParamBase {
@@ -1360,6 +1406,12 @@ struct MatchMatrixTensorParam : ParamBase {
   lite::Tensor* tmp{};
 
   int dim_t;
+  bool fuse_relu{false};
+
+#ifdef LITE_WITH_XPU
+  bool __xpu__float_to_fix{false};  // Is w already converted to int16/int8
+  float __xpu__w_max{0.0f};         // Abs max in w
+#endif
 };
 
 /// --------------------- search_seq_depadding operators --------------------
@@ -1381,6 +1433,12 @@ struct SearchGrnnParam : ParamBase {
   lite::Tensor* tmp_buffer{};
   lite::Tensor* idx_sorted_by_width{};
   lite::Tensor* layout_input{};
+
+#ifdef LITE_WITH_XPU
+  bool __xpu__float_to_fix{false};   // Is wi/wh already converted to int16/int8
+  std::vector<float> __xpu__wi_max;  // Abs max in wi
+  std::vector<float> __xpu__wh_max;  // Abs max in wh
+#endif
 };
 
 struct SplitLodTensorParam : ParamBase {
@@ -1402,10 +1460,11 @@ struct MergeLodTensorParam : ParamBase {
 
 struct ConditionalBlockParam : ParamBase {
   const lite::Tensor* cond{};
-  std::vector<lite::Tensor*> x{};
+  std::vector<lite::Tensor*> inputs{};
   std::vector<lite::Tensor*> outs{};
-  cpp::BlockDesc* sub_block{};
-  Scope* scope{};
+  int block_idx{-1};
+  std::shared_ptr<const cpp::ProgramDesc> program_desc{nullptr};
+  Scope* exec_scope{nullptr};
   bool is_scalar_condition{};
 };
 
@@ -1436,6 +1495,19 @@ struct InstanceNormParam : ParamBase {
   lite::Tensor* saved_variance{};
   float epsilon;
 };
+/// --------------------- group_norm operators --------------------
+struct GroupNormParam : ParamBase {
+  lite::Tensor* x{};
+  lite::Tensor* out{};
+  lite::Tensor* bias{};
+  lite::Tensor* scale{};
+  lite::Tensor* saved_mean{};
+  lite::Tensor* saved_variance{};
+  float epsilon;
+  int groups;
+  int channels;
+};
+
 /// --------------------- grid sampler operators --------------------
 struct GridSamplerParam : ParamBase {
   lite::Tensor* x{};
@@ -1522,6 +1594,132 @@ struct XPUFcParam : ParamBase {
   std::string activation_type{""};
 };
 
+struct XPUResNetCbamParam : ParamBase {
+  lite::Tensor* input{};
+  std::vector<lite::Tensor*> filter;
+  std::vector<lite::Tensor*> bias;
+  std::vector<lite::Tensor*> max_filter;
+  lite::Tensor* output{};
+
+  float pool_p{1.0f};
+};
+
+struct XPUMmdnnSearchAttentionParam : ParamBase {
+  lite::Tensor* X{};
+  lite::Tensor* W{};
+  lite::Tensor* b{};
+  lite::Tensor* Out{};
+
+  float W_max{0.0f};
+  int pad_id{0};
+  float alpha0{1.0f};
+  float alpha1{1.0f};
+  float mask{1.0f};
+};
+
+struct XPUMmdnnBidEmbGrnnAttParam : ParamBase {
+  lite::Tensor* id0{};
+  lite::Tensor* id1{};
+  lite::Tensor* emb_tbl{};
+  lite::Tensor* grnn_fw_wh{};
+  lite::Tensor* grnn_fw_wi{};
+  lite::Tensor* grnn_rv_wh{};
+  lite::Tensor* grnn_rv_wi{};
+  lite::Tensor* att_fc_w{};
+  lite::Tensor* att_fc_b{};
+
+  std::vector<float> grnn_fw_wh_maxs;
+  std::vector<float> grnn_fw_wi_maxs;
+  std::vector<float> grnn_rv_wh_maxs;
+  std::vector<float> grnn_rv_wi_maxs;
+  float att_fc_w_max{0.0f};
+
+  lite::Tensor* grnn_fw_pool_out{};
+  lite::Tensor* grnn_rv_pool_out{};
+  lite::Tensor* att_pool_out{};
+  lite::Tensor* concat_3in1_out{};
+  lite::Tensor* emb_fw_out{};
+};
+
+struct XPUMmdnnBidEmbGrnnAttParam2 : ParamBase {
+  lite::Tensor* id0{};
+  lite::Tensor* id1{};
+  lite::Tensor* emb_tbl{};
+  lite::Tensor* grnn_fw_wh{};
+  lite::Tensor* grnn_fw_wi{};
+  lite::Tensor* grnn_rv_wh{};
+  lite::Tensor* grnn_rv_wi{};
+  lite::Tensor* att_fc_w{};
+  lite::Tensor* att_fc_b{};
+
+  std::vector<float> grnn_fw_wh_maxs;
+  std::vector<float> grnn_fw_wi_maxs;
+  std::vector<float> grnn_rv_wh_maxs;
+  std::vector<float> grnn_rv_wi_maxs;
+  float att_fc_w_max{0.0f};
+
+  lite::Tensor* emb0_out{};
+  lite::Tensor* grnn_fw_pool_out{};
+  lite::Tensor* grnn_rv_pool_out{};
+  lite::Tensor* att_pool_out{};
+  lite::Tensor* concat_3in1_out{};
+  lite::Tensor* emb_fw_out{};
+};
+
+struct XPUMmdnnBidEmbAttParam : ParamBase {
+  lite::Tensor* id0{};
+  lite::Tensor* id1{};
+  lite::Tensor* emb_tbl{};
+  lite::Tensor* att_fc_w{};
+  lite::Tensor* att_fc_b{};
+
+  float att_fc_w_max{0.0f};
+
+  lite::Tensor* att_pool_out{};
+  lite::Tensor* emb_fw_out{};
+};
+
+struct XPUMmdnnMatchConvTopkParam : ParamBase {
+  lite::Tensor* input_x{};
+  lite::Tensor* input_y{};
+  lite::Tensor* input_w{};
+  lite::Tensor* conv_w{};
+
+  float input_w_max{0.0f};
+  float conv_w_max{0.0f};
+  std::vector<int> topks;
+  int output_channel{0};
+  int channel_num{0};
+  int dim_t{0};
+
+  lite::Tensor* topk_out{};
+};
+
+struct XPUMmdnnMergeAllParam : ParamBase {
+  std::vector<lite::Tensor*> concat_7in1_x;
+  std::vector<lite::Tensor*> concat_topk_x;
+  lite::Tensor* grnn_fw_wh{};
+  lite::Tensor* grnn_fw_wi{};
+  lite::Tensor* grnn_rv_wh{};
+  lite::Tensor* grnn_rv_wi{};
+  lite::Tensor* fc0_w{};
+  lite::Tensor* fc0_b{};
+  lite::Tensor* fc1_w{};
+  lite::Tensor* fc1_b{};
+  lite::Tensor* fc2_w{};
+  lite::Tensor* fc2_b{};
+
+  std::vector<float> grnn_fw_wh_maxs;
+  std::vector<float> grnn_fw_wi_maxs;
+  std::vector<float> grnn_rv_wh_maxs;
+  std::vector<float> grnn_rv_wi_maxs;
+  float fc0_w_max{0.0f};
+  float fc1_w_max{0.0f};
+  float fc2_w_max{0.0f};
+
+  lite::Tensor* out{};
+};
+
 // For DeformableConvolution op
 struct DeformableConvParam : ParamBase {
   lite::Tensor* x{};
@@ -1560,6 +1758,50 @@ struct PixelShuffleParam : ParamBase {
   lite::Tensor* output{nullptr};
   int upscale_factor{1};
 };
+
+struct RetinanetDetectionOutputParam : ParamBase {
+  std::vector<Tensor*> bboxes{};
+  std::vector<Tensor*> scores{};
+  std::vector<Tensor*> anchors{};
+  Tensor* im_info{};
+  Tensor* out{};
+  float score_threshold{};
+  int nms_top_k{};
+  float nms_threshold{};
+  float nms_eta{};
+  int keep_top_k{};
+};
+
+struct WhereIndexParam : ParamBase {
+  const lite::Tensor* input{nullptr};
+  lite::Tensor* output{nullptr};
+};
+
+struct ClipParam : ParamBase {
+  Tensor* x{};
+  Tensor* min_tensor{};
+  Tensor* max_tensor{};
+  Tensor* out{};
+  float min{};
+  float max{};
+};
+
+struct PrintParam : ParamBase {
+  const lite::Tensor* in{};
+  lite::Tensor* out{};
+  std::string name;
+  int first_n{-1};
+  std::string message;
+  int summarize{20};
+  bool print_tensor_name{true};
+  bool print_tensor_type{true};
+  bool print_tensor_shape{true};
+  bool print_tensor_lod{true};
+  bool print_tensor_layout{true};
+  std::string print_phase;
+  bool is_forward{true};
+};
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/pixel_shuffle_op.cc b/lite/operators/pixel_shuffle_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..40f564bdd6d2699bafe497bdfded21ea4f3956a3
--- /dev/null
+++ b/lite/operators/pixel_shuffle_op.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pixel_shuffle_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool PixelShuffleOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  CHECK_OR_FALSE(param_.upscale_factor);
+  const auto x_dims = param_.x->dims();
+  const auto upscale_factor = param_.upscale_factor;
+  CHECK_EQ_OR_FALSE(x_dims[1] % (upscale_factor * upscale_factor), 0);
+  return true;
+}
+
+bool PixelShuffleOpLite::InferShapeImpl() const {
+  const auto x_dims = param_.x->dims();
+  const auto upscale_factor = param_.upscale_factor;
+  auto output_dims = x_dims;
+  output_dims[0] = x_dims[0];
+  output_dims[1] = x_dims[1] / (upscale_factor * upscale_factor);
+  output_dims[2] = x_dims[2] * upscale_factor;
+  output_dims[3] = x_dims[3] * upscale_factor;
+  param_.output->Resize(output_dims);
+  return true;
+}
+
+bool PixelShuffleOpLite::AttachImpl(const cpp::OpDesc& opdesc,
+                                    lite::Scope* scope) {
+  auto input = opdesc.Input("X").front();
+  auto out = opdesc.Output("Out").front();
+
+  param_.x = scope->FindVar(input)->GetMutable<lite::Tensor>();
+  param_.output = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  if (opdesc.HasAttr("upscale_factor")) {
+    param_.upscale_factor = opdesc.GetAttr<int>("upscale_factor");
+  }
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(pixel_shuffle, paddle::lite::operators::PixelShuffleOpLite);
diff --git a/lite/operators/pixel_shuffle_op.h b/lite/operators/pixel_shuffle_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..63efd8df778c6d92bc448f795c19ff5bffba62c8
--- /dev/null
+++ b/lite/operators/pixel_shuffle_op.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class PixelShuffleOpLite : public OpLite {
+ public:
+  PixelShuffleOpLite() {}
+  explicit PixelShuffleOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "pixel_shuffle"; }
+
+ private:
+  mutable PixelShuffleParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h
index 92f00a4272fddeb03abd04cba473a997cce37217..916ed1dd6f036c6c36954622abbbc1361de1b790 100644
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
@@ -54,7 +54,7 @@ class PoolOpLite : public OpLite {
     param_.ksize = op_desc.GetAttr<std::vector<int>>("ksize");
     param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
     param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    std::vector<int> paddings = op_desc.GetAttr<std::vector<int>>("paddings");
 
     if (op_desc.HasAttr("exclusive")) {
       param_.exclusive = op_desc.GetAttr<bool>("exclusive");
diff --git a/lite/operators/print_op.cc b/lite/operators/print_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f4299aed06f17d7bf3bd30b9fec34c587168884
--- /dev/null
+++ b/lite/operators/print_op.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/print_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool PrintOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.in);
+  CHECK_OR_FALSE(param_.out);
+  return true;
+}
+
+bool PrintOp::InferShapeImpl() const {
+  param_.out->set_lod(param_.in->lod());
+  param_.out->Resize(param_.in->dims());
+  return true;
+}
+
+bool PrintOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  AttachParam(&param_);
+
+  param_.name = op_desc.Input("In").front();
+  param_.in = scope->FindTensor(param_.name);
+  param_.out = scope->FindMutableTensor(op_desc.Output("Out").front());
+  param_.first_n = op_desc.GetAttr<int32_t>("first_n");
+  param_.message = op_desc.GetAttr<std::string>("message");
+  param_.summarize = op_desc.GetAttr<int32_t>("summarize");
+  param_.print_tensor_name = op_desc.GetAttr<bool>("print_tensor_name");
+  param_.print_tensor_type = op_desc.GetAttr<bool>("print_tensor_type");
+  param_.print_tensor_shape = op_desc.GetAttr<bool>("print_tensor_shape");
+  param_.print_tensor_lod = op_desc.GetAttr<bool>("print_tensor_lod");
+  param_.print_tensor_layout = op_desc.GetAttr<bool>("print_tensor_layout");
+  param_.print_phase = op_desc.GetAttr<std::string>("print_phase");
+  param_.is_forward = op_desc.GetAttr<bool>("is_forward");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(print, paddle::lite::operators::PrintOp);
diff --git a/lite/operators/print_op.h b/lite/operators/print_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd8e777b59c3aac92771442402cf16623b75fbef
--- /dev/null
+++ b/lite/operators/print_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class PrintOp : public OpLite {
+ public:
+  PrintOp() {}
+  explicit PrintOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "print"; }
+
+ private:
+  mutable PrintParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/retinanet_detection_output_op.cc b/lite/operators/retinanet_detection_output_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e27f2bfca0ab25b8f73d4c6a68d539a7c22389e0
--- /dev/null
+++ b/lite/operators/retinanet_detection_output_op.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/retinanet_detection_output_op.h"
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool RetinanetDetectionOutputOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.bboxes.size() > 0);
+  CHECK_OR_FALSE(param_.scores.size() > 0);
+  CHECK_OR_FALSE(param_.anchors.size() > 0);
+  CHECK_OR_FALSE(param_.bboxes.size() == param_.scores.size());
+  CHECK_OR_FALSE(param_.bboxes.size() == param_.anchors.size());
+  CHECK_OR_FALSE(param_.im_info);
+  CHECK_OR_FALSE(param_.out);
+
+  DDim bbox_dims = param_.bboxes.front()->dims();
+  DDim score_dims = param_.scores.front()->dims();
+  DDim anchor_dims = param_.anchors.front()->dims();
+  DDim im_info_dims = param_.im_info->dims();
+
+  CHECK_OR_FALSE(bbox_dims.size() == 3);
+  CHECK_OR_FALSE(score_dims.size() == 3);
+  CHECK_OR_FALSE(anchor_dims.size() == 2);
+  CHECK_OR_FALSE(bbox_dims[2] == 4);
+  CHECK_OR_FALSE(bbox_dims[1] == score_dims[1]);
+  CHECK_OR_FALSE(anchor_dims[0] == bbox_dims[1]);
+  CHECK_OR_FALSE(im_info_dims.size() == 2);
+
+  return true;
+}
+
+bool RetinanetDetectionOutputOpLite::InferShapeImpl() const {
+  DDim bbox_dims = param_.bboxes.front()->dims();
+  param_.out->Resize({bbox_dims[1], bbox_dims[2] + 2});
+  return true;
+}
+
+bool RetinanetDetectionOutputOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                                lite::Scope *scope) {
+  for (auto arg_name : op_desc.Input("BBoxes")) {
+    param_.bboxes.push_back(
+        scope->FindVar(arg_name)->GetMutable<lite::Tensor>());
+  }
+  for (auto arg_name : op_desc.Input("Scores")) {
+    param_.scores.push_back(
+        scope->FindVar(arg_name)->GetMutable<lite::Tensor>());
+  }
+  for (auto arg_name : op_desc.Input("Anchors")) {
+    param_.anchors.push_back(
+        scope->FindVar(arg_name)->GetMutable<lite::Tensor>());
+  }
+  AttachInput(op_desc, scope, "ImInfo", false, &param_.im_info);
+  AttachOutput(op_desc, scope, "Out", false, &param_.out);
+
+  param_.score_threshold = op_desc.GetAttr<float>("score_threshold");
+  param_.nms_top_k = op_desc.GetAttr<int>("nms_top_k");
+  param_.nms_threshold = op_desc.GetAttr<float>("nms_threshold");
+  param_.nms_eta = op_desc.GetAttr<float>("nms_eta");
+  param_.keep_top_k = op_desc.GetAttr<int>("keep_top_k");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(retinanet_detection_output,
+                 paddle::lite::operators::RetinanetDetectionOutputOpLite);
diff --git a/lite/operators/retinanet_detection_output_op.h b/lite/operators/retinanet_detection_output_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9969227e15941644249b46ba7372f9afc705672c
--- /dev/null
+++ b/lite/operators/retinanet_detection_output_op.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class RetinanetDetectionOutputOpLite : public OpLite {
+ public:
+  RetinanetDetectionOutputOpLite() {}
+
+  explicit RetinanetDetectionOutputOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "retinanet_detection_output";
+  }
+
+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {}
+#endif
+
+ private:
+  mutable RetinanetDetectionOutputParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_fc_op.cc b/lite/operators/search_fc_op.cc
index 71e62c2ae729b4e1516a219888b9af3f7d994428..8024c38f9cc4a6d3ba2d47d6c61e716dd57bb362 100644
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
@@ -70,6 +70,18 @@ bool SearchFcOpLite::AttachImpl(const cpp::OpDesc &op_desc,
   param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
   param_.out_size = op_desc.GetAttr<int>("out_size");
 
+  if (op_desc.HasAttr("fuse_relu")) {
+    param_.fuse_relu = op_desc.GetAttr<bool>("fuse_relu");
+  }
+#ifdef LITE_WITH_XPU
+  if (op_desc.HasAttr("__xpu__float_to_fix")) {
+    param_.__xpu__float_to_fix = op_desc.GetAttr<bool>("__xpu__float_to_fix");
+  }
+  if (op_desc.HasAttr("__xpu__w_max")) {
+    param_.__xpu__w_max = op_desc.GetAttr<float>("__xpu__w_max");
+  }
+#endif
+
   return true;
 }
 
diff --git a/lite/operators/search_grnn_op.cc b/lite/operators/search_grnn_op.cc
index 1ced477c109d8cd93485f0193523887759939f17..6f743693bc782e636064ca398539433b497dc645 100644
--- a/lite/operators/search_grnn_op.cc
+++ b/lite/operators/search_grnn_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/operators/search_grnn_op.h"
+#include <vector>
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 
@@ -84,6 +85,18 @@ bool SearchGrnnOpLite::AttachImpl(const cpp::OpDesc& op_desc,
   param_.layout_input =
       scope->FindVar(layout_input)->GetMutable<lite::Tensor>();
 
+#ifdef LITE_WITH_XPU
+  if (op_desc.HasAttr("__xpu__float_to_fix")) {
+    param_.__xpu__float_to_fix = op_desc.GetAttr<bool>("__xpu__float_to_fix");
+  }
+  if (op_desc.HasAttr("__xpu__wi_max")) {
+    param_.__xpu__wi_max = op_desc.GetAttr<std::vector<float>>("__xpu__wi_max");
+  }
+  if (op_desc.HasAttr("__xpu__wh_max")) {
+    param_.__xpu__wh_max = op_desc.GetAttr<std::vector<float>>("__xpu__wh_max");
+  }
+#endif
+
   return true;
 }
 
diff --git a/lite/operators/sequence_mask_op.cc b/lite/operators/sequence_mask_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bac1dc8a26abe9a9ae2bbd77e03c2375b4814268
--- /dev/null
+++ b/lite/operators/sequence_mask_op.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_mask_op.h"
+
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceMaskOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  return true;
+}
+
+bool SequenceMaskOp::InferShapeImpl() const { return true; }
+
+bool SequenceMaskOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.X = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  if (opdesc.HasInput("MaxLenTensor") &&
+      !opdesc.Input("MaxLenTensor").empty()) {
+    auto var = scope->FindVar(opdesc.Input("MaxLenTensor").front());
+    if (var != nullptr) {
+      param_.MaxLenTensor = var->GetMutable<lite::Tensor>();
+    }
+  }
+  param_.Y =
+      scope->FindVar(opdesc.Output("Y").front())->GetMutable<lite::Tensor>();
+  param_.maxlen = opdesc.GetAttr<int>("maxlen");
+  param_.out_dtype = opdesc.GetAttr<int>("out_dtype");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_mask, paddle::lite::operators::SequenceMaskOp);
diff --git a/lite/operators/sequence_mask_op.h b/lite/operators/sequence_mask_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..97008b865b850f3837fcc49befc5735987fb2048
--- /dev/null
+++ b/lite/operators/sequence_mask_op.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceMaskOp : public OpLite {
+ public:
+  SequenceMaskOp() {}
+  explicit SequenceMaskOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_mask"; }
+
+ private:
+  mutable SequenceMaskParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_pad_op.cc b/lite/operators/sequence_pad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..858c0ffcbb1a8e739cf4575e9f2f8882fd231912
--- /dev/null
+++ b/lite/operators/sequence_pad_op.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_pad_op.h"
+#include <algorithm>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequencePadOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.PadValue);
+  CHECK_OR_FALSE(param_.Out);
+  CHECK_OR_FALSE(param_.Length);
+
+  return true;
+}
+
+bool SequencePadOp::InferShapeImpl() const {
+  auto x_dims = param_.X->dims();
+  CHECK_GE(x_dims.size(), 2) << "The rank of SequencePad OP Input(x) can't be "
+                                "less than 2. But the rank we received is "
+                             << x_dims.size();
+  auto time_step_dims = x_dims.Slice(1, x_dims.size());
+  auto pad_value_dims = param_.PadValue->dims();
+  CHECK_EQ((pad_value_dims == DDim({1})) || (pad_value_dims == time_step_dims),
+           true)
+      << "The SequencePad OP Input(PadValue) must be a scalar or a tensor "
+         "whiose shape equals to time steps in sequences";
+
+  auto x_lod = param_.X->lod();
+  CHECK_EQ(x_lod.empty(), false)
+      << "The SequencePad OP Input(X) must hold lod info.";
+  const auto &x_lod_0 = x_lod[0];
+  CHECK_GE(x_lod_0.size(), 2)
+      << "The size of SequencePadOp Input(X)'s lod info can't be less than 2. "
+         "But the size we received is "
+      << x_lod_0.size();
+  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod_0.back()))
+      << "The SequencePadOp Input(X)'s lod info mismatches the actual tensor "
+         "shape. The 1st dimension of Input(X)'s lod info is "
+      << x_dims[0] << ", the 1st dimension of actual tensor shape is "
+      << static_cast<int64_t>(x_lod_0.back());
+
+  int seq_num = x_lod_0.size() - 1;
+  int max_seq_len = 0;
+  for (int i = 0; i < seq_num; ++i) {
+    max_seq_len =
+        std::max(max_seq_len, static_cast<int>(x_lod_0[i + 1] - x_lod_0[i]));
+  }
+  int real_padded_length = param_.padded_length;
+  if (real_padded_length == -1) {
+    real_padded_length = max_seq_len;
+  }
+  CHECK_GE(real_padded_length, max_seq_len)
+      << "The SequencePadOp Attr(padded_length) should be greater than or "
+         "equal to the length of the longest original sequence. But the "
+         "padded_length we received is "
+      << real_padded_length
+      << ", the length of the longest original sequence is " << max_seq_len;
+
+  int out_dim_0 = seq_num;
+  std::vector<int64_t> out_dims_vec{out_dim_0, real_padded_length};
+  std::vector<int64_t> len_dims_vec{out_dim_0};
+  auto time_step_dims_vec = time_step_dims.Vectorize();
+  out_dims_vec.insert(
+      out_dims_vec.end(), time_step_dims_vec.begin(), time_step_dims_vec.end());
+  param_.Out->Resize(out_dims_vec);
+  param_.Length->Resize(len_dims_vec);
+  return true;
+}
+
+bool SequencePadOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.X = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  param_.PadValue = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("PadValue").front())->Get<lite::Tensor>());
+  param_.Length = scope->FindVar(opdesc.Output("Length").front())
+                      ->GetMutable<lite::Tensor>();
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  param_.padded_length = opdesc.GetAttr<int>("padded_length");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_pad, paddle::lite::operators::SequencePadOp);
diff --git a/lite/operators/sequence_pad_op.h b/lite/operators/sequence_pad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd5d732a5d8816d4f7994ee0e3175ac8a032b2d4
--- /dev/null
+++ b/lite/operators/sequence_pad_op.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequencePadOp : public OpLite {
+ public:
+  SequencePadOp() {}
+  explicit SequencePadOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_pad"; }
+
+ private:
+  mutable SequencePadParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_reverse_op.cc b/lite/operators/sequence_reverse_op.cc
index 19a47cac9da666269fc5ef2a172ff0295b71e95d..fa2b0553aa2ac84f27d5d27d31df5ce9584d82c3 100644
--- a/lite/operators/sequence_reverse_op.cc
+++ b/lite/operators/sequence_reverse_op.cc
@@ -34,6 +34,7 @@ bool SequenceReverseOp::InferShapeImpl() const {
   const auto *input = param_.X;
   auto out_dims = input->dims();
   param_.Out->Resize(out_dims);
+  param_.Out->set_lod(param_.X->lod());
   return true;
 }
 
@@ -45,6 +46,7 @@ bool SequenceReverseOp::AttachImpl(const cpp::OpDesc &opdesc,
       scope->FindVar(opdesc.Output("Y").front())->GetMutable<lite::Tensor>();
   CHECK(param_.X);
   CHECK(param_.Out);
+
   return true;
 }
 
diff --git a/lite/operators/sequence_unpad_op.cc b/lite/operators/sequence_unpad_op.cc
index b91d43c741f002b2bdb30e161688cd40b462faee..4f4497f0b81b5710e71cd0a2fcce10e9559d9d30 100644
--- a/lite/operators/sequence_unpad_op.cc
+++ b/lite/operators/sequence_unpad_op.cc
@@ -32,32 +32,7 @@ bool SequenceUnpadOp::CheckShape() const {
   return true;
 }
 
-bool SequenceUnpadOp::InferShapeImpl() const {
-  auto x_dims = param_.X->dims();
-  auto len_dims = param_.Length->dims();
-
-  auto *seq_len_ptr = param_.Length->data<int64_t>();
-  int64_t batch_size = len_dims[0];
-  std::vector<uint64_t> out_lod0(batch_size + 1, 0);
-  for (int64_t i = 0; i < batch_size; ++i) {
-    out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
-  }
-  paddle::lite::LoD out_lod;
-  out_lod.push_back(out_lod0);
-
-  int64_t out_dim0 = out_lod0.back();
-  std::vector<int64_t> out_dims{out_dim0};
-  if (x_dims.size() == 2) {
-    out_dims.push_back(1);
-  } else {
-    for (size_t i = 2; i < x_dims.size(); ++i) {
-      out_dims.push_back(x_dims[i]);
-    }
-  }
-  param_.Out->Resize(out_dims);
-  param_.Out->set_lod(out_lod);
-  return true;
-}
+bool SequenceUnpadOp::InferShapeImpl() const { return true; }
 
 bool SequenceUnpadOp::AttachImpl(const cpp::OpDesc &opdesc,
                                  lite::Scope *scope) {
diff --git a/lite/operators/subgraph_op.cc b/lite/operators/subgraph_op.cc
index 9ac07e96334eda9f0001d33e0789f9de15c4ca67..fec5a0e3254328220508f28a16b110beb01fb613 100644
--- a/lite/operators/subgraph_op.cc
+++ b/lite/operators/subgraph_op.cc
@@ -39,10 +39,11 @@ bool SubgraphOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
       op_desc.GetAttr<std::vector<std::string>>("input_data_names");
   param_.output_data_names =
       op_desc.GetAttr<std::vector<std::string>>("output_data_names");
-  CHECK(param_.sub_block_desc);
-  param_.sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
-  param_.scope = scope;
-  CHECK(param_.scope);
+  CHECK(param_.program_desc);
+  param_.block_idx = op_desc.GetAttr<int32_t>("sub_block");
+  CHECK_GE(param_.block_idx, 0);
+  param_.exec_scope = scope;
+  CHECK(param_.exec_scope);
   return true;
 }
 
diff --git a/lite/operators/subgraph_op.h b/lite/operators/subgraph_op.h
index edbfb922044d60165e589d389cd8cfb3b2547796..df6448f2f78a08f41ac037a13d14cbca1725cfb5 100644
--- a/lite/operators/subgraph_op.h
+++ b/lite/operators/subgraph_op.h
@@ -13,14 +13,11 @@
 // limitations under the License.
 
 #pragma once
-
+#include <memory>
 #include <string>
 #include <vector>
-#include "lite/core/kernel.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
 #include "lite/utils/all.h"
 
 namespace paddle {
@@ -37,14 +34,18 @@ class SubgraphOp : public OpLite {
 
   bool InferShapeImpl() const override;
 
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+  bool AttachImpl(const cpp::OpDesc &op_desc, Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
   std::string DebugString() const override { return "subgraph"; }
 
-  void SetSubBlock(cpp::BlockDesc *desc) { param_.sub_block_desc = desc; }
-  cpp::BlockDesc *GetSubBlock() { return param_.sub_block_desc; }
+  void SetProgramDesc(std::shared_ptr<const cpp::ProgramDesc> program_desc) {
+    param_.program_desc = program_desc;
+  }
+  std::shared_ptr<const cpp::ProgramDesc> GetProgramDesc() {
+    return param_.program_desc;
+  }
 
  private:
   mutable SubgraphParam param_;
diff --git a/lite/operators/topk_pooling_op.cc b/lite/operators/topk_pooling_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76634d216a8a120f4e83dfe511089c6deb750cba
--- /dev/null
+++ b/lite/operators/topk_pooling_op.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/topk_pooling_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool TopkPoolingOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool TopkPoolingOp::InferShapeImpl() const {
+  auto out_dims = param_.X->dims();
+  out_dims[1] *= param_.top_k;
+  auto out = param_.Out;
+  out->Resize(out_dims);
+  out->set_lod(param_.X->lod());
+
+  return true;
+}
+
+bool TopkPoolingOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto y = op_desc.Input("Y").front();
+  param_.X = scope->FindTensor(x);
+  param_.Y = scope->FindTensor(y);
+  auto output = op_desc.Output("Out").front();
+  param_.Out = scope->FindMutableTensor(output);
+  param_.top_k = op_desc.GetAttr<int>("top_k");
+  param_.feat_map_num = op_desc.GetAttr<int>("feat_map_num");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(topk_pooling, paddle::lite::operators::TopkPoolingOp);
diff --git a/lite/operators/topk_pooling_op.h b/lite/operators/topk_pooling_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec48c476ca3e6854038bed591ca59402eda93736
--- /dev/null
+++ b/lite/operators/topk_pooling_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class TopkPoolingOp : public OpLite {
+ public:
+  TopkPoolingOp() {}
+  explicit TopkPoolingOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "topk_pooling"; }
+
+ private:
+  mutable TopkPoolingParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/transpose_op.cc b/lite/operators/transpose_op.cc
index fe40bf6fa2f84ce7c999b41435aed00cd6555887..8f1372a883a1cd54ac2368f1e7f5e30a60a6b1db 100644
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
@@ -43,24 +43,9 @@ bool TransposeOp::CheckShape() const {
 }
 
 bool TransposeOp::InferShapeImpl() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
   auto x_dims = param_.x->dims();
-  auto x_rank = x_dims.size();
   std::vector<int> axis = param_.axis;
   size_t axis_size = axis.size();
-  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
-  // x_rank, axis_size
-  CHECK_OR_FALSE(x_rank == axis_size);
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    // Each element of Attribute axis should be a unique value
-    // range from 0 to (dims - 1),
-    // where the dims is the axis's size
-    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
-                   ++count[axis[i]] == 1);
-  }
   lite::DDim out_dims(x_dims);
   for (size_t i = 0; i < axis_size; i++) {
     out_dims[i] = x_dims[axis[i]];
@@ -113,24 +98,9 @@ bool Transpose2Op::CheckShape() const {
 }
 
 bool Transpose2Op::InferShapeImpl() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
   auto x_dims = param_.x->dims();
-  auto x_rank = x_dims.size();
   std::vector<int> axis = param_.axis;
   size_t axis_size = axis.size();
-  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
-  // x_rank, axis_size
-  CHECK_OR_FALSE(x_rank == axis_size);
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    // Each element of Attribute axis should be a unique value
-    // range from 0 to (dims - 1),
-    // where the dims is the axis's size
-    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
-                   ++count[axis[i]] == 1);
-  }
   lite::DDim out_dims(x_dims);
   for (size_t i = 0; i < axis_size; i++) {
     out_dims[i] = x_dims[axis[i]];
diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc
index 8cf11f6465d73646ec9bf846cbe6347bdc4b9f5b..612632acb4fbea692aa4a02dbd94bb1b506460bb 100644
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
@@ -26,10 +26,16 @@ bool VarConv2dOp::InferShapeImpl() const { return true; }
 bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.X = const_cast<lite::Tensor *>(
       &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
-  // param_.ROW = const_cast<lite::Tensor *>(
-  //     &scope->FindVar(opdesc.Input("ROW").front())->Get<lite::Tensor>());
-  // param_.COLUMN = const_cast<lite::Tensor *>(
-  //     &scope->FindVar(opdesc.Input("COLUMN").front())->Get<lite::Tensor>());
+  if (opdesc.HasInput("ROW") && !opdesc.Input("ROW").empty()) {
+    param_.ROW = const_cast<lite::Tensor *>(
+        &scope->FindVar(opdesc.Input("ROW").front())->Get<lite::Tensor>());
+    CHECK(param_.ROW) << "Input(ROW) of VarConv2dOP should not be null.";
+  }
+  if (opdesc.HasInput("COLUMN") && !opdesc.Input("COLUMN").empty()) {
+    param_.COLUMN = const_cast<lite::Tensor *>(
+        &scope->FindVar(opdesc.Input("COLUMN").front())->Get<lite::Tensor>());
+    CHECK(param_.COLUMN) << "Input(COLUMN) of VarConv2dOP should not be null.";
+  }
   param_.W = const_cast<lite::Tensor *>(
       &scope->FindVar(opdesc.Input("W").front())->Get<lite::Tensor>());
   param_.Out =
@@ -37,8 +43,6 @@ bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.Col =
       scope->FindVar(opdesc.Output("Col").front())->GetMutable<lite::Tensor>();
   CHECK(param_.X) << "X(Input) of VarConv2dOP should not be null.";
-  // CHECK(param_.ROW) << "Input(ROW) of VarConv2dOP should not be null.";
-  // CHECK(param_.COLUMN) << "Input(COLUMN) of VarConv2dOP should not be null.";
   CHECK(param_.W) << "W(Input) of VarConv2dOP should not be null.";
   CHECK(param_.Out) << "Out(Output) of VarConv2dOP should not be null.";
   CHECK(param_.Col) << "Col(Output) of VarConv2dOP should not be null.";
@@ -52,6 +56,15 @@ bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   if (opdesc.HasAttr("fuse_relu")) {
     param_.fuse_relu = opdesc.GetAttr<bool>("fuse_relu");
   }
+#ifdef LITE_WITH_XPU
+  if (opdesc.HasAttr("__xpu__float_to_fix")) {
+    param_.__xpu__float_to_fix = opdesc.GetAttr<bool>("__xpu__float_to_fix");
+  }
+  if (opdesc.HasAttr("__xpu__w_max")) {
+    param_.__xpu__w_max = opdesc.GetAttr<float>("__xpu__w_max");
+  }
+#endif
+
   return true;
 }
 
diff --git a/lite/operators/where_index_op.cc b/lite/operators/where_index_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81443b7058e0c7d68008cbe98040b3f50eac852f
--- /dev/null
+++ b/lite/operators/where_index_op.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/where_index_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool WhereIndexdOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.input);
+  CHECK_OR_FALSE(param_.output);
+  CHECK_GE(param_.input->dims().size(), 1);
+  return true;
+}
+
+bool WhereIndexdOp::InferShapeImpl() const {
+  int64_t rank = static_cast<int64_t>(param_.input->dims().size());
+  int64_t numel = static_cast<int64_t>(param_.input->dims().production());
+  param_.output->Resize({numel, rank});
+  return true;
+}
+
+bool WhereIndexdOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  AttachParam(&param_);
+  auto input = opdesc.Input("Condition").front();
+  auto output = opdesc.Output("Out").front();
+  CHECK(scope->FindVar(input));
+  CHECK(scope->FindVar(output));
+  param_.input = GetVar<lite::Tensor>(scope, input);
+  param_.output = GetMutableVar<lite::Tensor>(scope, output);
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(where_index, paddle::lite::operators::WhereIndexdOp);
diff --git a/lite/operators/where_index_op.h b/lite/operators/where_index_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..157a3cb0be33ffad275ae55a0999095357a09948
--- /dev/null
+++ b/lite/operators/where_index_op.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class WhereIndexdOp : public OpLite {
+ public:
+  WhereIndexdOp() {}
+  explicit WhereIndexdOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShapeImpl() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "where_index_op"; }
+
+ private:
+  mutable WhereIndexParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/while_op.cc b/lite/operators/while_op.cc
index 1dcf9553f331ee6646ad6d93de048728a0886116..ab8e4a5489c13e042bf0d07da1228f33626a1d43 100644
--- a/lite/operators/while_op.cc
+++ b/lite/operators/while_op.cc
@@ -20,31 +20,23 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-bool WhileOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.sub_block);
-  CHECK_OR_FALSE(param_.scope);
+bool WhileOp::CheckShape() const {
   CHECK_OR_FALSE(param_.cond);
+  CHECK_OR_FALSE(param_.program_desc);
+  CHECK_OR_FALSE(param_.exec_scope);
   return true;
 }
 
-bool WhileOpLite::InferShapeImpl() const { return true; }
-
-bool WhileOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto inputs = op_desc.Input("X");
-  auto outs = op_desc.Output("Out");
-
-  for (auto var : inputs) {
-    // param_.x.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  for (auto var : outs) {
-    // param_.outs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  param_.sub_block = sub_block_;
+bool WhileOp::InferShapeImpl() const { return true; }
 
+bool WhileOp::AttachImpl(const cpp::OpDesc &op_desc, Scope *scope) {
   auto condition = op_desc.Input("Condition");
   param_.cond = scope->FindVar(condition[0])->GetMutable<lite::Tensor>();
-  param_.scope = scope;
-
+  CHECK(param_.program_desc);
+  param_.block_idx = op_desc.GetAttr<int32_t>("sub_block");
+  CHECK_GE(param_.block_idx, 0);
+  param_.exec_scope = scope;
+  CHECK(param_.exec_scope);
   return true;
 }
 
@@ -52,4 +44,4 @@ bool WhileOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_OP(while, paddle::lite::operators::WhileOpLite);
+REGISTER_LITE_OP(while, paddle::lite::operators::WhileOp);
diff --git a/lite/operators/while_op.h b/lite/operators/while_op.h
index 94aec15a6d3eb60036bf9c2168fdbd855b84a396..e448ee568723b24a241c5bb127ac61458385337e 100644
--- a/lite/operators/while_op.h
+++ b/lite/operators/while_op.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/op_lite.h"
@@ -23,24 +24,30 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-class WhileOpLite : public OpLite {
+class WhileOp : public OpLite {
  public:
-  WhileOpLite() {}
-  explicit WhileOpLite(const std::string &op_type) : OpLite(op_type) {}
+  WhileOp() {}
+  explicit WhileOp(const std::string &op_type) : OpLite(op_type) {}
 
   bool CheckShape() const override;
 
   bool InferShapeImpl() const override;
 
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
   std::string DebugString() const override { return "while"; }
-  void SetSubBlock(cpp::BlockDesc *desc) { sub_block_ = desc; }
+
+  void SetProgramDesc(std::shared_ptr<const cpp::ProgramDesc> program_desc) {
+    param_.program_desc = program_desc;
+  }
+  std::shared_ptr<const cpp::ProgramDesc> GetProgramDesc() {
+    return param_.program_desc;
+  }
 
  private:
   mutable WhileParam param_;
-  cpp::BlockDesc *sub_block_;
 };
 
 }  // namespace operators
diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt
index 810a20abbc0d13897822cef2c99e5942e352a19f..e9c6574c19bcb6a238503d7b5fc955db9b96d689 100644
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
@@ -1,3 +1,13 @@
+if(LITE_WITH_ARM)
+    lite_cc_test(test_transformer_with_mask_fp32_arm SRCS test_transformer_with_mask_fp32_arm.cc
+      DEPS ${lite_model_test_DEPS} paddle_api_full
+      ARM_DEPS ${arm_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/transformer_with_mask_fp32 SERIAL)
+  if(WITH_TESTING)
+      add_dependencies(test_transformer_with_mask_fp32_arm extern_lite_download_transformer_with_mask_fp32_tar_gz)
+  endif()
+endif()
+
 if(LITE_WITH_XPU)
     lite_cc_test(test_resnet50_lite_xpu SRCS test_resnet50_lite_xpu.cc
       DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
@@ -6,11 +16,25 @@ if(LITE_WITH_XPU)
     lite_cc_test(test_ernie_lite_xpu SRCS test_ernie_lite_xpu.cc
       DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
       ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+      ARGS --model_dir=${LITE_MODEL_DIR}/ernie)
     lite_cc_test(test_bert_lite_xpu SRCS test_bert_lite_xpu.cc
       DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
       ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+      ARGS --model_dir=${LITE_MODEL_DIR}/bert)
+    if(WITH_TESTING)
+        add_dependencies(test_resnet50_lite_xpu extern_lite_download_resnet50_tar_gz)
+        add_dependencies(test_ernie_lite_xpu extern_lite_download_ernie_tar_gz)
+        add_dependencies(test_bert_lite_xpu extern_lite_download_bert_tar_gz)
+    endif()
+    # TODO(miaotianxiang): enable later
+    #lite_cc_test(test_fpr_lite_xpu SRCS test_fpr_lite_xpu.cc
+      #DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      #${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      #ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+    #lite_cc_test(test_mmdnn_lite_xpu SRCS test_mmdnn_lite_xpu.cc
+      #DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      #${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      #ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
 endif()
 
 if(LITE_WITH_RKNPU)
diff --git a/lite/tests/api/test_bert_lite_xpu.cc b/lite/tests/api/test_bert_lite_xpu.cc
index b3ee9febb3f0eabd36118680beca66ace9470de4..5d66fd0d5496e105ba97bea6c5e5387d96c9e01b 100644
--- a/lite/tests/api/test_bert_lite_xpu.cc
+++ b/lite/tests/api/test_bert_lite_xpu.cc
@@ -93,7 +93,7 @@ TEST(Ernie, test_ernie_lite_xpu) {
   for (size_t i = 0; i < results.size(); ++i) {
     for (size_t j = 0; j < results[i].size(); ++j) {
       EXPECT_NEAR(
-          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
+          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 3e-5);
     }
   }
 }
diff --git a/lite/tests/api/test_ernie_lite_xpu.cc b/lite/tests/api/test_ernie_lite_xpu.cc
index 0b614fec96cbcc5d9c96653681d0e8794cf4ab8f..b1db9f353657f3f09bcad25db4e777b05f15e0f7 100644
--- a/lite/tests/api/test_ernie_lite_xpu.cc
+++ b/lite/tests/api/test_ernie_lite_xpu.cc
@@ -93,7 +93,7 @@ TEST(Ernie, test_ernie_lite_xpu) {
   for (size_t i = 0; i < results.size(); ++i) {
     for (size_t j = 0; j < results[i].size(); ++j) {
       EXPECT_NEAR(
-          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
+          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 2e-5);
     }
   }
 }
diff --git a/lite/tests/api/test_fpr_lite_xpu.cc b/lite/tests/api/test_fpr_lite_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..026c25690fe2a673be0a5a97b163d7bbe5fdb4f6
--- /dev/null
+++ b/lite/tests/api/test_fpr_lite_xpu.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+TEST(ResnetCbam, test_resnet_cbam_lite_xpu) {
+  lite_api::CxxConfig config;
+  // config.set_model_dir(FLAGS_model_dir);
+  config.set_model_file(FLAGS_model_dir + "/__model__");
+  config.set_param_file(FLAGS_model_dir + "/__params__");
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  auto input_tensor = predictor->GetInput(0);
+  std::vector<int64_t> input_shape{1, 3, 224, 224};
+  input_tensor->Resize(input_shape);
+  auto* data = input_tensor->mutable_data<float>();
+  int input_num = 1;
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    input_num *= input_shape[i];
+  }
+  for (int i = 0; i < input_num; i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/api/test_mmdnn_lite_xpu.cc b/lite/tests/api/test_mmdnn_lite_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72d774db14d955f17caee217f13fddb32acb93c3
--- /dev/null
+++ b/lite/tests/api/test_mmdnn_lite_xpu.cc
@@ -0,0 +1,299 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
+
+DEFINE_bool(perf, false, "perf?");
+DEFINE_string(perf_input, "perf_input", "perf_input");
+DEFINE_int32(perf_batch_size, 40, "perf_batch_size");
+DEFINE_bool(use_xpu, true, "use_xpu?");
+DEFINE_int32(perf_dev, 0, "perf_dev");
+
+namespace paddle {
+namespace lite {
+
+class SampleReader {
+ public:
+  std::vector<std::vector<int64_t>> data;
+  std::vector<std::vector<uint64_t>> lod;
+
+  void Read() {
+    std::string raw_input =
+        "0 1;125 584 142 2114 197;125 756226 756913 855693 760836;125 584 142 "
+        "2114 197 10 2899;125 756226 756913 855693 760836 10 750793;125 584 "
+        "142 2114 197 10 2899 2 825 32 18499 125 584 295 2114 197 2114 2730 6 "
+        "15 32 18499 125 584 142 295 2114 1423 21 2 334 863 5122 197 974 21 "
+        "295 619 25 2114 1755 2701 197 15 216 23 18499 125 584 142 599 3228 23 "
+        "2 5122 1917 804 5 2114 197 1236 3 2114 1403 15 3886 1080 23 1150 125 "
+        "475 23 2998 23;125 756226 756913 855693 760836 10 750793 2 825 750355 "
+        "18499 881680 756226 295 765124 760836 2114 872813 754265 15 32 18499 "
+        "881680 756226 756913 761251 765124 752843 766823 2 334 759834 5122 "
+        "774643 758458 21 295 755114 25 1148365 1755 2701 197 15 216 23 18499 "
+        "881680 756226 756913 826848 3228 23 2 5122 831009 804 752371 2114 "
+        "760836 1236 3 2114 910393 15 3886 1080 23 877375 752137 761034 792123 "
+        "2998 23;1;1;\n"
+        "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;121 28 1054 "
+        "1459 125 72 32 2321 531 125 295 584 142 2114 197 14 477 30 121;121 28 "
+        "764114 1459 753052 750694 750001 886192 750435 752179 295 584 756913 "
+        "855693 760836 14 477 30 753504;121 28 1054 1459 125 72 32 2321 531 "
+        "125 295 584 142 2114 197 2 121 28 1054 1459 125 72 32 2321 531 125 "
+        "295 584 142 4 263 2114 197 43 95 863 2114 323 20 142 626 11 2 45 10 "
+        "45 58 142 65 918 741 2114 197 764 3 5122 26 51 1266 2037 295 222 1121 "
+        "4491 3 545 4338 11 2 5122 26 495 3 142 3444 3249 2114 197 3 626 4 "
+        "2794;121 28 764114 1459 753052 750694 750001 886192 750435 752179 295 "
+        "584 756913 855693 760836 2 121 28 764114 1459 753052 750694 750001 "
+        "886192 750435 752179 295 584 756913 4 750885 2114 760836 43 750030 "
+        "754302 2114 323 822131 142 626 769001 2 45 750128 750324 58 142 "
+        "1147454 918 910829 2114 760836 841946 767340 5122 779102 51 1266 2037 "
+        "756461 222 752031 942669 1139389 780275 4338 830597 2 5122 779102 495 "
+        "761418 142 3444 852932 2114 760836 3 760162 757966 751127;121 295 "
+        "5593 142 2114 197;121 295 5593 925208 2114 760836;\n"
+        "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;207 125 584 "
+        "142 2114 1423 14 5283 1745 73;207 752276 756226 756913 855693 752843 "
+        "14 5283 781651 786597;6109 18807 142 5 64 5283 1745 73 3690 1060 3626 "
+        "4 716 51 1030 2114 197 4 428 936 9066 10 10 10 2 207 125 584 142 2114 "
+        "1423 2 15329 2114 197 5669 401 318 285 953 4 2114 197 2285 7 1783 11 "
+        "2 5122 197 14017 584;6109 18807 142 5 755319 5283 781651 786597 3690 "
+        "1060 3626 4 716 910478 1030 2114 760836 4 750323 936 9066 10 750002 "
+        "750002 2 207 752276 756226 756913 855693 752843 2 15329 2114 760836 "
+        "5669 401 318 757541 750261 4 2114 760836 2285 7 757639 11 2 5122 "
+        "774643 14017 584;125 584 142 1745 5122;125 756226 756913 1745 "
+        "755836;\n"
+        "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;149 396 778 "
+        "584 142 295 2114 1423 14 64 125 584 73 21 36670 5834 10 211 25;149 "
+        "751876 1048872 584 756913 761251 765124 752843 14 64 125 756226 73 "
+        "944567 36670 5834 10 750012 753240;101 10 2114 197 3 946 2 149 396 "
+        "778 584 142 295 2114 1423 2 2610 6 1444 111 2114 948 72 32 21 15 494 "
+        "25 4 2114 197 5669 1145 2 148 295 149 396 778 584 142 295 21 22853 41 "
+        "348 619 25 366 5305 2114 807 4 1115 381 1955 2114 11;101 751178 2114 "
+        "760836 3 946 2 149 751876 1048872 584 756913 761251 765124 752843 2 "
+        "2610 753567 775165 750899 972788 948 750125 750001 751875 15 494 25 4 "
+        "2114 760836 5669 1145 2 148 808886 982157 751876 1048872 584 756913 "
+        "761251 790772 22853 41 348 619 25 366 894206 2114 1008440 4 753953 "
+        "381 851474 765868 11;149 396 778 584 142 295 2 149 396 354 778 584 "
+        "142 1333 2 584 778 295 5122 2 149 396 778 584 3609 2 149 396 64478 "
+        "816 14246 1423 2 149 396 584 32 127 19 3609 2 149 396 584 73 2 149 "
+        "396 584 778 295 2285 142 4922 323 2 149 396 584 2114 2 149 396 253 "
+        "584 2114 197;149 751876 1048872 584 756913 761251 2 149 751876 756286 "
+        "767182 584 756913 1333 2 584 778 897778 941364 2 149 751876 1048872 "
+        "584 1102835 2 149 751876 64478 816 14246 912094 2 149 751876 584 "
+        "773547 127 750771 791456 2 149 751876 584 73 2 149 751876 584 778 "
+        "897778 2285 751493 791984 323 2 149 751876 584 2114 2 149 751876 "
+        "808443 835481 2114 760836;\n"
+        "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;125 584 545 "
+        "149 14 125 584;125 756226 545 874302 14 125 756226;2204 25 30 1692 "
+        "1770 6534 295 125 584 72 32 1346 4 2698 2114 197 11 2 4235 4301 240 "
+        "295 125 584 72 32 21 6708 15 56974 494 25 1030 2114 197 110 804 495 "
+        "611 2 221 759 341 6 5283 1745 73 71 2114 1423 71 125 584 545 149 149 "
+        "2 505 345 58 125 584 65 3486 2114 295 4 45 786 196 6604 6086;2204 25 "
+        "30 797189 1770 1191824 295 752782 756226 751697 750001 1346 4 2698 "
+        "2114 760836 765158 2 4235 4301 240 753859 752782 756226 751697 750001 "
+        "751875 6708 15 56974 494 25 1030 2114 760836 777607 762850 966521 611 "
+        "2 221 752565 750130 750084 910219 781651 786597 71 2114 752843 71 125 "
+        "756226 545 874302 149 2 505 825657 782848 125 756226 65 3486 2114 "
+        "760669 4 45 755747 758903 6604 6086;125 584 2114 2 125 584 2114 1423 "
+        "2 125 584 2114 149 2 149 584 1745 5122 725 2 2114 125 584 2 125 584 "
+        "2114 2 2621 584 2114 2 527 37 2754 130 170 1013 494 887 240 2 4521 "
+        "11111 586 2321 531 125 584 142 1360 816 2842 1423 2 125 584 2114;125 "
+        "756226 2114 2 125 756226 2114 752843 2 125 756226 2114 783644 2 149 "
+        "760183 1745 755836 725 2 2114 125 756226 2 125 756226 2114 2 2621 "
+        "932600 2114 2 527 751304 869964 754462 170 1013 750719 778287 774620 "
+        "2 4521 11111 586 2321 750435 752179 756226 756913 1360 764399 2842 "
+        "1423 2 125 756226 2114;\n"
+        "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;207 584 142 "
+        "2114 197 4 207 584 142 2114 197 674 14 240 4328 14 4328 767;207 "
+        "1237071 756913 855693 760836 4 207 1237071 756913 855693 760836 674 "
+        "14 240 755573 14 4328 795065;207 584 142 2114 197 2 325 71 71 207 584 "
+        "142 2114 197 2 876 125 140 2114 197 2 207 584 142 2114 197 674 1210 "
+        "239 4328 767 268 1349 485 28 4389 504 3 941 57 1419 1978 11;207 "
+        "1237071 756913 855693 760836 2 325 71 71 207 1237071 756913 855693 "
+        "760836 2 876 125 750977 1250790 760836 2 207 1237071 756913 855693 "
+        "760836 674 814792 755820 812174 795065 818859 817155 816597 761001 "
+        "774461 780904 820475 1109800 790141 790459 780324 770390;584 142 295 "
+        "2114 232 2 207 584 2114 197 2 584 142 295 2114 232 2 584 142 512 2114 "
+        "197;584 756913 761251 765124 1006359 2 207 1237071 2114 760836 2 584 "
+        "756913 761251 765124 1006359 2 584 756913 879930 2114 760836;";
+
+    auto lines = Split(raw_input, "\n");
+    for (auto& line : lines) {
+      auto split1 = Split(line, ";");
+      if (data.size() == 0) {
+        for (size_t i = 1; i < split1.size(); ++i) {
+          data.push_back(std::vector<int64_t>());
+          lod.push_back({0});
+        }
+      }
+
+      for (size_t i = 1; i < split1.size(); ++i) {
+        auto split2 = Split(split1[i], " ");
+        if (split2.size() == 0) {
+          split2.push_back("1280000");
+        }
+        for (auto e : split2) {
+          data[i - 1].push_back(std::stoi(e.c_str(), nullptr, 0));
+        }
+        lod[i - 1].push_back(lod[i - 1].back() + split2.size());
+      }
+    }
+  }
+};
+
+class FileReader {
+  std::ifstream ifs;
+
+ public:
+  std::vector<std::vector<int64_t>> data;
+  std::vector<std::vector<uint64_t>> lod;
+
+  void Init(std::string file_name) { ifs.open(file_name); }
+
+  int Read(int maxline) {
+    data.clear();
+    lod.clear();
+
+    std::string line;
+    int cnt = 0;
+    while (cnt < maxline && getline(ifs, line)) {
+      std::vector<std::string> split1 = Split(line, ";");
+      if (data.size() == 0) {
+        for (size_t i = 1; i < split1.size(); ++i) {
+          data.push_back(std::vector<int64_t>());
+          lod.push_back({0});
+        }
+      }
+
+      for (size_t i = 1; i < split1.size(); i++) {
+        std::vector<std::string> split2 = Split(split1[i], " ");
+        if (split2.size() == 0) {
+          split2.push_back("1280000");
+        }
+        for (size_t j = 0; j < split2.size(); j++) {
+          data[i - 1].push_back(std::stoi(split2[j].c_str(), nullptr, 0));
+        }
+        lod[i - 1].push_back(lod[i - 1].back() + split2.size());
+      }
+      cnt++;
+    }
+    return cnt;
+  }
+};
+
+TEST(MMDNN, test_mmdnn_lite_xpu) {
+  lite_api::CxxConfig config;
+  // config.set_model_dir(FLAGS_model_dir);
+  config.set_model_file(FLAGS_model_dir + "/__model__");
+  config.set_param_file(FLAGS_model_dir + "/__param__");
+  config.set_xpu_dev_per_thread(FLAGS_perf_dev);
+  if (FLAGS_use_xpu) {
+    config.set_valid_places(
+        {lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+         lite_api::Place{TARGET(kXPU), PRECISION(kInt64)},
+         lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+         lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
+         lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  } else {
+    config.set_valid_places(
+        {lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+         lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
+         lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  }
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  if (FLAGS_perf) {
+    FileReader file_reader;
+    file_reader.Init(FLAGS_perf_input);
+    int UB_batch = FLAGS_perf_batch_size;  //  upper bound of batch
+    int iter = 0;
+    double tsc_sum = 0;
+
+    while (true) {
+      int batch = file_reader.Read(UB_batch);
+      if (batch <= 0) {
+        break;
+      }
+      ++iter;
+      for (size_t i = 0; i < file_reader.data.size(); ++i) {
+        auto input_x = predictor->GetInput(i);
+        input_x->Resize({(int64_t)file_reader.data[i].size(), 1});
+        input_x->SetLoD({file_reader.lod[i]});
+        auto* data_x = input_x->mutable_data<int64_t>();
+        memcpy(data_x,
+               file_reader.data[i].data(),
+               file_reader.data[i].size() * sizeof(int64_t));
+      }
+
+      auto start = GetCurrentUS();
+      predictor->Run();
+      auto end = GetCurrentUS();
+      tsc_sum += end - start;
+    }
+    LOG(INFO) << "================== Speed Report ===================";
+    LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num "
+              << FLAGS_threads << ", warmup: " << FLAGS_warmup
+              << ", repeats: " << iter << ", spend " << tsc_sum / iter / 1000.0
+              << " ms in average.";
+
+    return;
+  }
+
+  SampleReader sample_reader;
+  sample_reader.Read();
+
+  for (size_t i = 0; i < sample_reader.data.size(); ++i) {
+    auto input_x = predictor->GetInput(i);
+    input_x->Resize({(int64_t)sample_reader.data[i].size(), 1});
+    input_x->SetLoD({sample_reader.lod[i]});
+    auto* data_x = input_x->mutable_data<int64_t>();
+    memcpy(data_x,
+           sample_reader.data[i].data(),
+           sample_reader.data[i].size() * sizeof(int64_t));
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+
+  auto out = predictor->GetOutput(0);
+  auto out_shape = out->shape();
+  auto out_size = std::accumulate(
+      out_shape.begin(), out_shape.end(), 1, std::multiplies<int64_t>());
+  for (int i = 0; i < out_size; ++i) {
+    LOG(INFO) << "out[" << i << "] = " << out->data<float>()[i];
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/api/test_transformer_with_mask_fp32_arm.cc b/lite/tests/api/test_transformer_with_mask_fp32_arm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e65b017aa1440683d86d0da03686a2be9c4c6ee5
--- /dev/null
+++ b/lite/tests/api/test_transformer_with_mask_fp32_arm.cc
@@ -0,0 +1,274 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename T>
+void SetTensorData(const std::vector<T> &data,
+                   const std::vector<int64_t> &shape,
+                   paddle::lite_api::Tensor *tensor,
+                   const std::vector<std::vector<uint64_t>> &lod = {}) {
+  tensor->Resize(shape);
+  tensor->SetLoD(lod);
+  std::copy(data.begin(), data.end(), tensor->mutable_data<T>());
+}
+
+void PrepareInputData(
+    const std::shared_ptr<paddle::lite_api::PaddlePredictor> &predictor,
+    std::vector<int64_t> src_word_data,
+    int max_seq_len = 16,  // padding
+    int max_out_len = 8,
+    int bos_idx = 0,
+    int eos_idx = 1,
+    int n_head = 8) {
+  // src_word
+  auto src_word = predictor->GetInput(0);
+  int seq_len = src_word_data.size();
+  for (int i = seq_len; i < max_seq_len; i++) {
+    src_word_data.push_back(eos_idx);
+  }
+  std::vector<int64_t> src_word_shape{
+      1, static_cast<int64_t>(src_word_data.size())};
+  SetTensorData<int64_t>(src_word_data, src_word_shape, src_word.get());
+  // src_pos
+  auto src_pos = predictor->GetInput(1);
+  std::vector<int64_t> src_pos_data(src_word_data.size());
+  std::iota(src_pos_data.begin(), src_pos_data.end(), 0);
+  std::vector<int64_t> src_pos_shape{1,
+                                     static_cast<int64_t>(src_pos_data.size())};
+  SetTensorData<int64_t>(src_pos_data, src_pos_shape, src_pos.get());
+  // src_slf_attn_bias
+  auto src_slf_attn_bias = predictor->GetInput(2);
+  std::vector<float> src_slf_attn_bias_data(1 * n_head * src_word_data.size() *
+                                            src_word_data.size());
+  int offset = 0;
+  for (int j = 0; j < 1 * n_head * src_word_data.size(); j++) {
+    for (int i = 0; i < seq_len; i++) {
+      src_slf_attn_bias_data[offset++] = 0.0f;
+    }
+    for (int i = seq_len; i < src_word_data.size(); i++) {
+      src_slf_attn_bias_data[offset++] = -1e9f;
+    }
+  }
+  std::vector<int64_t> src_slf_attn_bias_shape{
+      1,
+      n_head,
+      static_cast<int64_t>(src_word_data.size()),
+      static_cast<int64_t>(src_word_data.size())};
+  SetTensorData<float>(
+      src_slf_attn_bias_data, src_slf_attn_bias_shape, src_slf_attn_bias.get());
+  // trg_word
+  auto trg_word = predictor->GetInput(3);
+  std::vector<int64_t> trg_word_data(2, 0);
+  std::vector<int64_t> trg_word_shape{2, 1};
+  std::vector<uint64_t> lod_level_0{0, 2};
+  std::vector<uint64_t> lod_level_1{0, 1, 2};
+  std::vector<std::vector<uint64_t>> trg_word_lod(2);
+  trg_word_lod[0] = lod_level_0;
+  trg_word_lod[1] = lod_level_1;
+  SetTensorData<int64_t>(
+      trg_word_data, trg_word_shape, trg_word.get(), trg_word_lod);
+  // init_score
+  auto init_score = predictor->GetInput(4);
+  std::vector<float> init_score_data(2);
+  init_score_data[0] = 0;
+  init_score_data[1] = -1e9f;
+  std::vector<int64_t> init_score_shape{2, 1};
+  std::vector<std::vector<uint64_t>> init_score_lod(trg_word_lod);
+  SetTensorData<float>(
+      init_score_data, init_score_shape, init_score.get(), init_score_lod);
+  // init_idx
+  auto init_idx = predictor->GetInput(5);
+  std::vector<int32_t> init_idx_data(2, 0);
+  std::vector<int64_t> init_idx_shape{2};
+  SetTensorData<int32_t>(init_idx_data, init_idx_shape, init_idx.get());
+  // trg_slf_attn_bias
+  auto trg_slf_attn_bias = predictor->GetInput(6);
+  std::vector<float> trg_slf_attn_bias_data(max_out_len * n_head * 1 *
+                                            max_out_len);
+  offset = 0;
+  for (int k = 0; k < max_out_len; k++) {
+    for (int j = 0; j < n_head; j++) {
+      for (int i = 0; i < max_out_len; i++) {
+        trg_slf_attn_bias_data[offset++] = (i <= k) ? 0.0f : -1e9f;
+      }
+    }
+  }
+  std::vector<int64_t> trg_slf_attn_bias_shape{
+      max_out_len, n_head, 1, max_out_len};
+  SetTensorData<float>(
+      trg_slf_attn_bias_data, trg_slf_attn_bias_shape, trg_slf_attn_bias.get());
+  // trg_src_attn_bias
+  auto trg_src_attn_bias = predictor->GetInput(7);
+  std::vector<float> trg_src_attn_bias_data(1 * n_head * 1 *
+                                            src_word_data.size());
+  offset = 0;
+  for (int j = 0; j < 1 * n_head * 1; j++) {
+    for (int i = 0; i < seq_len; i++) {
+      trg_src_attn_bias_data[offset++] = 0.0f;
+    }
+    for (int i = seq_len; i < src_word_data.size(); i++) {
+      trg_src_attn_bias_data[offset++] = -1e9f;
+    }
+  }
+  std::vector<int64_t> trg_src_attn_bias_shape{
+      1, n_head, 1, static_cast<int64_t>(src_word_data.size())};
+  SetTensorData<float>(
+      trg_src_attn_bias_data, trg_src_attn_bias_shape, trg_src_attn_bias.get());
+  // kv_padding_selection
+  auto kv_padding_selection = predictor->GetInput(8);
+  std::vector<float> kv_padding_selection_data(max_out_len * n_head *
+                                               max_out_len * 1);
+  offset = 0;
+  for (int k = 0; k < max_out_len; k++) {
+    for (int j = 0; j < n_head; j++) {
+      for (int i = 0; i < max_out_len; i++) {
+        kv_padding_selection_data[offset++] = (i == k) ? 1.0f : 0.0f;
+      }
+    }
+  }
+  std::vector<int64_t> kv_padding_selection_shape{
+      max_out_len, n_head, max_out_len, 1};
+  SetTensorData<float>(kv_padding_selection_data,
+                       kv_padding_selection_shape,
+                       kv_padding_selection.get());
+}
+
+void CheckOutputData(
+    const std::shared_ptr<paddle::lite_api::PaddlePredictor> &predictor,
+    const std::vector<int64_t> &ref_seq_ids_data,
+    const std::vector<float> &ref_seq_scores_data) {
+  // seq_ids
+  auto seq_ids = predictor->GetOutput(0);
+  auto seq_ids_shape = seq_ids->shape();
+  auto seq_ids_size = std::accumulate(seq_ids_shape.begin(),
+                                      seq_ids_shape.end(),
+                                      1,
+                                      std::multiplies<int64_t>());
+  ASSERT_EQ(seq_ids_size, ref_seq_ids_data.size());
+  auto *seq_ids_data = seq_ids->data<int64_t>();
+  for (size_t i = 0; i < seq_ids_size; i++) {
+    EXPECT_EQ(seq_ids_data[i], ref_seq_ids_data[i]);
+  }
+  // seq_scores
+  auto seq_scores = predictor->GetOutput(1);
+  auto seq_scores_shape = seq_scores->shape();
+  auto seq_scores_size = std::accumulate(seq_scores_shape.begin(),
+                                         seq_scores_shape.end(),
+                                         1,
+                                         std::multiplies<int64_t>());
+  ASSERT_EQ(seq_scores_size, ref_seq_scores_data.size());
+  auto *seq_scores_data = seq_scores->data<float>();
+  for (size_t i = 0; i < seq_scores_size; i++) {
+    EXPECT_NEAR(seq_scores_data[i], ref_seq_scores_data[i], 1e-5);
+  }
+}
+
+TEST(TransformerWithMask, test_transformer_with_mask_fp32) {
+  // Save the optimized model by using full api with CxxConfig
+  lite_api::CxxConfig cxx_config;
+  cxx_config.set_model_dir(FLAGS_model_dir);
+  cxx_config.set_valid_places(
+      {lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+       lite_api::Place{TARGET(kARM), PRECISION(kInt64)}});
+  auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
+  predictor->SaveOptimizedModel(FLAGS_model_dir + ".nb",
+                                paddle::lite_api::LiteModelType::kNaiveBuffer);
+  // Load the optimized model and run inference by using light api with
+  // MobileConfig
+  paddle::lite_api::MobileConfig mobile_config;
+  mobile_config.set_model_from_file(FLAGS_model_dir + ".nb");
+  mobile_config.set_threads(1);
+  mobile_config.set_power_mode(paddle::lite_api::PowerMode::LITE_POWER_HIGH);
+  std::vector<std::pair<std::vector<int64_t>,
+                        std::pair<std::vector<int64_t>, std::vector<float>>>>
+      test_cases = {
+          {{16, 16, 16, 1},
+           {{0, 16, 16, 16, 16, 16, 16, 1, 0, 16, 16, 16, 16, 16, 9, 1},
+            {0.0f,
+             -0.939061f,
+             -1.91494f,
+             -2.94378f,
+             -4.26457f,
+             -5.82675f,
+             -7.45856f,
+             -7.58065f,
+             0.0f,
+             -0.939061f,
+             -1.91494f,
+             -2.94378f,
+             -4.26457f,
+             -5.82675f,
+             -8.70994f,
+             -8.8053f}}},
+          {{16, 16, 16, 10, 1},
+           {{0, 6, 53, 11, 1, 0, 6, 53, 56, 4, 1},
+            {0.0f,
+             -2.36122f,
+             -4.1678f,
+             -6.19764f,
+             -7.69256f,
+             0.0f,
+             -2.36122f,
+             -4.1678f,
+             -6.20145f,
+             -7.66355f,
+             -8.63024f}}},
+          {{126, 4, 33, 1},
+           {{0, 68, 5, 17, 1, 0, 68, 5, 13, 14, 1},
+            {0.0f,
+             -0.829941f,
+             -1.20217f,
+             -2.23938f,
+             -2.98262f,
+             0.0f,
+             -0.829941f,
+             -1.20217f,
+             -2.25051f,
+             -3.07555f,
+             -3.57711f}}},
+          {{126, 4, 33, 99, 1},
+           {{0, 14, 242, 17, 1, 0, 93, 38, 27, 68, 1},
+            {0.f,
+             -1.8504f,
+             -2.66679f,
+             -3.09469f,
+             -3.63227f,
+             0.0f,
+             -1.33829f,
+             -1.41656f,
+             -3.1333f,
+             -3.27901f,
+             -3.88582f}}}};
+  for (auto &test_case : test_cases) {
+    PrepareInputData(predictor, test_case.first);
+    predictor->Run();
+    CheckOutputData(predictor, test_case.second.first, test_case.second.second);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index 03f0de291e80d821af5704727dbd30b10d2ca453..b8d142d7f5cc322b5950ebd512f6e60cd40f247a 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -1,89 +1,91 @@
 if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_RKNPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-    lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_group_norm_compute SRCS group_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_logical_compute SRCS logical_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_multiclass_nms_compute SRCS multiclass_nms_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_fill_constant_compute SRCS fill_constant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_fill_constant_batch_size_like_compute SRCS fill_constant_batch_size_like_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_logical_compute SRCS logical_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_multiclass_nms_compute SRCS multiclass_nms_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_fill_constant_compute SRCS fill_constant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_fill_constant_batch_size_like_compute SRCS fill_constant_batch_size_like_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 
 if(LITE_BUILD_EXTRA)
-    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_conv_compute SRCS sequence_conv_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_sum_compute SRCS reduce_sum_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_conv_compute SRCS sequence_conv_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_sum_compute SRCS reduce_sum_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_clip_compute SRCS clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 
     # for training kernel
     if (LITE_WITH_TRAIN)
-        lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-        lite_cc_test(test_kernel_activation_grad_compute SRCS activation_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-        lite_cc_test(test_kernel_elementwise_grad_compute SRCS elementwise_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-        lite_cc_test(test_kernel_mul_grad_compute SRCS mul_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-        lite_cc_test(test_kernel_sgd_compute SRCS sgd_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_activation_grad_compute SRCS activation_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_elementwise_grad_compute SRCS elementwise_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_mul_grad_compute SRCS mul_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_sgd_compute SRCS sgd_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     endif()
 
 endif()
-    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels}  ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_interp_compute SRCS interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_is_empty_compute SRCS is_empty_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels}  ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_crf_decoding_compute SRCS crf_decoding_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels}  ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_interp_compute SRCS interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_is_empty_compute SRCS is_empty_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels}  ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_crf_decoding_compute SRCS crf_decoding_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc
index c71eac8d4532eefd5569421807c85128746c6c8b..0e803f1281fe2fc4dfca70c3f5223b8835ad7eff 100644
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -38,7 +38,8 @@ enum activation_type_test {
   GELU,
   SQUARE,
   HARD_SWISH,
-  RECIPROCAL
+  RECIPROCAL,
+  THRESHOLDED_RELU
 };
 
 class ActivationComputeTester : public arena::TestCase {
@@ -54,6 +55,7 @@ class ActivationComputeTester : public arena::TestCase {
   float hard_swish_threshold = 6.0;
   float hard_swish_scale = 6.0;
   float hard_swish_offset = 3.0;
+  float relu_threshold_ = 1.0;
   DDim dims_{{1}};
   std::string type_ = "";
   activation_type_test act_type_ = RELU;
@@ -218,6 +220,12 @@ class ActivationComputeTester : public arena::TestCase {
         }
         break;
       }
+      case THRESHOLDED_RELU: {
+        for (int i = 0; i < dims_.production(); i++) {
+          output_data[i] = x_data[i] > relu_threshold_ ? x_data[i] : 0.f;
+        }
+        break;
+      }
       default:
         LOG(INFO) << "the type of activation is unknow.";
     }
@@ -245,6 +253,9 @@ class ActivationComputeTester : public arena::TestCase {
       op_desc->SetAttr("scale", hard_swish_scale);
       op_desc->SetAttr("offset", hard_swish_offset);
     }
+    if (act_type_ == THRESHOLDED_RELU) {
+      op_desc->SetAttr("threshold", relu_threshold_);
+    }
   }
 
   void PrepareData() override {
@@ -289,8 +300,11 @@ TEST(Activation_relu, precision) {
   abs_error = 1e-2;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
@@ -313,6 +327,9 @@ TEST(Activation_leaky_relu, precision) {
   abs_error = 1e-2;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
@@ -393,6 +410,9 @@ TEST(Activation_sigmoid, precision) {
   abs_error = 1e-2;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
@@ -415,8 +435,11 @@ TEST(Activation_tanh, precision) {
   abs_error = 1e-2;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
@@ -456,6 +479,9 @@ TEST(Activation_relu6, precision) {
   abs_error = 1e-2;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
@@ -561,7 +587,7 @@ TEST(Activation_gelu, precision) {
   LOG(INFO) << "test gelu op";
   Place place;
   float abs_error = 2e-5;
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
@@ -632,5 +658,35 @@ TEST(activation_reciprocal, precision) {
   }
 }
 
+TEST(Activation_thresholded_relu, precision) {
+  LOG(INFO) << "test thresholded_relu op";
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(
+        new ActivationComputeTester(place,
+                                    "def",
+                                    0.01,
+                                    6.,
+                                    "all",
+                                    0.,
+                                    DDim(dims),
+                                    "thresholded_relu",
+                                    THRESHOLDED_RELU));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/tests/kernels/activation_grad_compute_test.cc b/lite/tests/kernels/activation_grad_compute_test.cc
index 5d5046b01dee6c84f341159b68300197c20695e6..2ad5b80a910f323b34b039eabda0ceb4b49784c5 100644
--- a/lite/tests/kernels/activation_grad_compute_test.cc
+++ b/lite/tests/kernels/activation_grad_compute_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/activation_grad_compute.h"
+#include "lite/kernels/host/activation_grad_compute.h"
 #include <gtest/gtest.h>
 #include "lite/core/op_registry.h"
 #include "lite/kernels/arm/activation_compute.h"
@@ -20,13 +20,11 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
 
 using param_t = operators::ActivationParam;
 using grad_param_t = operators::ActivationGradParam;
-using kernel_t = SquareCompute;
-using grad_kernel_t = SquareGradCompute;
 
+template <class kernel_t, class grad_kernel_t>
 class ActivationGradTester {
  public:
   explicit ActivationGradTester(DDim dims) : dims_(dims) {}
@@ -71,22 +69,28 @@ class ActivationGradTester {
   void run_backward(grad_param_t* param,
                     grad_kernel_t* kernel,
                     const std::vector<float>& in_vec,
+                    const std::vector<float>& out_vec,
                     const std::vector<float>& out_grad_vec,
                     float* in_grad_vec) {
     Tensor x;
+    Tensor out;
     Tensor x_grad;
     Tensor out_grad;
     x.Resize(dims_);
+    out.Resize(dims_);
     x_grad.Resize(dims_);
     out_grad.Resize(dims_);
     auto* x_data = x.mutable_data<float>();
+    auto* out_data = out.mutable_data<float>();
     auto* out_grad_data = out_grad.mutable_data<float>();
 
     for (int i = 0; i < dims_.production(); i++) {
       x_data[i] = in_vec[i];
+      out_data[i] = out_vec[i];
       out_grad_data[i] = out_grad_vec[i];
     }
     param->X = &x;
+    param->Out = &out;
     param->X_grad = &x_grad;
     param->Out_grad = &out_grad;
     kernel->SetParam(*param);
@@ -102,7 +106,9 @@ class ActivationGradTester {
     std::vector<float> x(dims_.production());
     std::vector<float> out(dims_.production());
     for (int i = 0; i < dims_.production(); i++) {
-      x[i] = 1.0 * static_cast<float>(i % 128) * 0.3f - 1.1;
+      x[i] = static_cast<float>(i % 3 - 2.0) / 2.0 * 0.333 +
+             static_cast<float>(i % 19 - 10.0) / 10.0 * 0.333 +
+             static_cast<float>(i % 39 - 20.0) / 20.0 * 0.333 + 0.001213;
     }
     this->run_forward(&param_, &kernel_, x, out.data());
 
@@ -120,7 +126,8 @@ class ActivationGradTester {
     for (int i = 0; i < dims_.production(); i++) {
       out_grad[i] = 1.0;
     }
-    this->run_backward(&grad_param_, &grad_kernel_, x, out_grad, x_grad.data());
+    this->run_backward(
+        &grad_param_, &grad_kernel_, x, out, out_grad, x_grad.data());
 
     for (int i = 0; i < dims_.production(); i++) {
       EXPECT_NEAR(x_grad[i], (out_delta[i] - out[i]) / delta, max_grad_delta);
@@ -137,31 +144,58 @@ class ActivationGradTester {
   grad_param_t grad_param_;
 };
 
-void TestNormalCase(DDim dims) {
-  std::unique_ptr<ActivationGradTester> tester(new ActivationGradTester(dims));
+void TestSquareGrad(DDim dims) {
+  LOG(INFO) << "Test Square grad";
+  std::unique_ptr<
+      ActivationGradTester<arm::SquareCompute, host::SquareGradCompute>>
+      tester(
+          new ActivationGradTester<arm::SquareCompute, host::SquareGradCompute>(
+              dims));
   tester->prepare_kernel();
   float delta = 0.001;
   float max_grad_delta = 0.005;
   tester->check_grad(delta, max_grad_delta);
 }
 
-TEST(activation_grad_arm, compute) {
-  LOG(INFO) << "Test Square grad";
+void TestReluGrad(DDim dims) {
+  LOG(INFO) << "Test Relu grad";
+  std::unique_ptr<ActivationGradTester<arm::ReluCompute, host::ReluGradCompute>>
+      tester(new ActivationGradTester<arm::ReluCompute, host::ReluGradCompute>(
+          dims));
+  tester->prepare_kernel();
+  float delta = 0.001;
+  float max_grad_delta = 0.005;
+  tester->check_grad(delta, max_grad_delta);
+}
+
+void TestTanhGrad(DDim dims) {
+  LOG(INFO) << "Test Tanh grad";
+  std::unique_ptr<ActivationGradTester<arm::TanhCompute, host::TanhGradCompute>>
+      tester(new ActivationGradTester<arm::TanhCompute, host::TanhGradCompute>(
+          dims));
+  tester->prepare_kernel();
+  float delta = 0.001;
+  float max_grad_delta = 0.005;
+  tester->check_grad(delta, max_grad_delta);
+}
+
+TEST(activation_grad_host, compute) {
   DeviceInfo::Init();
-  for (auto n : {2}) {
-    for (auto c : {2}) {
-      for (auto h : {2}) {
-        for (auto w : {2}) {
-          TestNormalCase(DDim(std::vector<int64_t>({n, c, h, w})));
+  for (auto n : {2, 1}) {
+    for (auto c : {2, 9}) {
+      for (auto h : {2, 1}) {
+        for (auto w : {2, 10}) {
+          TestSquareGrad(DDim(std::vector<int64_t>({n, c, h, w})));
+          TestReluGrad(DDim(std::vector<int64_t>({n, c, h, w})));
+          TestTanhGrad(DDim(std::vector<int64_t>({n, c, h, w})));
         }
       }
     }
   }
 }
 
-}  // namespace arm
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 USE_LITE_KERNEL(square, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(square_grad, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(square_grad, kHost, kFloat, kNCHW, def);
diff --git a/lite/tests/kernels/batch_norm_compute_test.cc b/lite/tests/kernels/batch_norm_compute_test.cc
index ae65e0e3c320ff153a99d2a1656227bad34428d4..9674f95d0b52dbc264ef78748d0c0fba1e4ebc37 100644
--- a/lite/tests/kernels/batch_norm_compute_test.cc
+++ b/lite/tests/kernels/batch_norm_compute_test.cc
@@ -157,7 +157,7 @@ TEST(BatchNorm, precision) {
   LOG(INFO) << "test BatchNorm op";
   float abs_error = 2e-5;
   Place place;
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #elif defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
diff --git a/lite/tests/kernels/box_clip_compute_test.cc b/lite/tests/kernels/box_clip_compute_test.cc
index 72947fa4b258a894e5a73c5e8fe8cce12ef9a02c..c599e64214d3fb15a52cb14fe48de7a7d75b2868 100644
--- a/lite/tests/kernels/box_clip_compute_test.cc
+++ b/lite/tests/kernels/box_clip_compute_test.cc
@@ -70,9 +70,7 @@ class BoxClipComputeTester : public arena::TestCase {
       float sign = i % 3 == 0 ? -1.0f : 1.0f;
       input_data[i] = sign * static_cast<float>((i * 7) % 20);
     }
-    SetCommonTensor(input_, input_dims_, input_data.data());
-    auto input_tensor = baseline_scope()->FindMutableTensor(input_);
-    input_tensor->set_lod(input_lod_);
+    SetCommonTensor(input_, input_dims_, input_data.data(), input_lod_);
 
     std::vector<float> im_info_data{10, 10, 1, 15, 15, 1};
     SetCommonTensor(im_info_, im_info_dim_, im_info_data.data());
diff --git a/lite/tests/kernels/cast_compute_test.cc b/lite/tests/kernels/cast_compute_test.cc
index 86331bb8a1cce89da76d2ebb87a9d091e34f68c5..34038dfdc797d0e5ee618b575ad532fd64809276 100644
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
@@ -135,7 +135,7 @@ TEST(Cast, precision) {
   float abs_error = 2e-5;
 #if defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/clip_compute_test.cc b/lite/tests/kernels/clip_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c6149bb753b2a83813d0a129d61d7444456c399
--- /dev/null
+++ b/lite/tests/kernels/clip_compute_test.cc
@@ -0,0 +1,130 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+class ClipComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "x";
+  std::string out_ = "out";
+  std::string min_tensor_ = "min_tensor";
+  std::string max_tensor_ = "max_tensor";
+  float min_{};
+  float max_{};
+  bool use_minmax_tensor_{};
+  DDim x_dims_;
+
+ public:
+  ClipComputeTester(const Place& place,
+                    const std::string& alias,
+                    int n,
+                    int c,
+                    int h,
+                    int w,
+                    float min,
+                    float max,
+                    bool use_minmax_tensor)
+      : TestCase(place, alias) {
+    x_dims_ = DDim(std::vector<int64_t>({n, c, h, w}));
+    min_ = min;
+    max_ = max;
+    use_minmax_tensor_ = use_minmax_tensor;
+  }
+
+  void RunBaseline(Scope* scope) override {
+    auto* x = scope->FindTensor(x_);
+    auto* out = scope->NewTensor(out_);
+    CHECK(out);
+    out->Resize(x->dims());
+    const auto* x_data = x->data<float>();
+    auto* out_data = out->mutable_data<float>();
+
+    for (int i = 0; i < x->numel(); i++) {
+      if (x_data[i] < min_)
+        out_data[i] = min_;
+      else if (x_data[i] > max_)
+        out_data[i] = max_;
+      else
+        out_data[i] = x_data[i];
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("clip");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
+    if (use_minmax_tensor_) {
+      op_desc->SetInput("Min", {min_tensor_});
+      op_desc->SetInput("Max", {max_tensor_});
+      op_desc->SetAttr("min", 0.f);
+      op_desc->SetAttr("max", 0.f);
+    } else {
+      op_desc->SetAttr("min", min_);
+      op_desc->SetAttr("max", max_);
+    }
+  }
+
+  void PrepareData() override {
+    std::vector<float> x_data(x_dims_.production());
+    for (int i = 0; i < x_dims_.production(); i++) {
+      float sign = i % 3 == 0 ? -1.0f : 1.0f;
+      x_data[i] = sign * static_cast<float>(i % 128) * 0.013f + 0.001;
+    }
+    SetCommonTensor(x_, x_dims_, x_data.data());
+
+    if (use_minmax_tensor_) {
+      std::vector<float> min_data = {min_};
+      SetCommonTensor(
+          min_tensor_, DDim(std::vector<int64_t>({1})), min_data.data());
+
+      std::vector<float> max_data = {max_};
+      SetCommonTensor(
+          max_tensor_, DDim(std::vector<int64_t>({1})), max_data.data());
+    }
+  }
+};
+
+TEST(Clip, precision) {
+  LOG(INFO) << "test clip op";
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+
+  float min = -1;
+  float max = 1;
+  for (int n : {1, 3}) {
+    for (int c : {3, 5}) {
+      for (int h : {5, 6}) {
+        for (int w : {6, 7}) {
+          for (bool use_minmax_tensor : {true, false}) {
+            std::unique_ptr<arena::TestCase> tester(new ClipComputeTester(
+                place, "def", n, c, h, w, min, max, use_minmax_tensor));
+            arena::Arena arena(std::move(tester), place, 2e-5);
+            arena.TestPrecision();
+          }
+        }
+      }
+    }
+  }
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/conv_compute_test.cc b/lite/tests/kernels/conv_compute_test.cc
index 4442fe47e3a6410aa921d163ef0257602cce2fbc..a4bcf6ea70e3fe719793aa4ebd8fb8cd09e35905 100644
--- a/lite/tests/kernels/conv_compute_test.cc
+++ b/lite/tests/kernels/conv_compute_test.cc
@@ -413,6 +413,9 @@ TEST(Conv2d, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 5e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 5e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
diff --git a/lite/tests/kernels/dropout_compute_test.cc b/lite/tests/kernels/dropout_compute_test.cc
index 025f02ce31505cee684fb9a21c7b26d96e1c3026..c4ecc0cf01e3da7c43294ba1249b5b4f106caa95 100644
--- a/lite/tests/kernels/dropout_compute_test.cc
+++ b/lite/tests/kernels/dropout_compute_test.cc
@@ -94,7 +94,7 @@ TEST(Dropout, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;  // Using fp16 in NPU
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/elementwise_compute_test.cc b/lite/tests/kernels/elementwise_compute_test.cc
index 505ab72dc125d5b527845f4695a444c215422f8b..d91c304ef7e76b9ff623ebfe1bb9ad5bb4ace2c9 100644
--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ b/lite/tests/kernels/elementwise_compute_test.cc
@@ -228,7 +228,7 @@ TEST(Elementwise, precision) {
   abs_error = 1e-2;  // use fp16 in npu
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/elementwise_grad_compute_test.cc b/lite/tests/kernels/elementwise_grad_compute_test.cc
index 2b5fbbb65d3d7e17bf90afb71f5c8154f0d88488..04e74e49099f13a7e5920b306f8d2e26650a2574 100644
--- a/lite/tests/kernels/elementwise_grad_compute_test.cc
+++ b/lite/tests/kernels/elementwise_grad_compute_test.cc
@@ -215,18 +215,6 @@ class ElementwiseAddGradTester {
     fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
     this->run_forward(&param_, &kernel_, x, y, out.data());
 
-    for (int i = 0; i < x_dims_.production(); i++) {
-      LOG(INFO) << "x_" << i << ": " << x[i];
-    }
-
-    for (int i = 0; i < y_dims_.production(); i++) {
-      LOG(INFO) << "y_" << i << ": " << y[i];
-    }
-
-    for (int i = 0; i < out_dims_.production(); i++) {
-      LOG(INFO) << "out_" << i << ": " << out[i];
-    }
-
     // backward
     std::vector<float> out_grad(out_dims_.production());
     std::vector<float> x_grad(x_dims_.production());
@@ -242,14 +230,6 @@ class ElementwiseAddGradTester {
                        x_grad.data(),
                        y_grad.data());
 
-    for (int i = 0; i < x_grad.size(); i++) {
-      LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
-    }
-
-    for (int i = 0; i < y_grad.size(); i++) {
-      LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
-    }
-
     // get numeric gradient
     std::vector<float> x_delta(x_dims_.production());
     std::vector<float> y_delta(y_dims_.production());
@@ -443,18 +423,6 @@ class ElementwiseSubGradTester {
     fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
     this->run_forward(&param_, &kernel_, x, y, out.data());
 
-    for (int i = 0; i < x_dims_.production(); i++) {
-      LOG(INFO) << "x_" << i << ": " << x[i];
-    }
-
-    for (int i = 0; i < y_dims_.production(); i++) {
-      LOG(INFO) << "y_" << i << ": " << y[i];
-    }
-
-    for (int i = 0; i < out_dims_.production(); i++) {
-      LOG(INFO) << "out_" << i << ": " << out[i];
-    }
-
     // backward
     std::vector<float> out_grad(out_dims_.production());
     std::vector<float> x_grad(x_dims_.production());
@@ -470,14 +438,6 @@ class ElementwiseSubGradTester {
                        x_grad.data(),
                        y_grad.data());
 
-    for (int i = 0; i < x_grad.size(); i++) {
-      LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
-    }
-
-    for (int i = 0; i < y_grad.size(); i++) {
-      LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
-    }
-
     // get numeric gradient
     std::vector<float> x_delta(x_dims_.production());
     std::vector<float> y_delta(y_dims_.production());
diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc
index 4d0ad1ab47a17c3e8d227b9e0482d7cbe21ab7e2..c023a12b0fb4e3118976d854114c554ca6bf6462 100644
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
@@ -98,7 +98,7 @@ TEST(Gather, precision) {
   abs_error = 1e-2;  // use fp16 in npu
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/group_norm_compute_test.cc b/lite/tests/kernels/group_norm_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a1df003850731eb4d355d01f65100d2b9d200224
--- /dev/null
+++ b/lite/tests/kernels/group_norm_compute_test.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class GroupNormComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "x";
+  std::string y_ = "y";
+  std::string saved_mean_ = "saved_mean";
+  std::string saved_variance_ = "saved_variance";
+  std::string scale_ = "scale";
+  std::string bias_ = "bias";
+
+  DDim dims_{{4, 5, 19, 19}};
+  float epsilon_ = 1e-5f;
+  int groups_ = 1;
+  int channels_ = dims_[1];
+
+ public:
+  GroupNormComputeTest(const Place& place,
+                       const std::string& alias,
+                       DDim dims,
+                       float epsilon,
+                       int groups,
+                       int channels)
+      : TestCase(place, alias),
+        dims_(dims),
+        epsilon_(epsilon),
+        groups_(groups),
+        channels_(channels) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(x_);
+    auto scale = scope->FindTensor(scale_);
+    auto bias = scope->FindTensor(bias_);
+    auto y = scope->NewTensor(y_);
+    auto saved_mean = scope->NewTensor(saved_mean_);
+    auto saved_variance = scope->NewTensor(saved_variance_);
+    CHECK(y);
+    CHECK(saved_mean);
+    CHECK(saved_variance);
+    DDim saved_dim({dims_[0] * groups_});
+    y->Resize(dims_);
+    saved_mean->Resize(saved_dim);
+    saved_variance->Resize(saved_dim);
+
+    auto x_data = x->data<float>();
+    auto scale_data = scale->data<float>();
+    auto bias_data = bias->data<float>();
+    auto y_data = y->mutable_data<float>();
+    auto saved_mean_data = saved_mean->mutable_data<float>();
+    auto saved_variance_data = saved_variance->mutable_data<float>();
+
+    int n = x->dims()[0];
+    int ch_per_group = channels_ / groups_;
+    CHECK_EQ(x->dims()[1], channels_);
+    int spatial_size = ch_per_group * x->dims()[2] * x->dims()[3];
+    // compute mean
+    for (int i = 0; i < n * groups_; ++i) {
+      const float* x_ptr = x_data + i * spatial_size;
+      float sum = 0.f;
+      for (int j = 0; j < spatial_size; ++j) {
+        sum += x_ptr[j];
+      }
+      saved_mean_data[i] = sum / spatial_size;
+    }
+    // compute variance
+    for (int i = 0; i < n * groups_; ++i) {
+      const float* x_ptr = x_data + i * spatial_size;
+      float sum = 0.f;
+      for (int j = 0; j < spatial_size; ++j) {
+        sum +=
+            (x_ptr[j] - saved_mean_data[i]) * (x_ptr[j] - saved_mean_data[i]);
+      }
+      saved_variance_data[i] = 1.f / sqrtf(sum / spatial_size + epsilon_);
+    }
+    int in_size = x->dims()[2] * x->dims()[3];
+    // compute out
+    for (int i = 0; i < n * groups_; ++i) {
+      const float* x_ptr = x_data + i * spatial_size;
+      float* y_ptr = y_data + i * spatial_size;
+      int c_num = i % groups_;
+      for (int c = 0; c < ch_per_group; c++) {
+        int chin = c_num * ch_per_group + c;
+        float scale_val = scale_data[chin];
+        float bias_val = bias_data[chin];
+        const float* x_ch_ptr = x_ptr + c * in_size;
+        float* y_ch_ptr = y_ptr + c * in_size;
+        for (int j = 0; j < in_size; j++) {
+          y_ch_ptr[j] = scale_val * (x_ch_ptr[j] - saved_mean_data[i]) *
+                            saved_variance_data[i] +
+                        bias_val;
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("group_norm");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("Bias", {bias_});
+    op_desc->SetInput("Scale", {scale_});
+    op_desc->SetOutput("Y", {y_});
+    op_desc->SetOutput("SavedMean", {saved_mean_});
+    op_desc->SetOutput("SavedVariance", {saved_variance_});
+    op_desc->SetAttr("epsilon", epsilon_);
+    op_desc->SetAttr("groups", groups_);
+    op_desc->SetAttr("channels", channels_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> x(dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, dims_.production());
+
+    DDim scale_bias_dims{{dims_[1]}};
+    std::vector<float> scale(scale_bias_dims.production());
+    fill_data_rand(scale.data(), -1.f, 1.f, scale_bias_dims.production());
+    std::vector<float> bias(scale_bias_dims.production());
+    fill_data_rand(bias.data(), -1.f, 1.f, scale_bias_dims.production());
+
+    SetCommonTensor(x_, dims_, x.data());
+    SetCommonTensor(scale_, scale_bias_dims, scale.data(), {}, true);
+    SetCommonTensor(bias_, scale_bias_dims, bias.data(), {}, true);
+  }
+};
+
+void TestGroupNorm(Place place,
+                   float abs_error = 6e-5,
+                   std::vector<std::string> ignored_outs = {}) {
+  for (auto& n : {1, 3, 16}) {
+    for (auto& c : {1}) {
+      for (auto& h : {1, 16, 33, 56}) {
+        for (auto& w : {1, 17, 55}) {
+          for (auto& groups : {1, 2, 4}) {
+            if (c % groups != 0) {
+              continue;
+            }
+            DDim dim_in({n, c, h, w});
+            float epsilon = 1e-5f;
+            std::unique_ptr<arena::TestCase> tester(new GroupNormComputeTest(
+                place, "def", dim_in, epsilon, groups, c));
+#ifdef LITE_WITH_ARM
+            if (place == TARGET(kARM)) {
+              auto& ctx = tester->context()->As<ARMContext>();
+              ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 4);
+            }
+#endif
+            arena::Arena arena(std::move(tester), place, abs_error);
+            if (!arena.TestPrecision(ignored_outs)) {
+              LOG(ERROR) << "run n: " << n << ", c: " << c << ", h: " << h
+                         << ", w: " << w;
+              return;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(GroupNorm, precision) {
+  Place place;
+  float abs_error = 6e-5;
+  std::vector<std::string> ignored_outs = {};
+#ifdef LITE_WITH_ARM
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+  TestGroupNorm(place, abs_error, ignored_outs);
+}
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/layer_norm_compute_test.cc b/lite/tests/kernels/layer_norm_compute_test.cc
index 5ea01a6cca504db230d62a63ef3a62d4f73470fa..bd4480b6127a318286b3172f53fc8a5bceb8c328 100644
--- a/lite/tests/kernels/layer_norm_compute_test.cc
+++ b/lite/tests/kernels/layer_norm_compute_test.cc
@@ -147,7 +147,7 @@ TEST(LayerNorm, precision) {
   LOG(INFO) << "test layer_norm op";
   float abs_error = 2e-5;
   Place place;
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #elif defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc
index 988077c6c319d5bcc8e50d6c8e5544331a86fe45..ae39abf1dbaf206fe0a68dd492a48a2452c8094e 100644
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
@@ -116,7 +116,7 @@ TEST(LookupTable, precision) {
   abs_error = 1e-2;
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
@@ -132,7 +132,8 @@ TEST(LookupTable, precision) {
        std::vector<std::vector<int64_t>>{{5, 2, 3, 1}, {2, 3, 1}, {3, 1}}) {
     for (auto w_dims :
          std::vector<std::vector<int64_t>>{{4, 2}, {6, 8}, {12, 15}}) {
-#if defined(LITE_WITH_XPU) && defined(LITE_WITH_NPU)
+#if (defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)) || \
+    defined(LITE_WITH_NPU)
       for (auto padding_idx :
            std::vector<int64_t>{-1}) {  // Only -1 is supported by XPU or NPU
 #else
diff --git a/lite/tests/kernels/matmul_compute_test.cc b/lite/tests/kernels/matmul_compute_test.cc
index 59b0fde8fd18b8a2170b6fdbd42444f09843f077..9799c15622b07a8d126654c79738d29b176c2cf4 100644
--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
@@ -457,7 +457,7 @@ TEST(Matmul2x2, precision) {
   abs_error = 1e-2;  // use fp16 in npu
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
@@ -489,7 +489,7 @@ TEST(Matmul2x2_y_transpose, precision) {
   abs_error = 1e-2;  // use fp16 in npu
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc
index d070292332b65ed577ec6cefdb220ee691eb99e9..d89b3569358034d72ac8019f2348b49764ca6b0c 100644
--- a/lite/tests/kernels/mul_compute_test.cc
+++ b/lite/tests/kernels/mul_compute_test.cc
@@ -127,7 +127,7 @@ TEST(Mul, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;  // use fp16 in npu
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/multiclass_nms_compute_test.cc b/lite/tests/kernels/multiclass_nms_compute_test.cc
index a1190197bffdf505fec77c6b22b7871316a2d125..dd16730ef551ddc11825936d99733f33015fd2c0 100644
--- a/lite/tests/kernels/multiclass_nms_compute_test.cc
+++ b/lite/tests/kernels/multiclass_nms_compute_test.cc
@@ -478,7 +478,7 @@ TEST(multiclass_nms, precision) {
   Place place;
 #if defined(LITE_WITH_ARM)
   place = TARGET(kHost);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/pool_compute_test.cc b/lite/tests/kernels/pool_compute_test.cc
index 04894188b0bf1557000479ae18b0369997909f89..fc4d004e552e76792470f46a54afd6aa13bbc330 100644
--- a/lite/tests/kernels/pool_compute_test.cc
+++ b/lite/tests/kernels/pool_compute_test.cc
@@ -381,7 +381,7 @@ TEST(Pool, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;  // Using fp16 in NPU
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/prior_box_compute_test.cc b/lite/tests/kernels/prior_box_compute_test.cc
index 73fd612c3a03c0a15ddaf3ce6c08ff0ed1a5a95b..ec0eda8cbb2b7f8d6ab01efa467ed857d817905a 100644
--- a/lite/tests/kernels/prior_box_compute_test.cc
+++ b/lite/tests/kernels/prior_box_compute_test.cc
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace lite {
 
-const int MALLOC_ALIGN = 64;
+const int MALLOC_ALIGN = 16;
 
 void* fast_malloc(size_t size) {
   size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc
index 3a866b6cf22cf67c3f5a60e5a4aa8603cee6a1a3..f3fcc0bad5418624c86897bafc52dbf3a7ec0d8e 100644
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
@@ -206,7 +206,7 @@ TEST(Reshape, precision) {
   abs_error = 1e-2;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kHost);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/roi_align_compute_test.cc b/lite/tests/kernels/roi_align_compute_test.cc
index 8eb84dd0337d0635dc360e2e04aa1ad047e912c0..2bbfdcd81da951bd769ab03094a0df48f3a6e13b 100644
--- a/lite/tests/kernels/roi_align_compute_test.cc
+++ b/lite/tests/kernels/roi_align_compute_test.cc
@@ -106,13 +106,11 @@ class RoiAlignComputeTester : public arena::TestCase {
     }
     LOG(INFO) << "Read rois  data. " << datas[0] << " " << datas.back();
     reader.close();
-    SetCommonTensor(rois_, dims, datas.data());
 
-    auto rois_tensor = baseline_scope()->FindMutableTensor(rois_);
     std::vector<uint64_t> lod0({0, 152, 304});
     LoD lod;
     lod.push_back(lod0);
-    rois_tensor->set_lod(lod);
+    SetCommonTensor(rois_, dims, datas.data(), lod);
   }
 };
 
diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc
index efd0497002ee402426a7198bf47ec60c7f41d2fd..9d1f4403dc1a82e58d8c764933ba01c0e0b5c082 100644
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
@@ -165,7 +165,7 @@ TEST(Scale, precision) {
   abs_error = 4e-3;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
   abs_error = 3e-4;  // Some operations use fp16 in XPU
 #elif defined(LITE_WITH_X86)
diff --git a/lite/tests/kernels/sequence_conv_compute_test.cc b/lite/tests/kernels/sequence_conv_compute_test.cc
index 84887b2573516d0c82cbb8c9b4cf9336f30ee41d..68afaad04f8e84995e811f81f99a2d4109c845a5 100644
--- a/lite/tests/kernels/sequence_conv_compute_test.cc
+++ b/lite/tests/kernels/sequence_conv_compute_test.cc
@@ -85,21 +85,31 @@ class SequenceConvComputeTester : public arena::TestCase {
     auto output_dims = output->dims();
     auto output_data = output->mutable_data<float>();
     std::vector<std::vector<float>> res;
-    if (contextStart_ == -2) {
+
+    if (contextStart_ == -2 && lod_.size() == 1 &&
+        lod_[0] == std::vector<uint64_t>({0, 4})) {
       res = {{-0.08867277f, -0.17257819f, -0.2564836f},
              {0.194508f, 0.05720823f, -0.08009153f},
              {0.73512584f, 0.5749428f, 0.41475973f},
              {0.5635012f, 0.49485126f, 0.42620137f}};
-    } else if (contextStart_ == -1) {
+    } else if (contextStart_ == -1 && lod_.size() == 1 &&
+               lod_[0] == std::vector<uint64_t>({0, 4})) {
       res = {{0.194508f, 0.05720823f, -0.08009153f},
              {0.73512584f, 0.5749428f, 0.41475973f},
              {0.5635012f, 0.49485126f, 0.42620137f},
              {0.2517162f, 0.23646072f, 0.22120519f}};
-    } else if (contextStart_ == 0) {
+    } else if (contextStart_ == 0 && lod_.size() == 1 &&
+               lod_[0] == std::vector<uint64_t>({0, 4})) {
       res = {{0.73512584f, 0.5749428f, 0.41475973f},
              {0.5635012f, 0.49485126f, 0.42620137f},
              {0.2517162f, 0.23646072f, 0.22120519f},
              {0.02574372f, 0.03337148f, 0.04099924f}};
+    } else if (contextStart_ == -1 && lod_.size() == 1 &&
+               lod_[0] == std::vector<uint64_t>({0, 2, 4})) {
+      res = {{0.194508, 0.05720823, -0.08009153},
+             {0.7093821, 0.57208234, 0.43478262},
+             {0.19450802, 0.17925248, 0.16399695},
+             {0.2517162, 0.23646072, 0.22120519}};
     } else {
       fprintf(stderr, "not supported contextStart_\n");
       exit(-1);
@@ -136,12 +146,25 @@ void TestNormalCase(Place place, float abs_error = 2e-5) {
   }
 }
 
+void TestBatchCase(Place place, float abs_error = 2e-5) {
+  std::vector<std::vector<uint64_t>> lod{{0, 2, 4}};
+  std::vector<int64_t> dims{4, 5};
+  std::vector<int> candidate_pad_idx{-1};
+  for (int pad_idx : candidate_pad_idx) {
+    std::unique_ptr<arena::TestCase> tester(new SequenceConvComputeTester(
+        place, "def", lod, DDim(dims), pad_idx, 1, 3, 3));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
 TEST(sequence_conv, precision) {
 #ifdef LITE_WITH_ARM
   float abs_error = 2e-5;
   Place place(TARGET(kARM));
 
   TestNormalCase(place, abs_error);
+  TestBatchCase(place, abs_error);
 #endif
 }
 
diff --git a/lite/tests/kernels/slice_compute_test.cc b/lite/tests/kernels/slice_compute_test.cc
index fc96b39f010eab5eedd431cb81e881b7aadb11a2..b566bfa3e86cf6067f9914b5fc3932458a6ee186 100644
--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
@@ -202,20 +202,15 @@ class SliceComputeTester : public arena::TestCase {
                       DDim({static_cast<int64_t>(ends_.size())}),
                       ends_.data());
     } else if (use_tensor_list_) {
-      Scope& scope_ = this->scope();
       for (int i = 0; i < starts_.size(); ++i) {
-        auto* tensor = scope_.NewTensor("starts_tensor_list_" +
-                                        paddle::lite::to_string(i));
-        tensor->Resize(DDim({1}));
-        auto* d = tensor->mutable_data<int>();
-        d[0] = starts_[i];
+        SetCommonTensor("starts_tensor_list_" + paddle::lite::to_string(i),
+                        DDim({1}),
+                        &starts_[i]);
       }
       for (int i = 0; i < ends_.size(); ++i) {
-        auto* tensor =
-            scope_.NewTensor("ends_tensor_list_" + paddle::lite::to_string(i));
-        tensor->Resize(DDim({1}));
-        auto* d = tensor->mutable_data<int>();
-        d[0] = ends_[i];
+        SetCommonTensor("ends_tensor_list_" + paddle::lite::to_string(i),
+                        DDim({1}),
+                        &ends_[i]);
       }
     }
   }
@@ -273,7 +268,7 @@ TEST(Slice, precision) {
   test_slice(place);
   test_slice_tensor(place);
   test_slice_tensor_list(place);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   Place place(TARGET(kXPU));
   test_slice(place);
 #endif
diff --git a/lite/tests/kernels/softmax_compute_test.cc b/lite/tests/kernels/softmax_compute_test.cc
index a91f6534ffa1f8022e2005cc83255d306adf77c1..87a94aba184a055081446b4df830b72146834ed2 100644
--- a/lite/tests/kernels/softmax_compute_test.cc
+++ b/lite/tests/kernels/softmax_compute_test.cc
@@ -111,8 +111,12 @@ TEST(Softmax, precision) {
 
   for (auto x_dims :
        std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}) {
-    for (auto axis : {-1, 0, 1, 2, 3}) {
-      if (axis >= x_dims.size()) continue;
+    int ndims = x_dims.size();
+    for (int axis = -1; axis < ndims; axis++) {
+#if defined(LITE_WITH_XPU)
+      if (axis != -1 && axis != ndims - 1)
+        continue;  // -1 and dims.size() - 1 are only supported by XPU
+#endif
       std::unique_ptr<arena::TestCase> tester(
           new SoftmaxComputeTest(place, "def", DDim(x_dims), axis));
       arena::Arena arena(std::move(tester), place, abs_error);
diff --git a/lite/tests/kernels/stack_compute_test.cc b/lite/tests/kernels/stack_compute_test.cc
index 10b289e41972eb6a9f332f0376393fdfaae94abe..72529cac5165badd50c086a75e882417725adb96 100644
--- a/lite/tests/kernels/stack_compute_test.cc
+++ b/lite/tests/kernels/stack_compute_test.cc
@@ -106,7 +106,7 @@ TEST(Stack, precision) {
   Place place;
 #ifdef LITE_WITH_ARM
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/transpose_compute_test.cc b/lite/tests/kernels/transpose_compute_test.cc
index 0ec010e47fe22f0bd60f0c275696f726b6f01a68..933e9f8ec5fc7b1d9b510c71f57fda309a5477dc 100644
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
@@ -164,7 +164,7 @@ TEST(Transpose, precision) {
   LOG(INFO) << "test Transpose op";
   float abs_error = 2e-5;
   Place place;
-#ifdef LITE_WITH_XPU
+#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #elif defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
diff --git a/lite/tests/kernels/yolo_box_compute_test.cc b/lite/tests/kernels/yolo_box_compute_test.cc
index c41c89608fd7496c5b01b1a813581f7f461ff0ee..b88f25e1e0ddb85683297c19a841a5d47b2bbccf 100644
--- a/lite/tests/kernels/yolo_box_compute_test.cc
+++ b/lite/tests/kernels/yolo_box_compute_test.cc
@@ -247,7 +247,7 @@ TEST(YoloBox, precision) {
   Place place;
 #if defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc
index 8265f9db2f85e54dd91314ac5dc7932e7f7e842a..9ad98ce6f4566898b3821e6bf540b331a84b97bb 100644
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
@@ -236,19 +236,19 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
 
         double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] *
                       weight_dim[3] / param.groups;
-        LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
-                  << ", min time: " << t0.LapTimes().Min()
-                  << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+        VLOG(4) << "conv fp32: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
           double max_diff = 0;
           tensor_cmp_host(tout_basic, *param.output, max_ratio, max_diff);
-          LOG(INFO) << "compare result, max diff: " << max_diff
-                    << ", max ratio: " << max_ratio;
+          VLOG(4) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
           if (std::abs(max_ratio) > 1e-3f) {
             if (max_diff > 5e-4f) {
               LOG(WARNING) << "basic result";
@@ -274,15 +274,15 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
             }
           }
         }
-        LOG(INFO) << "test fp32 conv: input: " << dim_in
-                  << ", output: " << dim_out << ", weight dim: " << weight_dim
-                  << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
-                  << ", " << pads[3] << ", stride: " << strides[0] << ", "
-                  << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
-                  << ", group: " << group
-                  << ", bias: " << (flag_bias ? "true" : "false")
-                  << ", act: " << flag_act << ", threads: " << th
-                  << ", power_mode: " << cls << " successed!!\n";
+        VLOG(4) << "test fp32 conv: input: " << dim_in
+                << ", output: " << dim_out << ", weight dim: " << weight_dim
+                << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
+                << ", " << pads[3] << ", stride: " << strides[0] << ", "
+                << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
+                << ", group: " << group
+                << ", bias: " << (flag_bias ? "true" : "false")
+                << ", act: " << flag_act << ", threads: " << th
+                << ", power_mode: " << cls << " successed!!\n";
       }
     }
   }
diff --git a/lite/tests/math/conv_int8_compute_test.cc b/lite/tests/math/conv_int8_compute_test.cc
index 8dac81fe9f08f3e85fab844ce2df0965fbb52289..ecd5c3966df3115a366fd722b3978258c88c0bf5 100644
--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
@@ -34,7 +34,7 @@ DEFINE_int32(power_mode,
 DEFINE_int32(threads, 1, "threads num");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
-DEFINE_bool(basic_test, true, "do all tests");
+DEFINE_bool(basic_test, false, "do all tests");
 DEFINE_bool(check_result, true, "check the result");
 
 DEFINE_int32(batch, 1, "batch size");
@@ -614,6 +614,9 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
                         dims.push_back(DDim({batch, cin, h, h}));
                       }
                     }
+                    if (cin == 1 && cout == 1) {
+                      continue;
+                    }
                     test_conv_int8(dims,
                                    weights_dim,
                                    1,
diff --git a/lite/tests/math/deformable_conv_compute_test.cc b/lite/tests/math/deformable_conv_compute_test.cc
index e97203123d1db0752189a9965c922b048cd6bd38..76cb970ffe428ed393cdbdae0d281e6a511655ac 100644
--- a/lite/tests/math/deformable_conv_compute_test.cc
+++ b/lite/tests/math/deformable_conv_compute_test.cc
@@ -34,7 +34,7 @@ DEFINE_int32(power_mode,
 DEFINE_int32(threads, 1, "threads num");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
-DEFINE_bool(basic_test, true, "do all tests");
+DEFINE_bool(basic_test, false, "do all tests");
 DEFINE_bool(check_result, true, "check the result");
 
 DEFINE_int32(batch, 1, "batch size");
@@ -342,7 +342,7 @@ TEST(TestDeformableConvRand, test_deformable_conv_rand) {
   if (FLAGS_basic_test) {
     for (auto& cin : {1, 3, 8}) {
       for (auto& cout : {1, 5, 16}) {
-        for (auto& g : {1, 2}) {
+        for (auto& g : {1}) {
           for (auto& kw : {1, 2, 3}) {
             for (auto& kh : {1, 2, 3}) {
               for (auto& stride : {1, 2}) {
diff --git a/lite/tests/math/gemm_int8_compute_test.cc b/lite/tests/math/gemm_int8_compute_test.cc
index adae19d013e50fbd484257a99f55229c75b94263..57899c8d1e2e0c073f410e90d18119327f21f066 100644
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
@@ -120,6 +120,10 @@ bool test_gemm_int8(bool tra,
   auto dc_fp32 = tc_fp32.mutable_data<float>();
   auto dc_basic_int8 = tc_basic_int8.mutable_data<int8_t>();
   auto dc_basic_fp32 = tc_basic_fp32.mutable_data<float>();
+  // set intial input to be 0
+  memset(reinterpret_cast<char*>(dc_basic_fp32),
+         0,
+         tc_basic_fp32.numel() * sizeof(float));
   auto dbias = tbias.mutable_data<float>();
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/gemv_int8_compute_test.cc b/lite/tests/math/gemv_int8_compute_test.cc
index 99db53511446ecd4772fa2fd1b202337581506ef..3819c0dcd7f87c69a5805aae643a6a3a4a037f03 100644
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
@@ -108,6 +108,10 @@ bool test_gemv_int8(bool tra,
   auto dc_basic_int8 = tc_basic_int8.mutable_data<int8_t>();
   auto dc_basic_fp32 = tc_basic_fp32.mutable_data<float>();
   auto dbias = tbias.mutable_data<float>();
+  // set intial input to be 0
+  memset(reinterpret_cast<char*>(dc_basic_fp32),
+         0,
+         tc_basic_fp32.numel() * sizeof(float));
 
   paddle::lite_api::ActivationType act =
       paddle::lite_api::ActivationType::kIndentity;
diff --git a/lite/tests/math/sgemm_c4_compute_test.cc b/lite/tests/math/sgemm_c4_compute_test.cc
index 3e5577e03075502bab30aa03a50241b817fa8742..ecdf77fd37fff1da2914eeca5e29ef931de09c53 100644
--- a/lite/tests/math/sgemm_c4_compute_test.cc
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
@@ -92,6 +92,7 @@ bool test_sgemm_c4(
   auto db_c4 = tb_c4.mutable_data<float>();
   auto dc_basic = tc_basic.mutable_data<float>();
   auto dbias = tbias.mutable_data<float>();
+  memset(reinterpret_cast<char*>(dc_basic), 0, tc_basic.numel());
 
   // trans A, B to c4
   basic_trans_mat_to_c4(da, da_c4, k, m, k, true);
@@ -179,6 +180,141 @@ bool test_sgemm_c4(
 #endif
   return true;
 }
+bool test_sgemm_c8(
+    int m, int n, int k, bool has_bias, bool has_relu, int cls, int ths) {
+  int m_round = (m + 7) / 8 * 8;
+  int k_round = (k + 7) / 8 * 8;
+  int size_a = m * k;
+  int size_b = n * k;
+  int size_a_c4 = m_round * k_round;
+  int size_b_c8 = k_round * n;
+
+  Tensor ta;
+  Tensor tb;
+  Tensor ta_c4;
+  Tensor tb_c8;
+  Tensor tc;
+  Tensor tc_basic;
+  Tensor tc_backup;
+  Tensor tbias;
+
+  ta.Resize({size_a});
+  tb.Resize({size_b});
+  ta_c4.Resize({size_a_c4});
+  tb_c8.Resize({size_b_c8});
+  tc.Resize({m_round * n});
+  tc_basic.Resize({m_round * n});
+  tbias.Resize({m});
+
+  ta.set_precision(PRECISION(kInt16));
+  tb.set_precision(PRECISION(kInt16));
+  ta_c4.set_precision(PRECISION(kInt16));
+  tb_c8.set_precision(PRECISION(kInt16));
+  tc.set_precision(PRECISION(kInt32));
+  tc_basic.set_precision(PRECISION(kInt32));
+  tbias.set_precision(PRECISION(kInt32));
+
+  fill_tensor_rand(ta);
+  fill_tensor_rand(tb);
+  fill_tensor_rand(tbias);
+  fill_tensor_rand(tc);
+
+  auto da = ta.mutable_data<int16_t>();
+  auto db = tb.mutable_data<int16_t>();
+  auto da_c4 = ta_c4.mutable_data<int16_t>();
+  auto db_c8 = tb_c8.mutable_data<int16_t>();
+  auto dc_basic = tc_basic.mutable_data<int32_t>();
+  auto dbias = tbias.mutable_data<int32_t>();
+
+  // trans A, B to c4
+  basic_trans_mat_to_c8(da, da_c4, k, m, k, true);
+  basic_trans_mat_to_c8(db, db_c8, n, k, n, false);
+
+  LOG(INFO) << "sgemm_c8 M: " << m << ", N: " << n << ", K: " << k
+            << ", relu: " << (has_relu ? "true" : "false")
+            << ", bias: " << (has_bias ? "true" : "false");
+
+  if (FLAGS_check_result) {
+    basic_gemm_c8(false,
+                  false,
+                  m,
+                  n,
+                  k,
+                  1,
+                  da,
+                  k,
+                  db,
+                  n,
+                  0,
+                  dc_basic,
+                  n,
+                  dbias,
+                  false,
+                  false);
+  }
+  Timer t0;
+  LOG(INFO) << "basic test end";
+#ifdef LITE_WITH_ARM
+  //! compute
+  double ops = 2.0 * m_round * n * k_round;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), ths);
+  auto dc = tc.mutable_data<int32_t>();
+  for (int j = 0; j < FLAGS_warmup; ++j) {
+    paddle::lite::arm::math::sgemm_prepack_c8_int16_small(
+        m, n, k, da_c4, db_c8, dc, &ctx);
+  }
+  LOG(INFO) << "basic test end";
+
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    t0.Start();
+    paddle::lite::arm::math::sgemm_prepack_c8_int16_small(
+        m, n, k, da_c4, db_c8, dc, &ctx);
+    t0.Stop();
+  }
+  LOG(INFO) << "basic test end";
+  LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k
+            << ", power_mode: " << cls << ", threads: " << ths
+            << ", GOPS: " << ops * 1e-9f
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
+            << " GOPs";
+
+  if (FLAGS_check_result) {
+    double max_ratio = 0;
+    double max_diff = 0;
+    tensor_cmp_host(tc_basic, tc, max_ratio, max_diff);
+    LOG(INFO) << "compare result, max diff: " << max_diff
+              << ", max ratio: " << max_ratio;
+    if (std::abs(max_ratio) > 1e-4f && std::abs(max_diff) > 5e-5f) {
+      Tensor tdiff;
+      tdiff.set_precision(PRECISION(kInt32));
+      tdiff.Resize(tc.dims());
+      tensor_diff(tc_basic, tc, tdiff);
+      LOG(INFO) << "a: ";
+      print_tensor(ta);
+      LOG(INFO) << "a_c8: ";
+      print_tensor(ta_c4);
+      LOG(INFO) << "b: ";
+      print_tensor(tb);
+      LOG(INFO) << "b_c8: ";
+      print_tensor(tb_c8);
+      LOG(INFO) << "basic result: ";
+      print_tensor(tc_basic);
+      LOG(INFO) << "lite result: ";
+      print_tensor(tc);
+      LOG(INFO) << "diff result: ";
+      print_tensor(tdiff);
+      return false;
+    }
+  }
+#endif
+  return true;
+}
 
 TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) {
   if (FLAGS_basic_test) {
@@ -186,11 +322,11 @@ TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) {
     paddle::lite::DeviceInfo::Init();
 #endif
     LOG(INFO) << "run basic sgemm_c4 test";
-    for (auto& m : {1, 3, 8, 32, 397}) {
-      for (auto& n : {1, 2, 3, 4, 13, 141, 789}) {
-        for (auto& k : {1, 3, 8, 59, 234}) {
-          for (auto& has_bias : {false, true}) {
-            for (auto& has_relu : {false, true}) {
+    for (auto& m : {1, 3, 8, 32, 397, 32, 64, 77}) {
+      for (auto& n : {1, 2, 3, 4, 13, 141, 789, 1}) {
+        for (auto& k : {1, 3, 8, 59, 234, 19}) {
+          for (auto& has_bias : {false}) {
+            for (auto& has_relu : {false}) {
               for (auto& th : {1, 2, 4}) {
                 auto flag = test_sgemm_c4(
                     m, n, k, has_bias, has_relu, FLAGS_power_mode, th);
@@ -213,8 +349,41 @@ TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) {
     }
   }
 }
+TEST(TestSgemmC8, test_func_sgemm_c8_prepacked) {
+  if (FLAGS_basic_test) {
+#ifdef LITE_WITH_ARM
+    paddle::lite::DeviceInfo::Init();
+#endif
+    LOG(INFO) << "run basic sgemm_c4 test";
+    for (auto& m : {1, 3, 8, 32, 397, 32, 64, 77}) {
+      for (auto& n : {1, 2, 3, 4, 13, 141, 789, 1}) {
+        for (auto& k : {1, 3, 8, 59, 234, 19}) {
+          for (auto& has_bias : {false}) {
+            for (auto& has_relu : {false}) {
+              for (auto& th : {1}) {
+                auto flag = test_sgemm_c8(
+                    m, n, k, has_bias, has_relu, FLAGS_power_mode, th);
+                if (flag) {
+                  LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k
+                            << ", bias: " << (has_bias ? "true" : "false")
+                            << ", relu: " << (has_relu ? "true" : "false")
+                            << " passed\n";
+                } else {
+                  LOG(FATAL) << "test m = " << m << ", n=" << n << ", k=" << k
+                             << ", bias: " << (has_bias ? "true" : "false")
+                             << ", relu: " << (has_relu ? "true" : "false")
+                             << " failed\n";
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
 
-TEST(TestSgemmC4Custom, test_func_sgemm_c4_prepacked_custom) {
+TEST(TestSgemmCnCustom, test_func_sgemm_cn_prepacked_custom) {
 #ifdef LITE_WITH_ARM
   paddle::lite::DeviceInfo::Init();
 #endif
@@ -230,6 +399,18 @@ TEST(TestSgemmC4Custom, test_func_sgemm_c4_prepacked_custom) {
                << ", k=" << FLAGS_K << ", bias: " << FLAGS_flag_bias
                << ", relu: " << FLAGS_flag_relu << " failed!!";
   }
+  flag = test_sgemm_c8(FLAGS_M,
+                       FLAGS_N,
+                       FLAGS_K,
+                       FLAGS_flag_bias,
+                       FLAGS_flag_relu,
+                       FLAGS_power_mode,
+                       FLAGS_threads);
+  if (!flag) {
+    LOG(FATAL) << "test m = " << FLAGS_M << ", n=" << FLAGS_N
+               << ", k=" << FLAGS_K << ", bias: " << FLAGS_flag_bias
+               << ", relu: " << FLAGS_flag_relu << " failed!!";
+  }
   LOG(INFO) << "test m = " << FLAGS_M << ", n=" << FLAGS_N << ", k=" << FLAGS_K
             << ", bias: " << FLAGS_flag_bias << ", relu: " << FLAGS_flag_relu
             << " passed!!";
diff --git a/lite/tests/math/sgemv_compute_test.cc b/lite/tests/math/sgemv_compute_test.cc
index 91a1fe1770dfa3eeb3f3b94fcd2361f1c1634b1e..661c4f02aa7eafe807f77767dfd4db01a338993e 100644
--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
@@ -84,6 +84,7 @@ bool test_sgemv(bool tra,
   auto db = tb.mutable_data<float>();
   auto dc = tc.mutable_data<float>();
   auto dc_basic = tc_basic.mutable_data<float>();
+  memset(reinterpret_cast<char*>(dc_basic), 0, tc_basic.numel());
   auto dbias = tbias.mutable_data<float>();
   paddle::lite_api::ActivationType act =
       paddle::lite_api::ActivationType::kIndentity;
diff --git a/lite/tests/utils/naive_math_impl.h b/lite/tests/utils/naive_math_impl.h
index a1e793f91d8cd75a2daa7eb46134b841ecf1eac7..0a89d7ca3eaf52ccdfd6c1ce1727669b8c7284e1 100644
--- a/lite/tests/utils/naive_math_impl.h
+++ b/lite/tests/utils/naive_math_impl.h
@@ -62,6 +62,72 @@ static void basic_trans_mat_to_c4(const type* input,
   }
   delete[] zero_buf;
 }
+template <typename type>
+static void basic_trans_mat_to_c8(const type* input,
+                                  type* output,
+                                  const int ldin,
+                                  const int M,
+                                  const int K,
+                                  bool pack_k) {
+  const int m_round = (M + 7) / 8 * 8;
+  int k_round = (K + 7) / 8 * 8;
+  if (!pack_k) {
+    k_round = K;
+  }
+  const int m_loop = m_round / 8;
+  type zero_buf[K];
+  memset(zero_buf, 0, K * sizeof(type));
+  for (int i = 0; i < m_loop; ++i) {
+    const type* in0 = input + i * 8 * ldin;
+    const type* in1 = in0 + ldin;
+    const type* in2 = in1 + ldin;
+    const type* in3 = in2 + ldin;
+    const type* in4 = in3 + ldin;
+    const type* in5 = in4 + ldin;
+    const type* in6 = in5 + ldin;
+    const type* in7 = in6 + ldin;
+    if (8 * (i + 1) - M > 0) {
+      switch (8 * (i + 1) - M) {
+        case 7:
+          in1 = zero_buf;
+        case 6:
+          in2 = zero_buf;
+        case 5:
+          in3 = zero_buf;
+        case 4:
+          in4 = zero_buf;
+        case 3:
+          in5 = zero_buf;
+        case 2:
+          in6 = zero_buf;
+        case 1:
+          in7 = zero_buf;
+        default:
+          break;
+      }
+    }
+    for (int j = 0; j < K; ++j) {
+      *output++ = *in0++;
+      *output++ = *in1++;
+      *output++ = *in2++;
+      *output++ = *in3++;
+      *output++ = *in4++;
+      *output++ = *in5++;
+      *output++ = *in6++;
+      *output++ = *in7++;
+    }
+    for (int j = K; j < k_round; ++j) {
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+    }
+  }
+}
 
 template <typename type, typename type2>
 static void basic_gemm_c4(bool trans_a,
@@ -118,6 +184,60 @@ static void basic_gemm_c4(bool trans_a,
   free(tmp_c);
 }
 
+template <typename type, typename type2>
+static void basic_gemm_c8(bool trans_a,
+                          bool trans_b,
+                          int m,
+                          int n,
+                          int k,
+                          type2 alpha,
+                          const type* a,
+                          int lda,
+                          const type* b,
+                          int ldb,
+                          type2 beta,
+                          type2* c,
+                          int ldc,
+                          const type2* bias,
+                          bool flag_bias = false,
+                          bool flag_relu = false) {
+  type2* tmp_c = reinterpret_cast<type2*>(malloc(m * ldc * sizeof(type2)));
+  memset(tmp_c, 0, m * ldc * sizeof(type2));
+#pragma omp parallel for
+  for (int i = 0; i < m; ++i) {
+    auto bias_data = static_cast<type2>(0);
+    if (flag_bias) {
+      bias_data = bias[i];
+    }
+    for (int j = 0; j < n; ++j) {
+      auto sum = static_cast<type2>(0);
+      for (int l = 0; l < k; ++l) {
+        type av;
+        type bv;
+        if (trans_a) {
+          av = a[l * lda + i];
+        } else {
+          av = a[i * lda + l];
+        }
+        if (trans_b) {
+          bv = b[j * ldb + l];
+        } else {
+          bv = b[l * ldb + j];
+        }
+        sum += av * bv;
+      }
+      type2 tmp = alpha * sum + beta * tmp_c[i * ldc + j] + bias_data;
+      if (flag_relu) {
+        tmp_c[i * ldc + j] = tmp > (type2)0 ? tmp : (type2)0;
+      } else {
+        tmp_c[i * ldc + j] = tmp;
+      }
+    }
+  }
+  //! trans c to c4
+  basic_trans_mat_to_c8(tmp_c, c, ldc, m, n, false);
+  free(tmp_c);
+}
 template <typename type, typename type2>
 static void basic_gemm(bool trans_a,
                        bool trans_b,
diff --git a/lite/tests/utils/tensor_utils.h b/lite/tests/utils/tensor_utils.h
index 5a48b9da6c28b8da784acdaac4d89900d44728f9..8882bb2c08f7e5c930ad7284b31ccd4fd30b8c65 100644
--- a/lite/tests/utils/tensor_utils.h
+++ b/lite/tests/utils/tensor_utils.h
@@ -50,6 +50,10 @@ void fill_tensor_const(Tensor& tensor, float value) {  // NOLINT
       fill_tensor_host_const_impl(
           tensor.mutable_data<int8_t>(), static_cast<signed char>(value), size);
       break;
+    case PRECISION(kInt16):
+      fill_tensor_host_const_impl(
+          tensor.mutable_data<int16_t>(), static_cast<int16_t>(value), size);
+      break;
     case PRECISION(kInt32):
       fill_tensor_host_const_impl(
           tensor.mutable_data<int>(), static_cast<int>(value), size);
@@ -78,6 +82,12 @@ void fill_tensor_host_rand_impl<signed char>(signed char* dio, int64_t size) {
   }
 }
 template <>
+void fill_tensor_host_rand_impl<int16_t>(int16_t* dio, int64_t size) {
+  for (int64_t i = 0; i < size; ++i) {
+    dio[i] = (rand() % 256 - 128) * 2;  // NOLINT
+  }
+}
+template <>
 void fill_tensor_host_rand_impl<unsigned char>(unsigned char* dio,
                                                int64_t size) {
   for (int64_t i = 0; i < size; ++i) {
@@ -95,6 +105,9 @@ void fill_tensor_rand(Tensor& tensor) {  // NOLINT
     case PRECISION(kInt8):
       fill_tensor_host_rand_impl(tensor.mutable_data<int8_t>(), size);
       break;
+    case PRECISION(kInt16):
+      fill_tensor_host_rand_impl(tensor.mutable_data<int16_t>(), size);
+      break;
     case PRECISION(kInt32):
       fill_tensor_host_rand_impl(tensor.mutable_data<int>(), size);
       break;
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index 9365120772d96d31ff0af98c2cab4dea609be5ab..f3f9b9a94236b0d4f25448deb6a702b82c38740f 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -22,6 +22,7 @@ OPTMODEL_DIR=""
 BUILD_TAILOR=OFF
 BUILD_CV=OFF
 WITH_LOG=ON
+WITH_EXCEPTION=OFF
 WITH_PROFILE=OFF
 BUILD_NPU=OFF
 NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/
@@ -32,6 +33,9 @@ BUILD_APU=OFF
 APU_DDK_ROOT="$(pwd)/apu_sdk_lib/"
 BUILD_RKNPU=OFF
 RKNPU_DDK_ROOT="$(pwd)/rknpu/"
+WITH_HUAWEI_ASCEND_NPU=OFF # Huawei Ascend Builder/Runtime Libs on X86 host 
+# default installation path, ensure acllib/atc/opp directories are all in this root dir
+HUAWEI_ASCEND_NPU_DDK_ROOT="/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux_gcc4.8.5"
 PYTHON_EXECUTABLE_OPTION=""
 
 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
@@ -39,8 +43,8 @@ readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/t
 readonly workspace=$PWD
 
 # if operating in mac env, we should expand the maximum file num
-os_nmae=`uname -s`
-if [ ${os_nmae} == "Darwin" ]; then
+os_name=`uname -s`
+if [ ${os_name} == "Darwin" ]; then
    ulimit -n 1024
 fi
 
@@ -126,6 +130,7 @@ function make_tiny_publish_so {
       -DLITE_WITH_JAVA=$BUILD_JAVA \
       -DLITE_WITH_PYTHON=$BUILD_PYTHON \
       -DLITE_WITH_LOG=$WITH_LOG \
+      -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
       -DLITE_ON_TINY_PUBLISH=ON \
       -DANDROID_STL_TYPE=$android_stl \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
@@ -181,6 +186,7 @@ function make_opencl {
       -DWITH_TESTING=OFF \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
       -DLITE_WITH_LOG=$WITH_LOG \
+      -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
       -DLITE_WITH_CV=$BUILD_CV \
       -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_LANG=$3
 
@@ -219,6 +225,7 @@ function make_full_publish_so {
       -DLITE_WITH_JAVA=$BUILD_JAVA \
       -DLITE_WITH_PYTHON=$BUILD_PYTHON \
       -DLITE_WITH_LOG=$WITH_LOG \
+      -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
       -DLITE_WITH_PROFILE=${WITH_PROFILE} \
       -DANDROID_STL_TYPE=$android_stl \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
@@ -343,6 +350,8 @@ function make_cuda {
             -DLITE_WITH_STATIC_CUDA=OFF \
             -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON \
+            -DLITE_WITH_LOG=${WITH_LOG} \
+            -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
             -DLITE_WITH_XPU=$BUILD_XPU \
             -DLITE_WITH_XTCL=$BUILD_XTCL \
             -DXPU_SDK_ROOT=$XPU_SDK_ROOT
@@ -358,6 +367,11 @@ function make_x86 {
   root_dir=$(pwd)
   build_directory=$BUILD_DIR/build.lite.x86
 
+  if [ ${WITH_HUAWEI_ASCEND_NPU} == "ON" ]; then
+    export CXX=/usr/bin/g++ # Ascend need g++ in centos
+    build_directory=$BUILD_DIR/build.lite.huawei_ascend_npu
+  fi
+
   if [ -d $build_directory ]
   then
     rm -rf $build_directory
@@ -379,10 +393,13 @@ function make_x86 {
             -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON \
             -DLITE_WITH_LOG=${WITH_LOG} \
+            -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
             -DLITE_WITH_PROFILE=${WITH_PROFILE} \
             -DLITE_WITH_XPU=$BUILD_XPU \
             -DLITE_WITH_XTCL=$BUILD_XTCL \
             -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+            -DLITE_WITH_HUAWEI_ASCEND_NPU=$WITH_HUAWEI_ASCEND_NPU \
+            -DHUAWEI_ASCEND_NPU_DDK_ROOT=$HUAWEI_ASCEND_NPU_DDK_ROOT \
             -DCMAKE_BUILD_TYPE=Release \
             -DPY_VERSION=$PY_VERSION \
             $PYTHON_EXECUTABLE_OPTION
@@ -409,6 +426,7 @@ function print_usage {
     echo
     echo -e "optional argument:"
     echo -e "--with_log: (OFF|ON); controls whether to print log information, default is ON"
+    echo -e "--with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF"
     echo -e "--build_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)"
     echo -e "--build_train: (OFF|ON); controls whether to publish training operators and kernels, build_train is only for full_publish library now"
     echo -e "--build_python: (OFF|ON); controls whether to publish python api lib (ANDROID and IOS is not supported)"
@@ -491,6 +509,17 @@ function main {
                 WITH_LOG="${i#*=}"
                 shift
                 ;;
+            --with_exception=*)
+                WITH_EXCEPTION="${i#*=}"
+                if [[ $WITH_EXCEPTION == "ON" && $ARM_OS=="android" && $ARM_ABI == "armv7" && $ARM_LANG != "clang" ]]; then
+                     set +x
+                     echo
+                     echo -e "error: only clang provide C++ exception handling support for 32-bit ARM."
+                     echo
+                     exit 1
+                fi
+                shift
+                ;;
             --with_profile=*)
                 WITH_PROFILE="${i#*=}"
                 shift
@@ -539,6 +568,14 @@ function main {
                 RKNPU_DDK_ROOT="${i#*=}"
                 shift
                 ;;
+            --with_huawei_ascend_npu=*)
+                WITH_HUAWEI_ASCEND_NPU="${i#*=}"
+                shift
+                ;;
+            --huawei_ascend_npu_ddk_root=*)
+                HUAWEI_ASCEND_NPU_DDK_ROOT="${i#*=}"
+                shift
+                ;;
             tiny_publish)
                 make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL 
                 shift
diff --git a/lite/tools/build_android.sh b/lite/tools/build_android.sh
index aba5fb706cb62e5bc9b50127f16d07e0db55d595..ecf34f0dfc4ddd141af9ea07dd6c4f15d1c0c16b 100755
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-set -x
+set +x
 #####################################################################################################
 # 1. global variables, you can change them according to your requirements
 #####################################################################################################
@@ -17,6 +17,8 @@ WITH_JAVA=ON
 WITH_CV=OFF
 # controls whether to hide log information, default is ON.
 WITH_LOG=ON
+# controls whether to throw the exception when error occurs, default is OFF 
+WITH_EXCEPTION=OFF
 # options of striping lib according to input model.
 OPTMODEL_DIR=""
 WITH_STRIP=OFF
@@ -145,6 +147,7 @@ function make_tiny_publish_so {
   local cmake_mutable_options="
       -DLITE_BUILD_EXTRA=$WITH_EXTRA \
       -DLITE_WITH_LOG=$WITH_LOG \
+      -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
       -DLITE_BUILD_TAILOR=$WITH_STRIP \
       -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
       -DLITE_WITH_JAVA=$WITH_JAVA \
@@ -194,6 +197,7 @@ function make_full_publish_so {
   local cmake_mutable_options="
       -DLITE_BUILD_EXTRA=$WITH_EXTRA \
       -DLITE_WITH_LOG=$WITH_LOG \
+      -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
       -DLITE_BUILD_TAILOR=$WITH_STRIP \
       -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
       -DLITE_WITH_JAVA=$WITH_JAVA \
@@ -237,6 +241,7 @@ function print_usage {
     echo -e "|     --with_java: (OFF|ON); controls whether to publish java api lib, default is ON                                                   |"
     echo -e "|     --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF                                           |"
     echo -e "|     --with_log: (OFF|ON); controls whether to print log information, default is ON                                                   |"
+    echo -e "|     --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF                            |"
     echo -e "|     --with_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)  |"
     echo -e "|                                                                                                                                      |"
     echo -e "|  arguments of striping lib according to input model:(armv8, gcc, c++_static)                                                         |"
@@ -269,6 +274,7 @@ function main {
     if [ -z "$1" ]; then
         # compiling result contains light_api lib only, recommanded.
         make_tiny_publish_so $ARCH $TOOLCHAIN $ANDROID_STL
+        exit 0
     fi
 
     # Parse command line.
@@ -319,6 +325,18 @@ function main {
                 WITH_LOG="${i#*=}"
                 shift
                 ;;
+            # ON or OFF, default OFF
+            --with_exception=*)
+                WITH_EXCEPTION="${i#*=}"
+                if [[ $WITH_EXCEPTION == "ON" && $ARCH == "armv7" && $TOOLCHAIN != "clang" ]]; then
+                     set +x
+                     echo
+                     echo -e "Error: only clang provide C++ exception handling support for 32-bit ARM."
+                     echo
+                     exit 1
+                fi
+                shift
+                ;;
             # compiling lib which can operate on opencl and cpu.
             --with_opencl=*)
                 WITH_OPENCL="${i#*=}"
@@ -358,6 +376,7 @@ function main {
     done
     # compiling result contains light_api lib only, recommanded.
     make_tiny_publish_so
+    exit 0
 }
 
 main $@
diff --git a/lite/tools/build_bm.sh b/lite/tools/build_bm.sh
index 964da15b0b6fcf888812271b0a2c944d9efa63b8..055f6a35c3ab145e9dfe4bc5d46172a2119ffb25 100755
--- a/lite/tools/build_bm.sh
+++ b/lite/tools/build_bm.sh
@@ -43,7 +43,7 @@ function prepare_thirdparty {
     # clone bmlibs
     if [ ! -d ${workspace}/third-party/bmlibs ]; then
         git clone https://github.com/AnBaolei1984/bmlibs.git ${workspace}/third-party/bmlibs
-    fi     
+    fi
 }
 
 # for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
@@ -70,6 +70,13 @@ function build_bm {
     mkdir -p $build_dir
     cd $build_dir
 
+    if [ $TARGET_NAME == "BM1684" ]; then
+      BM_SDK_ROOT="$workspace/third-party/bmlibs/bm_sc5_libs"
+    else
+      BM_SDK_ROOT="$workspace/third-party/bmlibs/bm_sc3_libs"
+    fi
+    echo $BM_SDK_ROOT
+
     prepare_workspace
     cmake .. \
         ${CMAKE_COMMON_OPTIONS} \
@@ -95,17 +102,7 @@ function main {
         case $i in
             --target_name=*)
                 TARGET_NAME="${i#*=}"
-                shift
-                ;;
-            #--bm_sdk_root=*)
-            #    BM_SDK_ROOT="${i#*=}"
-            #    shift
-            #    ;;
-            bm)
                 build_bm
-                shift
-                ;;
-            *)
                 # unknown option
                 print_usage
                 exit 1
diff --git a/lite/tools/build_ios.sh b/lite/tools/build_ios.sh
index 2c7eeb466f3d82cf491b6a631d79918fa4fd4cd2..4eea073a058ba9e1e821e9f0746687baa0c38d5f 100755
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
@@ -12,6 +12,8 @@ WITH_EXTRA=OFF
 WITH_CV=OFF
 # controls whether to hide log information, default is ON.
 WITH_LOG=ON
+# controls whether to throw the exception when error occurs, default is OFF 
+WITH_EXCEPTION=OFF
 # absolute path of Paddle-Lite.
 workspace=$PWD/$(dirname $0)/../../
 # options of striping lib according to input model.
@@ -69,6 +71,7 @@ function make_ios {
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
             -DLITE_WITH_X86=OFF \
             -DLITE_WITH_LOG=$WITH_LOG \
+            -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
             -DLITE_BUILD_TAILOR=$WITH_STRIP \
             -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
             -DARM_TARGET_ARCH_ABI=$arch \
@@ -96,6 +99,7 @@ function print_usage {
     echo -e "|     --arch: (armv8|armv7), default is armv8                                                                                          |"
     echo -e "|     --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF                                           |"
     echo -e "|     --with_log: (OFF|ON); controls whether to print log information, default is ON                                                   |"
+    echo -e "|     --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF                            |"
     echo -e "|     --with_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)  |"
     echo -e "|                                                                                                                                      |"
     echo -e "|  arguments of striping lib according to input model:(armv8, gcc, c++_static)                                                         |"
@@ -140,6 +144,10 @@ function main {
                 WITH_LOG="${i#*=}"
                 shift
                 ;;
+            --with_exception=*)
+                WITH_EXCEPTION="${i#*=}"
+                shift
+                ;;
             help)
                 print_usage
                 exit 0
@@ -152,6 +160,7 @@ function main {
         esac
     done
     make_ios $ARCH
+    exit 0
 }
 
 main $@
diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh
index 5ed491cb7da7b33357b7e66ab8267e60815b5348..f6de128feb6073fe206d03b68c5d8bc04dc9f16c 100755
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
@@ -17,6 +17,8 @@ PY_VERSION=""
 WITH_CV=OFF
 # controls whether to print log information, default is ON.
 WITH_LOG=ON
+# controls whether to throw the exception when error occurs, default is OFF 
+WITH_EXCEPTION=OFF
 # options of striping lib according to input model.
 WITH_STRIP=OFF
 OPTMODEL_DIR=""
@@ -60,6 +62,7 @@ function init_cmake_mutable_options {
                         -DPY_VERSION=$PY_VERSION \
                         -DLITE_WITH_CV=$WITH_CV \
                         -DLITE_WITH_LOG=$WITH_LOG \
+                        -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
                         -DLITE_BUILD_TAILOR=$WITH_STRIP \
                         -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
                         -DLITE_WITH_OPENCL=$WITH_OPENCL \
@@ -210,6 +213,7 @@ function print_usage {
     echo -e "|     --python_version: (2.7|3.5|3.7); controls python version to compile whl, default is None                                                         |"
     echo -e "|     --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF                                                           |"
     echo -e "|     --with_log: (OFF|ON); controls whether to print log information, default is ON                                                                   |"
+    echo -e "|     --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF                                            |"
     echo -e "|                                                                                                                                                      |"
     echo -e "|  arguments of striping lib according to input model:                                                                                                 |"
     echo -e "|     ./lite/tools/build_linux.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir                                                                |"
@@ -280,6 +284,11 @@ function main {
                 shift
                 ;;
             # ON or OFF, default OFF
+            --with_exception=*)
+                WITH_EXCEPTION="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default OFF
             --with_strip=*)
                 BUILD_TAILOR="${i#*=}"
                 shift
diff --git a/lite/tools/build_mlu.sh b/lite/tools/build_mlu.sh
index 01d71aaf213abb99633112664af580b897ce7454..e0fb2ab11b110cf5a29151ea7c8e544a4074c8c5 100755
--- a/lite/tools/build_mlu.sh
+++ b/lite/tools/build_mlu.sh
@@ -4,7 +4,7 @@ set -ex
 # global variables with default value
 NEUWARE_HOME="${NEUWARE_HOME}"
 TARGET_NAME="all"    # default target
-BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
+BUILD_EXTRA=ON                      # ON(with sequence ops)/OFF
 WITH_TESTING=ON                     # ON/OFF
 
 function print_usage {
@@ -28,16 +28,13 @@ readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/t
 readonly workspace=$(pwd)
 
 function prepare_thirdparty {
-    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
+    if [ ! -d $workspace/third-party ]; then
         rm -rf $workspace/third-party
-
-        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
-            wget $THIRDPARTY_TAR
-        fi
-        tar xzf third-party-05b862.tar.gz
-    else
-        git submodule update --init --recursive
     fi
+    if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
+        wget $THIRDPARTY_TAR
+    fi
+    tar xvf third-party-05b862.tar.gz
 }
 
 # for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
diff --git a/lite/tools/check_api_approvals.sh b/lite/tools/check_api_approvals.sh
old mode 100644
new mode 100755
index 6100558d68abb2b4c82c1f367078e519972546ce..b2a4659c964121b0a95961195340c296710db2de
--- a/lite/tools/check_api_approvals.sh
+++ b/lite/tools/check_api_approvals.sh
@@ -5,13 +5,14 @@ if [ -z ${BRANCH} ]; then
 fi
 
 LITE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../.." && pwd )"
-
 approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle-Lite/pulls/${GIT_PR_ID}/reviews?per_page=10000`
-git_files=`git diff --numstat upstream/$BRANCH| wc -l`
-git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'`
 failed_num=0
 echo_list=()
 
+# approval list
+Superjomn=328693
+DannyIsFunny=45189361
+
 function add_failed(){
     failed_num=`expr $failed_num + 1`
     echo_list="${echo_list[@]}$1"
@@ -24,20 +25,105 @@ function check_approval(){
         add_failed "${failed_num}. ${echo_line}"
     fi
 }
+####################################################################################################
+#  Check 1: You must have Superjomn's (Yunchunwei) approval for changing
+#           20+ files or adding more than 1000+ lines of content
+####################################################################################################
+function CheckModifiedFileNums() {
+    git_files=`git diff --numstat upstream/$BRANCH| wc -l`
+    git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'`
+
+    if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
+        echo_line="You must have Superjomn's (Yunchunwei) approval for changing 20+ files or adding more than 1000+ lines of content.\n"
+        check_approval 1 $Superjomn
+    fi
 
+    if [ -n "${echo_list}" ];then
+      echo "****************"
+      echo -e "${echo_list[@]}"
+      echo "There are ${failed_num} approved errors."
+      echo "****************"
+    fi
+ 
+    if [ -n "${echo_list}" ]; then
+      exit 1
+    fi
+}
+####################################################################################################
+#  Check 2: You must have Superjomn's (Yunchunwei) approval for increasing 
+#           size of dynamic lib for 10+ kb
+####################################################################################################
+function CheckLibSizeDiff() {
+    # step1: record lib size of current branch
+    lite/tools/build_android.sh --arch=armv8 --toolchain=gcc --android_stl=c++_static --with_log=OFF
+    current_size=`stat -c%s build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/cxx/lib/libpaddle_light_api_shared.so`
 
-if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
-    echo_line="You must have Superjomn (Yunchunwei) approval for change 20+ files or add than 1000+ lines of content.\n"
-    check_approval 1 328693
-fi 
+    # step2: record lib size of current develop branch
+    git checkout develop
+    git clean -f . && git checkout .
+    git fetch upstream && git merge upstream/develop
 
-if [ -n "${echo_list}" ];then
-  echo "****************"
-  echo -e "${echo_list[@]}"
-  echo "There are ${failed_num} approved errors."
-  echo "****************"
-fi
+    lite/tools/build_android.sh --arch=armv8 --toolchain=gcc --android_stl=c++_static --with_log=OFF
+    develop_size=`stat -c%s build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/cxx/lib/libpaddle_light_api_shared.so`
+
+    # step3: if diff_size > 10485, special approval is needed    
+    diff_size=$[$current_size - $develop_size]
+    if [ $diff_size -gt 10485 ]; then
+        echo_line="Your PR has increased basic inference lib for $diff_size Byte, exceeding maximum requirement of  10485 Byte (0.01M). You need Superjomn's (Yunchunwei) approval or you can contact DannyIsFunny(HuZhiqiang).\n"
+        echo "****************"
+        echo -e "${echo_line[@]}"
+        echo "There is an approved errors."
+        echo "****************"
+        exit 1
+    fi
+#  Todo: Code below should be applied later.
+#    if [ $diff_size -gt 10485 ]; then
+#        echo_line="Your PR has increased basic inference lib for $diff_size Byte, exceeding maximum requirement of  10485 Byte (0.01M). You need Superjomn's (Yunchunwei) approval or you can contact DannyIsFunny(HuZhiqiang).\n"
+#        check_approval 1 $Superjomn
+#    fi
+#
+#    if [ -n "${echo_list}" ];then
+#      echo "****************"
+#      echo -e "${echo_list[@]}"
+#      echo "There are ${failed_num} approved errors."
+#      echo "****************"
+#    fi
+#
+#    if [ -n "${echo_list}" ]; then
+#      exit 1
+#    fi
+}
+
+####################################################################################################
+# Main functions
+####################################################################################################
+function main {
+    if [ -z "$1" ]; then
+        # at least on argument is needed
+        echo "Error: at least on argument is needed!"
+        exit 1
+    fi
+
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            check_modified_file_nums)
+            # modified files num can not exceed 20 +
+                CheckModifiedFileNums
+                exit 0
+                ;;
+            check_lib_size_diff)
+            # size diff can not exceed 10K +
+                CheckLibSizeDiff
+                exit 0
+                ;;
+            *)
+                # unknown option
+                echo "Error: unsupported input argument!"
+                exit 1
+                ;;
+        esac
+    done
+}
 
-if [ -n "${echo_list}" ]; then
-  exit 1
-fi   
+main $@
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 29ed9100f932b3215e45fc2352b5f0d73b7349b1..9cec7cdc5d566d1db5a8de4c723a9e0b11408d4d 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -21,8 +21,8 @@ USE_ADB_EMULATOR=ON
 LITE_WITH_COVERAGE=OFF
 
 # if operating in mac env, we should expand the maximum file num
-os_nmae=`uname -s`
-if [ ${os_nmae} == "Darwin" ]; then
+os_name=`uname -s`
+if [ ${os_name} == "Darwin" ]; then
    ulimit -n 1024
 fi
 
@@ -279,7 +279,7 @@ function test_server {
 }
 
 function assert_api_spec_approvals() {
-    /bin/bash ${LITE_ROOT}/lite/tools/check_api_approvals.sh
+    /bin/bash ${LITE_ROOT}/lite/tools/check_api_approvals.sh check_modified_file_nums
     if [ "$?" != 0 ];then
        exit 1
     fi
@@ -353,7 +353,7 @@ function cmake_xpu {
         -DWITH_MKL=ON \
         -DLITE_BUILD_EXTRA=ON \
         -DLITE_WITH_XPU=ON \
-        -DXPU_SDK_ROOT="$(pwd)/../../XPU_SDK"
+        -DXPU_SDK_ROOT="/opt/output"
 }
 
 function build_xpu {
@@ -399,6 +399,64 @@ function build_test_xpu {
     test_xpu
 }
 
+function cmake_huawei_ascend_npu {
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
+    prepare_workspace
+    cmake .. \
+        ${common_flags} \
+        -DWITH_GPU=OFF \
+        -DWITH_MKLDNN=OFF \
+        -DLITE_WITH_X86=ON \
+        -DWITH_MKL=ON \
+        -DLITE_BUILD_EXTRA=ON \
+        -DLITE_WITH_HUAWEI_ASCEND_NPU=ON \
+        -DHUAWEI_ASCEND_NPU_DDK_ROOT="/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux_gcc4.8.5" \
+        -DCMAKE_BUILD_TYPE=Release
+}
+
+function build_huawei_ascend_npu {
+    make lite_compile_deps -j$NUM_CORES_FOR_COMPILE
+}
+
+# It will eagerly test all lite related unittests.
+function test_huawei_ascend_npu {
+    # Due to the missing of ascend kernels, we skip the following tests temporarily.
+    # TODO(xxx) clear the skip list latter
+    local skip_list=("test_paddle_api" "test_cxx_api" "test_googlenet"
+                     "test_mobilenetv1_lite_x86" "test_mobilenetv2_lite_x86"
+                     "test_inceptionv4_lite_x86" "test_light_api"
+                     "test_apis" "test_model_bin"
+                    )
+    local to_skip=0
+    for _test in $(cat $TESTS_FILE); do
+        to_skip=0
+        for skip_name in ${skip_list[@]}; do
+            if [ $skip_name = $_test ]; then
+                echo "to skip " $skip_name
+                to_skip=1
+            fi
+        done
+
+        if [ $to_skip -eq 0 ]; then
+            ctest -R $_test -V
+        fi
+    done
+}
+
+# Build the code and run lite server tests. This is executed in the CI system.
+function build_test_huawei_ascend_npu {
+    cur_dir=$(pwd)
+
+    build_dir=$cur_dir/build.lite.huawei_ascend_npu_test
+    mkdir -p $build_dir
+    cd $build_dir
+
+    cmake_huawei_ascend_npu
+    build_huawei_ascend_npu
+
+    test_huawei_ascend_npu
+}
+
 # test_arm_android <some_test_name> <adb_port_number>
 function test_arm_android {
     local test_name=$1
@@ -415,7 +473,7 @@ function test_arm_android {
     echo "test name: ${test_name}"
     adb_work_dir="/data/local/tmp"
 
-    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl")
+    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl" "test_transformer_with_mask_fp32_arm")
     for skip_name in ${skip_list[@]} ; do
         [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
     done
@@ -564,8 +622,18 @@ function test_arm_model {
 function test_model_optimize_tool_compile {
     cd $workspace
     cd build
+    # Compile opt tool
     cmake .. -DWITH_LITE=ON -DLITE_ON_MODEL_OPTIMIZE_TOOL=ON -DWITH_TESTING=OFF -DLITE_BUILD_EXTRA=ON
     make opt -j$NUM_CORES_FOR_COMPILE
+    # Check whether opt can transform quantized mobilenetv1 successfully.
+    cd lite/api && chmod +x ./opt
+    wget --no-check-certificate https://paddlelite-data.bj.bcebos.com/doc_models/MobileNetV1_quant.tar.gz
+    tar zxf MobileNetV1_quant.tar.gz
+    ./opt --model_dir=./MobileNetV1_quant --valid_targets=arm --optimize_out=quant_mobilenetv1
+    if [ ! -f quant_mobilenetv1.nb ]; then
+       echo -e "Error! Resulted opt can not tramsform MobileNetV1_quant successfully!"
+       exit 1
+    fi
 }
 
 function _test_paddle_code_generator {
@@ -1147,6 +1215,10 @@ function main {
                 test_arm_android $TEST_NAME $ARM_PORT
                 shift
                 ;;
+            test_huawei_ascend_npu)
+                test_huawei_ascend_npu
+                shift
+                ;;
             build_test_cuda_server)
                 build_test_cuda_server
                 shift
@@ -1164,6 +1236,10 @@ function main {
                 build_test_xpu
                 shift
                 ;;
+            build_test_huawei_ascend_npu)
+                build_test_huawei_ascend_npu
+                shift
+                ;;
             build_test_train)
                 build_test_train
                 shift
@@ -1189,6 +1265,7 @@ function main {
                 build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
                 build_test_arm_subtask_model test_resnet50 resnet50
                 build_test_arm_subtask_model test_inceptionv4 inception_v4_simple
+                build_test_arm_subtask_model test_transformer_with_mask_fp32_arm transformer_with_mask_fp32
                 shift
                 ;;
             build_test_arm_subtask_armlinux)
diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py
index abb60f6141fbee53916a7db1711cf606afb09924..0cf14d12d553a4d9f7f4ed9780e4274560a8b23f 100644
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -56,8 +56,8 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
 ops_lines = []
 
 # valid targets and valid_ops
-valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU"]
-valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
+valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU"]
+valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
 class TargetType:
     kUnk = 0
     kHost = 1
@@ -73,6 +73,7 @@ class TargetType:
     kMLU = 11
     kRKNPU = 12
     kAPU = 13
+    kHuaweiAscendNPU = 14
 
 
 # record op_info of valid kernels into `valid_ops` according to different target type
diff --git a/lite/utils/all.h b/lite/utils/all.h
index a0d323aa24b36dac7858f484eb1cf1d5a7bcba50..8586188b99971d04271d14ac2d3b8043b0ea4414 100644
--- a/lite/utils/all.h
+++ b/lite/utils/all.h
@@ -14,10 +14,16 @@
 
 #pragma once
 
+#include <iostream>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+
 #include "lite/utils/any.h"
 #include "lite/utils/check.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/factory.h"
 #include "lite/utils/hash.h"
 #include "lite/utils/io.h"
 #include "lite/utils/macros.h"
diff --git a/lite/utils/env.h b/lite/utils/env.h
index 3048c84b42f6f658eaf0c8ee0d08456f53162c37..1d26148cea1ed499c8d5ca408ae9235788be6e91 100644
--- a/lite/utils/env.h
+++ b/lite/utils/env.h
@@ -15,13 +15,24 @@
 #pragma once
 #include <cstdlib>
 #include <cstring>
-
 #include <iostream>
 #include <string>
 
+// Specify the path of configuration file for the subgraph segmentation, an
+// example is shown as below:
+// op_type:in_var_name_0,in_var_name1:out_var_name_0,out_var_name1
+// op_type::out_var_name_0
+// op_type:in_var_name_0
+// op_type
 #define SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE \
   "SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE"
 
+// The original weight/local/unused variables in the subblock of the subgraph op
+// will be saved only if 'SUBGRAPH_ONLINE_MODE' is set to true(default) during
+// the analysis phase, it ensure the ops in the subblock can be converted to the
+// target device model online during the execution phase.
+#define SUBGRAPH_ONLINE_MODE "SUBGRAPH_ONLINE_MODE"
+
 namespace paddle {
 namespace lite {
 
diff --git a/lite/utils/factory.h b/lite/utils/factory.h
deleted file mode 100644
index d286ceb42ce32dba68bc68cabab2a600ad3d7789..0000000000000000000000000000000000000000
--- a/lite/utils/factory.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <functional>
-#include <iostream>
-#include <list>
-#include <map>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-#include "lite/utils/all.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-
-/*
- * Factor for any Type creator.
- *
- * Usage:
- *
- * struct SomeType;
- * // Register a creator.
- * Factory<SomeType>::Global().Register("some_key", [] ->
- *                                      std::unique_ptr<SomeType> { ... });
- * // Retrive a creator.
- * auto some_type_instance = Factory<SomeType>::Global().Create("some_key");
- */
-template <typename ItemType, typename ItemTypePtr>
-class Factory {
- public:
-  using item_t = ItemType;
-  using self_t = Factory<item_t, ItemTypePtr>;
-  using item_ptr_t = ItemTypePtr;
-  using creator_t = std::function<item_ptr_t()>;
-
-  static Factory& Global() {
-    static Factory* x = new self_t;
-    return *x;
-  }
-
-  void Register(const std::string& op_type, creator_t&& creator) {
-    creators_[op_type].emplace_back(std::move(creator));
-  }
-
-  item_ptr_t Create(const std::string& op_type) const {
-    auto res = Creates(op_type);
-    if (res.empty()) return nullptr;
-    CHECK_EQ(res.size(), 1UL) << "Get multiple Op for type " << op_type;
-    return std::move(res.front());
-  }
-
-  std::list<item_ptr_t> Creates(const std::string& op_type) const {
-    std::list<item_ptr_t> res;
-    auto it = creators_.find(op_type);
-    if (it == creators_.end()) return res;
-    for (auto& c : it->second) {
-      res.emplace_back(c());
-    }
-    return res;
-  }
-
-  std::string DebugString() const {
-    STL::stringstream ss;
-    for (const auto& item : creators_) {
-      ss << "  - " << item.first << "\n";
-    }
-    return ss.str();
-  }
-
- protected:
-  std::map<std::string, std::list<creator_t>> creators_;
-};
-
-/* A helper function to help run a lambda at the start.
- */
-template <typename Type>
-class Registor {
- public:
-  explicit Registor(std::function<void()>&& functor) { functor(); }
-
-  // Touch will do nothing.
-  int Touch() { return 0; }
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/io.h b/lite/utils/io.h
index 2141364df79bb189772592a556dd9a115ae1a67e..5de95e72f06856df01189e8ae3f1c22115801094 100644
--- a/lite/utils/io.h
+++ b/lite/utils/io.h
@@ -120,5 +120,40 @@ static std::vector<std::string> ListDir(const std::string& path,
   return paths;
 }
 
+static bool ReadFile(const std::string& filename, std::vector<char>* contents) {
+  FILE* fp = fopen(filename.c_str(), "rb");
+  if (!fp) return false;
+  fseek(fp, 0, SEEK_END);
+  size_t size = ftell(fp);
+  fseek(fp, 0, SEEK_SET);
+  contents->clear();
+  contents->resize(size);
+  size_t offset = 0;
+  char* ptr = reinterpret_cast<char*>(&(contents->at(0)));
+  while (offset < size) {
+    size_t already_read = fread(ptr, 1, size - offset, fp);
+    offset += already_read;
+    ptr += already_read;
+  }
+  fclose(fp);
+  return true;
+}
+
+static bool WriteFile(const std::string& filename,
+                      const std::vector<char>& contents) {
+  FILE* fp = fopen(filename.c_str(), "wb");
+  if (!fp) return false;
+  size_t size = contents.size();
+  size_t offset = 0;
+  const char* ptr = reinterpret_cast<const char*>(&(contents.at(0)));
+  while (offset < size) {
+    size_t already_written = fwrite(ptr, 1, size - offset, fp);
+    offset += already_written;
+    ptr += already_written;
+  }
+  fclose(fp);
+  return true;
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/utils/logging.h b/lite/utils/logging.h
index f292f220c006135af664ea34acc03525a5c112ab..c7fa8d4cf113abebb29c4ebe972e243a39573cf0 100644
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
@@ -57,7 +57,7 @@ static int gettimeofday(struct timeval* tp, void* tzp) {
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
 
-#ifdef LITE_WITH_ANDROID
+#if defined(LITE_WITH_LOG) && defined(LITE_WITH_ANDROID)
 #include <android/log.h>
 // Android log macors
 #define ANDROID_LOG_TAG "Paddle-Lite"
@@ -143,8 +143,10 @@ class LogMessage {
       ANDROID_LOG_I(log_stream_.str().c_str());
     } else if (level_ == "W") {
       ANDROID_LOG_W(log_stream_.str().c_str());
+    } else if (level_ == "F") {
+      ANDROID_LOG_F(log_stream_.str().c_str());
     } else {
-      fprintf(stderr, "Unsupported log level: %s", level_.c_str());
+      fprintf(stderr, "Unsupported log level: %s\n", level_.c_str());
       assert(false);
     }
 #endif
@@ -170,17 +172,25 @@ class LogMessageFatal : public LogMessage {
                   const char* level = "F")
       : LogMessage(file, func, lineno, level) {}
 
-  ~LogMessageFatal() {
+  ~LogMessageFatal()
+#ifdef LITE_WITH_EXCEPTION
+      noexcept(false)
+#endif
+  {
     log_stream_ << '\n';
 #ifdef LITE_WITH_ANDROID
     ANDROID_LOG_F(log_stream_.str().c_str());
 #endif
     fprintf(stderr, "%s", log_stream_.str().c_str());
 
+#ifdef LITE_WITH_EXCEPTION
+    throw std::exception();
+#else
 #ifndef LITE_ON_TINY_PUBLISH
     abort();
 #else
     assert(false);
+#endif
 #endif
   }
 };
@@ -237,7 +247,11 @@ class Voidify {
 
 class VoidifyFatal : public Voidify {
  public:
+#ifdef LITE_WITH_EXCEPTION
+  ~VoidifyFatal() noexcept(false) { throw std::exception(); }
+#else
   ~VoidifyFatal() { assert(false); }
+#endif
 };
 
 #endif
diff --git a/lite/utils/md5.h b/lite/utils/md5.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2e972dd8001a9a85e29688f460be061d64a16b5
--- /dev/null
+++ b/lite/utils/md5.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace paddle {
+namespace lite {
+
+std::string MD5(std::string message) {
+  const uint32_t shiftAmounts[] = {
+      7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
+      5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20,
+      4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
+      6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
+  const uint32_t partsOfSines[] = {
+      0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a,
+      0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
+      0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340,
+      0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+      0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8,
+      0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
+      0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
+      0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+      0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92,
+      0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
+      0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391};
+
+  uint32_t state[4];
+  state[0] = 0x67452301;
+  state[1] = 0xefcdab89;
+  state[2] = 0x98badcfe;
+  state[3] = 0x10325476;
+
+  // Pad with zeros
+  int size = ((((message.length() + 8) / 64) + 1) * 64) - 8;
+  uint8_t *buf = reinterpret_cast<uint8_t *>(calloc(size + 64, 1));
+  memcpy(buf, message.c_str(), message.length());
+  buf[message.length()] = 128;
+  uint32_t bits = 8 * message.length();
+  memcpy(buf + size, &bits, 4);
+
+// Process at each 512-bit(64 bytes) chunk
+#define LEFTROTATE(x, c) (((x) << (c)) | ((x) >> (32 - (c))))
+  for (int offset = 0; offset < size; offset += 64) {
+    uint32_t A = state[0];
+    uint32_t B = state[1];
+    uint32_t C = state[2];
+    uint32_t D = state[3];
+    uint32_t *W = reinterpret_cast<uint32_t *>(buf + offset);
+    for (uint32_t i = 0; i < 64; i++) {
+      uint32_t F, g;
+      if (i < 16) {
+        F = (B & C) | ((~B) & D);
+        g = i;
+      } else if (i < 32) {
+        F = (D & B) | ((~D) & C);
+        g = (5 * i + 1) % 16;
+      } else if (i < 48) {
+        F = B ^ C ^ D;
+        g = (3 * i + 5) % 16;
+      } else {
+        F = C ^ (B | (~D));
+        g = (7 * i) % 16;
+      }
+      uint32_t T = D;
+      D = C;
+      C = B;
+      B = B + LEFTROTATE((A + F + partsOfSines[i] + W[g]), shiftAmounts[i]);
+      A = T;
+    }
+    state[0] += A;
+    state[1] += B;
+    state[2] += C;
+    state[3] += D;
+  }
+#undef LEFTROTATE
+  free(buf);
+
+  // Convert digest to string
+  std::string res;
+  res.reserve(16 << 1);
+  const uint8_t *digest = reinterpret_cast<uint8_t *>(state);
+  char hex[3];
+  for (size_t i = 0; i < 16; i++) {
+    snprintf(hex, sizeof(hex), "%02x", digest[i]);
+    res.append(hex);
+  }
+  return res;
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/utils/paddle_enforce.h b/lite/utils/paddle_enforce.h
deleted file mode 100644
index 82534af996919ac69a8624e442f1af6a9abb2c07..0000000000000000000000000000000000000000
--- a/lite/utils/paddle_enforce.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file defines PADDLE_ENFORCE_xx, which helps to adapt the legacy fluid
- * codes.
- */
-#pragma once
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/string.h"
-
-#define PADDLE_ENFORCE(cond, ...) \
-  CHECK((cond)) << paddle::lite::string_format("" __VA_ARGS__);
-#define PADDLE_ENFORCE_EQ(a, b, ...) \
-  CHECK_EQ((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-#define PADDLE_ENFORCE_LE(a, b, ...) \
-  CHECK_LE((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-#define PADDLE_ENFORCE_LT(a, b, ...) \
-  CHECK_LT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-
-#define PADDLE_ENFORCE_GE(a, b, ...) \
-  CHECK_GE((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-#define PADDLE_ENFORCE_GT(a, b, ...) \
-  CHECK_GT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-
-#ifndef PADDLE_THROW
-#define PADDLE_THROW(...) printf("" __VA_ARGS__);
-#endif
diff --git a/lite/utils/replace_stl/stream.cc b/lite/utils/replace_stl/stream.cc
index 081006be6711d5d26c405181fd6d86e89c9e4e95..8e14e4d6d5dbab8dc01b9f8a07910a905cae6abf 100644
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
@@ -23,6 +23,14 @@ namespace paddle {
 namespace lite {
 namespace replace_stl {
 
+#ifndef LITE_WITH_LOG
+#define ADD_DATA_AS_STRING(data_, obj_)
+#else
+#define ADD_DATA_AS_STRING(data_, obj_)    \
+  std::string text = std::to_string(obj_); \
+  pad(text);                               \
+  data_ = data_ + text;
+
 void ostream::pad(const std::string& text) {
   if (display_width_ > 0) {
     if (display_width_ < text.size()) {
@@ -36,15 +44,6 @@ void ostream::pad(const std::string& text) {
     }
   }
 }
-
-#ifndef LITE_WITH_LOG
-#define ADD_DATA_AS_STRING(data_, obj_)
-#else
-#define ADD_DATA_AS_STRING(data_, obj_)    \
-  std::string text = std::to_string(obj_); \
-  pad(text);                               \
-  data_ = data_ + text;
-
 #endif
 
 template <>
diff --git a/lite/utils/replace_stl/stream.h b/lite/utils/replace_stl/stream.h
index 3288a1986906b3fd600b91b6a56ae7134644456f..c58265a0cd864ebe2d2d158d953b17e2c230531f 100644
--- a/lite/utils/replace_stl/stream.h
+++ b/lite/utils/replace_stl/stream.h
@@ -57,7 +57,9 @@ class ostream {
   ostream& operator<<(const T* obj);
 
  private:
+#ifdef LITE_WITH_LOG
   void pad(const std::string& text);
+#endif
   std::string data_;
   int display_width_{-1};  // -1 refers to no setting
 };
diff --git a/lite/utils/string.h b/lite/utils/string.h
index ada51d0b85d7536bfc937a7b1b8368a0f0e053be..b1aaf5d6c56d8931c4ad416f9d38c947abc68dd8 100644
--- a/lite/utils/string.h
+++ b/lite/utils/string.h
@@ -60,6 +60,38 @@ static std::string to_string(const T& v) {
   return ss.str();
 }
 
+static std::string to_string(int index) {
+  const int BUFFER_LENGTH = 15;
+  char buffer[BUFFER_LENGTH];
+  snprintf(buffer, sizeof(buffer), "%d", index);
+  return std::string(buffer);
+}
+
+template <typename T = std::string>
+static T parse_string(const std::string& v) {
+  return v;
+}
+
+template <>
+int32_t parse_string<int32_t>(const std::string& v) {
+  return std::stoi(v);
+}
+
+template <>
+int64_t parse_string<int64_t>(const std::string& v) {
+  return std::stoll(v);
+}
+
+template <>
+float parse_string<float>(const std::string& v) {
+  return std::stof(v);
+}
+
+template <>
+double parse_string<double>(const std::string& v) {
+  return std::stod(v);
+}
+
 template <typename T>
 std::string Join(const std::vector<T>& vec, const std::string& delim) {
   if (vec.empty()) return "";
@@ -84,19 +116,20 @@ static std::string Repr(const std::vector<std::string>& v) {
   return "{" + Join(tmp, ",") + "}";
 }
 
-static std::vector<std::string> Split(const std::string& original,
-                                      const std::string& separator) {
-  std::vector<std::string> results;
+template <class T = std::string>
+static std::vector<T> Split(const std::string& original,
+                            const std::string& separator) {
+  std::vector<T> results;
   std::string::size_type pos1, pos2;
   pos2 = original.find(separator);
   pos1 = 0;
   while (std::string::npos != pos2) {
-    results.push_back(original.substr(pos1, pos2 - pos1));
+    results.push_back(parse_string<T>(original.substr(pos1, pos2 - pos1)));
     pos1 = pos2 + separator.size();
     pos2 = original.find(separator, pos1);
   }
   if (pos1 != original.length()) {
-    results.push_back(original.substr(pos1));
+    results.push_back(parse_string<T>(original.substr(pos1)));
   }
   return results;
 }
diff --git a/third-party/flatbuffers b/third-party/flatbuffers
new file mode 160000
index 0000000000000000000000000000000000000000..6df40a2471737b27271bdd9b900ab5f3aec746c7
--- /dev/null
+++ b/third-party/flatbuffers
@@ -0,0 +1 @@
+Subproject commit 6df40a2471737b27271bdd9b900ab5f3aec746c7