diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55375994031850d93caa89ec7050a9e8e657d04f..e598f1dcd501b2ca09273a0914ff4cdf66f8b0e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
 lite_option(LITE_WITH_RKNPU  "Enable RKNPU in lite mode"  OFF)
 lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
+lite_option(LITE_WITH_HUAWEI_ASCEND_NPU  "Enable HUAWEI_ASCEND_NPU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
@@ -98,6 +99,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OF
 lite_option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
 lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
 lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
+lite_option(LITE_WITH_EXCEPTION "Enable throwing the exception when error occurs in lite" OFF)
 lite_option(LITE_WITH_NVTX "Enable nvtx or not, please enable LITE_WITH_CUDA first." OFF)
 lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
 lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
@@ -224,6 +226,11 @@ endif()
 if(LITE_WITH_MLU)
     include(mlu)
 endif()
+
+if(LITE_WITH_HUAWEI_ASCEND_NPU)
+    include(device/huawei_ascend_npu)
+endif()
+
 include(coveralls)
 
 include(external/mklml)     # download mklml package
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 1b0890e0dbf5e741176c293a059d809752c72a43..773de573aff92599ad6e5fb746a2956d9e50a8c2 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -174,6 +174,10 @@ if (LITE_WITH_MLU)
 add_definitions("-DLITE_WITH_MLU")
 endif()
 
+if (LITE_WITH_HUAWEI_ASCEND_NPU)
+add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU")
+endif()
+
 if (LITE_WITH_PROFILE)
     add_definitions("-DLITE_WITH_PROFILE")
 endif()
@@ -190,6 +194,10 @@ if (LITE_WITH_LOG)
   add_definitions("-DLITE_WITH_LOG")
 endif()
 
+if (LITE_WITH_EXCEPTION)
+  add_definitions("-DLITE_WITH_EXCEPTION")
+endif()
+
 if (LITE_ON_TINY_PUBLISH)
   add_definitions("-DLITE_ON_TINY_PUBLISH")
 endif()
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index e6193e0bb3c93292d2264501fc4d5739ff8766ee..68f91fe88173f1cd254bc44d5e7dbcd456bfcdb8 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -80,6 +80,21 @@ if (ARM_TARGET_LANG STREQUAL "clang")
     elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
         set(triple arm-v7a-linux-android)
         set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE)
+        if(ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
+            # Use CMAKE_CXX_STANDARD_LIBRARIES_INIT to ensure libunwind and libc++ is linked in the right order
+            set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libunwind.a")
+            if (ANDROID_API_LEVEL LESS 21)
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libandroid_support.a")
+            endif()
+            if(ANDROID_STL_TYPE STREQUAL "c++_shared")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_shared.so")
+            elseif(ANDROID_STL_TYPE STREQUAL "c++_static")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_static.a")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++abi.a")
+            else()
+                message(FATAL_ERROR "Invalid Android STL TYPE: ${ANDROID_STL_TYPE}.")
+            endif()
+        endif()
     else()
         message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
     endif()
diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake
index 069923c779fbd3eed4f5f81ef3e386ff70fac215..c9c3fc9f2681b6002567d555a26ee14edefaeae5 100644
--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -23,6 +23,21 @@ if(ANDROID)
     
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")
+
+    # Don't re-export libgcc symbols
+    set(REMOVE_ATOMIC_GCC_SYMBOLS "-Wl,--exclude-libs,libatomic.a -Wl,--exclude-libs,libgcc.a")
+    set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
+    set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
+
+    # Only the libunwind.a from clang(with libc++) provide C++ exception handling support for 32-bit ARM
+    # Refer to https://android.googlesource.com/platform/ndk/+/master/docs/BuildSystemMaintainers.md#Unwinding
+    if (ARM_TARGET_LANG STREQUAL "clang" AND ARM_TARGET_ARCH_ABI STREQUAL "armv7" AND ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
+        set(REMOVE_UNWIND_SYMBOLS "-Wl,--exclude-libs,libunwind.a")
+        set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
+        set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
+        set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
+    endif()
 endif()
 
 if(ARMLINUX)
@@ -59,14 +74,13 @@ function(check_linker_flag)
 endfunction()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if((LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) OR LITE_WITH_PYTHON OR LITE_WITH_EXCEPTION OR (NOT LITE_ON_TINY_PUBLISH))
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -fasynchronous-unwind-tables -funwind-tables")
+else ()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-asynchronous-unwind-tables -fno-unwind-tables")
+endif()
 if (LITE_ON_TINY_PUBLISH)
-    if((NOT LITE_WITH_PYTHON))
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
-    endif()
-    if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
-    endif()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
     check_linker_flag(-Wl,--gc-sections)
 endif()
diff --git a/cmake/device/huawei_ascend_npu.cmake b/cmake/device/huawei_ascend_npu.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0bd9591eee702f4db914a8b547c4c99b21d0473b
--- /dev/null
+++ b/cmake/device/huawei_ascend_npu.cmake
@@ -0,0 +1,169 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+  return()
+endif()
+
+# 1. path to Huawei Ascend Install Path
+if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
+    set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
+    if(NOT HUAWEI_ASCEND_NPU_DDK_ROOT)
+        message(FATAL_ERROR "Must set HUAWEI_ASCEND_NPU_DDK_ROOT or env HUAWEI_ASCEND_NPU_DDK_ROOT when LITE_WITH_HUAWEI_ASCEND_NPU=ON")
+    endif()
+endif()
+message(STATUS "HUAWEI_ASCEND_NPU_DDK_ROOT: ${HUAWEI_ASCEND_NPU_DDK_ROOT}")
+
+# 2. Huawei Ascend include directory
+set(ACL_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/include")
+set(ATC_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/include")
+set(OPP_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp")
+include_directories(${ACL_INCLUDE_DIR})
+include_directories(${ATC_INCLUDE_DIR})
+include_directories(${OPP_INCLUDE_DIR})
+
+# 3 find ACL Libs (ACL libs should before ATC libs)
+find_library(ACL_ASCENDCL_FILE NAMES ascendcl
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ACL_ASCENDCL_FILE)
+  message(FATAL_ERROR "Can not find ACL_ASCENDCL_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
+else()
+  message(STATUS "Found ACL_ASCENDCL_FILE Library: ${ACL_ASCENDCL_FILE}")
+  add_library(acl_ascendcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET acl_ascendcl PROPERTY IMPORTED_LOCATION ${ACL_ASCENDCL_FILE})
+endif()
+
+# 3.1 ascendcl dependency - libruntime.so
+find_library(ACL_RUNTIME_FILE NAMES runtime
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ACL_RUNTIME_FILE)
+  message(FATAL_ERROR "Can not find ACL_RUNTIME_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
+else()
+  message(STATUS "Found ACL_RUNTIME_FILE Library: ${ACL_RUNTIME_FILE}")
+  add_library(acl_runtime SHARED IMPORTED GLOBAL)
+  set_property(TARGET acl_runtime PROPERTY IMPORTED_LOCATION ${ACL_RUNTIME_FILE})
+endif()
+
+# 4.1 find ATC libs - libregister.so
+find_library(ATC_REGISTER_FILE NAMES register
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_REGISTER_FILE)
+  message(FATAL_ERROR "Can not find ATC_REGISTER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_REGISTER_FILE Library: ${ATC_REGISTER_FILE}")
+  add_library(atc_register SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_register PROPERTY IMPORTED_LOCATION ${ATC_REGISTER_FILE})
+endif()
+
+# 4.1.1 dependency of register - libprotobuf.so.19,
+find_library(ATC_PROTOBUF_FILE NAMES libprotobuf.so.19
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+  if(NOT ATC_REGISTER_FILE)
+  message(FATAL_ERROR "Can not find ATC_PROTOBUF_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_PROTOBUF_FILE Library: ${ATC_PROTOBUF_FILE}")
+  add_library(atc_protobuf SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_protobuf PROPERTY IMPORTED_LOCATION ${ATC_PROTOBUF_FILE})
+endif()
+
+# 4.1.2 dependency of register - libgraph.so
+find_library(ATC_GRAPH_FILE NAMES graph
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_GRAPH_FILE)
+  message(FATAL_ERROR "Can not find ATC_GRAPH_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GRAPH_FILE Library: ${ATC_GRAPH_FILE}")
+  add_library(atc_graph SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_graph PROPERTY IMPORTED_LOCATION ${ATC_GRAPH_FILE})
+endif()
+
+# 4.2 find ATC libs - libge_compiler.so
+find_library(ATC_GE_COMPILER_FILE NAMES ge_compiler
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_GE_COMPILER_FILE)
+  message(FATAL_ERROR "Can not find ATC_GE_COMPILER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GE_COMPILER_FILE Library: ${ATC_GE_COMPILER_FILE}")
+  add_library(atc_ge_compiler SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_ge_compiler PROPERTY IMPORTED_LOCATION ${ATC_GE_COMPILER_FILE})
+endif()
+
+# 4.2.1 dependencies of libge_compiler.so - libge_common.so
+find_library(ATC_GE_COMMON_FILE NAMES ge_common
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_GE_COMMON_FILE)
+  message(FATAL_ERROR "Can not find ATC_GE_COMMON_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GE_COMMON_FILE Library: ${ATC_GE_COMMON_FILE}")
+  add_library(atc_ge_common SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_ge_common PROPERTY IMPORTED_LOCATION ${ATC_GE_COMMON_FILE})
+endif()
+
+# 4.2.3 dependencies of libge_compiler.so - libresource.so
+find_library(ATC_RESOURCE_FILE NAMES resource
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_RESOURCE_FILE)
+  message(FATAL_ERROR "Can not find ATC_RESOURCE_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_RESOURCE_FILE Library: ${ATC_RESOURCE_FILE}")
+  add_library(atc_resource SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_resource PROPERTY IMPORTED_LOCATION ${ATC_RESOURCE_FILE})
+endif()
+
+# 4.3 find OPP libs - libopsproto.so
+find_library(OPP_OPS_PROTO_FILE NAMES opsproto
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in
+  NO_DEFAULT_PATH)
+
+if(NOT OPP_OPS_PROTO_FILE)
+  message(FATAL_ERROR "Can not find OPP_OPS_PROTO_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in")
+else()
+  message(STATUS "Found OPP_OPS_PROTO_FILE Library: ${OPP_OPS_PROTO_FILE}")
+  add_library(opp_ops_proto SHARED IMPORTED GLOBAL)
+  set_property(TARGET opp_ops_proto PROPERTY IMPORTED_LOCATION ${OPP_OPS_PROTO_FILE})
+endif()
+
+# 4.3.1 dependency of  opp_ops_proto - liberror_manager.so
+find_library(ATC_ERROR_MANAGER_FILE NAMES error_manager
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_ERROR_MANAGER_FILE)
+  message(FATAL_ERROR "Can not find ATC_ERROR_MANAGER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_ERROR_MANAGER_FILE Library: ${ATC_ERROR_MANAGER_FILE}")
+  add_library(atc_error_manager SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_error_manager PROPERTY IMPORTED_LOCATION ${ATC_ERROR_MANAGER_FILE})
+endif()
+
+# note: huawei_ascend_npu_runtime_libs should before huawei_ascend_npu_builder_libs
+set(huawei_ascend_npu_runtime_libs acl_ascendcl acl_runtime CACHE INTERNAL "huawei_ascend_npu acllib runtime libs")
+set(huawei_ascend_npu_builder_libs atc_register atc_protobuf atc_graph opp_ops_proto atc_error_manager 
+    atc_ge_compiler atc_ge_common atc_resource CACHE INTERNAL "huawei_ascend_npu atc builder libs")
\ No newline at end of file
diff --git a/cmake/device/npu.cmake b/cmake/device/npu.cmake
index 88598f4690a157b20ac1873d84ad13c2f8652725..0409b6a60fc651cbaade61998a09bc0489bc978c 100644
--- a/cmake/device/npu.cmake
+++ b/cmake/device/npu.cmake
@@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
   PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
   NO_DEFAULT_PATH)
 
+# Added in HiAI DDK 320 or later version
+find_library(NPU_DDK_HCL_FILE NAMES hcl
+  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
+  NO_DEFAULT_PATH)
+
 if(NOT NPU_DDK_HIAI_FILE)
   message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
 else()
@@ -78,5 +83,13 @@ else()
   set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
 endif()
 
-set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
+if(NOT NPU_DDK_HCL_FILE)
+# message(FATAL_ERROR "Can not find NPU_DDK_HCL_FILE in ${NPU_DDK_ROOT}")
+else()
+  message(STATUS "Found NPU_DDK HCL Library: ${NPU_DDK_HCL_FILE}")
+  add_library(npu_ddk_hcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET npu_ddk_hcl PROPERTY IMPORTED_LOCATION ${NPU_DDK_HCL_FILE})
+endif()
+
+set(npu_runtime_libs npu_ddk_hiai npu_ddk_hcl CACHE INTERNAL "npu ddk runtime libs")
 set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
diff --git a/cmake/external/flatbuffers.cmake b/cmake/external/flatbuffers.cmake
index 7c6374b40b92a8807c5bb9529d907c576f6ad05c..e6ab31ee855f5bbc0594f37c00a3ec46d8e4231d 100644
--- a/cmake/external/flatbuffers.cmake
+++ b/cmake/external/flatbuffers.cmake
@@ -94,12 +94,10 @@ function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT)
   message(STATUS "SRC_FBS_DIR: ${SRC_FBS_DIR}")
   string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS})
   add_custom_command(
-    OUTPUT ${GEN_HEADER}
+    OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/${GEN_HEADER}"
     COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
             --cpp --gen-mutable --gen-object-api --reflect-names
-            --force-empty --force-empty-vectors
             ${OPT}
-            -I "${CMAKE_CURRENT_SOURCE_DIR}/tests/include_test"
             -o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}"
             "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
     DEPENDS flatbuffers
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index 8408a79fa4265b08771e435dcc5e82801a9d40f9..fe66d0f643e9bdf0cb778c4e4647294f553c023e 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS ARGS)
   cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   set(deps ${lite_deps_DEPS})
@@ -118,6 +118,12 @@ function (lite_deps TARGET)
     endforeach(var)
   endif()
 
+  if (LITE_WITH_HUAWEI_ASCEND_NPU)
+    foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
   set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()
 
@@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
     set(options SHARED shared STATIC static MODULE module)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
       HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -165,6 +171,7 @@ function(lite_cc_library TARGET)
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
             MLU_DEPS ${args_MLU_DEPS}
+            HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
             )
 
     if (args_SHARED OR ARGS_shared)
@@ -193,7 +200,7 @@ function(lite_cc_binary TARGET)
         set(options " -g ")
     endif()
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
       LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -215,6 +222,7 @@ function(lite_cc_binary TARGET)
             HVY_DEPS ${args_HVY_DEPS}
             CV_DEPS ${CV_DEPS}
             MLU_DEPS ${args_MLU_DEPS}
+            HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
             )
     cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
     if(NOT WIN32)
@@ -246,7 +254,7 @@ function(lite_cc_test TARGET)
     endif()
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
         ARGS
         COMPILE_LEVEL # (basic|extra)
@@ -276,6 +284,7 @@ function(lite_cc_test TARGET)
               HVY_DEPS ${args_HVY_DEPS}
               CV_DEPS ${args_CV_DEPS}
               MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
               )
     _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
     # strip binary target to reduce size
@@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
 set(apu_kernels CACHE INTERNAL "apu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(mlu_kernels CACHE INTERNAL "mlu kernels")
+set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
 set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
@@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR)
   file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
+# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -438,6 +448,15 @@ function(add_kernel TARGET device level)
         endif()
         set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
+    if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU")
+        if (NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(huawei_ascend_npu_kernels "${huawei_ascend_npu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
     if ("${device}" STREQUAL "OPENCL")
         if (NOT LITE_WITH_OPENCL)
             foreach(src ${args_SRCS})
@@ -481,6 +500,7 @@ function(add_kernel TARGET device level)
               RKNPU_DEPS ${args_RKNPU_DEPS}
               BM_DEPS ${args_BM_DEPS}
               MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
@@ -499,7 +519,7 @@ endif()
 function(add_operator TARGET level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -537,6 +557,7 @@ function(add_operator TARGET level)
               RKNPU_DEPS ${args_RKNPU_DEPS}
               BM_DEPS ${args_BM_DEPS}
               MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
diff --git a/docs/demo_guides/opencl.md b/docs/demo_guides/opencl.md
index 31a0e411566297d5556e6b7fffcec1343cd83781..52ea158cf7b9c827c17225b6690b1bd9d8d15d24 100644
--- a/docs/demo_guides/opencl.md
+++ b/docs/demo_guides/opencl.md
@@ -37,14 +37,25 @@ rm ./lite/api/paddle_use_kernels.h
 rm ./lite/api/paddle_use_ops.h
 
 # 设置编译参数并开始编译
+# android-armv7:cpu+gpu+cv+extra
 ./lite/tools/build_android.sh \
   --arch=armv7 \
   --toolchain=clang \
-  --with_cv=OFF \
   --with_log=OFF \
-  --with_extra=OFF \
+  --with_extra=ON \
+  --with_cv=ON \
   --with_opencl=ON
 
+# android-armv8:cpu+gpu+cv+extra
+./lite/tools/build_android.sh \
+  --arch=armv8 \
+  --toolchain=clang \
+  --with_log=OFF \
+  --with_extra=ON \
+  --with_cv=ON \
+  --with_opencl=ON
+
+
 # 注：编译帮助请执行: ./lite/tools/build_android.sh help
 ```
 
@@ -206,7 +217,7 @@ adb shell "export GLOG_v=4; \
 
 ## 3. 如何在Code中使用
 
-即编译产物`demo/cxx/mobile_light`目录下的代码，在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc);
+即编译产物`demo/cxx/mobile_light`目录下的代码，在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)，其中也包括判断当前设备是否支持OpenCL的方法;
 
 注：这里给出的链接会跳转到线上最新develop分支的代码，很可能与您本地的代码存在差异，建议参考自己本地位于`lite/demo/cxx/`目录的代码，查看如何使用。
 
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index eeea3b3adf4caf2e3ea57eb365c32f24626851e6..10601e34f9815bfee88d8dba58988169839cc86d 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
 message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
+message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
@@ -45,6 +46,7 @@ if (WITH_TESTING)
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz")
     endif()
     if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index d28b2afc9d83c291ee4a24814bdc9491ba914273..6ff381268a5796a52136214b64db39c057b5d59b 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -11,7 +11,7 @@ endif()
 
 set(light_lib_DEPS light_api paddle_api paddle_api_light)
 
-if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR LITE_WITH_HUAWEI_ASCEND_NPU OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
     #full api dynamic library
     lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
                   DEPS paddle_api paddle_api_light  paddle_api_full)
@@ -40,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                   NPU_DEPS ${npu_kernels}
                   APU_DEPS ${apu_kernels}
                   RKNPU_DEPS ${rknpu_kernels}
+                  HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
                   )
 
     add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
     if(WIN32)
         target_link_libraries(paddle_light_api_shared shlwapi.lib)
     endif()
-    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
+    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${rknpu_kernels} ${apu_kernels})
    if(APPLE)
         set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds")
         set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}")
@@ -94,6 +95,7 @@ if (WITH_TESTING)
       RKNPU_DEPS ${rknpu_kernels}
       BM_DEPS ${bm_kernels}
       MLU_DEPS ${mlu_kernels}
+      HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
       APU_DEPS ${apu_kernels})
 
 endif()
@@ -112,6 +114,10 @@ if(LITE_WITH_RKNPU)
     set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
 endif()
 
+if(LITE_WITH_HUAWEI_ASCEND_NPU)
+    set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps})
+endif()
 
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
@@ -126,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
 message(STATUS "get MLU kernels ${mlu_kernels}")
+message(STATUS "get HUAWEI_ASCEND_NPU kernels ${huawei_ascend_npu_kernels}")
 
 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -144,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
                         RKNPU_DEPS ${rknpu_kernels}
                         BM_DEPS ${bm_kernels}
                         CL_DEPS ${opencl_kernels}
-                        FPGA_DEPS ${fpga_kernels})
+                        FPGA_DEPS ${fpga_kernels}
+                        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 endif()
 
 # for light api
@@ -168,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         BM_DEPS ${bm_kernels}
-        MLU_DEPS ${mlu_kernels})
+        MLU_DEPS ${mlu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -191,6 +200,7 @@ if(WITH_TESTING)
            FPGA_DEPS ${fpga_kernels}
            BM_DEPS ${bm_kernels}
            MLU_DEPS ${mlu_kernels}
+           HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
            EXCLUDE_COMPILE_DEPS "ON"
            ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
                 --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -322,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
         APU_DEPS ${apu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
-        BM_DEPS ${bm_kernels})
+        BM_DEPS ${bm_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
     # The final inference library for just MobileConfig.
     bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
     target_link_libraries(paddle_api_full ${cuda_deps})
@@ -394,6 +405,7 @@ if(NOT WITH_COVERAGE)
       FPGA_DEPS ${fpga_kernels}
       BM_DEPS ${bm_kernels}
       MLU_DEPS ${mlu_kernels}
+      HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
     if (WITH_TESTING)
         add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -415,7 +427,8 @@ if(NOT IOS)
         RKNPU_DEPS ${rknpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
     
     lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
@@ -430,7 +443,8 @@ if(NOT IOS)
         RKNPU_DEPS ${rknpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
     
     lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
@@ -445,7 +459,8 @@ if(NOT IOS)
         RKNPU_DEPS ${rknpu_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 
     lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
@@ -459,7 +474,8 @@ if(NOT IOS)
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
     
     lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
@@ -470,8 +486,9 @@ if(NOT IOS)
         XPU_DEPS ${xpu_kernels}
         RKNPU_DEPS ${rknpu_kernels}
         MLU_DEPS ${mlu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
         CL_DEPS ${opencl_kernels}
-	BM_DEPS ${bm_kernels}
+        BM_DEPS ${bm_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
@@ -487,7 +504,8 @@ if(NOT IOS)
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 endif()
 
 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index ad3eb429b393514e170e68915b49c251164b38c2..52fc33830828ce1325a77b821f1cea4c329e933b 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -37,8 +37,7 @@ void Predictor::SaveModel(const std::string &dir,
   if (!program_) {
     GenRuntimeProgram();
   }
-  program_->SaveOpInfosToProgram(program_desc_.get());
-  program_->UpdateVarsOfProgram(program_desc_.get());
+  program_->SaveToProgram(program_desc_);
   switch (model_type) {
     case lite_api::LiteModelType::kProtobuf:
       SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true);
@@ -58,17 +57,21 @@ void Predictor::SaveModel(const std::string &dir,
 void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
   std::set<std::string> ops_info;
   std::set<std::string> kernels_info;
-  const auto &instructions_ = program_->instructions();
-  for (auto &node : instructions_) {
-    // parse op type infomation
-    auto op = node.op()->op_info();
-    ops_info.insert(op->Type());
-    // parse kernel type information
-    std::string kernel_type_str =
-        node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) +
-        "," + PrecisionRepr(node.kernel()->precision()) + "," +
-        DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias();
-    kernels_info.insert(kernel_type_str);
+  auto block_size = program_->block_size();
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    const auto &insts = program_->instructions(block_idx);
+    for (auto &inst : insts) {
+      // parse op type infomation
+      auto op = inst.op()->op_info();
+      ops_info.insert(op->Type());
+      // parse kernel type information
+      std::string kernel_type_str =
+          inst.kernel()->op_type() + "," + TargetRepr(inst.kernel()->target()) +
+          "," + PrecisionRepr(inst.kernel()->precision()) + "," +
+          DataLayoutRepr(inst.kernel()->layout()) + "," +
+          inst.kernel()->alias();
+      kernels_info.insert(kernel_type_str);
+    }
   }
 
   // get souce_file name from op type and kernel type
@@ -170,9 +173,9 @@ void Predictor::PrepareFeedFetch() {
 
   std::vector<const cpp::OpDesc *> feeds;
   std::vector<const cpp::OpDesc *> fetchs;
-  const auto &insts = program_->instructions();
-  for (size_t i = 0; i < program_->num_instructions(); i++) {
-    const auto &op = insts[i].op()->op_info();
+  const auto &insts = program_->instructions(kRootBlockIdx);
+  for (auto &inst : insts) {
+    const auto &op = inst.op()->op_info();
     if (op->Type() == "feed") {
       feeds.push_back(op);
     } else if (op->Type() == "fetch") {
@@ -255,7 +258,6 @@ void Predictor::Build(const lite_api::CxxConfig &config,
   } else {
     LOG(INFO) << "Load model from file.";
   }
-
   Build(model_path,
         model_file,
         param_file,
@@ -296,10 +298,10 @@ void Predictor::Build(const std::string &model_path,
   Build(program_desc_, valid_places, passes);
 }
 
-void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
+void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &program_desc,
                       const std::vector<Place> &valid_places,
                       const std::vector<std::string> &passes) {
-  program_desc_ = desc;
+  program_desc_ = program_desc;
   // `inner_places` is used to optimize passes
   std::vector<Place> inner_places = valid_places;
   for (auto &valid_place : valid_places) {
@@ -336,7 +338,7 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
                         Place{TARGET(kARM), PRECISION(kInt8)});
   }
 
-  Program program(*desc.get(), scope_, inner_places);
+  Program program(program_desc_, scope_, inner_places);
   valid_places_ = inner_places;
 
   core::KernelPickFactor factor;
diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
index 20364609deb0261acac489f182cdfa740d6e6346..ceb823d5811aed26792318e3c1bf718ad9c2d851 100644
--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -49,18 +49,33 @@ class LITE_API Predictor {
     program_desc_ = std::make_shared<cpp::ProgramDesc>();
   }
 
-  // Create a predictor with the weight variable scope set.
+  ///////////////////////////////////////////////////////////////////
+  // Function: Predictor
+  // Usage: Constructor of Predictor. Create a predictor with the
+  // weight variable scope set given.
+  ///////////////////////////////////////////////////////////////////
   explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
       : scope_(root_scope) {}
-  Predictor(const std::shared_ptr<cpp::ProgramDesc>& desc,
+  ///////////////////////////////////////////////////////////////////
+  // Function: Predictor
+  // Usage: Constructor of Predictor. This constructor function can
+  // only be called in Predictor->Clone. This Function will create
+  // a predictor from existed ProgramDesc, Scope and RuntimeProgram.
+  ///////////////////////////////////////////////////////////////////
+  Predictor(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
             const std::shared_ptr<Scope>& root,
             const std::vector<Place>& valid_places,
             const std::vector<std::string>& var_names = {})
-      : program_desc_(desc), scope_(root) {
-    Program program(*desc.get(), scope_, valid_places, var_names);
-    optimizer_ = Optimizer(std::move(program), valid_places);
-    exec_scope_ = optimizer_.exec_scope();
+      : program_desc_(program_desc), scope_(root) {
+    // step1. Create a Program to construct the exec_scope and ops
+    Program program(program_desc_, scope_, valid_places, var_names);
+    exec_scope_ = program.exec_scope();
     valid_places_ = valid_places;
+
+    // step3. Create the RuntimeProgram.
+    program_.reset(
+        new RuntimeProgram(program_desc_, exec_scope_, kRootBlockIdx));
+    program_generated_ = true;
   }
 
   // Build from a model, with places set for hardware config.
@@ -79,32 +94,62 @@ class LITE_API Predictor {
       lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
       bool memory_from_memory = false);
 
-  void Build(const std::shared_ptr<cpp::ProgramDesc>& desc,
+  void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
              const std::vector<Place>& valid_places,
              const std::vector<std::string>& passes = {});
 
-  std::shared_ptr<Predictor> Clone() const {
+  //////////////////////////////////////////////////////////
+  // Function: Clone
+  // Usage: Create a Predictor from an existed one,
+  // the cloned predictor will share persistable variables
+  // in scope_ with the original predictor.
+  //////////////////////////////////////////////////////////
+  std::shared_ptr<Predictor> Clone() {
+    // step 1. Generate runtime_program, update op_info and var_info in
+    // program_desc_
+    if (!program_generated_) {
+      GenRuntimeProgram();
+    }
+    program_->SaveToProgram(program_desc_);
+    // step 2. Create a predictor friom current program_desc_ and
+    // runtime_program.
     auto predictor =
         std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
+    // step3. Return the result
     return predictor;
   }
-
-  std::shared_ptr<Predictor> Clone(
-      const std::vector<std::string>& var_names) const {
+  //////////////////////////////////////////////////////////
+  // Function: Clone(var_names)
+  // Usage: Create a Predictor from an existed one,
+  // the cloned predictor will share persistable variables
+  // but persistable variables of name var_names will not
+  // be shared.
+  //////////////////////////////////////////////////////////
+  std::shared_ptr<Predictor> Clone(const std::vector<std::string>& var_names) {
     CHECK(program_desc_) << "Both program and scope of current predicotr "
                             "should be not be nullptr in Clone mode.";
     CHECK(scope_) << "Both program and scope of current predicotr should be "
                      "not be nullptr in Clone mode.";
+    // step 1. Generate runtime_program, update op_info and var_info in
+    // program_desc_
+    if (!program_generated_) {
+      GenRuntimeProgram();
+    }
+    program_->SaveToProgram(program_desc_);
+    // step 2. Create a predictor friom current program_desc_ and
+    // runtime_program.
     auto predictor = std::make_shared<Predictor>(
         program_desc_, scope_, valid_places_, var_names);
-
-    for (auto i : var_names) {
-      predictor->exec_scope_->LocalVar(i);
-      auto* tensor = predictor->scope_->Var(i)->GetMutable<lite::Tensor>();
+    // step3. Copy some persistable variables into private scope.
+    for (auto var_name : var_names) {
+      predictor->exec_scope_->LocalVar(var_name);
+      auto* tensor =
+          predictor->scope_->Var(var_name)->GetMutable<lite::Tensor>();
       auto* sub_tensor =
-          predictor->exec_scope_->Var(i)->GetMutable<lite::Tensor>();
+          predictor->exec_scope_->Var(var_name)->GetMutable<Tensor>();
       sub_tensor->CopyDataFrom(*tensor);
     }
+    // step4. Return the result
     return predictor;
   }
 
@@ -140,6 +185,7 @@ class LITE_API Predictor {
   // get a const tensor according to its name
   const lite::Tensor* GetTensor(const std::string& name) const;
   const RuntimeProgram& runtime_program() const;
+  Scope* scope() { return scope_.get(); }
 
   // This method is disabled in mobile, for unnecessary dependencies required.
   void SaveModel(
@@ -162,7 +208,7 @@ class LITE_API Predictor {
   std::shared_ptr<cpp::ProgramDesc> program_desc_;
   std::shared_ptr<Scope> scope_;
   Scope* exec_scope_;
-  std::unique_ptr<RuntimeProgram> program_;
+  std::shared_ptr<RuntimeProgram> program_;
   bool program_generated_{false};
   std::vector<std::string> input_names_;
   std::vector<std::string> output_names_;
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 4f4b0d0766c173bbbc50f6a0e99fc9d49d2470f4..726783349f0dcc049c4578df5c9e0ecbdb3dee4f 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -74,7 +74,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   mode_ = config.power_mode();
   threads_ = config.threads();
 #ifdef LITE_WITH_NPU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
   Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
+      config.get_device_id());
+  Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
       config.subgraph_model_cache_dir());
 #endif
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
index f0d1fb96fe4dfd5f8fa57808a2098cbc42db6a11..fbcf171726d741ef0073f423bc4a600c9f9389d0 100644
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -22,17 +22,18 @@ namespace lite {
 void LightPredictor::Build(const std::string& lite_model_file,
                            bool model_from_memory) {
   if (model_from_memory) {
-    LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
+    LoadModelNaiveFromMemory(
+        lite_model_file, scope_.get(), program_desc_.get());
   } else {
-    LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
+    LoadModelNaiveFromFile(lite_model_file, scope_.get(), program_desc_.get());
   }
 
   // For weight quantization of post training, load the int8/16 weights
   // for optimized model, and dequant it to fp32.
   DequantizeWeight();
-
-  BuildRuntimeProgram(cpp_program_desc_);
+  BuildRuntimeProgram(program_desc_);
   PrepareFeedFetch();
+  program_desc_.reset();
 }
 
 void LightPredictor::Build(const std::string& model_dir,
@@ -43,15 +44,15 @@ void LightPredictor::Build(const std::string& model_dir,
   switch (model_type) {
 #ifndef LITE_ON_TINY_PUBLISH
     case lite_api::LiteModelType::kProtobuf:
-      LoadModelPb(model_dir, "", "", scope_.get(), &cpp_program_desc_);
+      LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
       break;
 #endif
     case lite_api::LiteModelType::kNaiveBuffer: {
       if (model_from_memory) {
         LoadModelNaiveFromMemory(
-            model_buffer, param_buffer, scope_.get(), &cpp_program_desc_);
+            model_buffer, param_buffer, scope_.get(), program_desc_.get());
       } else {
-        LoadModelNaive(model_dir, scope_.get(), &cpp_program_desc_);
+        LoadModelNaive(model_dir, scope_.get(), program_desc_.get());
       }
       break;
     }
@@ -60,7 +61,7 @@ void LightPredictor::Build(const std::string& model_dir,
   }
 
   DequantizeWeight();
-  BuildRuntimeProgram(cpp_program_desc_);
+  BuildRuntimeProgram(program_desc_);
   PrepareFeedFetch();
 }
 
@@ -109,15 +110,17 @@ std::vector<std::string> LightPredictor::GetOutputNames() {
 }
 // append the names of inputs and outputs into input_names_ and output_names_
 void LightPredictor::PrepareFeedFetch() {
-  auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
-  std::vector<cpp::OpDesc*> feeds;
-  std::vector<cpp::OpDesc*> fetchs;
-  for (size_t i = 0; i < current_block->OpsSize(); i++) {
-    auto op = current_block->GetOp<cpp::OpDesc>(i);
-    if (op->Type() == "feed") {
-      feeds.push_back(op);
-    } else if (op->Type() == "fetch") {
-      fetchs.push_back(op);
+  std::vector<const cpp::OpDesc*> feeds;
+  std::vector<const cpp::OpDesc*> fetchs;
+  std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
+  auto main_block = program_desc->GetBlock<cpp::BlockDesc>(kRootBlockIdx);
+  auto op_size = main_block->OpsSize();
+  for (size_t op_idx = 0; op_idx < op_size; ++op_idx) {
+    auto op_desc = main_block->GetOp<cpp::OpDesc>(op_idx);
+    if (op_desc->Type() == "feed") {
+      feeds.push_back(op_desc);
+    } else if (op_desc->Type() == "fetch") {
+      fetchs.push_back(op_desc);
     }
   }
   input_names_.resize(feeds.size());
@@ -132,54 +135,35 @@ void LightPredictor::PrepareFeedFetch() {
   }
 }
 
-void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
-  std::vector<Instruction> insts;
-  // 1. Create op first
-  Program program(prog, scope_, {});
-
-// 2. Create Instructs
-#ifdef LITE_WITH_OPENCL
-  using OpenCLContext = Context<TargetType::kOpenCL>;
-  std::unique_ptr<KernelContext> local_ctx(new KernelContext());
-  local_ctx->As<OpenCLContext>().InitOnce();
-#endif
-
-  // Create the kernels of the target places, and filter out the specific
-  // kernel with the target alias.
-  for (auto& op : program.ops()) {
-    auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
-    std::string op_type, alias;
-    Place place;
-    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-    auto kernels = op->CreateKernels({place});
-    // filter out a kernel
-    auto it = std::find_if(
-        kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
-          return it->alias() == alias;
-        });
-    CHECK(it != kernels.end());
-
-#ifdef LITE_WITH_OPENCL
-    if ((*it)->target() == TARGET(kOpenCL)) {
-      std::unique_ptr<KernelContext> ctx(new KernelContext());
-      (*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
-      (*it)->SetContext(std::move(ctx));
-    } else {
-      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+void LightPredictor::BuildRuntimeProgram(
+    const std::shared_ptr<const cpp::ProgramDesc>& program_desc) {
+  auto* exe_scope = &scope_->NewScope();
+  // Prepare workspace
+  scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
+  scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
+  CHECK(program_desc);
+  auto block_size = program_desc->BlocksSize();
+  CHECK(block_size);
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+    auto var_size = block_desc->VarsSize();
+    for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
+      auto var_desc = block_desc->GetVar<cpp::VarDesc>(var_idx);
+      if (!var_desc->Persistable()) {
+        exe_scope->Var(var_desc->Name());
+      } else {
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") continue;
+        scope_->Var(var_desc->Name());
+      }
     }
-#else
-    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
-#endif
-
-    insts.emplace_back(op, std::move(*it));
   }
-  program_.reset(new RuntimeProgram(std::move(insts)));
-
-  CHECK(program.exec_scope());
-  program_->set_exec_scope(program.exec_scope());
+  // Only extracting the ops and generate the runtime program from the main
+  // block desc
+  program_.reset(new RuntimeProgram(program_desc, exe_scope, kRootBlockIdx));
 }
 
 void LightPredictor::DequantizeWeight() {
+  std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
 #define PROCESS_CONV2D_DATA()                                             \
   for (int64_t i = 0; i < ch; ++i) {                                      \
     for (int64_t j = 0; j < offset; ++j) {                                \
@@ -205,10 +189,9 @@ void LightPredictor::DequantizeWeight() {
     }
     return result;
   };
-
   Tensor tmp_tensor;
-  for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
-    auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
+  for (size_t i = 0; i < program_desc->BlocksSize(); i++) {
+    auto* block = program_desc->GetBlock<cpp::BlockDesc>(i);
     for (size_t k = 0; k < block->OpsSize(); ++k) {
       auto* op_desc = block->GetOp<cpp::OpDesc>(k);
       if (is_weight_quantized_op(op_desc)) {
diff --git a/lite/api/light_api.h b/lite/api/light_api.h
index e651d1323a5ce6e36546e9437d06a472eb8a5137..97a46b7d28ffc84feb87283eed9786b562a45229 100644
--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -46,6 +46,7 @@ class LITE_API LightPredictor {
   LightPredictor(const std::string& lite_model_file,
                  bool model_from_memory = false) {
     scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
     Build(lite_model_file, model_from_memory);
   }
 
@@ -57,6 +58,7 @@ class LITE_API LightPredictor {
                  lite_api::LiteModelType model_type =
                      lite_api::LiteModelType::kNaiveBuffer) {
     scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
     Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
   }
 
@@ -78,6 +80,7 @@ class LITE_API LightPredictor {
   std::vector<std::string> GetInputNames();
   std::vector<std::string> GetOutputNames();
   void PrepareFeedFetch();
+  Scope* scope() { return scope_.get(); }
 
  private:
   void Build(const std::string& lite_model_file,
@@ -91,14 +94,15 @@ class LITE_API LightPredictor {
       lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
       bool model_from_memory = false);
 
-  void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
+  void BuildRuntimeProgram(
+      const std::shared_ptr<const cpp::ProgramDesc>& program_desc);
 
   void DequantizeWeight();
 
  private:
   std::shared_ptr<Scope> scope_;
   std::unique_ptr<RuntimeProgram> program_;
-  cpp::ProgramDesc cpp_program_desc_;
+  std::shared_ptr<cpp::ProgramDesc> program_desc_;
   std::vector<std::string> input_names_;
   std::vector<std::string> output_names_;
 };
diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc
index 718ba020fb9c6daa4dc4d7263238692267335a48..c9c34377e2a82b72d26e3148a694fe0662e985ce 100644
--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -38,7 +38,15 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
   threads_ = config.threads();
 
 #ifdef LITE_WITH_NPU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
   Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
+      config.get_device_id());
+  Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
       config.subgraph_model_cache_dir());
 #endif
 }
diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc
index 465f82056c6bb80b706cfb7d875773d75735911b..b523d5951b3302c5aa46763625af12e24da0015e 100644
--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,
 
   if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
     ASSERT_EQ(out->dims().production(), 1000);
-    double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
+    double eps = first_target == TARGET(kOpenCL) ? 0.25 : 0.1;
     for (int i = 0; i < ref.size(); ++i) {
       for (int j = 0; j < ref[i].size(); ++j) {
         auto result = pdata[j * step + (out->dims()[1] * i)];
diff --git a/lite/api/opt.cc b/lite/api/opt.cc
index c2fb594e8877020848ecc90c039c31d6f77f638b..e6a53e93e72261082fa220c5fe7b0c12bf60ca87 100644
--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -112,6 +112,8 @@ std::vector<Place> ParserValidPlaces() {
       valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
     } else if (target_repr == "npu") {
       valid_places.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "huawei_ascend_npu") {
+      valid_places.emplace_back(TARGET(kHuaweiAscendNPU));
     } else if (target_repr == "xpu") {
       valid_places.emplace_back(TARGET(kXPU));
     } else if (target_repr == "mlu") {
@@ -201,6 +203,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                       "kXPU",
                                       "kRKNPU",
                                       "kAPU",
+                                      "kHuaweiAscendNPU",
                                       "kAny",
                                       "kUnk"};
   int maximum_optype_length = 0;
@@ -265,16 +268,17 @@ void PrintHelpInfo() {
       "        `--param_file=<param_path>`\n"
       "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
       "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
+      "        "
+      "`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
       "        `--record_tailoring_info=(true|false)`\n"
       "  Arguments of model checking and ops information:\n"
       "        `--print_all_ops=true`   Display all the valid operators of "
       "Paddle-Lite\n"
       "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
       "  Display valid operators of input targets\n"
       "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
       "  Display operators in the input model\n";
   std::cout << "opt version:" << opt_version << std::endl
             << help_info << std::endl;
diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc
index 4ee18e24a632777c6a3e4a661c90aa9b59654028..ed41a821c0938b599dc8900baa021491df78f329 100644
--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
       valid_places_.emplace_back(TARGET(kX86));
     } else if (target_repr == "npu") {
       valid_places_.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "huawei_ascend_npu") {
+      valid_places_.emplace_back(TARGET(kHuaweiAscendNPU));
     } else if (target_repr == "xpu") {
       valid_places_.emplace_back(TARGET(kXPU));
     } else if (target_repr == "rknpu") {
@@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() {
       "        `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
       "default\n"
       "        `set_lite_out(output_optimize_model_dir)`\n"
-      "        `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
+      "        "
+      "`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
       "        `record_model_info(false|true)`: refer to whether to record ops "
       "info for striping lib, false by default`\n"
       "        `run() : start model transformation`\n"
@@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() {
       "        `--param_file=<param_path>`\n"
       "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
       "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n"
       "        `--record_tailoring_info=(true|false)`\n"
       "  Arguments of model checking and ops information:\n"
       "        `--print_all_ops=true`   Display all the valid operators of "
       "Paddle-Lite\n"
       "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
       "  Display valid operators of input targets\n"
       "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
       "  Display operators in the input model\n";
   std::cout << "paddlelite opt version:" << opt_version << std::endl
             << help_info << std::endl;
@@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
                                                      "kXPU",
                                                      "kRKNPU",
                                                      "kAPU",
+                                                     "kHuaweiAscendNPU",
                                                      "kAny",
                                                      "kUnk"};
   // Get the lengh of the first column: maximum length of the op_type
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index 2bcfa9be1f8a601ace71291c7d820bc77d1acde6..08d2233536b90d2b39c7ba6e6733036652179d5f 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -32,9 +32,22 @@
 #include "lite/backends/mlu/target_wrapper.h"
 #endif
 
+#ifdef LITE_WITH_OPENCL
+#include "lite/backends/opencl/cl_runtime.h"
+#endif
+
 namespace paddle {
 namespace lite_api {
 
+bool IsOpenCLBackendValid() {
+  bool opencl_valid = false;
+#ifdef LITE_WITH_OPENCL
+  opencl_valid = paddle::lite::CLRuntime::Global()->OpenCLAvaliableForDevice();
+#endif
+  LOG(INFO) << "opencl_valid:" << opencl_valid;
+  return opencl_valid;
+}
+
 Tensor::Tensor(void *raw) : raw_tensor_(raw) {}
 
 // TODO(Superjomn) refine this by using another `const void* const_raw`;
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 3d87b78c83ef7ef771bfbccc12efc37e7be92e4c..6fe00bbd32d51e7d923901792e9d62166058c406 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -33,6 +33,9 @@ using lod_t = std::vector<std::vector<uint64_t>>;
 
 enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
 
+// return true if current device supports OpenCL model
+LITE_API bool IsOpenCLBackendValid();
+
 struct LITE_API Tensor {
   explicit Tensor(void* raw);
   explicit Tensor(const void* raw);
@@ -123,6 +126,7 @@ class LITE_API ConfigBase {
   PowerMode mode_{LITE_POWER_NO_BIND};
   // to save subgraph model for npu/xpu/...
   std::string subgraph_model_cache_dir_{""};
+  int device_id_{0};
 
  public:
   explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
@@ -142,6 +146,9 @@ class LITE_API ConfigBase {
   const std::string& subgraph_model_cache_dir() const {
     return subgraph_model_cache_dir_;
   }
+  // set Device ID
+  void set_device_id(int device_id) { device_id_ = device_id; }
+  const int get_device_id() const { return device_id_; }
 };
 
 /// CxxConfig is the config for the Full feature predictor.
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index 59603e25f3b7e4942a6be4d7af008c4a9dd6772b..29a119a6916e1e9fe9880c801291072351c18365 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -75,7 +75,8 @@ const std::string& TargetToStr(TargetType target) {
                                               "bm",
                                               "mlu",
                                               "rknpu",
-                                              "apu"};
+                                              "apu",
+                                              "huawei_ascend_npu"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -118,7 +119,8 @@ const std::string& TargetRepr(TargetType target) {
                                               "kBM",
                                               "kMLU",
                                               "kRKNPU",
-                                              "kAPU"};
+                                              "kAPU",
+                                              "kHuaweiAscendNPU"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -163,7 +165,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                                TARGET(kMLU),
                                                TARGET(kAPU),
                                                TARGET(kRKNPU),
-                                               TARGET(kFPGA)});
+                                               TARGET(kFPGA),
+                                               TARGET(kHuaweiAscendNPU)});
   if (target == TARGET(kAny)) {
     return valid_set;
   }
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index a43e74cd3a13b2e4fecd95428b9fd3fe8579d4d3..5161d6b58af01f7af4dcbaec6a1cacb91e7c7056 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -57,7 +57,8 @@ enum class TargetType : int {
   kMLU = 11,
   kRKNPU = 12,
   kAPU = 13,
-  NUM = 14,  // number of fields.
+  kHuaweiAscendNPU = 14,
+  NUM = 15,  // number of fields.
 };
 enum class PrecisionType : int {
   kUnk = 0,
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 2ec4965d3d526c82c41b51954f9564488c5126e1..f132b2064e76a85865b6092240ec96d6af9ae49a 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -28,6 +28,7 @@ USE_MIR_PASS(graph_visualize_pass);
 
 USE_MIR_PASS(remove_tf_redundant_ops_pass);
 USE_MIR_PASS(lite_conv_bn_fuse_pass);
+USE_MIR_PASS(lite_conv_conv_fuse_pass);
 USE_MIR_PASS(lite_fc_fuse_pass);
 USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
 USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
@@ -47,12 +48,14 @@ USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
+USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
 USE_MIR_PASS(mlu_subgraph_pass);
 USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
 USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc
index b7b24dfcea31d6e6e78538c6ac33923116b2e5a5..e32b61094a0b9ce9781cb6e9b8aef7ab753d7278 100644
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) {
       .value("MLU", TargetType::kMLU)
       .value("RKNPU", TargetType::kRKNPU)
       .value("APU", TargetType::kAPU)
+      .value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU)
       .value("Any", TargetType::kAny);
 
   // PrecisionType
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
index 7f0d53f976ace17ee8d95e62e62d56f5cb974881..27a8a46cfa1413ea0d9ffa3641d8e4bd60785e11 100644
--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(mlu)
 add_subdirectory(bm)
 add_subdirectory(apu)
 add_subdirectory(rknpu)
+add_subdirectory(huawei_ascend_npu)
diff --git a/lite/backends/arm/math/beam_search.cc b/lite/backends/arm/math/beam_search.cc
index 32b7d3bfeba6107493d62a0c9be14a3c15ce7692..74dfa143bda97219874b0e53efc7de34b0416c0e 100644
--- a/lite/backends/arm/math/beam_search.cc
+++ b/lite/backends/arm/math/beam_search.cc
@@ -234,7 +234,7 @@ void beam_search(const Tensor *pre_ids,
   selected_ids->Resize(dims);
   selected_scores->Resize(dims);
   if (parent_idx) {
-    parent_idx->Resize(dims);
+    parent_idx->Resize({static_cast<int64_t>(num_instances)});
   }
   auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
   auto *selected_scores_data = selected_scores->mutable_data<float>();
diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h
index 9625b1cc03ba007676705e68a738b893024df779..c72223d2e845bc67b541e6f1790e45129deff62f 100644
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
@@ -139,6 +139,151 @@ static bool conv_trans_weights_numc(const dtype* din,
   }
   return true;
 }
+// for example: m = 4, n = 4
+// din = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9 , 10 ,11], [12, 13, 14, 15]]
+// dout = [[0, 4, 8, 12], [1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15]]
+/*
+  m = 8 n = 8: 0 1 2 3 4 5 6 7           0 8 16 24 32 40 48 56
+               16 17 18 19 20 21 22 23   2 10 18 26 34 42 50 58
+               24 25 26 27 28 29 30 31   3 11 19 27 35 43 51 59
+               32 33 34 35 36 37 38 39   4 12 20 28 36 44 52 60           ...
+    }
+  }
+*/
+template <typename Dtype>
+void local_transpose(const Dtype* din, Dtype* dout, int m, int n) {
+  // n % 4 == 0 && m % 4 == 0
+  // n * m ==> n * m data trans
+  int offset_m = m << 2;
+  const Dtype* din_ptr = din;
+  Dtype* dout_ptr = dout;
+  for (int i = 0; i < n; i += 4) {
+    Dtype* out_ptr0 = dout_ptr;
+    Dtype* out_ptr1 = dout_ptr + m;
+    Dtype* out_ptr2 = out_ptr1 + m;
+    Dtype* out_ptr3 = out_ptr2 + m;
+    const Dtype* in_ptr0 = din_ptr;
+    const Dtype* in_ptr1 = din_ptr + m;
+    const Dtype* in_ptr2 = in_ptr1 + m;
+    const Dtype* in_ptr3 = in_ptr2 + m;
+    for (int j = 0; j < m; j += 4) {
+      float32x4_t vin0 = vld1q_f32(in_ptr0);
+      float32x4_t vin1 = vld1q_f32(in_ptr1);
+      float32x4_t vin2 = vld1q_f32(in_ptr2);
+      float32x4_t vin3 = vld1q_f32(in_ptr3);
+      // a00 b00 a02 b02 a01 b01 a03 b03
+      float32x4x2_t tmp0 = vtrnq_f32(vin0, vin1);
+      // c00 d00 c02 d02 c01 d01 c03 d03
+      float32x4x2_t tmp2 = vtrnq_f32(vin2, vin3);
+      in_ptr0 = in_ptr3 + m;
+      in_ptr1 = in_ptr3 + 2 * m;
+      float tmp_val1 = tmp0.val[0][2];
+      float tmp_val2 = tmp0.val[0][3];
+      tmp0.val[0][2] = tmp2.val[0][0];
+      tmp0.val[0][3] = tmp2.val[0][1];
+      float tmp_val3 = tmp0.val[1][2];
+      float tmp_val4 = tmp0.val[1][3];
+      tmp2.val[0][0] = tmp_val1;
+      tmp2.val[0][1] = tmp_val2;
+      tmp0.val[1][2] = tmp2.val[1][0];
+      tmp0.val[1][3] = tmp2.val[1][1];
+      tmp2.val[1][0] = tmp_val3;
+      tmp2.val[1][1] = tmp_val4;
+      in_ptr2 = in_ptr1 + m;
+      in_ptr3 = in_ptr1 + 2 * m;
+      vst1q_f32(out_ptr0, tmp0.val[0]);
+      vst1q_f32(out_ptr1, tmp0.val[1]);
+      out_ptr0 += 4;
+      out_ptr1 += 4;
+      vst1q_f32(out_ptr2, tmp2.val[0]);
+      vst1q_f32(out_ptr3, tmp2.val[1]);
+      out_ptr2 += 4;
+      out_ptr3 += 4;
+    }
+    dout_ptr += offset_m;
+    din_ptr += 4;
+  }
+}
+template <typename Dtype>
+void transpose(const Dtype* din, Dtype* dout, int m, int n) {
+  // nxm == mxn
+  // 4x4
+  int cnt_n = n >> 2;
+  int remain_n = n & 3;
+  int cnt_m = m >> 2;
+  int remain_m = m & 3;
+  int nn_num = n << 2;  // n * 4
+  int mm_num = m << 2;  // m * 4
+  for (int x = 0; x < cnt_n; x++) {
+    const Dtype* din_ptr0 = din + x * mm_num;
+    const Dtype* din_ptr1 = din_ptr0 + m;
+    const Dtype* din_ptr2 = din_ptr1 + m;
+    const Dtype* din_ptr3 = din_ptr2 + m;
+    Dtype* dout_ptr0 = dout + x * 4;
+    for (int y = 0; y < cnt_m; y++) {
+      float32x4_t din0 = vld1q_f32(din_ptr0);  // a00 a01 a02 a03
+      float32x4_t din1 = vld1q_f32(din_ptr1);
+      float32x4_t din2 = vld1q_f32(din_ptr2);
+      float32x4_t din3 = vld1q_f32(din_ptr3);
+      Dtype* dout_ptr1 = dout_ptr0 + n;
+      Dtype* dout_ptr2 = dout_ptr1 + n;
+      Dtype* dout_ptr3 = dout_ptr2 + n;
+      // a00 b00 a02 b02 a01 b01 a03 b03
+      float32x4x2_t tmp0 = vtrnq_f32(din0, din1);
+      // c00 d00 c02 d02 c01 d01 c03 d03
+      float32x4x2_t tmp2 = vtrnq_f32(din2, din3);
+      din_ptr0 += 4;
+      din_ptr1 += 4;
+      // a00 b00 c00 d00 a02 b02 c02 d02
+      // a01 b01 c01 d01 a03 b03 c03 d03
+      float tmp_val1 = tmp0.val[0][2];
+      float tmp_val2 = tmp0.val[0][3];
+      tmp0.val[0][2] = tmp2.val[0][0];
+      tmp0.val[0][3] = tmp2.val[0][1];
+      float tmp_val3 = tmp0.val[1][2];
+      float tmp_val4 = tmp0.val[1][3];
+      tmp2.val[0][0] = tmp_val1;
+      tmp2.val[0][1] = tmp_val2;
+      tmp0.val[1][2] = tmp2.val[1][0];
+      tmp0.val[1][3] = tmp2.val[1][1];
+      tmp2.val[1][0] = tmp_val3;
+      tmp2.val[1][1] = tmp_val4;
+      din_ptr2 += 4;
+      din_ptr3 += 4;
+      vst1q_f32(dout_ptr0, tmp0.val[0]);
+      vst1q_f32(dout_ptr1, tmp0.val[1]);
+      dout_ptr0 += nn_num;
+      vst1q_f32(dout_ptr2, tmp2.val[0]);
+      vst1q_f32(dout_ptr3, tmp2.val[1]);
+    }
+    for (int y = 0; y < remain_m; y++) {
+      *dout_ptr0++ = *din_ptr0++;
+      *dout_ptr0++ = *din_ptr1++;
+      *dout_ptr0++ = *din_ptr2++;
+      *dout_ptr0++ = *din_ptr3++;
+    }
+  }
+  const Dtype* din_ptr0 = din + cnt_n * mm_num;
+  dout = dout + cnt_n * 4;
+  for (int x = 0; x < remain_n; x++) {
+    Dtype* dout_ptr0 = dout + x * 4;
+    for (int y = 0; y < cnt_m; y++) {
+      float32x4_t din0 = vld1q_f32(din_ptr0);
+      Dtype* dout_ptr1 = dout_ptr0 + n;
+      Dtype* dout_ptr2 = dout_ptr1 + n;
+      Dtype* dout_ptr3 = dout_ptr2 + n;
+      din_ptr0 += 4;
+      *dout_ptr0 = din0[0];
+      *dout_ptr1 = din0[1];
+      dout_ptr0 += nn_num;
+      *dout_ptr2 = din0[2];
+      *dout_ptr3 = din0[3];
+    }
+    for (int y = 0; y < remain_m; y++) {
+      *dout_ptr0++ = *din_ptr0++;
+    }
+  }
+}
 /*preprocessing inputs
 * input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws]
 * n = he - hs
diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc
index 04373992e4802a0b0c2529daac851e00ebcb56cf..a73a63ddcb67f8790f73aff3fff8368f4005b7e1 100644
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -747,6 +747,16 @@ void elementwise_mul<int>(const int* dinx,
   }
 }
 
+template <>
+void elementwise_mul<int64_t>(const int64_t* dinx,
+                              const int64_t* diny,
+                              int64_t* dout,
+                              int num) {
+  for (int i = 0; i < num; i++) {
+    dout[i] = dinx[i] * diny[i];
+  }
+}
+
 template <>
 void elementwise_mul_relu<float>(const float* dinx,
                                  const float* diny,
@@ -801,6 +811,17 @@ void elementwise_mul_relu<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_mul_relu<int64_t>(const int64_t* dinx,
+                                   const int64_t* diny,
+                                   int64_t* dout,
+                                   int num) {
+  for (int i = 0; i < num; i++) {
+    int64_t tmp = dinx[i] * diny[i];
+    dout[i] = tmp > 0 ? tmp : 0;
+  }
+}
+
 template <>
 void elementwise_mul_broadcast<float>(const float* dinx,
                                       const float* diny,
@@ -935,6 +956,29 @@ void elementwise_mul_broadcast<int>(const int* dinx,
   }
 }
 
+template <>
+void elementwise_mul_broadcast<int64_t>(const int64_t* dinx,
+                                        const int64_t* diny,
+                                        int64_t* dout,
+                                        int batch,
+                                        int channels,
+                                        int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* dinx_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = *dinx_ptr * diny_data;
+        dout_ptr++;
+        dinx_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_mul_relu_broadcast<float>(const float* dinx,
                                            const float* diny,
@@ -1014,6 +1058,30 @@ void elementwise_mul_relu_broadcast<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_mul_relu_broadcast<int64_t>(const int64_t* dinx,
+                                             const int64_t* diny,
+                                             int64_t* dout,
+                                             int batch,
+                                             int channels,
+                                             int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* dinx_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        int64_t tmp = *dinx_ptr * diny_data;
+        *dout_ptr = tmp > 0 ? tmp : 0;
+        dout_ptr++;
+        dinx_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_max<float>(const float* dinx,
                             const float* diny,
diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc
index 3e6cbff0660be8f2542d059a39115bed52122ff1..8303851ece9dd2f1d053f9f4b888e42f2fdc0aad 100644
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -2044,7 +2044,7 @@ void pooling3x3s1p0_avg(const float* din,
               } else {
                 if (pad_bottom > 1) {
                   coef_h = 1.f / 3;
-                } else if (pad_bottom = 1) {
+                } else if (pad_bottom == 1) {
                   coef_h = 0.5f;
                 } else {
                   coef_h = 1.f;
diff --git a/lite/backends/arm/math/prior_box.cc b/lite/backends/arm/math/prior_box.cc
index 6daab69ebf00da24d67132afba4b9abef0afbd39..4ef7356e67cee4c47ddf3eb16ed5286b4271b41a 100644
--- a/lite/backends/arm/math/prior_box.cc
+++ b/lite/backends/arm/math/prior_box.cc
@@ -21,7 +21,7 @@ namespace lite {
 namespace arm {
 namespace math {
 
-const int MALLOC_ALIGN = 64;
+const int MALLOC_ALIGN = 16;
 
 void* fast_malloc(size_t size) {
   size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
diff --git a/lite/backends/arm/math/sequence_pool.cc b/lite/backends/arm/math/sequence_pool.cc
index b8f9ab0a1a842a59971ad4c165d4c1be3426059a..ded76c1bdae354ca46a254309dcc6b3e216c92f4 100644
--- a/lite/backends/arm/math/sequence_pool.cc
+++ b/lite/backends/arm/math/sequence_pool.cc
@@ -46,11 +46,60 @@ void seq_pool_sum<float>(const float* din,
       memcpy(dout_ptr, din_ptr, width * sizeof(float));
       din_ptr += width;
       height = height - 1;
-      for (int h = 0; h < height; h++) {
-        for (int w = 0; w < width; ++w) {
-          dout_ptr[w] += din_ptr[w];
+      int cnt_w = width >> 2;
+      int remain_w = width & 3;
+      int cnt_h = height >> 2;
+      int remain_h = height & 3;
+      int stride = width << 2;
+      for (int w = 0; w < cnt_w; w++) {
+        const float* din_ptr0 = din_ptr + w * 4;
+        float32x4_t dout_val = vld1q_f32(dout_ptr);
+        const float* din_ptr1 = din_ptr0 + width;
+        const float* din_ptr2 = din_ptr1 + width;
+        const float* din_ptr3 = din_ptr2 + width;
+        for (int h = 0; h < cnt_h; h++) {
+          float32x4_t din0 = vld1q_f32(din_ptr0);
+          float32x4_t din1 = vld1q_f32(din_ptr1);
+          float32x4_t din2 = vld1q_f32(din_ptr2);
+          float32x4_t din3 = vld1q_f32(din_ptr3);
+          dout_val = vaddq_f32(din0, dout_val);
+          float32x4_t tmp = vaddq_f32(din1, din2);
+          din_ptr0 += stride;
+          din_ptr1 += stride;
+          dout_val = vaddq_f32(din3, dout_val);
+          din_ptr2 += stride;
+          din_ptr3 += stride;
+          dout_val = vaddq_f32(tmp, dout_val);
         }
-        din_ptr += width;
+        for (int h = 0; h < remain_h; h++) {
+          float32x4_t din0 = vld1q_f32(din_ptr0);
+          dout_val = vaddq_f32(din0, dout_val);
+          din_ptr0 += width;
+        }
+        vst1q_f32(dout_ptr, dout_val);
+        dout_ptr += 4;
+      }
+      const float* din_ptr00 = din_ptr + cnt_w * 4;
+      for (int w = 0; w < remain_w; w++) {
+        const float* din_ptr0 = din_ptr00 + w;
+        const float* din_ptr1 = din_ptr0 + width;
+        const float* din_ptr2 = din_ptr1 + width;
+        const float* din_ptr3 = din_ptr2 + width;
+        for (int h = 0; h < cnt_h; h++) {
+          *dout_ptr += din_ptr0[0];
+          float tmp = din_ptr1[0] + din_ptr2[0];
+          din_ptr0 += stride;
+          din_ptr1 += stride;
+          *dout_ptr += din_ptr3[0];
+          din_ptr2 += stride;
+          din_ptr3 += stride;
+          *dout_ptr += tmp;
+        }
+        for (int h = 0; h < remain_h; h++) {
+          *dout_ptr += din_ptr0[0];
+          din_ptr0 += width;
+        }
+        dout_ptr++;
       }
     }
   }
@@ -144,12 +193,62 @@ void seq_pool_max<float>(const float* din,
       } else {
         memcpy(dout_ptr, din_ptr, width * sizeof(float));
         din_ptr += width;
-        int remain_h = height - 1;
-        for (int h = 0; h < remain_h; h++) {
-          for (int w = 0; w < width; w++) {
-            dout_ptr[w] = std::max(dout_ptr[w], din_ptr[w]);
+        height = height - 1;
+        int cnt_w = width >> 2;
+        int remain_w = width & 3;
+        int cnt_h = height >> 2;
+        int remain_h = height & 3;
+        int stride = width << 2;
+        for (int w = 0; w < cnt_w; w++) {
+          const float* din_ptr0 = din_ptr + w * 4;
+          float32x4_t dout_val = vld1q_f32(dout_ptr);
+          const float* din_ptr1 = din_ptr0 + width;
+          const float* din_ptr2 = din_ptr1 + width;
+          const float* din_ptr3 = din_ptr2 + width;
+          for (int h = 0; h < cnt_h; h++) {
+            float32x4_t din0 = vld1q_f32(din_ptr0);
+            float32x4_t din1 = vld1q_f32(din_ptr1);
+            float32x4_t din2 = vld1q_f32(din_ptr2);
+            float32x4_t din3 = vld1q_f32(din_ptr3);
+            dout_val = vmaxq_f32(din0, dout_val);
+            float32x4_t tmp = vmaxq_f32(din1, din2);
+            din_ptr0 += stride;
+            din_ptr1 += stride;
+            dout_val = vmaxq_f32(din3, dout_val);
+            din_ptr2 += stride;
+            din_ptr3 += stride;
+            dout_val = vmaxq_f32(tmp, dout_val);
           }
-          din_ptr += width;
+          for (int h = 0; h < remain_h; h++) {
+            float32x4_t din0 = vld1q_f32(din_ptr0);
+            dout_val = vmaxq_f32(din0, dout_val);
+            din_ptr0 += width;
+          }
+          vst1q_f32(dout_ptr, dout_val);
+          dout_ptr += 4;
+        }
+        const float* din_ptr00 = din_ptr + cnt_w * 4;
+        for (int w = 0; w < remain_w; w++) {
+          const float* din_ptr0 = din_ptr00 + w;
+          const float* din_ptr1 = din_ptr0 + width;
+          const float* din_ptr2 = din_ptr1 + width;
+          const float* din_ptr3 = din_ptr2 + width;
+          for (int h = 0; h < cnt_h; h++) {
+            *dout_ptr += din_ptr0[0];
+            *dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
+            float tmp = std::max(din_ptr1[0], din_ptr2[0]);
+            din_ptr0 += stride;
+            din_ptr1 += stride;
+            *dout_ptr = std::max(*dout_ptr, din_ptr3[0]);
+            din_ptr2 += stride;
+            din_ptr3 += stride;
+            *dout_ptr = std::max(*dout_ptr, tmp);
+          }
+          for (int h = 0; h < remain_h; h++) {
+            *dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
+            din_ptr0 += width;
+          }
+          dout_ptr++;
         }
       }
     }
diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt
index 7f96308a5dcaf5742bd5dcef7c2e5f146cdb7c59..c23d3d0ed0351b59d4a373efb2474e9a73763659 100644
--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -11,10 +11,13 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps})
 nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
+nv_library(cuda_gru_forward  SRCS gru_forward.cu DEPS cuda_activation ${cuda_static_deps})
+nv_library(cuda_sequence2batch  SRCS sequence2batch.cu DEPS ${cuda_static_deps})
 nv_library(cuda_gemm SRCS gemm.cc  DEPS ${cuda_static_deps})
 nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
 nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps})
 nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})
+nv_library(cuda_bias SRCS bias.cu DEPS ${cuda_static_deps})
 
 set (
  math_cuda
@@ -25,10 +28,13 @@ set (
  cuda_transpose
  cuda_elementwise
  cudnn_pool
+ cuda_gru_forward
+ cuda_sequence2batch
  cuda_gemm
  cuda_batched_gemm
  cuda_strided_gemm
  cuda_sequence_padding
+ cuda_bias
 )
 
 set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
diff --git a/lite/backends/cuda/math/activation.cu b/lite/backends/cuda/math/activation.cu
index a45e3eb378eefdbabce0b837891514dc659e0429..4d97042aeb0b728b491fbc2dd12ddcc94b4c1490 100644
--- a/lite/backends/cuda/math/activation.cu
+++ b/lite/backends/cuda/math/activation.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <iostream>
+#include "lite/backends/cuda/cuda_utils.h"
 #include "lite/backends/cuda/math/activation.h"
 #include "lite/backends/cuda/math/utils.h"
 
@@ -21,6 +22,20 @@ namespace lite {
 namespace cuda {
 namespace math {
 
+ActivationType GetActiveType(const std::string& act) {
+  if (act == "sigmoid") {
+    return kSigmoid;
+  } else if (act == "relu") {
+    return kReLU;
+  } else if (act == "tanh") {
+    return kTanh;
+  } else if (act == "identify") {
+    return kIdentity;
+  } else {
+    LOG(FATAL) << "not supported activation: " << act;
+  }
+}
+
 template <typename T>
 __global__ void relu_kernel(const int num,
                             const float alpha,
@@ -470,6 +485,76 @@ template void relu(int, const half*, half*, float, cudaStream_t);
 template void bias_relu(
     int, const float*, const float* bias, float*, float, cudaStream_t);
 
+// ------------- sigmoid -------------
+
+template <typename T>
+__global__ void sigmoid_kernel(const int num, const T* in, T* out) {
+  CUDA_KERNEL_LOOP(i, num) {
+#if __CUDA_ARCH__ >= 350
+    out[i] = static_cast<T>(1.0f) /
+             (static_cast<T>(1.0f) + expf(-1 * __ldg(in + i)));
+#else
+    out[i] = static_cast<T>(1.0f) / (static_cast<T>(1.0f) + expf(-in[i]));
+#endif
+  }
+}
+
+template <>
+__global__ void sigmoid_kernel(const int num, const half* in, half* out) {
+  CUDA_KERNEL_LOOP(i, num) {
+    half tmp = __float2half(1.0f);
+#if __CUDA_ARCH__ >= 530
+    out[i] = __hdiv(
+        tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.0f), __ldg(in + i)))));
+#else
+    out[i] = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i]))));
+#endif
+  }
+}
+
+template <>
+__global__ void sigmoid_kernel(const int num, const half2* in, half2* out) {
+  CUDA_KERNEL_LOOP(i, num) {
+    half2 tmp = __floats2half2_rn(1.0f, 1.0f);
+#if __CUDA_ARCH__ >= 530
+    out[i] = __h2div(tmp,
+                     __hadd2(tmp,
+                             h2exp(__hmul2(__floats2half2_rn(-1.0f, -1.0f),
+                                           __ldg(in + i)))));
+#else
+    out[i].x = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].x))));
+    out[i].y = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].y))));
+#endif
+  }
+}
+
+template <typename T>
+void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream) {
+  sigmoid_kernel<T><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
+      num, din, dout);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <>
+void sigmoid(const int num, const half* din, half* dout, cudaStream_t stream) {
+  if (num % 2 == 0) {
+    const half2* din2 = reinterpret_cast<const half2*>(din);
+    half2* dout2 = reinterpret_cast<half2*>(dout);
+    sigmoid_kernel<
+        half2><<<CUDA_GET_BLOCKS(num / 2), CUDA_NUM_THREADS, 0, stream>>>(
+        num / 2, din2, dout2);
+  } else {
+    sigmoid_kernel<half><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
+        num, din, dout);
+  }
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template void sigmoid(const int num,
+                      const float* din,
+                      float* dout,
+                      cudaStream_t stream);
+
 }  // namespace math
 }  // namespace cuda
 }  // namespace lite
diff --git a/lite/backends/cuda/math/activation.h b/lite/backends/cuda/math/activation.h
index 887a222ee83878aa19fd6a94a76572e48ab4d954..926ad8d99fc4bd6464ed517505fcf30f035c57f8 100644
--- a/lite/backends/cuda/math/activation.h
+++ b/lite/backends/cuda/math/activation.h
@@ -17,11 +17,22 @@
 #include <cuda_runtime.h>
 #include <string>
 
+#include "lite/utils/cp_logging.h"
+
 namespace paddle {
 namespace lite {
 namespace cuda {
 namespace math {
 
+enum ActivationType {
+  kSigmoid,
+  kReLU,
+  kTanh,
+  kIdentity,
+};
+
+ActivationType GetActiveType(const std::string& act);
+
 // fp32 and half
 template <typename T>
 void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream);
@@ -72,6 +83,9 @@ void bias_int8_nhwc(int num,
                     const void* scale,
                     cudaStream_t stream);
 
+template <typename T>
+void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream);
+
 }  // namespace math
 }  // namespace cuda
 }  // namespace lite
diff --git a/lite/backends/cuda/math/bias.cu b/lite/backends/cuda/math/bias.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5e597e51c81cf75ddc2f850ac41924a0176ecb45
--- /dev/null
+++ b/lite/backends/cuda/math/bias.cu
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/cuda/math/bias.h"
+
+#include <iostream>
+
+#include "lite/backends/cuda/cuda_utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+__global__ void RowwiseAddKernel(
+    const T* a, const T* b, T* c, int width, int num) {
+  CUDA_KERNEL_LOOP(i, num) {
+    int h = i / width;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
+}
+
+template <>
+__global__ void RowwiseAddKernel(
+    const half* a, const half* b, half* c, int width, int num) {
+  CUDA_KERNEL_LOOP(i, num) {
+    int h = i / width;
+    int w = i - h * width;
+    c[i] = __hadd(a[i], b[w]);
+  }
+}
+
+template <typename T>
+void RowwiseAdd<T>::operator()(const T* input,
+                               const T* bias,
+                               T* output,
+                               const int width,
+                               const int count,
+                               const cudaStream_t& stream) {
+  RowwiseAddKernel<T><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+      input, bias, output, width, count);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template struct RowwiseAdd<float>;
+template struct RowwiseAdd<half>;
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/bias.h b/lite/backends/cuda/math/bias.h
new file mode 100644
index 0000000000000000000000000000000000000000..98f805a013ff80b267301be4d47a9694c5ce642f
--- /dev/null
+++ b/lite/backends/cuda/math/bias.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "lite/backends/cuda/cuda_utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+struct RowwiseAdd {
+  void operator()(const T* input,
+                  const T* bias,
+                  T* output,
+                  const int width,
+                  const int count,
+                  const cudaStream_t& stream);
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/gru_forward.cu b/lite/backends/cuda/math/gru_forward.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cd04c3871db07a18acab99c960a90124941ade5d
--- /dev/null
+++ b/lite/backends/cuda/math/gru_forward.cu
@@ -0,0 +1,278 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#include "lite/backends/cuda/math/gru_forward.h"
+#include "lite/core/device_info.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <typename T>
+__global__ void GruForwardResetOutput(
+    T* gate_value,
+    T* reset_output_value,
+    T* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
+  }
+  T prev_out = 0;
+  T reset_out_val;
+  T update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_output_value) {
+    if (is_batch) {
+      prev_output_value += batch_idx * frame_size;
+    }
+    prev_out = prev_output_value[frame_idx];
+  }
+
+  if (active_gate == lite::cuda::math::ActivationType::kSigmoid) {
+    update_gate_value = Sigmoid(update_gate_value);
+    reset_gate_value = Sigmoid(reset_gate_value);
+  } else if (active_gate == lite::cuda::math::ActivationType::kReLU) {
+    update_gate_value = ReLU(update_gate_value);
+    reset_gate_value = ReLU(reset_gate_value);
+  } else if (active_gate == lite::cuda::math::ActivationType::kTanh) {
+    update_gate_value = Tanh(update_gate_value);
+    reset_gate_value = Tanh(reset_gate_value);
+  }
+
+  reset_out_val = prev_out * reset_gate_value;
+
+  gate_value[frame_idx + frame_size * 0] = update_gate_value;
+  gate_value[frame_idx + frame_size * 1] = reset_gate_value;
+  reset_output_value[frame_idx] = reset_out_val;
+}
+
+template <>
+__global__ void GruForwardResetOutput(
+    half* gate_value,
+    half* reset_output_value,
+    half* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
+  }
+  half prev_out = 0;
+  half reset_out_val;
+  half update_gate_value = gate_value[frame_idx + frame_size * 0];
+  half reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_output_value) {
+    if (is_batch) {
+      prev_output_value += batch_idx * frame_size;
+    }
+    prev_out = prev_output_value[frame_idx];
+  }
+
+  if (active_gate == ActivationType::kSigmoid) {
+    update_gate_value = Sigmoid(update_gate_value);
+    reset_gate_value = Sigmoid(reset_gate_value);
+  } else if (active_gate == ActivationType::kReLU) {
+    update_gate_value = ReLU(update_gate_value);
+    reset_gate_value = ReLU(reset_gate_value);
+  } else if (active_gate == ActivationType::kTanh) {
+    update_gate_value = Tanh(update_gate_value);
+    reset_gate_value = Tanh(reset_gate_value);
+  }
+#if __CUDA_ARCH__ >= 530
+  reset_out_val = __hmul(prev_out, reset_gate_value);
+#else
+  reset_out_val =
+      __float2half(__half2float(prev_out) * __half2float(reset_gate_value));
+#endif
+
+  gate_value[frame_idx + frame_size * 0] = update_gate_value;
+  gate_value[frame_idx + frame_size * 1] = reset_gate_value;
+  reset_output_value[frame_idx] = reset_out_val;
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <typename T>
+__global__ void GruForwardFinalOutput(
+    T* gate_value,
+    T* prev_output_value,
+    T* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) {
+      return;
+    }
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
+  }
+
+  T output;
+  T prev_out = 0;
+  T update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T state_frame_value = gate_value[frame_idx + frame_size * 2];
+
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    prev_out = prev_output_value[frame_idx];
+  }
+
+  if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
+    state_frame_value = Sigmoid(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kReLU) {
+    state_frame_value = ReLU(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kTanh) {
+    state_frame_value = Tanh(state_frame_value);
+  }
+
+  if (origin_mode) {
+    output = update_gate_value * prev_out + state_frame_value -
+             update_gate_value * state_frame_value;
+  } else {
+    output = prev_out - update_gate_value * prev_out +
+             update_gate_value * state_frame_value;
+  }
+
+  gate_value[frame_idx + frame_size * 2] = state_frame_value;
+  output_value[frame_idx] = output;
+}
+
+template <>
+__global__ void GruForwardFinalOutput(
+    half* gate_value,
+    half* prev_output_value,
+    half* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) {
+      return;
+    }
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
+  }
+
+  half output;
+  half prev_out = 0;
+  half update_gate_value = gate_value[frame_idx + frame_size * 0];
+  half state_frame_value = gate_value[frame_idx + frame_size * 2];
+
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    prev_out = prev_output_value[frame_idx];
+  }
+
+  if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
+    state_frame_value = Sigmoid(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kReLU) {
+    state_frame_value = ReLU(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kTanh) {
+    state_frame_value = Tanh(state_frame_value);
+  }
+
+  if (origin_mode) {
+#if __CUDA_ARCH__ >= 530
+    output =
+        __hsub(__hadd(__hmul(update_gate_value, prev_out), state_frame_value),
+               __hmul(update_gate_value, state_frame_value));
+#else
+    output = __float2half(
+        __half2float(update_gate_value) * __half2float(prev_out) +
+        __half2float(state_frame_value) -
+        __half2float(update_gate_value) * __half2float(state_frame_value));
+#endif
+  } else {
+#if __CUDA_ARCH__ >= 530
+    output = prev_out - update_gate_value * prev_out +
+             update_gate_value * state_frame_value;
+    output = __hadd(__hsub(prev_out, __hmul(update_gate_value, prev_out)),
+                    __hmul(update_gate_value, state_frame_value));
+#else
+    output = __float2half(
+        __half2float(prev_out) -
+        __half2float(update_gate_value) * __half2float(prev_out) +
+        __half2float(update_gate_value) * __half2float(state_frame_value));
+#endif
+  }
+
+  gate_value[frame_idx + frame_size * 2] = state_frame_value;
+  output_value[frame_idx] = output;
+}
+
+template __global__ void GruForwardFinalOutput<float>(
+    float* gate_value,
+    float* prev_output_value,
+    float* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch);
+
+template __global__ void GruForwardResetOutput<float>(
+    float* gate_value,
+    float* reset_output_value,
+    float* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch);
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/gru_forward.h b/lite/backends/cuda/math/gru_forward.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a1648c437e860bec07fbec7bbbd69b659a58407
--- /dev/null
+++ b/lite/backends/cuda/math/gru_forward.h
@@ -0,0 +1,242 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+
+#include <string>
+#include <vector>
+
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/activation.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+template <typename Dtype>
+inline __device__ Dtype Sigmoid(const Dtype a) {
+  const Dtype min = SIGMOID_THRESHOLD_MIN;
+  const Dtype max = SIGMOID_THRESHOLD_MAX;
+  Dtype tmp = (a < min) ? min : ((a > max) ? max : a);
+  return static_cast<Dtype>(1.0) / (static_cast<Dtype>(1.0) + expf(-tmp));
+}
+
+template <>
+inline __device__ half Sigmoid(const half a) {
+#if __CUDA_ARCH__ >= 530
+  const half tmp = __float2half(1.0f);
+  return __hdiv(tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.f), a))));
+#else
+  return __float2half(1.0f / (expf(__half2float(a) * -1) + 1.0f));
+#endif
+}
+
+template <typename Dtype>
+inline __device__ Dtype ReLU(const Dtype a) {
+  return a > static_cast<Dtype>(0.f) ? a : static_cast<Dtype>(0.f);
+}
+
+template <>
+inline __device__ half ReLU(const half a) {
+  const half tmp = __float2half(0.f);
+#if __CUDA_ARCH__ >= 530
+  return __hgt(a, tmp) ? a : tmp;
+#else
+  return __float2half(__half2float(a) > 0.f ? __half2float(a) : 0.f);
+#endif
+}
+
+template <typename Dtype>
+inline __device__ Dtype Tanh(const Dtype a) {
+  Dtype tmp = static_cast<Dtype>(-2.0) * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (static_cast<Dtype>(2.0) / (static_cast<Dtype>(1.0) + expf(tmp))) -
+         static_cast<Dtype>(1.0);
+}
+
+template <>
+inline __device__ half Tanh(const half a) {
+#if __CUDA_ARCH__ >= 530
+  half tmp = __float2half(1.0f);
+  half numerator = __hmul(__float2half(-2.0f), a);
+  return __hsub(__hdiv(__float2half(2.0f), __hadd(tmp, hexp(numerator))), tmp);
+#else
+  float tmp = -2.0f * __half2float(a);
+  return __float2half(2.0f / (1.0f + expf(tmp)) - 1.0f);
+#endif
+}
+
+template <typename T>
+__global__ void GruForwardResetOutput(
+    T* gate_value,
+    T* reset_output_value,
+    T* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch);
+
+template <typename T>
+__global__ void GruForwardFinalOutput(
+    T* gate_value,
+    T* prev_output_value,
+    T* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch);
+
+/*
+ * threads(tile_size, 1)
+ * grids(frame_blocks, 1)
+ */
+template <class T, int TiledSize>
+__global__ void FastCollectiveGruGate(T* gate_value,
+                                      T* prev_output_value,
+                                      T* gate_weight,
+                                      T* reset_output,
+                                      int frame_size,
+                                      ActivationType active_node) {
+  T xt_0 = 0.0f;
+  T a0 = 0.0f;
+  T c0 = 0.0f;
+  T b0[TiledSize];
+
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  int tiled_mask = ((1 << TiledSize) - 1);
+  // tiled matrix multiply using register shift, faster than sm.
+  if (prev_output_value) {
+    for (int k = 0; k < (((frame_size - 1) / TiledSize) + 1); ++k) {
+      a0 = 0;
+      if ((threadIdx.x + k * TiledSize) < frame_size) {
+        a0 = prev_output_value[threadIdx.x + (k * TiledSize)];
+      }
+      for (int i = 0; i < TiledSize; ++i) {
+        if (col < frame_size * 2 && (i + k * TiledSize) < frame_size) {
+          b0[i] = gate_weight[(i + k * TiledSize) * frame_size * 2 + col];
+        }
+      }
+
+      for (int i = 0; i < TiledSize; ++i) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+        c0 = c0 + __shfl_sync(tiled_mask, a0, i, TiledSize) * b0[i];
+#else
+        c0 = c0 + __shfl(a0, i, TiledSize) * b0[i];
+#endif
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (col < frame_size * 2) {
+    xt_0 = gate_value[col];
+    c0 += xt_0;
+    if (active_node == ActivationType::kSigmoid) {
+      c0 = Sigmoid(c0);
+    } else if (active_node == ActivationType::kReLU) {
+      c0 = ReLU(c0);
+    } else if (active_node == ActivationType::kTanh) {
+      c0 = Tanh(c0);
+    }
+    gate_value[col] = c0;
+    if (frame_size <= col && col < frame_size * 2) {
+      T htp_0 = 0.0;
+      if (prev_output_value) {
+        htp_0 = prev_output_value[col - frame_size];
+      }
+      reset_output[col - frame_size] = c0 * htp_0;
+    } else if (col < frame_size) {
+      gate_value[col] = c0;
+    }
+  }
+}
+
+template <class T, int TiledSize>
+__global__ void FastCollectiveGruOut(T* gate_weight,
+                                     T* prev_out_value,
+                                     T* output_value,
+                                     T* gate_value,
+                                     T* reset_value,
+                                     int frame_size,
+                                     ActivationType active_node,
+                                     bool origin_mode) {
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  T a0 = 0.0f;
+  T b0[TiledSize];
+  T c0 = 0.0f;
+
+  int tiled_mask = ((1 << TiledSize) - 1);
+  if (prev_out_value) {
+    for (int k = 0; k < ((frame_size - 1) / TiledSize + 1); ++k) {
+      a0 = 0;
+      if ((threadIdx.x + k * TiledSize) < frame_size) {
+        a0 = reset_value[threadIdx.x + k * TiledSize];
+      }
+      for (int i = 0; i < TiledSize; ++i) {
+        if (col < frame_size && (i + k * TiledSize) < frame_size) {
+          b0[i] = gate_weight[(i + k * TiledSize) * frame_size + col];
+        }
+      }
+      for (int i = 0; i < TiledSize; ++i) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+        c0 = c0 + __shfl_sync(tiled_mask, a0, i, TiledSize) * b0[i];
+#else
+        c0 = c0 + __shfl(a0, i, TiledSize) * b0[i];
+#endif
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (col < frame_size) {
+    T xt_0 = gate_value[col + 2 * frame_size];
+    T gta_0 = gate_value[col];
+    T htp_0 = 0;
+    if (prev_out_value) {
+      htp_0 = prev_out_value[col];
+    }
+    c0 += xt_0;
+    if (active_node == ActivationType::kSigmoid) {
+      c0 = Sigmoid(c0);
+    } else if (active_node == ActivationType::kReLU) {
+      c0 = ReLU(c0);
+    } else if (active_node == ActivationType::kTanh) {
+      c0 = Tanh(c0);
+    }
+    gate_value[col + 2 * frame_size] = c0;
+    if (origin_mode) {
+      output_value[col] = htp_0 * gta_0 + (1 - gta_0) * c0;
+    } else {
+      output_value[col] = c0 * gta_0 + (1 - gta_0) * htp_0;
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/scale.cu b/lite/backends/cuda/math/scale.cu
index 806a3697a2eb19354a81056f0a7ab6272ed991a1..f9d5209c3e4af11231f4b62531f9eb11ede56557 100644
--- a/lite/backends/cuda/math/scale.cu
+++ b/lite/backends/cuda/math/scale.cu
@@ -22,10 +22,6 @@ namespace lite {
 namespace cuda {
 namespace math {
 
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void scale_kernel(int count,
                              const T* in_data,
@@ -48,7 +44,6 @@ __global__ void scale_kernel(int count,
 template <typename T>
 __global__ void scale_kernel(
     int count, const T* in_data, T* out_data, const T scale, const T bias) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
   CUDA_KERNEL_LOOP(tid, count) { out_data[tid] = scale * in_data[tid] + bias; }
 }
 
@@ -133,12 +128,11 @@ void fp32_scale_nhwc(int num,
 }
 
 template <typename T>
-void scale(int num, const T* in, T* out, T scale, cudaStream_t stream, T bias) {
+void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream) {
   int thread = 256;
   int block = (num + thread - 1) / thread;
   scale_kernel<<<block, thread, 0, stream>>>(num, in, out, scale, bias);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }
 
 template <typename T>
@@ -146,11 +140,10 @@ void scale(int num, const T* in, T* out, T scale, T bias) {
   int thread = 256;
   int block = (num + thread - 1) / thread;
   scale_kernel<<<block, thread>>>(num, in, out, scale, bias);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }
 
-template void scale(int num, const float*, float*, float, cudaStream_t, float);
+template void scale(int num, const float*, float*, float, float, cudaStream_t);
 template void scale(int num, const float*, float*, float, float);
 
 }  // namespace math
diff --git a/lite/backends/cuda/math/scale.h b/lite/backends/cuda/math/scale.h
index 52ed1d38ae79ce11cac50a9abef0f57e6de1352c..b9961b12c3c251ffb7f80589fa8c9ccb12d96e30 100644
--- a/lite/backends/cuda/math/scale.h
+++ b/lite/backends/cuda/math/scale.h
@@ -32,8 +32,7 @@ void fp32_scale_nhwc(int num,
                      cudaStream_t stream);
 
 template <typename T>
-void scale(
-    int num, const T* in, T* out, T scale, cudaStream_t stream, T bias = 0);
+void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream);
 
 template <typename T>
 void scale(int num, const T* in, T* out, T scale, T bias = 0);
diff --git a/lite/backends/cuda/math/sequence2batch.cu b/lite/backends/cuda/math/sequence2batch.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9a93362b3bb163b889049d07186634987ed63940
--- /dev/null
+++ b/lite/backends/cuda/math/sequence2batch.cu
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/sequence2batch.h"
+#include "lite/backends/cuda/math/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+__global__ void CopyMatrixRowsKernel(const T* src,
+                                     T* dst,
+                                     const uint64_t* index,
+                                     int height,
+                                     int width,
+                                     bool is_src_index) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int row_id = blockDim.y * blockIdx.x + idy;
+  if (row_id < height) {
+    int src_idx = is_src_index ? index[row_id] : row_id;
+    int dst_idx = is_src_index ? row_id : index[row_id];
+    const T* src_data = src + src_idx * width;
+    T* dst_data = dst + dst_idx * width;
+    for (int i = idx; i < width; i += blockDim.x) {
+      dst_data[i] = src_data[i];
+    }
+  }
+}
+
+template <typename T>
+void CopyMatrixRowsFunctor<T>::operator()(
+    const lite::Tensor& src,
+    lite::Tensor* dst,
+    const std::vector<uint64_t>& index_lod,
+    bool is_src_index,
+    const cudaStream_t& stream) {
+  auto src_dims = src.dims();
+  auto dst_dims = dst->dims();
+  CHECK_EQ(src_dims.size(), 2) << "The src must be matrix with rank 2.";
+  CHECK_EQ(dst_dims.size(), 2) << "The dst must be matrix with rank 2.";
+  CHECK_EQ(src_dims[1], dst_dims[1])
+      << "The width of src and dst must be same.";
+  int height = dst_dims[0];
+  int width = dst_dims[1];
+  const auto* src_data = src.data<T>();
+  auto* dst_data = dst->template mutable_data<T>(TARGET(kCUDA));
+
+  index_tensor_.Resize({static_cast<int64_t>(index_lod.size())});
+  auto* index_tensor_data = index_tensor_.mutable_data<uint64_t>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(index_tensor_data,
+                                 index_lod.data(),
+                                 sizeof(uint64_t) * index_lod.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  dim3 threads(128, 8);
+  dim3 grids((height + threads.y - 1) / threads.y);
+  CopyMatrixRowsKernel<T><<<grids, threads, 0, stream>>>(
+      src_data, dst_data, index_tensor_data, height, width, is_src_index);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template class CopyMatrixRowsFunctor<float>;
+template class CopyMatrixRowsFunctor<half>;
+
+template class LoDTensor2BatchFunctor<float>;
+template class LoDTensor2BatchFunctor<half>;
+
+template class Batch2LoDTensorFunctor<float>;
+template class Batch2LoDTensorFunctor<half>;
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/sequence2batch.h b/lite/backends/cuda/math/sequence2batch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5a12ed0b4d54a9af47cfc046906ae96767e63cf
--- /dev/null
+++ b/lite/backends/cuda/math/sequence2batch.h
@@ -0,0 +1,167 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+class CopyMatrixRowsFunctor {
+ public:
+  // If is_src_index is true, copy the indexed rows of input src to the output
+  // dst. If is_src_index is false, copy the input src to the indexed of output
+  // dst. The indexes rows are based on the input index.
+  void operator()(const lite::Tensor& src,
+                  lite::Tensor* dst,
+                  const std::vector<uint64_t>& index_lod,
+                  bool is_src_index,
+                  const cudaStream_t& stream);
+
+ private:
+  lite::Tensor index_tensor_;
+};
+
+template <typename T>
+class LoDTensor2BatchFunctor {
+  // Calculate the length of each sequence and
+  // sort sequence index by the length.
+  // example:  sequences = {s0, s1, s2}
+  //            s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+  //            seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
+  struct SeqInfo {
+    SeqInfo(size_t start_val, size_t len_val, size_t seq_val)
+        : start(start_val), length(len_val), seq_idx(seq_val) {}
+    size_t start;
+    size_t length;
+    size_t seq_idx;
+  };
+
+ public:
+  void operator()(const lite::Tensor& lod_tensor,
+                  lite::Tensor* batch_tensor,
+                  bool is_reverse,
+                  const cudaStream_t& stream) const {
+    auto lods = lod_tensor.lod();
+    CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now.";
+    const auto& lod = lods[0];
+
+    std::vector<SeqInfo> seq_info;
+    for (int seq_id = 0; seq_id < static_cast<int>(lod.size()) - 1; ++seq_id) {
+      size_t length = lod[seq_id + 1] - lod[seq_id];
+      seq_info.emplace_back(lod[seq_id], length, seq_id);
+    }
+
+    std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
+      return a.length > b.length;
+    });
+
+    // Calculate the start position of each batch.
+    // example:  sequences = {s0, s1, s2}
+    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+    //           max_seqlen = 5,
+    //           batchIndex = {b0, b1, b2, b3, b4}
+    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
+    //              batch_start_positions[0] = 0
+    //              batch_start_positions[1] = len(b0)
+    //              batch_start_positions[2] = len(b0) + len(b1)
+    //              ...
+    //           seq2batch_idx[12] = {4, 0, 9,
+    //                                5, 1, 10,
+    //                                6, 2, 11,
+    //                                7, 3,
+    //                                8}
+    //           seq_order = {1, 0, 2}, the sort order.
+    //               where 1 is the second sequence,
+    //                     0 is the first sequence,
+    //                     2 is the third sequence.
+
+    LoD batch_lods;
+    batch_lods.emplace_back(std::vector<uint64_t>{0});
+    batch_lods.emplace_back(std::vector<uint64_t>{0});
+    batch_lods.emplace_back(std::vector<uint64_t>{0});
+
+    // batch_lods[0] is the start positions for batch LoDTensor
+    size_t max_seqlen = seq_info[0].length;
+    batch_lods[0].resize(max_seqlen + 1);
+    // batch_lods[1] is the raw index in the input LoDTensor
+    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+    // batch_lods[2] is the sort order for the input LoDTensor.
+    batch_lods[2].resize(seq_info.size());
+
+    auto* batch_starts = batch_lods[0].data();
+    auto* seq2batch_idx = batch_lods[1].data();
+    batch_starts[0] = 0;
+    for (size_t n = 0; n < max_seqlen; ++n) {
+      size_t batch_id = batch_starts[n];
+      for (size_t i = 0; i < seq_info.size(); ++i) {
+        size_t seq_len = seq_info[i].length;
+        size_t start = seq_info[i].start;
+        if (n < seq_len) {
+          seq2batch_idx[batch_id] =
+              is_reverse ? start + seq_len - 1 - n : start + n;
+          ++batch_id;
+        } else {
+          break;
+        }
+      }
+      batch_starts[n + 1] = batch_id;
+    }
+    auto* seq_order = batch_lods[2].data();
+    for (size_t i = 0; i < seq_info.size(); ++i) {
+      seq_order[i] = seq_info[i].seq_idx;
+    }
+
+    batch_tensor->set_lod(batch_lods);
+
+    lite::cuda::math::CopyMatrixRowsFunctor<T> to_batch;
+    to_batch(lod_tensor, batch_tensor, batch_lods[1], true, stream);
+    CUDA_POST_KERNEL_CHECK;
+  }
+};
+
+template <typename T>
+class Batch2LoDTensorFunctor {
+ public:
+  void operator()(const lite::Tensor& batch_tensor,
+                  lite::Tensor* lod_tensor,
+                  const cudaStream_t& stream) {
+    auto in_lod = batch_tensor.lod();
+    CHECK_GT(in_lod.size(), 2UL) << "The LoD of LoDTensor should include at "
+                                    "least 2-level sequence infomation.";
+    CHECK_EQ(in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]))
+        << "The LoD information should be consistent with the dims.";
+    lite::cuda::math::CopyMatrixRowsFunctor<T> to_seq;
+    to_seq(batch_tensor, lod_tensor, in_lod[1], false, stream);
+    CUDA_POST_KERNEL_CHECK;
+  }
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/sequence_padding.cu b/lite/backends/cuda/math/sequence_padding.cu
index 3a32be2a3446e420cac53a33506f141a001d61f0..e4f194b9c2289c51983d62b3835727efea91028d 100644
--- a/lite/backends/cuda/math/sequence_padding.cu
+++ b/lite/backends/cuda/math/sequence_padding.cu
@@ -86,8 +86,7 @@ void SequencePadding(T* pad_data,
       seq_num,
       pad_seq_len,
       step_width);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }
 
 template <typename T>
@@ -120,8 +119,7 @@ void SequenceUnpadding(T* seq_data,
       seq_num,
       pad_seq_len,
       step_width);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }
 
 template void SequencePadding(float* pad_data,
diff --git a/lite/backends/cuda/target_wrapper.h b/lite/backends/cuda/target_wrapper.h
index 3eeee84c1c46a65782e38b998bcd8142e08cbec1..caa9b3077fe96bf73e50b33688b90b71e0cd5c23 100644
--- a/lite/backends/cuda/target_wrapper.h
+++ b/lite/backends/cuda/target_wrapper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include "lite/backends/cuda/cuda_utils.h"
 #include "lite/core/target_wrapper.h"
 
 namespace paddle {
@@ -31,6 +32,16 @@ class TargetWrapper<TARGET(kCUDA)> {
   static size_t num_devices();
   static size_t maximum_stream() { return 0; }
 
+  static int GetComputeCapability() {
+    int dev_id = GetCurDevice();
+    int major, minor;
+    CUDA_CALL(cudaDeviceGetAttribute(
+        &major, cudaDevAttrComputeCapabilityMajor, dev_id));
+    CUDA_CALL(cudaDeviceGetAttribute(
+        &minor, cudaDevAttrComputeCapabilityMinor, dev_id));
+    return major * 10 + minor;
+  }
+
   static size_t GetCurDevice() {
     int dev_id;
     cudaGetDevice(&dev_id);
diff --git a/lite/backends/host/target_wrapper.cc b/lite/backends/host/target_wrapper.cc
index 5f020662a9d74aab6c28f79221d670e5de5ae048..00ce9dd6b349decc2f603692c2a6a0801bd4d7c0 100644
--- a/lite/backends/host/target_wrapper.cc
+++ b/lite/backends/host/target_wrapper.cc
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace lite {
 
-const int MALLOC_ALIGN = 64;
+const int MALLOC_ALIGN = 16;
 
 void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
   size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
@@ -30,7 +30,6 @@ void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
   void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
                                     (~(MALLOC_ALIGN - 1)));
   static_cast<void**>(r)[-1] = p;
-  memset(r, 0, size);
   return r;
 }
 void TargetWrapper<TARGET(kHost)>::Free(void* ptr) {
diff --git a/lite/backends/huawei_ascend_npu/CMakeLists.txt b/lite/backends/huawei_ascend_npu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..65616b4d357d4d29ca9b356abead2e1f6eb725d1
--- /dev/null
+++ b/lite/backends/huawei_ascend_npu/CMakeLists.txt
@@ -0,0 +1,6 @@
+if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+  return()
+endif()
+
+lite_cc_library(model_client_huawei_ascend_npu SRCS model_client.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs})
+lite_cc_library(device_huawei_ascend_npu SRCS device.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs} model_client_huawei_ascend_npu)
diff --git a/lite/backends/huawei_ascend_npu/device.cc b/lite/backends/huawei_ascend_npu/device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c8dc3d1de46fe12c3cb41257f864bcb1ff82bd9a
--- /dev/null
+++ b/lite/backends/huawei_ascend_npu/device.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/huawei_ascend_npu/device.h"
+#include <map>
+#include <utility>
+#include "ge/ge_api_types.h"
+#include "ge/ge_ir_build.h"
+#include "graph/graph.h"
+#include "lite/utils/io.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+std::shared_ptr<AclModelClient> Device::LoadFromMem(
+    const std::vector<char>& model_buffer, const int device_id) {
+  if (model_buffer.size() == 0) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] model_buffer size is ZERO!";
+    return nullptr;
+  }
+
+  // Create a ACL model  client to load the om model
+  std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
+  // Load model from memory
+  if (model_client->LoadFromMem(
+          reinterpret_cast<const void*>(model_buffer.data()),
+          model_buffer.size())) {
+    return model_client;
+  }
+  return nullptr;
+}
+
+std::shared_ptr<AclModelClient> Device::LoadFromFile(
+    const std::string& model_path, const int device_id) {
+  if (!paddle::lite::IsFileExists(model_path)) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] om model file not exists:" << model_path;
+    return nullptr;
+  }
+
+  // Create a ACL model  client to load the om model
+  std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
+  // Load model from memory
+  if (model_client->LoadFromFile(model_path.c_str())) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
+    return model_client;
+  }
+  return nullptr;
+}
+
+std::mutex Device::device_mutex_;
+
+bool Device::Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
+                   std::vector<ge::Operator>& output_nodes,  // NOLINT
+                   std::vector<char>* model_buffer) {
+  std::lock_guard<std::mutex> lock(device_mutex_);
+  // Convert the HiAI IR graph to the HiAI om model
+  ge::Graph ir_graph("graph");
+  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
+
+  // Build IR model
+  ge::ModelBufferData om_buffer;
+  std::map<std::string, std::string> options;
+  options.insert(std::make_pair(ge::ir_option::LOG_LEVEL, "error"));
+
+  ATC_CALL(aclgrphBuildModel(ir_graph, options, om_buffer));
+
+  // Copy from om model buffer
+  model_buffer->resize(om_buffer.length);
+  memcpy(reinterpret_cast<void*>(model_buffer->data()),
+         reinterpret_cast<void*>(om_buffer.data.get()),
+         om_buffer.length);
+
+  return true;
+}
+
+void Device::InitOnce() {
+  if (runtime_inited_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] runtime already inited!";
+    return;
+  }
+  // ACL runtime init => can only be called once in one process
+  ACL_CALL(aclInit(NULL));
+
+  // ATC builder init => can only be called once in one process
+  std::map<std::string, std::string> global_options;
+  global_options.insert(
+      std::make_pair(ge::ir_option::SOC_VERSION, "Ascend310"));
+  ATC_CALL(ge::aclgrphBuildInitialize(global_options));
+
+  runtime_inited_ = true;
+}
+
+void Device::DestroyOnce() {
+  if (!runtime_inited_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to destroy runtime!";
+    return;
+  }
+  // ATC builder finalize => can only be called once in one process
+  ge::aclgrphBuildFinalize();
+  // ACL runtime finalize => can only be called once in one process
+  ACL_CALL(aclFinalize());
+
+  runtime_inited_ = false;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/huawei_ascend_npu/device.h b/lite/backends/huawei_ascend_npu/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..de7ca55670ad019b0f035f9e8ab42c29748654f1
--- /dev/null
+++ b/lite/backends/huawei_ascend_npu/device.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+#include "lite/backends/huawei_ascend_npu/model_client.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() { InitOnce(); }
+
+  ~Device() { DestroyOnce(); }
+
+  std::shared_ptr<AclModelClient> LoadFromMem(
+      const std::vector<char>& model_buffer, const int device_id);
+  std::shared_ptr<AclModelClient> LoadFromFile(const std::string& model_path,
+                                               const int device_id);
+  // Build the ACL IR graph to the ACL om model
+  bool Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
+             std::vector<ge::Operator>& output_nodes,  // NOLINT
+             std::vector<char>* model_buffer);         // NOLINT
+
+ private:
+  void InitOnce();
+  void DestroyOnce();
+  bool runtime_inited_{false};
+  static std::mutex device_mutex_;
+};
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/huawei_ascend_npu/model_client.cc b/lite/backends/huawei_ascend_npu/model_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02a8014210b24f8ae143ee68341aec0281d5a570
--- /dev/null
+++ b/lite/backends/huawei_ascend_npu/model_client.cc
@@ -0,0 +1,398 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/huawei_ascend_npu/model_client.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+bool AclModelClient::LoadFromMem(const void* data, uint32_t size) {
+  if (load_flag_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
+    return true;
+  }
+
+  auto ret = aclmdlQuerySizeFromMem(
+      data, size, &model_memory_size_, &model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from memory failed!";
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
+                    "failed, require size is "
+                 << model_memory_size_;
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
+                    "failed, require size is "
+                 << model_weight_size_;
+    return false;
+  }
+  ret = aclmdlLoadFromMemWithMem(data,
+                                 size,
+                                 &model_id_,
+                                 model_memory_ptr_,
+                                 model_memory_size_,
+                                 model_weight_ptr_,
+                                 model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from memory failed!";
+    return false;
+  }
+  model_desc_ = aclmdlCreateDesc();
+  if (model_desc_ == nullptr) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
+    return false;
+  }
+  ret = aclmdlGetDesc(model_desc_, model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] AclModelClient LoadFromMem success.";
+  load_flag_ = true;
+  return true;
+}
+
+bool AclModelClient::LoadFromFile(const char* model_path) {
+  if (load_flag_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
+    return true;
+  }
+  auto ret =
+      aclmdlQuerySize(model_path, &model_memory_size_, &model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from file failed!";
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
+                    "failed, require size is "
+                 << model_memory_size_;
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
+                    "failed, require size is "
+                 << model_weight_size_;
+    return false;
+  }
+  ret = aclmdlLoadFromFileWithMem(model_path,
+                                  &model_id_,
+                                  model_memory_ptr_,
+                                  model_memory_size_,
+                                  model_weight_ptr_,
+                                  model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from file failed!";
+    return false;
+  }
+  model_desc_ = aclmdlCreateDesc();
+  if (model_desc_ == nullptr) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
+    return false;
+  }
+  ret = aclmdlGetDesc(model_desc_, model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
+  load_flag_ = true;
+  return true;
+}
+
+bool AclModelClient::GetModelIOTensorDim(
+    std::vector<TensorDesc>* input_tensor,
+    std::vector<TensorDesc>* output_tensor) {
+  if (!model_desc_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] GetModelIOTensorDim failed!";
+    return false;
+  }
+  size_t input_num = aclmdlGetNumInputs(model_desc_);
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] input numher is " << input_num;
+  for (size_t i = 0; i < input_num; i++) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] printing input [" << i << "] ....";
+    aclmdlIODims input_dim;
+    aclmdlGetInputDims(model_desc_, i, &input_dim);
+    aclDataType data_type = aclmdlGetInputDataType(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of inputs[" << i << "] is "
+            << data_type;
+    aclFormat data_format = aclmdlGetInputFormat(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of inputs[" << i << "] is "
+            << data_format;
+    TensorDesc tensor_desc = TensorDesc(data_type, input_dim, data_format);
+    input_tensor->push_back(tensor_desc);
+  }
+
+  size_t output_num = aclmdlGetNumOutputs(model_desc_);
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] output numher is " << output_num;
+  for (size_t i = 0; i < output_num; i++) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] printing output [" << i << "] ....";
+    aclmdlIODims output_dim;
+    aclmdlGetOutputDims(model_desc_, i, &output_dim);
+    aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of outputs[" << i << "] is "
+            << data_type;
+    aclFormat data_format = aclmdlGetOutputFormat(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of outputs[" << i << "] is "
+            << data_format;
+    TensorDesc tensor_desc = TensorDesc(data_type, output_dim, data_format);
+    output_tensor->push_back(tensor_desc);
+  }
+  return true;
+}
+
+bool AclModelClient::GetTensorFromDataset(
+    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
+  size_t device_output_num = aclmdlGetDatasetNumBuffers(output_dataset_);
+  size_t tensor_output_num = reinterpret_cast<size_t>(output_tensor->size());
+  if (device_output_num != tensor_output_num) {
+    LOG(ERROR)
+        << "[HUAWEI_ASCEND_NPU] output number not equal, device number is "
+        << device_output_num << "tensor number is " << tensor_output_num;
+    return false;
+  }
+  for (size_t i = 0; i < device_output_num; i++) {
+    aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(output_dataset_, i);
+    void* device_data = aclGetDataBufferAddr(buffer_device);
+    uint32_t device_size = aclGetDataBufferSize(buffer_device);
+
+    void* tensor_data = nullptr;
+    aclError ret = aclrtMallocHost(&tensor_data, device_size);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMallocHost failed, ret " << ret;
+      return false;
+    }
+    ret = aclrtMemcpy(tensor_data,
+                      device_size,
+                      device_data,
+                      device_size,
+                      ACL_MEMCPY_DEVICE_TO_HOST);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMemcpy failed, ret " << ret;
+      return false;
+    }
+    if (output_tensor->at(i)->SetData(reinterpret_cast<uint8_t*>(tensor_data),
+                                      device_size) != ge::GRAPH_SUCCESS) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] SetData to output tensor failed";
+      return false;
+    }
+  }
+  VLOG(3)
+      << "[HUAWEI_ASCEND_NPU] Get output tensor from output dataset succeed.";
+  return true;
+}
+
+void AclModelClient::CreateInputDataset(
+    std::vector<std::shared_ptr<ge::Tensor>>* input_tensor) {
+  input_dataset_ = aclmdlCreateDataset();
+  if (input_dataset_ == nullptr) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create input dataset failed!";
+    return;
+  }
+
+  for (size_t i = 0; i < input_tensor->size(); i++) {
+    auto item = input_tensor->at(i);
+    size_t buffer_size = item->GetSize();
+    void* buffer_device = nullptr;
+    aclError ret =
+        aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR)
+          << "[HUAWEI_ASCEND_NPU] input malloc device buffer failed. size is "
+          << buffer_size;
+      return;
+    }
+    void* buffer_data = reinterpret_cast<void*>(item->GetData());
+    ret = aclrtMemcpy(buffer_device,
+                      buffer_size,
+                      buffer_data,
+                      buffer_size,
+                      ACL_MEMCPY_HOST_TO_DEVICE);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input memcpy failed, buffer size is "
+                 << buffer_size;
+      aclrtFree(buffer_device);
+      return;
+    }
+    aclDataBuffer* data_buffer =
+        aclCreateDataBuffer(buffer_device, buffer_size);
+    if (data_buffer == nullptr) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
+      aclrtFree(buffer_device);
+      return;
+    }
+    if (aclmdlAddDatasetBuffer(input_dataset_, data_buffer) != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input aclmdlAddDatasetBuffer failed!";
+      aclrtFree(buffer_device);
+      aclDestroyDataBuffer(data_buffer);
+      return;
+    }
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateInputDataset succeed.";
+}
+void AclModelClient::CreateOutputDataset(
+    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
+  output_dataset_ = aclmdlCreateDataset();
+  if (output_dataset_ == nullptr) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create output dataset failed!";
+    return;
+  }
+  size_t output_size = aclmdlGetNumOutputs(model_desc_);
+  CHECK_EQ(output_size, output_tensor->size());
+  for (size_t i = 0; i < output_size; i++) {
+    size_t buffer_size = aclmdlGetOutputSizeByIndex(model_desc_, i);
+    void* buffer_device = nullptr;
+    aclError ret =
+        aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR)
+          << "[HUAWEI_ASCEND_NPU] output malloc device buffer failed. size is "
+          << buffer_size;
+      return;
+    }
+    aclDataBuffer* data_buffer =
+        aclCreateDataBuffer(buffer_device, buffer_size);
+    if (data_buffer == nullptr) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
+      aclrtFree(buffer_device);
+      return;
+    }
+    if (aclmdlAddDatasetBuffer(output_dataset_, data_buffer) !=
+        ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclmdlAddDatasetBuffer failed!";
+      aclrtFree(buffer_device);
+      aclDestroyDataBuffer(data_buffer);
+      return;
+    }
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateOutputDataset succeed.";
+}
+
+bool AclModelClient::ModelExecute(
+    std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
+    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
+  // check model exists
+  if (model_desc_ == nullptr) {
+    LOG(ERROR)
+        << "[HUAWEI_ASCEND_NPU] no model description, model execution failed!";
+    return false;
+  }
+  // create input/output dataset
+  CreateInputDataset(input_tensor);
+  CreateOutputDataset(output_tensor);
+
+  // model execution
+  ACL_CALL(aclmdlExecute(model_id_, input_dataset_, output_dataset_));
+
+  // get output
+  if (!GetTensorFromDataset(output_tensor)) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset failed, modelId:"
+               << model_id_;
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset succeed, modelId:"
+          << model_id_;
+
+  return true;
+}
+
+void AclModelClient::DestroyDataset(aclmdlDataset** dataset) {
+  if (*dataset == nullptr) {
+    LOG(WARNING)
+        << "[HUAWEI_ASCEND_NPU] no dataset exists, no need to destroy!";
+    return;
+  }
+
+  size_t dataset_num = aclmdlGetDatasetNumBuffers(*dataset);
+  for (size_t i = 0; i < dataset_num; i++) {
+    aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(*dataset, i);
+    void* device_data = aclGetDataBufferAddr(buffer_device);
+    if (device_data == nullptr) {
+      LOG(WARNING)
+          << "[HUAWEI_ASCEND_NPU] failed to get data buffer of deivce data!";
+    } else {
+      if (aclrtFree(device_data) != ACL_ERROR_NONE) {
+        LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to free deivce data!";
+      }
+    }
+    if (aclDestroyDataBuffer(buffer_device) != ACL_ERROR_NONE) {
+      LOG(WARNING)
+          << "[HUAWEI_ASCEND_NPU] failed to destroy deivce data buffer!";
+    }
+  }
+  if (aclmdlDestroyDataset(*dataset) != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to destroy dataset!";
+  }
+  *dataset = nullptr;
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroy dataset success.";
+}
+
+bool AclModelClient::UnloadModel() {
+  if (!load_flag_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to unload model, load flag is "
+                 << load_flag_;
+    return true;
+  }
+
+  DestroyDataset(&input_dataset_);
+  DestroyDataset(&output_dataset_);
+
+  aclError ret = aclmdlUnload(model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "unload model failed, model id is " << model_id_;
+    return false;
+  }
+  if (model_desc_ != nullptr) {
+    (void)aclmdlDestroyDesc(model_desc_);
+    model_desc_ = nullptr;
+  }
+
+  if (model_memory_ptr_ != nullptr) {
+    aclrtFree(model_memory_ptr_);
+    model_memory_ptr_ = nullptr;
+    model_memory_size_ = 0;
+  }
+
+  if (model_weight_ptr_ != nullptr) {
+    aclrtFree(model_weight_ptr_);
+    model_weight_ptr_ = nullptr;
+    model_weight_size_ = 0;
+  }
+  load_flag_ = false;
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Unload model success, model id " << model_id_;
+  return true;
+}
+
+uint32_t AclModelClient::num_devices() {
+  uint32_t count = 0;
+  ACL_CALL(aclrtGetDeviceCount(&count));
+  return count;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/huawei_ascend_npu/model_client.h b/lite/backends/huawei_ascend_npu/model_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cf19b26261a4ff0301b493c7edf2de6ce3f7ec1
--- /dev/null
+++ b/lite/backends/huawei_ascend_npu/model_client.h
@@ -0,0 +1,179 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/huawei_ascend_npu/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+class TensorDesc {
+ public:
+  TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) {
+    if (format == ACL_FORMAT_NHWC) {
+      dim_order[1] = 3;
+      dim_order[2] = 1;
+      dim_order[3] = 2;
+    }
+    // create ge::Tensordesc
+    ge_tensor_desc_ = new ge::TensorDesc(
+        GetGeShape(dims), GetGeFormat(format), GetGeDataType(data_type));
+    CHECK(ge_tensor_desc_ != nullptr);
+  }
+  ~TensorDesc() { ge_tensor_desc_ = nullptr; }
+  int64_t GetNumber() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[0]);
+  }
+  int64_t GetChannel() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[1]);
+  }
+  int64_t GetHeight() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[2]);
+  }
+  int64_t GetWidth() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[3]);
+  }
+  const ge::TensorDesc& GetGeTensorDesc() const { return *ge_tensor_desc_; }
+
+ private:
+  ge::Shape GetGeShape(aclmdlIODims dims) {
+    ge::Shape ge_shape({0, 0, 0, 0});
+    for (size_t i = 0; i < dims.dimCount; i++) {
+      if (ge_shape.SetDim(i, dims.dims[i]) != ge::GRAPH_SUCCESS) {
+        LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Shape SetDim failed!";
+      } else {
+        VLOG(3) << "[HUAWEI_ASCEND_NPU] Setting Ge Shape[" << i << "] = <"
+                << dims.dims[i] << ">";
+      }
+    }
+    return ge_shape;
+  }
+  ge::Format GetGeFormat(aclFormat format) {
+    ge::Format ge_format = ge::FORMAT_NCHW;
+    switch (format) {
+      case ACL_FORMAT_NCHW:
+        ge_format = ge::FORMAT_NCHW;
+        break;
+      case ACL_FORMAT_NHWC:
+        ge_format = ge::FORMAT_NHWC;
+        break;
+      case ACL_FORMAT_ND:
+        ge_format = ge::FORMAT_ND;
+        break;
+      default:
+        LOG(FATAL) << "[HUAWEI_ASCEND_NPU] format not supported:" << format;
+        break;
+    }
+    return ge_format;
+  }
+  ge::DataType GetGeDataType(aclDataType data_type) {
+    ge::DataType ge_datatype = ge::DT_FLOAT;
+    switch (data_type) {
+      case ACL_FLOAT:
+        ge_datatype = ge::DT_FLOAT;
+        break;
+      case ACL_FLOAT16:
+        ge_datatype = ge::DT_FLOAT16;
+        break;
+      case ACL_INT8:
+        ge_datatype = ge::DT_INT8;
+        break;
+      case ACL_INT16:
+        ge_datatype = ge::DT_INT16;
+        break;
+      case ACL_INT32:
+        ge_datatype = ge::DT_INT32;
+        break;
+      case ACL_INT64:
+        ge_datatype = ge::DT_INT64;
+        break;
+      case ACL_BOOL:
+        ge_datatype = ge::DT_BOOL;
+        break;
+      default:
+        LOG(FATAL) << "[HUAWEI_ASCEND_NPU] data type not supported!";
+        break;
+    }
+    return ge_datatype;
+  }
+
+ private:
+  ge::TensorDesc* ge_tensor_desc_{nullptr};
+  // n c h w order, default to ACL_FORMAT_NCHW
+  std::vector<size_t> dim_order{0, 1, 2, 3};
+};
+
+class AclModelClient {
+ public:
+  explicit AclModelClient(int device_id) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Creating Huawei Ascend Device: "
+            << device_id;
+    device_num_ = num_devices();
+    if (device_id < 0 || device_id >= device_num_) {
+      LOG(FATAL) << "Failed with invalid device id " << device_id;
+      return;
+    }
+    device_id_ = device_id;
+    ACL_CALL(aclrtSetDevice(device_id_));
+  }
+
+  ~AclModelClient() {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroying Huawei Ascend Device: "
+            << device_id_;
+    ACL_CALL(aclrtResetDevice(device_id_));
+  }
+
+  bool LoadFromMem(const void* data, uint32_t size);
+  bool LoadFromFile(const char* model_path);
+  bool GetModelIOTensorDim(std::vector<TensorDesc>* input_tensor,
+                           std::vector<TensorDesc>* output_tensor);
+  bool ModelExecute(std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
+                    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
+  bool UnloadModel();
+
+ private:
+  void CreateInputDataset(
+      std::vector<std::shared_ptr<ge::Tensor>>* input_tensor);
+  void CreateOutputDataset(
+      std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
+  bool GetTensorFromDataset(
+      std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
+  void DestroyDataset(aclmdlDataset** dataset);
+
+ private:
+  uint32_t num_devices();
+
+ private:
+  int device_id_{0};
+  int device_num_{0};
+  aclrtContext context_{nullptr};
+  bool load_flag_{false};
+  uint32_t model_id_{0};
+  size_t model_memory_size_;
+  size_t model_weight_size_;
+  void* model_memory_ptr_;
+  void* model_weight_ptr_;
+  aclmdlDesc* model_desc_{nullptr};
+  aclmdlDataset* input_dataset_{nullptr};
+  aclmdlDataset* output_dataset_{nullptr};
+};
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/huawei_ascend_npu/utils.h b/lite/backends/huawei_ascend_npu/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2bff3f87e0831f7b98be60ef3980f10da610f10
--- /dev/null
+++ b/lite/backends/huawei_ascend_npu/utils.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "acl/acl.h"
+#include "ge/ge_api_types.h"
+#include "ge/ge_ir_build.h"
+#include "graph/ge_error_codes.h"
+#include "graph/graph.h"
+#include "graph/tensor.h"
+#include "graph/types.h"
+#include "lite/utils/cp_logging.h"
+
+/*
+ * This file contains some Huawei Ascend NPU specific uitls.
+ */
+
+#define ACL_CALL(msg)                                       \
+  CHECK_EQ(reinterpret_cast<aclError>(msg), ACL_ERROR_NONE) \
+      << (msg) << " Huawei Ascend NPU ACL Error: "          \
+      << ::paddle::lite::huawei_ascend_npu::AclErrorInfo(   \
+             reinterpret_cast<int>(msg))
+
+#define ATC_CALL(msg)                                                 \
+  CHECK_EQ(reinterpret_cast<ge::graphStatus>(msg), ge::GRAPH_SUCCESS) \
+      << (msg) << " Huawei Ascend NPU ATC Error: "                    \
+      << ::paddle::lite::huawei_ascend_npu::AtcErrorInfo(             \
+             reinterpret_cast<uint32_t>(msg))
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+static const char* AtcErrorInfo(uint32_t error) {
+  switch (error) {
+#define LITE_ATC_ERROR_INFO(xx) \
+  case xx:                      \
+    return #xx;                 \
+    break;
+    LITE_ATC_ERROR_INFO(ge::GRAPH_FAILED);         // 0xFFFFFFFF
+    LITE_ATC_ERROR_INFO(ge::GRAPH_PARAM_INVALID);  // 50331649
+#undef LITE_ATC_ERROR_INFO
+    default:
+      return "unknown error";
+      break;
+  }
+}
+
+static const char* AclErrorInfo(int error) {
+  switch (error) {
+#define LITE_ACL_ERROR_INFO(xx) \
+  case xx:                      \
+    return #xx;                 \
+    break;
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PARAM);                    // 100000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_UNINITIALIZE);                     // 100001
+    LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_INITIALIZE);                // 100002
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE);                     // 100003
+    LITE_ACL_ERROR_INFO(ACL_ERROR_WRITE_FILE);                       // 100004
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE_SIZE);                // 100005
+    LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_FILE);                       // 100006
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_MISSING_ATTR);                // 100007
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_ATTR_INVALID);                // 100008
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DUMP_CONFIG);              // 100009
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PROFILING_CONFIG);         // 100010
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_MODEL_ID);                 // 100011
+    LITE_ACL_ERROR_INFO(ACL_ERROR_DESERIALIZE_MODEL);                // 100012
+    LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_MODEL);                      // 100013
+    LITE_ACL_ERROR_INFO(ACL_ERROR_READ_MODEL_FAILURE);               // 100014
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_SIZE_INVALID);               // 100015
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_MISSING_ATTR);               // 100016
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_INPUT_NOT_MATCH);            // 100017
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_OUTPUT_NOT_MATCH);           // 100018
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_NOT_DYNAMIC);                // 100019
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_TYPE_NOT_MATCH);                // 100020
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_INPUT_NOT_MATCH);               // 100021
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_OUTPUT_NOT_MATCH);              // 100022
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_ATTR_NOT_MATCH);                // 100023
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_NOT_FOUND);                     // 100024
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_LOAD_FAILED);                   // 100025
+    LITE_ACL_ERROR_INFO(ACL_ERROR_UNSUPPORTED_DATA_TYPE);            // 100026
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FORMAT_NOT_MATCH);                 // 100027
+    LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED);      // 100028
+    LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_NOT_FOUND);                 // 100029
+    LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED);  // 100030
+    LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_ALREADY_REGISTERED);        // 100031
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_QUEUE_ID);                 // 100032
+    LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_SUBSCRIBE);                 // 100033
+    LITE_ACL_ERROR_INFO(ACL_ERROR_STREAM_NOT_SUBSCRIBE);             // 100034
+    LITE_ACL_ERROR_INFO(ACL_ERROR_THREAD_NOT_SUBSCRIBE);             // 100035
+    LITE_ACL_ERROR_INFO(ACL_ERROR_WAIT_CALLBACK_TIMEOUT);            // 100036
+    LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_FINALIZE);                  // 100037
+    LITE_ACL_ERROR_INFO(ACL_ERROR_NOT_STATIC_AIPP);                  // 100038
+    LITE_ACL_ERROR_INFO(ACL_ERROR_BAD_ALLOC);                        // 200000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_API_NOT_SUPPORT);                  // 200001
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DEVICE);                   // 200002
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MEMORY_ADDRESS_UNALIGNED);         // 200003
+    LITE_ACL_ERROR_INFO(ACL_ERROR_RESOURCE_NOT_MATCH);               // 200004
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_RESOURCE_HANDLE);          // 200005
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FEATURE_UNSUPPORTED);              // 200006
+    LITE_ACL_ERROR_INFO(ACL_ERROR_STORAGE_OVER_LIMIT);               // 300000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INTERNAL_ERROR);                   // 500000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FAILURE);                          // 500001
+    LITE_ACL_ERROR_INFO(ACL_ERROR_GE_FAILURE);                       // 500002
+    LITE_ACL_ERROR_INFO(ACL_ERROR_RT_FAILURE);                       // 500003
+    LITE_ACL_ERROR_INFO(ACL_ERROR_DRV_FAILURE);                      // 500004
+    LITE_ACL_ERROR_INFO(ACL_ERROR_PROFILING_FAILURE);                // 500005
+#undef LITE_ACL_ERROR_INFO
+    default:
+      return "unknown error";
+      break;
+  }
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc
index 22f760e39f86b29ccf025a83b2a43c87882f9e02..2b2d5321ba6dbac7ff002039c3c8a0423cbe0a6e 100644
--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -33,7 +33,7 @@ std::shared_ptr<hiai::AiModelMngerClient> Device::Load(
   // Check HiAI DDK version
   const char* ddk_version = model_client->GetVersion();
   if (ddk_version) {
-    LOG(INFO) << "[NPU] HiAI DDK version: " << ddk_version;
+    VLOG(3) << "[NPU] HiAI DDK version: " << ddk_version;
   } else {
     LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!";
   }
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
index d8232cda4c790646fb5a4aae7d4e00d272d3a640..fe6b8fcd99d3f615aefd25145e97b7a08a537794 100644
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -38,17 +38,20 @@ CLRuntime::~CLRuntime() {
 }
 
 bool CLRuntime::Init() {
-  if (initialized_) {
+  if (is_cl_runtime_initialized_) {
     return true;
   }
   bool is_platform_init = InitializePlatform();
   bool is_device_init = InitializeDevice();
-  is_init_success_ = is_platform_init && is_device_init;
-  initialized_ = true;
-
-  context_ = CreateContext();
-  command_queue_ = CreateCommandQueue(context());
-  return initialized_;
+  LOG(INFO) << "is_platform_init:" << is_platform_init;
+  LOG(INFO) << "is_device_init:" << is_device_init;
+  if ((is_platform_init == true) && (is_device_init == true)) {
+    is_platform_device_init_success_ = true;
+    context_ = CreateContext();
+    command_queue_ = CreateCommandQueue(context());
+    is_cl_runtime_initialized_ = true;
+  }
+  return is_cl_runtime_initialized_;
 }
 
 cl::Platform& CLRuntime::platform() {
@@ -64,7 +67,9 @@ cl::Context& CLRuntime::context() {
 }
 
 cl::Device& CLRuntime::device() {
-  CHECK(device_ != nullptr) << "device_ is not initialized!";
+  if (device_ == nullptr) {
+    LOG(ERROR) << "device_ is not initialized!";
+  }
   return *device_;
 }
 
@@ -150,6 +155,14 @@ GpuType CLRuntime::ParseGpuTypeFromDeviceName(std::string device_name) {
 }
 
 bool CLRuntime::InitializeDevice() {
+  VLOG(3) << "device_info_.size():" << device_info_.size();
+  for (auto i : device_info_) {
+    VLOG(3) << ">>> " << i.first << " " << i.second;
+  }
+  if (device_info_.size() > 0 && device_info_.size() <= 2) {
+    return false;
+  }
+  device_info_["PLACEHOLDER"] = 1;
   // ===================== BASIC =====================
   // CL_DEVICE_TYPE_GPU
   // CL_DEVICE_NAME
@@ -160,7 +173,7 @@ bool CLRuntime::InitializeDevice() {
   status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
   CL_CHECK_ERROR(status_);
   if (all_devices.empty()) {
-    LOG(FATAL) << "No OpenCL GPU device found!";
+    LOG(ERROR) << "No available OpenCL GPU device found!";
     return false;
   }
   device_ = std::make_shared<cl::Device>();
@@ -313,9 +326,6 @@ bool CLRuntime::InitializeDevice() {
 }
 
 std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
-  if (0 != device_info_.size()) {
-    return device_info_;
-  }
   InitializeDevice();
   return device_info_;
 }
diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h
index 3eeea7d63ae8f81e7eb395bc0da70caaf94c2a79..7e28130e15da0d45e62d984202f76aa1aff9762c 100644
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/backends/opencl/cl_utility.h"
+#include "lite/backends/opencl/cl_wrapper.h"
 
 typedef enum {
   UNKNOWN = 0,
@@ -68,6 +69,28 @@ class CLRuntime {
  public:
   static CLRuntime* Global();
 
+  bool OpenCLAvaliableForDevice() {
+    bool opencl_lib_found = paddle::lite::CLWrapper::Global()->OpenclLibFound();
+    LOG(INFO) << "opencl_lib_found:" << opencl_lib_found;
+    if (opencl_lib_found == false) return false;
+
+    bool dlsym_success = paddle::lite::CLWrapper::Global()->DlsymSuccess();
+    LOG(INFO) << "dlsym_success:" << dlsym_success;
+    if (opencl_lib_found == false) return false;
+
+    InitializeDevice();
+    bool support_fp16 =
+        static_cast<bool>(device_info_["CL_DEVICE_EXTENSIONS_FP16"]);
+    LOG(INFO) << "support_fp16:" << support_fp16;
+    if (support_fp16 == false) return false;
+
+    is_device_avaliable_for_opencl_ =
+        dlsym_success && opencl_lib_found && support_fp16;
+    LOG(INFO) << "is_device_avaliable_for_opencl_:"
+              << is_device_avaliable_for_opencl_;
+    return is_device_avaliable_for_opencl_;
+  }
+
   bool Init();
 
   cl::Platform& platform();
@@ -85,7 +108,7 @@ class CLRuntime {
 
   bool BuildProgram(cl::Program* program, const std::string& options = "");
 
-  bool IsInitSuccess() { return is_init_success_; }
+  bool IsInitSuccess() { return is_platform_device_init_success_; }
 
   std::string cl_path() { return cl_path_; }
 
@@ -167,9 +190,11 @@ class CLRuntime {
 
   cl_int status_{CL_SUCCESS};
 
-  bool initialized_{false};
+  bool is_device_avaliable_for_opencl_{false};
+
+  bool is_cl_runtime_initialized_{false};
 
-  bool is_init_success_{false};
+  bool is_platform_device_init_success_{false};
 };
 
 }  // namespace lite
diff --git a/lite/backends/opencl/cl_wrapper.cc b/lite/backends/opencl/cl_wrapper.cc
index f16baca860de4c567b27a4e3ee364e47db74f4ca..5580a487eaaaf77676d2d6bd41542596504774a4 100644
--- a/lite/backends/opencl/cl_wrapper.cc
+++ b/lite/backends/opencl/cl_wrapper.cc
@@ -19,14 +19,16 @@ limitations under the License. */
 
 namespace paddle {
 namespace lite {
+
 CLWrapper *CLWrapper::Global() {
   static CLWrapper wrapper;
   return &wrapper;
 }
 
 CLWrapper::CLWrapper() {
-  CHECK(InitHandle()) << "Fail to initialize the OpenCL library!";
-  InitFunctions();
+  opencl_lib_found_ = InitHandle();
+  CHECK(opencl_lib_found_) << "Fail to initialize the OpenCL library!";
+  dlsym_success_ = InitFunctions();
 }
 
 bool CLWrapper::InitHandle() {
@@ -68,15 +70,17 @@ bool CLWrapper::InitHandle() {
   }
 }
 
-void CLWrapper::InitFunctions() {
+bool CLWrapper::InitFunctions() {
   CHECK(handle_ != nullptr) << "The library handle can't be null!";
+  bool dlsym_success = true;
 
 #define PADDLE_DLSYM(cl_func)                                        \
   do {                                                               \
     cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func);            \
     if (cl_func##_ == nullptr) {                                     \
-      LOG(FATAL) << "Cannot find the " << #cl_func                   \
+      LOG(ERROR) << "Cannot find the " << #cl_func                   \
                  << " symbol in libOpenCL.so!";                      \
+      dlsym_success = false;                                         \
       break;                                                         \
     }                                                                \
     VLOG(4) << "Loaded the " << #cl_func << " symbol successfully."; \
@@ -137,6 +141,7 @@ void CLWrapper::InitFunctions() {
   PADDLE_DLSYM(clEnqueueCopyImage);
 
 #undef PADDLE_DLSYM
+  return dlsym_success;
 }
 
 }  // namespace lite
diff --git a/lite/backends/opencl/cl_wrapper.h b/lite/backends/opencl/cl_wrapper.h
index 35ef33e5a2f3973217e0e4c36caf1f8eb0fbdcb2..4df86b4028f92883718e7da0967f4a88ab20cc6d 100644
--- a/lite/backends/opencl/cl_wrapper.h
+++ b/lite/backends/opencl/cl_wrapper.h
@@ -508,13 +508,20 @@ class CLWrapper final {
     return clEnqueueCopyImage_;
   }
 
+  bool OpenclLibFound() { return opencl_lib_found_; }
+
+  bool DlsymSuccess() { return dlsym_success_; }
+
  private:
   CLWrapper();
   CLWrapper(const CLWrapper &) = delete;
   CLWrapper &operator=(const CLWrapper &) = delete;
   bool InitHandle();
-  void InitFunctions();
+  bool InitFunctions();
+  bool opencl_lib_found_{true};
+  bool dlsym_success_{true};
   void *handle_{nullptr};
+
   clGetPlatformIDsType clGetPlatformIDs_{nullptr};
   clGetPlatformInfoType clGetPlatformInfo_{nullptr};
   clBuildProgramType clBuildProgram_{nullptr};
diff --git a/lite/backends/xpu/debug.h b/lite/backends/xpu/debug.h
index 75d18b6f4bf461a871c26c7665d8b48bc2f3db38..56bafc9c3d3a7772af8fc8afd10fc7efa3415ef7 100644
--- a/lite/backends/xpu/debug.h
+++ b/lite/backends/xpu/debug.h
@@ -19,7 +19,7 @@
 #include <memory>
 #include <string>
 #include <type_traits>
-#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/backends/xpu/target_wrapper.h"
 
 namespace paddle {
 namespace lite {
@@ -82,8 +82,8 @@ void DumpXPUMem(const T* ptr,
                 size_t item_per_line = 30) {
   size_t after_stride_len = (len + stride - 1) / stride;
   std::unique_ptr<T[]> cpu_mem(new T[len]);
-  xpu_memcpy(
-      cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  XPU_CALL(xpu_memcpy(
+      cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST));
   std::unique_ptr<T[]> after_stride(new T[after_stride_len]);
   for (size_t i = 0; i < after_stride_len; ++i) {
     after_stride[i] = cpu_mem[i * stride];
diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc
index 85a0023590858ab72e9e4f258d62dce809888918..a322418ccde20a34dc6c6ba9b47601a9a658f99c 100644
--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
@@ -19,11 +19,11 @@ namespace lite {
 
 void* TargetWrapperXPU::Malloc(size_t size) {
   void* ptr{nullptr};
-  xpu_malloc(&ptr, size);
+  XPU_CALL(xpu_malloc(&ptr, size));
   return ptr;
 }
 
-void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
+void TargetWrapperXPU::Free(void* ptr) { XPU_CALL(xpu_free(ptr)); }
 
 void TargetWrapperXPU::MemcpySync(void* dst,
                                   const void* src,
@@ -31,10 +31,10 @@ void TargetWrapperXPU::MemcpySync(void* dst,
                                   IoDirection dir) {
   switch (dir) {
     case IoDirection::HtoD:
-      xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
+      XPU_CALL(xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE));
       break;
     case IoDirection::DtoH:
-      xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
+      XPU_CALL(xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST));
       break;
     default:
       LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
@@ -49,7 +49,7 @@ XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size,
   } else {
     ptr = TargetWrapperXPU::Malloc(size);
   }
-  CHECK(ptr != nullptr);
+  CHECK(ptr != nullptr) << "size = " << size << ", use_l3 = " << use_l3;
   return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3));
 }
 
diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h
index b84b5d75e74a14e81091b003aa3ae5514e53a42c..070184a13088a169fe38f1b8105a0803d9915da1 100644
--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
@@ -16,11 +16,23 @@
 
 #include <memory>                                 // std::unique_ptr
 #include "lite/backends/xpu/xpu_header_sitter.h"  // xpu_free
-#include "lite/core/target_wrapper.h"
+#include "lite/core/target_wrapper.h"             // TargetWrapper
+#include "lite/utils/cp_logging.h"                // CHECK_EQ
+
+#define XPU_CALL(func)                                        \
+  {                                                           \
+    auto e = (func);                                          \
+    CHECK_EQ(e, 0) << "XPU: (" << #func << ") returns " << e; \
+  }
 
 namespace paddle {
 namespace lite {
 
+// MAX(lod.size()) = 64
+const int XPU_MAX_LOD_SIZE = 64;
+// MAX(lod[i + 1] - lod[i]) = 512
+const int XPU_MAX_LOD_SEQ_LEN = 512;
+
 using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
 
 struct XPUScratchPad {
@@ -33,7 +45,7 @@ struct XPUScratchPad {
 struct XPUScratchPadDeleter {
   void operator()(XPUScratchPad* sp) const {
     if (!sp->is_l3_) {
-      xpu_free(sp->addr_);
+      XPU_CALL(xpu_free(sp->addr_));
     }
     delete sp;
   }
@@ -55,7 +67,7 @@ class TargetWrapper<TARGET(kXPU)> {
                          size_t size,
                          IoDirection dir);
 
-  static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = true);
+  static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = false);
 
   static xdnn::Context* GetRawContext() {
     if (tls_raw_ctx_ == nullptr) {
@@ -77,11 +89,10 @@ class TargetWrapper<TARGET(kXPU)> {
   static void SetDev(int dev_no = 0) {
     const char* dev_env = getenv("LITE_XPU_DEV");
     if (dev_env) {
-      xpu_set_device(atoi(dev_env));
-      return;
+      dev_no = atoi(dev_env);
     }
 
-    xpu_set_device(dev_no);
+    XPU_CALL(xpu_set_device(dev_no));
   }
 
   static std::string multi_encoder_precision;  // NOLINT
diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt
index 75971570fb078ce4e39413e5b3df629fe2a7ac3e..53988f063b89ae3e75f4c27cc1d937d12bb6dae5 100644
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 
 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
index 1138a3bcc2e3e3f3c77d94bf8128b8231f930550..599e8f6c3791ac68474ca27e6c627bd2fc43765a 100644
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -24,7 +24,7 @@ namespace arena {
 void TestCase::CreateInstruction() {
   std::shared_ptr<lite::OpLite> op = nullptr;
   static const std::set<TargetType> subgraph_op_supported_targets(
-      {TARGET(kNPU), TARGET(kXPU)});
+      {TARGET(kNPU), TARGET(kXPU), TARGET(kHuaweiAscendNPU)});
   bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) !=
                             subgraph_op_supported_targets.end();
 #if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
@@ -32,25 +32,35 @@ void TestCase::CreateInstruction() {
 #endif
   if (enable_subgraph_op) {
     // Create a new block desc to wrap the original op desc
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
     int sub_block_idx = 0;
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
     sub_block_desc->ClearOps();
     sub_block_desc->ClearVars();
-    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
-    *sub_block_op_desc = *op_desc_;
+    auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_op_desc = *op_desc_;
     // Add the block desc into the subgraph op which used to replace the
     // original op
     op_desc_.reset(new cpp::OpDesc());
     op_desc_->SetType("subgraph");
     op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
-    auto in_names = sub_block_op_desc->input_vars();
-    auto out_names = sub_block_op_desc->output_vars();
+    auto in_names = sub_op_desc->input_vars();
+    auto out_names = sub_op_desc->output_vars();
     op_desc_->SetInput("Inputs", in_names);
     op_desc_->SetOutput("Outputs", out_names);
-    op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names);
+    // filter only data op (not const op by persisiable)
+    std::vector<std::string> in_data_names;
+    for (auto name : in_names) {
+      if (!(inst_scope_->FindTensor(name)->persistable())) {
+        in_data_names.push_back(name);
+      }
+    }
+    op_desc_->SetAttr<std::vector<std::string>>("input_data_names",
+                                                in_data_names);
     op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
     op = LiteOpRegistry::Global().Create(op_desc().Type());
-    static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
+    static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
+        sub_program_desc);
   } else {
     op = LiteOpRegistry::Global().Create(op_desc().Type());
   }
@@ -60,7 +70,7 @@ void TestCase::CreateInstruction() {
   // filter out the target kernel
   CHECK(!kernels.empty()) << "No kernel found for place "
                           << place_.DebugString();
-  auto it = std::remove_if(
+  auto it = std::find_if(
       kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& k) {
         return k->alias() == alias_;
       });
@@ -234,19 +244,6 @@ bool TestCase::CheckPrecision(const std::string& var_name,
   return success;
 }
 
-TestCase::~TestCase() {
-  if (op_desc_->Type() == "subgraph") {
-    // Release the subblock desc of Subgraph op
-    auto subgraph_op = const_cast<operators::SubgraphOp*>(
-        static_cast<const operators::SubgraphOp*>(instruction_->op()));
-    CHECK(subgraph_op);
-    auto sub_block_desc = subgraph_op->GetSubBlock();
-    if (sub_block_desc) {
-      delete sub_block_desc;
-    }
-  }
-}
-
 }  // namespace arena
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h
index 4e73768e53576f03e47158618fa4f0eac0851382..4ccb05428d38c65f8cad36f1702c034cfe62705b 100644
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -46,7 +46,7 @@ class TestCase {
         base_scope_(new Scope) {
     ctx_ = ContextScheduler::Global().NewContext(place_.target);
   }
-  virtual ~TestCase();
+  virtual ~TestCase() {}
 
   void Prepare() {
     PrepareData();
diff --git a/lite/core/context.cc b/lite/core/context.cc
index f14d1dfddea806ab3839f6f897b9d4d3fe396ca8..abb44945ec66e1a89efc1ccb08ec1df370f2e099 100644
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -17,8 +17,13 @@
 namespace paddle {
 namespace lite {
 
-#ifdef LITE_WITH_NPU
-std::string Context<TargetType::kNPU>::subgraph_model_cache_dir_{""};  // NOLINT
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+thread_local std::string
+    Context<TargetType::kHuaweiAscendNPU>::subgraph_model_cache_dir_{
+        ""};  // NOLINT
+thread_local int
+    Context<TargetType::kHuaweiAscendNPU>::huawei_ascend_device_id_{
+        0};  // NOLINT
 #endif
 
 #ifdef LITE_WITH_MLU
diff --git a/lite/core/context.h b/lite/core/context.h
index c3993d9589eeac442eaa827152fd1293852396db..69f6a4b9d6bc87422d06e66e8d329547ccf5f24a 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -39,6 +39,7 @@
 #include <utility>
 #include <vector>
 #include "lite/core/device_info.h"
+#include "lite/core/scope.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
 #include "lite/utils/all.h"
@@ -61,6 +62,7 @@ using FPGAContext = Context<TargetType::kFPGA>;
 using BMContext = Context<TargetType::kBM>;
 using MLUContext = Context<TargetType::kMLU>;
 using RKNPUContext = Context<TargetType::kRKNPU>;
+using HuaweiAscendNPUContext = Context<TargetType::kHuaweiAscendNPU>;
 
 template <>
 class Context<TargetType::kHost> {
@@ -84,6 +86,35 @@ class Context<TargetType::kNPU> {
   NPUContext& operator=(const NPUContext& ctx) {}
   std::string name() const { return "NPUContext"; }
 
+  static void SetSubgraphModelCacheDir(Scope* scope,
+                                       std::string subgraph_model_cache_dir) {
+    auto var = scope->Var("SUBGRAPH_MODEL_CACHE_DIR");
+    CHECK(var);
+    auto data = var->GetMutable<std::string>();
+    CHECK(data);
+    *data = subgraph_model_cache_dir;
+  }
+  static std::string SubgraphModelCacheDir(Scope* scope) {
+    auto var = scope->FindVar("SUBGRAPH_MODEL_CACHE_DIR");
+    if (!var) return "";
+    return var->Get<std::string>();
+  }
+};
+#endif
+
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+template <>
+class Context<TargetType::kHuaweiAscendNPU> {
+ public:
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(HuaweiAscendNPUContext* ctx) {}
+
+  HuaweiAscendNPUContext& operator=(const HuaweiAscendNPUContext& ctx) {
+    return *this;
+  }
+  std::string name() const { return "HuaweiAscendNPUContext"; }
+
   static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) {
     subgraph_model_cache_dir_ = subgraph_model_cache_dir;
   }
@@ -91,8 +122,14 @@ class Context<TargetType::kNPU> {
     return subgraph_model_cache_dir_;
   }
 
+  static void SetHuaweiAscendDeviceID(int huawei_ascend_device_id) {
+    huawei_ascend_device_id_ = huawei_ascend_device_id;
+  }
+  static int HuaweiAscendDeviceID() { return huawei_ascend_device_id_; }
+
  private:
-  static std::string subgraph_model_cache_dir_;
+  static thread_local std::string subgraph_model_cache_dir_;
+  static thread_local int huawei_ascend_device_id_;
 };
 #endif
 
@@ -385,6 +422,13 @@ class ContextScheduler {
             &ctx->As<NPUContext>());
         break;
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+      case TARGET(kHuaweiAscendNPU):
+        kernel_contexts_[TargetType::kHuaweiAscendNPU]
+            .As<HuaweiAscendNPUContext>()
+            .CopySharedTo(&ctx->As<HuaweiAscendNPUContext>());
+        break;
+#endif
 #ifdef LITE_WITH_APU
       case TARGET(kAPU):
         kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
@@ -466,6 +510,9 @@ class ContextScheduler {
 #ifdef LITE_WITH_NPU
     InitContext<TargetType::kNPU, NPUContext>();
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+    InitContext<TargetType::kHuaweiAscendNPU, HuaweiAscendNPUContext>();
+#endif
 #ifdef LITE_WITH_APU
     InitContext<TargetType::kAPU, APUContext>();
 #endif
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index be09ed4b1a63154b8561f4d39cff7d987a9fcba7..cd129b332fa79dc45d74dc8a0befc1e67a68c316 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -18,6 +18,7 @@ lite_cc_library(mir_passes
       fusion/conv_activation_fuse_pass.cc
       fusion/var_conv_2d_activation_fuse_pass.cc
       fusion/conv_bn_fuse_pass.cc
+      fusion/conv_conv_fuse_pass.cc
       fusion/elementwise_add_activation_fuse_pass.cc
       fusion/quant_dequant_fuse_pass.cc
       fusion/sequence_pool_concat_fuse_pass.cc
@@ -32,6 +33,7 @@ lite_cc_library(mir_passes
       elimination/identity_dropout_eliminate_pass.cc
       elimination/elementwise_mul_constant_eliminate_pass.cc
       elimination/remove_tf_redundant_ops_pass.cc
+      elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
       static_kernel_pick_pass.cc
       variable_place_inference_pass.cc
       type_target_cast_pass.cc
diff --git a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7866cb956c4e51d3b69687751325ca3ff4eda9d6
--- /dev/null
+++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
@@ -0,0 +1,244 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
+#include <algorithm>
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+// Remove all of the unused nodes from the contorl flow op and update the inputs
+// and outputs of the op info The unused nodes are defined as the nodes which
+// are only linked to the control flow op nodes but nerver linked to the other
+// op nodes.
+//
+// For example:
+// graph[0]: main block
+//                      in_x
+//             in_f      |   in_z(unused node)
+//                  \    |    /
+//                   \   |   /
+//        in_w ------- while ------- in_y(unused_node)
+//                    /  |
+//                   /   |
+// (unused node)out_y    |
+//                     out_x
+//
+// graph[1]: sub block
+//                     in_x
+//                       |
+//                       |
+//                     conv2d----in_f
+//                       |
+//                       |
+//                      fc ------in_w
+//                       |
+//                       |
+//                     softmax
+//                       |
+//                       |
+//                     out_x
+//
+// After the pass is applied:
+//                      in_x
+//             in_f      |
+//                  \    |
+//                   \   |
+//        in_w ------- while
+//                       |
+//                       |
+//                       |
+//                     out_x
+
+// Remove the var node from var2rm if it is recursively referred to any op in
+// the subblock
+void CollectUnusedInputOutputNodes(
+    int block_idx,
+    std::vector<std::unique_ptr<mir::SSAGraph>>* graphs,
+    const std::unordered_set<std::string>& control_flow_op_types,
+    std::unordered_map<std::string, Node*>* in_vars2rm,
+    std::unordered_map<std::string, Node*>* out_vars2rm) {
+  auto block_size = graphs->size();
+  for (auto& op_node : (*graphs)[block_idx]->StmtTopologicalOrder()) {
+    if (!op_node->IsStmt()) continue;
+    auto op_info = op_node->AsStmt().op_info();
+    auto op_type = op_info->Type();
+    if (control_flow_op_types.count(op_type)) {
+      int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
+      CHECK(block_idx >= 0 && block_idx < block_size);
+      CollectUnusedInputOutputNodes(sub_block_idx,
+                                    graphs,
+                                    control_flow_op_types,
+                                    in_vars2rm,
+                                    out_vars2rm);
+    } else {
+      for (auto& var_node : op_node->inlinks) {
+        auto& var_name = var_node->AsArg().name;
+        if (in_vars2rm->count(var_name)) {
+          in_vars2rm->erase(var_name);
+        }
+      }
+      for (auto& var_node : op_node->outlinks) {
+        auto& var_name = var_node->AsArg().name;
+        // Tensor array may be only used as the output vars in the sublock
+        if (in_vars2rm->count(var_name)) {
+          in_vars2rm->erase(var_name);
+        }
+        if (out_vars2rm->count(var_name)) {
+          out_vars2rm->erase(var_name);
+        }
+      }
+    }
+  }
+}
+
+// Remove the unused var nodes from the graph and update the op_info of the
+// control flow op
+void RemoveNodesFromGraphAndUpdateOpInfo(
+    SSAGraph* graph,
+    Node* op_node,
+    const std::unordered_map<std::string, Node*>& in_vars2rm,
+    const std::unordered_map<std::string, Node*>& out_vars2rm) {
+  auto op_info = op_node->AsStmt().mutable_op_info();
+  auto op_type = op_info->Type();
+  // Unlink the in_vars2rm and out_vars2rm from the control flow op node, and
+  // remove them if nerver used.
+  for (auto& var_node : in_vars2rm) {
+    VLOG(3) << "in var node '" << var_node.first << "' is unlinked to "
+            << op_type;
+    RemoveDirectedLink(var_node.second, op_node);
+  }
+  for (auto& var_node : out_vars2rm) {
+    VLOG(3) << "out var node '" << var_node.first << "' is unlinked from "
+            << op_type;
+    RemoveDirectedLink(op_node, var_node.second);
+    // Unlink from all of the out op nodes.
+    std::unordered_set<Node*> out_op_nodes;
+    for (auto* out_op_node : var_node.second->outlinks) {
+      if (!out_op_nodes.count(out_op_node)) {
+        out_op_nodes.insert(out_op_node);
+      }
+    }
+    for (auto* out_op_node : out_op_nodes) {
+      RemoveDirectedLink(var_node.second, out_op_node);
+    }
+  }
+  // Remove the unused nodes from the graph if their inlinks and outlinks are
+  // empty
+  std::unordered_set<const Node*> removed_var_nodes;
+  for (auto& var_node : in_vars2rm) {
+    if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
+        !removed_var_nodes.count(var_node.second)) {
+      removed_var_nodes.insert(var_node.second);
+      graph->RemoveNode(var_node.second);
+      VLOG(3) << "in var node " << var_node.first << " is removed";
+    }
+  }
+  for (auto& var_node : out_vars2rm) {
+    if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
+        !removed_var_nodes.count(var_node.second)) {
+      removed_var_nodes.insert(var_node.second);
+      graph->RemoveNode(var_node.second);
+      VLOG(3) << "out var node " << var_node.first << " is removed";
+    }
+  }
+  // Update the op info of the control flow op
+  for (auto& input : *op_info->mutable_inputs()) {
+    for (auto var = input.second.begin(); var != input.second.end();) {
+      if (in_vars2rm.count(*var)) {
+        var = input.second.erase(var);
+      } else {
+        ++var;
+      }
+    }
+  }
+  for (auto& output : *op_info->mutable_outputs()) {
+    for (auto var = output.second.begin(); var != output.second.end();) {
+      if (out_vars2rm.count(*var)) {
+        var = output.second.erase(var);
+      } else {
+        ++var;
+      }
+    }
+  }
+}
+
+void ControlFlowOpUnusedInputsAndOutputsEliminatePass::SetAllGraphs(
+    std::vector<std::unique_ptr<mir::SSAGraph>>* graphs) {
+  CHECK(graphs && !graphs->empty());
+  graphs_ = graphs;
+}
+
+void ControlFlowOpUnusedInputsAndOutputsEliminatePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  // Remove the unused input and output nodes from the control flow op nodes
+  // Which are only linked to the control flow op nodes but nerver linked to the
+  // other op nodes
+  const std::unordered_set<std::string> control_flow_op_types = {
+      "while", "conditional_block"};
+  auto block_size = graphs_->size();
+  for (auto& op_node : graph->StmtTopologicalOrder()) {
+    if (!op_node->IsStmt()) continue;
+    auto op_info = op_node->AsStmt().mutable_op_info();
+    auto op_type = op_info->Type();
+    if (!control_flow_op_types.count(op_type)) continue;
+    int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
+    CHECK(sub_block_idx >= 0 && sub_block_idx < block_size);
+    // Initialize the unused nodes with all of the input and output nodes
+    std::unordered_map<std::string, Node *> in_vars2rm, out_vars2rm;
+    for (auto* var_node : op_node->inlinks) {
+      auto& var_name = var_node->AsArg().name;
+      if (!in_vars2rm.count(var_name)) {
+        in_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
+      }
+    }
+    for (auto* var_node : op_node->outlinks) {
+      auto& var_name = var_node->AsArg().name;
+      if (!out_vars2rm.count(var_name)) {
+        out_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
+      }
+    }
+    // Remove the nodes which used in subblock recursively, and the remaining
+    // nodes are the unused one.
+    CollectUnusedInputOutputNodes(sub_block_idx,
+                                  graphs_,
+                                  control_flow_op_types,
+                                  &in_vars2rm,
+                                  &out_vars2rm);
+    if (in_vars2rm.size() > 0 || out_vars2rm.size() > 0) {
+      // Remove the unused nodes from graph, and update the op info of the
+      // control flow op
+      RemoveNodesFromGraphAndUpdateOpInfo(
+          graph.get(), op_node, in_vars2rm, out_vars2rm);
+    }
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(
+    control_flow_op_unused_inputs_and_outputs_eliminate_pass,
+    paddle::lite::mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass)
+    .BindTargets({TARGET(kNPU)});
diff --git a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..2863661de1e93d15bfe835e39033d4ecaee6d8cc
--- /dev/null
+++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ControlFlowOpUnusedInputsAndOutputsEliminatePass : public mir::StmtPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph> &graph) override;
+  void SetAllGraphs(std::vector<std::unique_ptr<mir::SSAGraph>> *graphs);
+
+ private:
+  std::vector<std::unique_ptr<mir::SSAGraph>> *graphs_;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt
index a7a4cee798c1e8ef5b9b8f8d9e8e5810554fc571..95723bbd21dc02ed8bb5b46c48f9836d3f9aff1f 100644
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -16,6 +16,9 @@ lite_cc_library(fuse_var_conv_activation
 lite_cc_library(fuse_conv_bn
         SRCS conv_bn_fuser.cc
         DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_conv_conv
+        SRCS conv_conv_fuser.cc
+        DEPS pattern_matcher_high_api)     
 lite_cc_library(fuse_elementwise_add_activation
         SRCS elementwise_add_activation_fuser.cc
         DEPS pattern_matcher_high_api)
@@ -42,6 +45,7 @@ set(mir_fusers
     fuse_conv_activation
     fuse_var_conv_activation
     fuse_conv_bn
+    fuse_conv_conv
     fuse_quant_dequant
     fuse_elementwise_add_activation
     fuse_transpose_softmax_transpose
diff --git a/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
index 61aeb2ab1f51ddcd6b153971253f8239472a1031..db950fd4b4d671ed618c8bc53010e5be6f5fd78b 100644
--- a/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
@@ -326,6 +326,28 @@ class XPUMmdnnSearchAttentionFuser : public FuseBase {
   }
 };
 
+// 4 inputs
+// ========
+//
+// input_x
+// input_y
+// topk_row
+// topk_col
+//
+// input_x ------- match_matrix_tensor ------- input_y
+//                           |
+//                          relu
+//                 ________/    \________
+//                 |                    |
+//            var_conv_2d               |
+//                 |                    |
+//                relu                  |
+//                 |_______      _______|
+//                         \    /
+//                   sequence_concat
+//                           |
+// topk_row ---- sequence_topk_avg_pooling ----- topk_col
+//
 class XPUMmdnnMatchConvTopkFuser : public FuseBase {
  public:
   void BuildPattern() override {
@@ -418,10 +440,156 @@ class XPUMmdnnMatchConvTopkFuser : public FuseBase {
 
     auto* match_op_info = matched.at("match_matrix_tensor")->stmt()->op_info();
     op_desc.SetAttr<float>("input_w_max",
-                           match_op_info->GetAttr<float>("w_max"));
+                           match_op_info->GetAttr<float>("__xpu__w_max"));
+    op_desc.SetAttr<int>("dim_t", match_op_info->GetAttr<int>("dim_t"));
+    auto* conv_op_info = matched.at("conv")->stmt()->op_info();
+    op_desc.SetAttr<float>("conv_w_max",
+                           conv_op_info->GetAttr<float>("__xpu__w_max"));
+    op_desc.SetAttr<int>("output_channel",
+                         conv_op_info->GetAttr<int>("OutputChannel"));
+    auto* topk_op_info = matched.at("topk")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<int>>(
+        "topks", topk_op_info->GetAttr<std::vector<int>>("topks"));
+    op_desc.SetAttr<int>("channel_num",
+                         topk_op_info->GetAttr<int>("channel_num"));
+
+    auto* new_stmt = matched.at("match_matrix_tensor")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    // XXX(miaotianxiang): redundant links around |topk| are automatically
+    // removed as |topk| is marked intermediate.
+    // RemoveDirectedLink(matched.at("topk_col"), matched.at("topk"));
+    // RemoveDirectedLink(matched.at("topk_row"), matched.at("topk"));
+    std::vector<std::string> arg_names{"conv_w"};
+    for (auto name : arg_names) {
+      DirectedLink(matched.at(name), matched.at("match_matrix_tensor"));
+    }
+    std::vector<std::string> out_names{"topk_out"};
+    for (auto name : out_names) {
+      IR_OP_VAR_LINK(matched.at("match_matrix_tensor"), matched.at(name));
+    }
+  }
+};
+
+// 2 inputs
+// ========
+//
+// input_x
+// input_y
+//
+// input_x ------- match_matrix_tensor ------- input_y
+//    |                      |                    |
+//    |                     relu                  |
+//    |            ________/    \________         |
+//    |            |                    |         |
+//    |       var_conv_2d               |         |
+//    |            |                    |         |
+//    |           relu                  |         |
+//    |            |_______      _______|         |
+//    |                    \    /                 |
+//    |              sequence_concat              |
+//    |                      |                    |
+//    |--------- sequence_topk_avg_pooling -------|
+//
+class XPUMmdnnMatchConvTopkFuser2 : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input_x = VarNode("input_x")
+                        ->assert_is_op_input("match_matrix_tensor", "X")
+                        ->assert_is_op_input("sequence_topk_avg_pooling", "ROW")
+                        ->AsInput();
+    auto* input_y =
+        VarNode("input_y")
+            ->assert_is_op_input("match_matrix_tensor", "Y")
+            ->assert_is_op_input("sequence_topk_avg_pooling", "COLUMN")
+            ->AsInput();
+    auto* input_w = VarNode("input_w")
+                        ->assert_is_op_input("match_matrix_tensor", "W")
+                        ->AsInput();
+
+    auto* match_matrix_tensor =
+        OpNode("match_matrix_tensor", "match_matrix_tensor");
+    auto* match_out = VarNode("match_out")
+                          ->assert_is_op_output("match_matrix_tensor", "Out")
+                          ->AsIntermediate();
+    auto* match_tmp = VarNode("match_tmp")
+                          ->assert_is_op_output("match_matrix_tensor", "Tmp")
+                          ->AsIntermediate();
+    auto* relu0 = OpNode("relu0", "relu")->AsIntermediate();
+    auto* relu0_out = VarNode("relu0_out")
+                          ->assert_is_op_output("relu", "Out")
+                          ->AsIntermediate();
+    auto* conv_w =
+        VarNode("conv_w")->assert_is_op_input("var_conv_2d", "W")->AsInput();
+    auto* conv = OpNode("conv", "var_conv_2d")->AsIntermediate();
+    auto* conv_out = VarNode("conv_out")
+                         ->assert_is_op_output("var_conv_2d", "Out")
+                         ->AsIntermediate();
+    auto* conv_col = VarNode("conv_col")
+                         ->assert_is_op_output("var_conv_2d", "Col")
+                         ->AsIntermediate();
+    auto* relu1 = OpNode("relu1", "relu")->AsIntermediate();
+    auto* relu1_out = VarNode("relu1_out")
+                          ->assert_is_op_output("relu", "Out")
+                          ->AsIntermediate();
+    auto* seq_concat =
+        OpNode("seq_concat", "sequence_concat")->AsIntermediate();
+    auto* seq_concat_out =
+        VarNode("seq_concat_out")
+            ->assert_is_op_output("sequence_concat", "Out")
+            ->assert_is_op_input("sequence_topk_avg_pooling", "X")
+            ->AsIntermediate();
+    auto* topk = OpNode("topk", "sequence_topk_avg_pooling")->AsIntermediate();
+    auto* topk_out =
+        VarNode("topk_out")
+            ->assert_is_op_output("sequence_topk_avg_pooling", "Out")
+            ->AsOutput();
+    auto* topk_pos =
+        VarNode("topk_pos")
+            ->assert_is_op_output("sequence_topk_avg_pooling", "pos")
+            ->AsIntermediate();
+
+    *input_x >> *match_matrix_tensor;
+    *input_y >> *match_matrix_tensor;
+    *input_w >> *match_matrix_tensor;
+    *match_matrix_tensor >> *match_out >> *relu0 >> *relu0_out;
+    *match_matrix_tensor >> *match_tmp;
+
+    *relu0_out >> *conv >> *conv_out >> *relu1 >> *relu1_out;
+    *conv_w >> *conv;
+    *conv >> *conv_col;
+
+    *relu0_out >> *seq_concat;
+    *relu1_out >> *seq_concat;
+    *seq_concat >> *seq_concat_out >> *topk >> *topk_out;
+    *input_x >> *topk;
+    *input_y >> *topk;
+    *topk >> *topk_pos;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_match_conv_topk");
+    op_desc.SetInput("input_x", {matched.at("input_x")->arg()->name});
+    op_desc.SetInput("input_y", {matched.at("input_y")->arg()->name});
+    op_desc.SetInput("input_w", {matched.at("input_w")->arg()->name});
+    op_desc.SetInput("conv_w", {matched.at("conv_w")->arg()->name});
+    op_desc.SetOutput("topk_out", {matched.at("topk_out")->arg()->name});
+
+    auto* match_op_info = matched.at("match_matrix_tensor")->stmt()->op_info();
+    op_desc.SetAttr<float>("input_w_max",
+                           match_op_info->GetAttr<float>("__xpu__w_max"));
     op_desc.SetAttr<int>("dim_t", match_op_info->GetAttr<int>("dim_t"));
     auto* conv_op_info = matched.at("conv")->stmt()->op_info();
-    op_desc.SetAttr<float>("conv_w_max", conv_op_info->GetAttr<float>("w_max"));
+    op_desc.SetAttr<float>("conv_w_max",
+                           conv_op_info->GetAttr<float>("__xpu__w_max"));
+    op_desc.SetAttr<int>("output_channel",
+                         conv_op_info->GetAttr<int>("OutputChannel"));
     auto* topk_op_info = matched.at("topk")->stmt()->op_info();
     op_desc.SetAttr<std::vector<int>>(
         "topks", topk_op_info->GetAttr<std::vector<int>>("topks"));
@@ -437,8 +605,7 @@ class XPUMmdnnMatchConvTopkFuser : public FuseBase {
     new_stmt->SetKernels(std::move(kernels));
 
     // XXX(miaotianxiang): redundant links around |topk| are automatically
-    // removed as |topk| is
-    // marked intermediate.
+    // removed as |topk| is marked intermediate.
     // RemoveDirectedLink(matched.at("topk_col"), matched.at("topk"));
     // RemoveDirectedLink(matched.at("topk_row"), matched.at("topk"));
     std::vector<std::string> arg_names{"conv_w"};
@@ -624,6 +791,15 @@ class XPUMmdnnBidEmbAttFuser : public FuseBase {
   }
 };
 
+// 5 outputs
+// =========
+//
+// eltwise01_out
+// seq_pool_right_out
+// seq_pool_left_out
+// seq_pool_2in1_out
+// concat_3in1_out
+//
 class XPUMmdnnBidEmbGrnnAttFuser : public FuseBase {
  public:
   void BuildPattern() override {
@@ -818,17 +994,272 @@ class XPUMmdnnBidEmbGrnnAttFuser : public FuseBase {
     auto* grnn_fw_op_info = matched.at("grnn_left")->stmt()->op_info();
     op_desc.SetAttr<std::vector<float>>(
         "grnn_fw_wh_maxs",
-        grnn_fw_op_info->GetAttr<std::vector<float>>("wh_max"));
+        grnn_fw_op_info->GetAttr<std::vector<float>>("__xpu__wh_max"));
     op_desc.SetAttr<std::vector<float>>(
         "grnn_fw_wi_maxs",
-        grnn_fw_op_info->GetAttr<std::vector<float>>("wi_max"));
+        grnn_fw_op_info->GetAttr<std::vector<float>>("__xpu__wi_max"));
     auto* grnn_rv_op_info = matched.at("grnn_right")->stmt()->op_info();
     op_desc.SetAttr<std::vector<float>>(
         "grnn_rv_wh_maxs",
-        grnn_rv_op_info->GetAttr<std::vector<float>>("wh_max"));
+        grnn_rv_op_info->GetAttr<std::vector<float>>("__xpu__wh_max"));
     op_desc.SetAttr<std::vector<float>>(
         "grnn_rv_wi_maxs",
-        grnn_rv_op_info->GetAttr<std::vector<float>>("wi_max"));
+        grnn_rv_op_info->GetAttr<std::vector<float>>("__xpu__wi_max"));
+    auto* att_fc_op_info = matched.at("att_2in1")->stmt()->op_info();
+    op_desc.SetAttr<float>("att_fc_w_max",
+                           att_fc_op_info->GetAttr<float>("W_max"));
+
+    auto* new_stmt = matched.at("emb0")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    std::vector<std::string> arg_names{
+        "input1",
+        "grnn_left_wh",
+        "grnn_left_wi",
+        "grnn_right_wh",
+        "grnn_right_wi",
+        "att_2in1_w",
+        "att_2in1_b",
+    };
+    for (auto name : arg_names) {
+      DirectedLink(matched.at(name), matched.at("emb0"));
+    }
+    std::vector<std::string> out_names{
+        "seq_pool_left_out",
+        "seq_pool_right_out",
+        "seq_pool_2in1_out",
+        "concat_3in1_out",
+        "eltwise01_out",
+    };
+    for (auto name : out_names) {
+      IR_OP_VAR_LINK(matched.at("emb0"), matched.at(name));
+    }
+  }
+};
+
+// 6 outputs
+// =========
+//
+// emb0_out
+// eltwise01_out
+// seq_pool_right_out
+// seq_pool_left_out
+// seq_pool_2in1_out
+// concat_3in1_out
+//
+class XPUMmdnnBidEmbGrnnAttFuser2 : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input0 = VarNode("input0")->AsInput();
+    auto* input1 = VarNode("input1")->AsInput();
+    auto* emb_tbl = VarNode("emb_tbl")->AsInput();
+
+    auto* emb0 = OpNode("emb0", "lookup_table");
+    auto* emb0_out = VarNode("emb0_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->assert_is_op_input("search_seq_arithmetic", "X")
+                         ->AsOutput();
+    auto* emb1 = OpNode("emb1", "lookup_table")->AsIntermediate();
+    auto* emb1_out = VarNode("emb1_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->assert_is_op_input("search_seq_arithmetic", "Y")
+                         ->AsIntermediate();
+    auto* eltwise01 =
+        OpNode("eltwise01", "search_seq_arithmetic")->AsIntermediate();
+    auto* eltwise01_out =
+        VarNode("eltwise01_out")
+            ->assert_is_op_output("search_seq_arithmetic", "Out")
+            ->AsOutput();
+
+    auto* seq_rev_right0 =
+        OpNode("seq_rev_right0", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev_right0_out =
+        VarNode("seq_rev_right0_out")
+            ->assert_is_op_output("sequence_reverse", "Y")
+            ->AsIntermediate();
+    auto* grnn_right_wh = VarNode("grnn_right_wh")
+                              ->assert_is_op_input("search_grnn", "Wh")
+                              ->AsInput();
+    auto* grnn_right_wi = VarNode("grnn_right_wi")
+                              ->assert_is_op_input("search_grnn", "Wi")
+                              ->AsInput();
+    auto* grnn_right = OpNode("grnn_right", "search_grnn")->AsIntermediate();
+    auto* grnn_right_out = VarNode("grnn_right_out")
+                               ->assert_is_op_output("search_grnn", "Out")
+                               ->AsIntermediate();
+    auto* grnn_right_idx_sorted_by_width =
+        VarNode("grnn_right_idx_sorted_by_width")
+            ->assert_is_op_output("search_grnn", "idx_sorted_by_width")
+            ->AsIntermediate();
+    auto* grnn_right_layout_input =
+        VarNode("grnn_right_layout_input")
+            ->assert_is_op_output("search_grnn", "layout_input")
+            ->AsIntermediate();
+    auto* grnn_right_tmp_buffer =
+        VarNode("grnn_right_tmp_buffer")
+            ->assert_is_op_output("search_grnn", "tmp_buffer")
+            ->AsIntermediate();
+    auto* seq_rev_right1 =
+        OpNode("seq_rev_right1", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev_right1_out =
+        VarNode("seq_rev_right1_out")
+            ->assert_is_op_output("sequence_reverse", "Y")
+            ->AsIntermediate();
+    auto* seq_pool_right =
+        OpNode("seq_pool_right", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_right_out = VarNode("seq_pool_right_out")
+                                   ->assert_is_op_output("sequence_pool", "Out")
+                                   ->AsOutput();
+    auto* seq_pool_right_max_idx =
+        VarNode("seq_pool_right_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* grnn_left_wh = VarNode("grnn_left_wh")
+                             ->assert_is_op_input("search_grnn", "Wh")
+                             ->AsInput();
+    auto* grnn_left_wi = VarNode("grnn_left_wi")
+                             ->assert_is_op_input("search_grnn", "Wi")
+                             ->AsInput();
+    auto* grnn_left = OpNode("grnn_left", "search_grnn")->AsIntermediate();
+    auto* grnn_left_out = VarNode("grnn_left_out")
+                              ->assert_is_op_output("search_grnn", "Out")
+                              ->AsIntermediate();
+    auto* grnn_left_idx_sorted_by_width =
+        VarNode("grnn_left_idx_sorted_by_width")
+            ->assert_is_op_output("search_grnn", "idx_sorted_by_width")
+            ->AsIntermediate();
+    auto* grnn_left_layout_input =
+        VarNode("grnn_left_layout_input")
+            ->assert_is_op_output("search_grnn", "layout_input")
+            ->AsIntermediate();
+    auto* grnn_left_tmp_buffer =
+        VarNode("grnn_left_tmp_buffer")
+            ->assert_is_op_output("search_grnn", "tmp_buffer")
+            ->AsIntermediate();
+    auto* seq_pool_left =
+        OpNode("seq_pool_left", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_left_out = VarNode("seq_pool_left_out")
+                                  ->assert_is_op_output("sequence_pool", "Out")
+                                  ->AsOutput();
+    auto* seq_pool_left_max_idx =
+        VarNode("seq_pool_left_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* concat_2in1 = OpNode("concat_2in1", "concat")->AsIntermediate();
+    auto* concat_2in1_out = VarNode("concat_2in1_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsIntermediate();
+    auto* att_2in1_w =
+        VarNode("att_2in1_w")
+            ->assert_is_op_input("__xpu__mmdnn_search_attention", "W")
+            ->AsInput();
+    auto* att_2in1_b =
+        VarNode("att_2in1_b")
+            ->assert_is_op_input("__xpu__mmdnn_search_attention", "b")
+            ->AsInput();
+    auto* att_2in1 =
+        OpNode("att_2in1", "__xpu__mmdnn_search_attention")->AsIntermediate();
+    auto* att_2in1_out =
+        VarNode("att_2in1_out")
+            ->assert_is_op_output("__xpu__mmdnn_search_attention", "Out")
+            ->AsIntermediate();
+    auto* seq_pool_2in1 =
+        OpNode("seq_pool_2in1", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_2in1_out = VarNode("seq_pool_2in1_out")
+                                  ->assert_is_op_output("sequence_pool", "Out")
+                                  ->AsOutput();
+    auto* seq_pool_2in1_max_idx =
+        VarNode("seq_pool_2in1_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* concat_3in1 = OpNode("concat_3in1", "concat")->AsIntermediate();
+    auto* concat_3in1_out = VarNode("concat_3in1_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsOutput();
+
+    *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out;
+    *emb_tbl >> *emb0;
+    *input1 >> *emb1 >> *emb1_out >> *eltwise01;
+    *emb_tbl >> *emb1;
+
+    *eltwise01_out >> *seq_rev_right0 >> *seq_rev_right0_out >> *grnn_right >>
+        *grnn_right_out >> *seq_rev_right1 >> *seq_rev_right1_out;
+    *grnn_right_out >> *seq_pool_right >> *seq_pool_right_out;
+    *seq_pool_right >> *seq_pool_right_max_idx;
+    *grnn_right_wh >> *grnn_right;
+    *grnn_right_wi >> *grnn_right;
+    *grnn_right >> *grnn_right_idx_sorted_by_width;
+    *grnn_right >> *grnn_right_layout_input;
+    *grnn_right >> *grnn_right_tmp_buffer;
+
+    *eltwise01_out >> *grnn_left >> *grnn_left_out >> *seq_pool_left >>
+        *seq_pool_left_out;
+    *seq_pool_left >> *seq_pool_left_max_idx;
+    *grnn_left_wh >> *grnn_left;
+    *grnn_left_wi >> *grnn_left;
+    *grnn_left >> *grnn_left_idx_sorted_by_width;
+    *grnn_left >> *grnn_left_layout_input;
+    *grnn_left >> *grnn_left_tmp_buffer;
+
+    *seq_rev_right1_out >> *concat_2in1;
+    *grnn_left_out >> *concat_2in1;
+    *concat_2in1 >> *concat_2in1_out >> *att_2in1 >> *att_2in1_out >>
+        *seq_pool_2in1 >> *seq_pool_2in1_out;
+    *seq_pool_2in1 >> *seq_pool_2in1_max_idx;
+    *att_2in1_w >> *att_2in1;
+    *att_2in1_b >> *att_2in1;
+
+    *eltwise01_out >> *concat_3in1;
+    *seq_rev_right1_out >> *concat_3in1;
+    *grnn_left_out >> *concat_3in1;
+    *concat_3in1 >> *concat_3in1_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_bid_emb_grnn_att2");
+    op_desc.SetInput("id0", {matched.at("input0")->arg()->name});
+    op_desc.SetInput("id1", {matched.at("input1")->arg()->name});
+    op_desc.SetInput("emb_tbl", {matched.at("emb_tbl")->arg()->name});
+    op_desc.SetInput("grnn_fw_wh", {matched.at("grnn_left_wh")->arg()->name});
+    op_desc.SetInput("grnn_fw_wi", {matched.at("grnn_left_wi")->arg()->name});
+    op_desc.SetInput("grnn_rv_wh", {matched.at("grnn_right_wh")->arg()->name});
+    op_desc.SetInput("grnn_rv_wi", {matched.at("grnn_right_wi")->arg()->name});
+    op_desc.SetInput("att_fc_w", {matched.at("att_2in1_w")->arg()->name});
+    op_desc.SetInput("att_fc_b", {matched.at("att_2in1_b")->arg()->name});
+    op_desc.SetOutput("emb0_out", {matched.at("emb0_out")->arg()->name});
+    op_desc.SetOutput("grnn_fw_pool_out",
+                      {matched.at("seq_pool_left_out")->arg()->name});
+    op_desc.SetOutput("grnn_rv_pool_out",
+                      {matched.at("seq_pool_right_out")->arg()->name});
+    op_desc.SetOutput("att_pool_out",
+                      {matched.at("seq_pool_2in1_out")->arg()->name});
+    op_desc.SetOutput("concat_3in1_out",
+                      {matched.at("concat_3in1_out")->arg()->name});
+    op_desc.SetOutput("emb_fw_out", {matched.at("eltwise01_out")->arg()->name});
+
+    auto* grnn_fw_op_info = matched.at("grnn_left")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_fw_wh_maxs",
+        grnn_fw_op_info->GetAttr<std::vector<float>>("__xpu__wh_max"));
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_fw_wi_maxs",
+        grnn_fw_op_info->GetAttr<std::vector<float>>("__xpu__wi_max"));
+    auto* grnn_rv_op_info = matched.at("grnn_right")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_rv_wh_maxs",
+        grnn_rv_op_info->GetAttr<std::vector<float>>("__xpu__wh_max"));
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_rv_wi_maxs",
+        grnn_rv_op_info->GetAttr<std::vector<float>>("__xpu__wi_max"));
     auto* att_fc_op_info = matched.at("att_2in1")->stmt()->op_info();
     op_desc.SetAttr<float>("att_fc_w_max",
                            att_fc_op_info->GetAttr<float>("W_max"));
@@ -868,6 +1299,9 @@ class XPUMmdnnBidEmbGrnnAttFuser : public FuseBase {
 
 class XPUMmdnnMergeAllFuser : public FuseBase {
  public:
+  explicit XPUMmdnnMergeAllFuser(int n_concat_topk)
+      : n_concat_topk_(n_concat_topk) {}
+
   void BuildPattern() override {
     auto* concat_7in1_input0 = VarNode("concat_7in1_input0")
                                    ->assert_is_op_nth_input("concat", "X", 0)
@@ -909,16 +1343,25 @@ class XPUMmdnnMergeAllFuser : public FuseBase {
                           ->assert_is_op_output("relu", "Out")
                           ->AsIntermediate();
 
-    auto* concat_2in1_input0 = VarNode("concat_2in1_input0")
+    auto* concat_topk_input0 = VarNode("concat_topk_input0")
                                    ->assert_is_op_nth_input("concat", "X", 0)
                                    ->AsInput();
-    auto* concat_2in1_input1 = VarNode("concat_2in1_input1")
+    auto* concat_topk_input1 = VarNode("concat_topk_input1")
                                    ->assert_is_op_nth_input("concat", "X", 1)
                                    ->AsInput();
-    auto* concat_2in1 = OpNode("concat_2in1", "concat")->AsIntermediate();
-    auto* concat_2in1_out = VarNode("concat_2in1_out")
+    auto* concat_topk = OpNode("concat_topk", "concat")->AsIntermediate();
+    auto* concat_topk_out = VarNode("concat_topk_out")
                                 ->assert_is_op_output("concat", "Out")
                                 ->AsIntermediate();
+    for (int i = 2; i < n_concat_topk_; ++i) {
+      auto concat_topk_input_name =
+          paddle::lite::string_format("concat_topk_input%d", i);
+      auto* concat_topk_inputx = VarNode(concat_topk_input_name)
+                                     ->assert_is_op_nth_input("concat", "X", i)
+                                     ->AsInput();
+      *concat_topk_inputx >> *concat_topk;
+    }
+
     auto* seq_rev = OpNode("seq_rev", "sequence_reverse")->AsIntermediate();
     auto* seq_rev_out = VarNode("seq_rev_out")
                             ->assert_is_op_output("sequence_reverse", "Y")
@@ -1034,9 +1477,9 @@ class XPUMmdnnMergeAllFuser : public FuseBase {
     *search_fc0_w >> *search_fc0;
     *search_fc0_b >> *search_fc0;
 
-    *concat_2in1_input0 >> *concat_2in1;
-    *concat_2in1_input1 >> *concat_2in1;
-    *concat_2in1 >> *concat_2in1_out >> *seq_rev >> *seq_rev_out;
+    *concat_topk_input0 >> *concat_topk;
+    *concat_topk_input1 >> *concat_topk;
+    *concat_topk >> *concat_topk_out >> *seq_rev >> *seq_rev_out;
 
     *seq_rev_out >> *grnn_rv >> *grnn_rv_out >> *seq_pool_rv >>
         *seq_pool_rv_out;
@@ -1047,7 +1490,7 @@ class XPUMmdnnMergeAllFuser : public FuseBase {
     *grnn_rv >> *grnn_rv_layout_input;
     *grnn_rv >> *grnn_rv_tmp_buffer;
 
-    *concat_2in1_out >> *grnn_fw >> *grnn_fw_out >> *seq_pool_fw >>
+    *concat_topk_out >> *grnn_fw >> *grnn_fw_out >> *seq_pool_fw >>
         *seq_pool_fw_out;
     *seq_pool_fw >> *seq_pool_fw_max_idx;
     *grnn_fw_wh >> *grnn_fw;
@@ -1075,8 +1518,8 @@ class XPUMmdnnMergeAllFuser : public FuseBase {
     op_desc.SetType("__xpu__mmdnn_merge_all");
     auto* concat_7in1_op_info = matched.at("concat_7in1")->stmt()->op_info();
     op_desc.SetInput("concat_7in1_x", concat_7in1_op_info->Input("X"));
-    auto* concat_2in1_op_info = matched.at("concat_2in1")->stmt()->op_info();
-    op_desc.SetInput("concat_2in1_x", concat_2in1_op_info->Input("X"));
+    auto* concat_topk_op_info = matched.at("concat_topk")->stmt()->op_info();
+    op_desc.SetInput("concat_topk_x", concat_topk_op_info->Input("X"));
     op_desc.SetInput("grnn_fw_wh", {matched.at("grnn_fw_wh")->arg()->name});
     op_desc.SetInput("grnn_fw_wi", {matched.at("grnn_fw_wi")->arg()->name});
     op_desc.SetInput("grnn_rv_wh", {matched.at("grnn_rv_wh")->arg()->name});
@@ -1093,23 +1536,26 @@ class XPUMmdnnMergeAllFuser : public FuseBase {
     auto* grnn_fw_op_info = matched.at("grnn_fw")->stmt()->op_info();
     op_desc.SetAttr<std::vector<float>>(
         "grnn_fw_wh_maxs",
-        grnn_fw_op_info->GetAttr<std::vector<float>>("wh_max"));
+        grnn_fw_op_info->GetAttr<std::vector<float>>("__xpu__wh_max"));
     op_desc.SetAttr<std::vector<float>>(
         "grnn_fw_wi_maxs",
-        grnn_fw_op_info->GetAttr<std::vector<float>>("wi_max"));
+        grnn_fw_op_info->GetAttr<std::vector<float>>("__xpu__wi_max"));
     auto* grnn_rv_op_info = matched.at("grnn_rv")->stmt()->op_info();
     op_desc.SetAttr<std::vector<float>>(
         "grnn_rv_wh_maxs",
-        grnn_rv_op_info->GetAttr<std::vector<float>>("wh_max"));
+        grnn_rv_op_info->GetAttr<std::vector<float>>("__xpu__wh_max"));
     op_desc.SetAttr<std::vector<float>>(
         "grnn_rv_wi_maxs",
-        grnn_rv_op_info->GetAttr<std::vector<float>>("wi_max"));
+        grnn_rv_op_info->GetAttr<std::vector<float>>("__xpu__wi_max"));
     auto* fc0_op_info = matched.at("search_fc0")->stmt()->op_info();
-    op_desc.SetAttr<float>("fc0_w_max", fc0_op_info->GetAttr<float>("w_max"));
+    op_desc.SetAttr<float>("fc0_w_max",
+                           fc0_op_info->GetAttr<float>("__xpu__w_max"));
     auto* fc1_op_info = matched.at("search_fc1")->stmt()->op_info();
-    op_desc.SetAttr<float>("fc1_w_max", fc1_op_info->GetAttr<float>("w_max"));
+    op_desc.SetAttr<float>("fc1_w_max",
+                           fc1_op_info->GetAttr<float>("__xpu__w_max"));
     auto* fc2_op_info = matched.at("search_fc2")->stmt()->op_info();
-    op_desc.SetAttr<float>("fc2_w_max", fc2_op_info->GetAttr<float>("w_max"));
+    op_desc.SetAttr<float>("fc2_w_max",
+                           fc2_op_info->GetAttr<float>("__xpu__w_max"));
 
     auto* new_stmt = matched.at("concat_7in1")->stmt();
     auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
@@ -1120,8 +1566,8 @@ class XPUMmdnnMergeAllFuser : public FuseBase {
     new_stmt->SetKernels(std::move(kernels));
 
     std::vector<std::string> arg_names{
-        "concat_2in1_input0",
-        "concat_2in1_input1",
+        "concat_topk_input0",
+        "concat_topk_input1",
         "grnn_fw_wh",
         "grnn_fw_wi",
         "grnn_rv_wh",
@@ -1133,6 +1579,11 @@ class XPUMmdnnMergeAllFuser : public FuseBase {
         "search_fc2_w",
         "search_fc2_b",
     };
+    for (int i = 2; i < n_concat_topk_; ++i) {
+      auto concat_topk_input_name =
+          paddle::lite::string_format("concat_topk_input%d", i);
+      arg_names.push_back(concat_topk_input_name);
+    }
     for (auto name : arg_names) {
       DirectedLink(matched.at(name), matched.at("concat_7in1"));
     }
@@ -1143,6 +1594,9 @@ class XPUMmdnnMergeAllFuser : public FuseBase {
       IR_OP_VAR_LINK(matched.at("concat_7in1"), matched.at(name));
     }
   }
+
+ private:
+  int n_concat_topk_;
 };
 
 }  // namespace fusion
@@ -1158,15 +1612,21 @@ class XPUMmdnnFusePass : public ProgramPass {
     search_att_fuser(graph.get());
     fusion::XPUMmdnnMatchConvTopkFuser match_conv_topk_fuser;
     match_conv_topk_fuser(graph.get());
+    fusion::XPUMmdnnMatchConvTopkFuser2 match_conv_topk_fuser2;
+    match_conv_topk_fuser2(graph.get());
 
     fusion::XPUMmdnnBidSeqRevEmbEltwiseFuser bi_seq_rev_emb_eltwise_fuser;
     bi_seq_rev_emb_eltwise_fuser(graph.get());
     fusion::XPUMmdnnBidEmbGrnnAttFuser bid_emb_grnn_att_fuser;
     bid_emb_grnn_att_fuser(graph.get());
+    fusion::XPUMmdnnBidEmbGrnnAttFuser2 bid_emb_grnn_att_fuser2;
+    bid_emb_grnn_att_fuser2(graph.get());
     fusion::XPUMmdnnBidEmbAttFuser bid_emb_att_fuser;
     bid_emb_att_fuser(graph.get());
-    fusion::XPUMmdnnMergeAllFuser merge_all_fuser;
-    merge_all_fuser(graph.get());
+    for (int n_concat_topk : {3, 2}) {
+      fusion::XPUMmdnnMergeAllFuser merge_all_fuser(n_concat_topk);
+      merge_all_fuser(graph.get());
+    }
   }
 };
 
@@ -1178,6 +1638,7 @@ REGISTER_MIR_PASS(__xpu__mmdnn_fuse_pass, paddle::lite::mir::XPUMmdnnFusePass)
     .BindTargets({TARGET(kXPU)})
     .BindKernel("__xpu__mmdnn_search_attention")
     .BindKernel("__xpu__mmdnn_bid_emb_grnn_att")
+    .BindKernel("__xpu__mmdnn_bid_emb_grnn_att2")
     .BindKernel("__xpu__mmdnn_bid_emb_att")
     .BindKernel("__xpu__mmdnn_match_conv_topk")
     .BindKernel("__xpu__mmdnn_merge_all");
diff --git a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
index 04988612192b79824b1294428fa9b1c38d784979..21bc266204d95c0f7faa8c3796e4b6255a3fe741 100644
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -383,10 +383,10 @@ class XPUSingleEncoderFuser : public FuseBase {
     op_desc.SetAttr<std::string>("act_type", act_type_);
 
     auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
     static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
     auto* single_encoder_stmt = matched.at("q_mul")->stmt();
     fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
     fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());
diff --git a/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
index b25eb084f286fccfa4afe8832f9dc1ff8384d552..f017cc8c72f93a772f8bcbdc9aa96d5b0ad215d8 100644
--- a/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
@@ -373,10 +373,10 @@ class XPUResNetCbamBlock0Fuser : public FuseBase {
     auto block0_stmt = matched.at("left_conv1")->stmt();
     // block0_stmt->ResetOp(op_desc, graph->valid_places());
     auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
     static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
     fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
     fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
     block0_stmt->SetOp(fake_subgraph_op);
@@ -693,10 +693,10 @@ class XPUResNetCbamBlock1Fuser : public FuseBase {
 
     auto block1_stmt = matched.at("right_conv1")->stmt();
     auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
     static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
     fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
     fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
     block1_stmt->SetOp(fake_subgraph_op);
@@ -932,10 +932,10 @@ class XPUResNetCbamBlock2Fuser : public FuseBase {
         << "Y of last fc must have been transposed";
 
     auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
     static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
     fake_subgraph_op->Attach(op_desc, scope);
     fake_subgraph_op->SetValidPlaces(block2_stmt->op()->valid_places());
     block2_stmt->SetOp(fake_subgraph_op);
diff --git a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
index de2210a76ea0647cb02131a088ceb754afd0ef9c..7024a872f30d3c78affe82648c902a6128de7070 100644
--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
@@ -315,10 +315,10 @@ class XPUResNetBlock0Fuser : public FuseBase {
     auto block0_stmt = matched.at("left_conv1")->stmt();
     // block0_stmt->ResetOp(op_desc, graph->valid_places());
     auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
     static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
     fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
     fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
     block0_stmt->SetOp(fake_subgraph_op);
@@ -577,10 +577,10 @@ class XPUResNetBlock1Fuser : public FuseBase {
 
     auto block1_stmt = matched.at("right_conv1")->stmt();
     auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
     static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
     fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
     fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
     block1_stmt->SetOp(fake_subgraph_op);
diff --git a/lite/core/mir/fusion/conv_conv_fuse_pass.cc b/lite/core/mir/fusion/conv_conv_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9c4f0c02cd89e04d93af8e4dab71acc5d24e411
--- /dev/null
+++ b/lite/core/mir/fusion/conv_conv_fuse_pass.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/conv_conv_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/conv_conv_fuser.h"
+#include "lite/core/mir/graph_visualize_pass.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  // initialze fuser params
+  std::vector<bool> conv_has_bias_cases{true, false};
+  std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
+  bool has_arm = false;
+  for (auto& place : graph->valid_places()) {
+    if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) {
+      has_arm = true;
+      break;
+    }
+  }
+  if (!has_arm) {
+    return;
+  }
+  // only support fp32 fusion
+  for (auto conv_has_bias0 : conv_has_bias_cases) {
+    for (auto conv_has_bias1 : conv_has_bias_cases) {
+      for (auto conv_type0 : conv_type_cases) {
+        for (auto conv_type1 : conv_type_cases) {
+          VLOG(4) << "conv_has_bias0:" << conv_has_bias0
+                  << " conv_type0:" << conv_type0;
+          VLOG(4) << "conv_has_bias1:" << conv_has_bias1
+                  << " conv_type1:" << conv_type1;
+          fusion::ConvConvFuser fuser(
+              conv_type0, conv_type1, conv_has_bias0, conv_has_bias1);
+          fuser(graph.get());
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_conv_conv_fuse_pass, paddle::lite::mir::ConvConvFusePass)
+    .BindTargets({TARGET(kARM)});
diff --git a/lite/kernels/xpu/utils.h b/lite/core/mir/fusion/conv_conv_fuse_pass.h
similarity index 76%
rename from lite/kernels/xpu/utils.h
rename to lite/core/mir/fusion/conv_conv_fuse_pass.h
index d410cb1567d5c60aeb52b798d9f17c7f5692e096..64e1b87ec9a8618572d6044f6dde2ab25c5a11c4 100644
--- a/lite/kernels/xpu/utils.h
+++ b/lite/core/mir/fusion/conv_conv_fuse_pass.h
@@ -14,18 +14,19 @@
 
 #pragma once
 
-#include "lite/backends/xpu/xpu_header_sitter.h"
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
-namespace xpu {
+namespace mir {
 
-struct XPUFreeDeleter {
-  void operator()(void* p) const { xpu_free(p); }
+class ConvConvFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };
 
-}  // namespace xpu
-}  // namespace kernels
+}  // namespace mir
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/mir/fusion/conv_conv_fuser.cc b/lite/core/mir/fusion/conv_conv_fuser.cc
new file mode 100644
index 0000000000000000000000000000000000000000..737f96e69baa8953c0231fcc4c9e104907b17381
--- /dev/null
+++ b/lite/core/mir/fusion/conv_conv_fuser.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/conv_conv_fuser.h"
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void ConvConvFuser::BuildPattern() {
+  auto* conv_input0 = VarNode("conv_input0")
+                          ->assert_is_op_input(conv_type0_, "Input")
+                          ->AsInput();
+  auto* conv_weight0 = VarNode("conv_weight0")
+                           ->assert_is_op_input(conv_type0_, "Filter")
+                           ->AsInput();
+  auto* conv0 = OpNode("conv2d0", conv_type0_)->assert_is_op(conv_type0_);
+  auto* conv_out0 = VarNode("conv_out0")
+                        ->assert_is_op_output(conv_type0_, "Output")
+                        ->assert_is_op_input(conv_type1_, "Input")
+                        ->AsIntermediate();
+
+  auto* conv_weight1 = VarNode("conv_weight1")
+                           ->assert_is_op_input(conv_type1_, "Filter")
+                           ->AsIntermediate();
+  auto* conv1 = OpNode("conv2d1", conv_type1_)
+                    ->assert_is_op(conv_type1_)
+                    ->assert_op_attr<int>("groups", 1)
+                    ->AsIntermediate();
+
+  auto* conv_out1 = VarNode("conv_out1")
+                        ->assert_is_op_output(conv_type1_, "Output")
+                        ->AsOutput();
+
+  if (conv_has_bias0_) {
+    if (conv_has_bias1_) {
+      auto* conv_bias0 = VarNode("conv_bias0")
+                             ->assert_is_op_input(conv_type0_, "Bias")
+                             ->AsIntermediate();
+      auto* conv_bias1 = VarNode("conv_bias1")
+                             ->assert_is_op_input(conv_type1_, "Bias")
+                             ->AsInput();
+      conv0->LinksFrom({conv_input0, conv_weight0, conv_bias0})
+          .LinksTo({conv_out0});
+      conv1->LinksFrom({conv_out0, conv_weight1, conv_bias1})
+          .LinksTo({conv_out1});
+    } else {
+      auto* conv_bias0 = VarNode("conv_bias0")
+                             ->assert_is_op_input(conv_type0_, "Bias")
+                             ->AsIntermediate();
+      conv0->LinksFrom({conv_input0, conv_weight0, conv_bias0})
+          .LinksTo({conv_out0});
+      conv1->LinksFrom({conv_out0, conv_weight1}).LinksTo({conv_out1});
+    }
+  } else {
+    conv0->LinksFrom({conv_input0, conv_weight0}).LinksTo({conv_out0});
+    if (conv_has_bias1_) {
+      auto* conv_bias1 = VarNode("conv_bias1")
+                             ->assert_is_op_input(conv_type1_, "Bias")
+                             ->AsInput();
+      conv1->LinksFrom({conv_out0, conv_weight1, conv_bias1})
+          .LinksTo({conv_out1});
+    } else {
+      conv1->LinksFrom({conv_out0, conv_weight1}).LinksTo({conv_out1});
+    }
+  }
+}
+
+void ConvConvFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
+  auto conv_instruct = matched.at("conv2d0")->stmt();
+  auto conv_op_desc = conv_instruct->mutable_op_info();
+  auto conv = conv_instruct->op();
+  auto* scope = conv->scope();
+  auto conv_op_desc1 = matched.at("conv2d1")->stmt()->mutable_op_info();
+
+  // conv0
+  auto weight0_t = scope->FindVar(matched.at("conv_weight0")->arg()->name)
+                       ->GetMutable<lite::Tensor>();
+
+  // conv1
+  auto weight1_t = scope->FindVar(matched.at("conv_weight1")->arg()->name)
+                       ->GetMutable<lite::Tensor>();
+  // auto groups0 = conv_op_desc->GetAttr<int>("groups");
+  auto groups1 = conv_op_desc1->GetAttr<int>("groups");
+  auto strides1 = conv_op_desc1->GetAttr<std::vector<int>>("strides");
+  auto paddings1 = conv_op_desc1->GetAttr<std::vector<int>>("paddings");
+  auto dilations1 = conv_op_desc1->GetAttr<std::vector<int>>("dilations");
+
+  bool enable0_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
+  bool enable1_int8 = conv_op_desc1->HasAttr("enable_int8") ? true : false;
+  int kw = weight1_t->dims()[2];
+  int kh = weight1_t->dims()[3];
+  if (!(kw == 1 && kh == 1)) {
+    return;
+  }
+  CHECK_EQ(enable0_int8, enable1_int8) << "The Conv compute type must be same";
+  CHECK_EQ(groups1, 1) << "The groups of weight1_dim must be 1";
+  CHECK_EQ(weight0_t->dims()[0], weight1_t->dims()[1])
+      << "weight0_dims[0] == weight1_dim[1]";
+  for (int i = 0; i < strides1.size(); i++) {
+    CHECK_EQ(strides1[i], 1) << "strides[" << i << "]: " << strides1[i]
+                             << " must be 1";
+  }
+  for (int i = 0; i < paddings1.size(); i++) {
+    CHECK_EQ(paddings1[i], 0) << "paddings1[" << i << "]: " << paddings1[i]
+                              << " must be 0";
+  }
+  for (int i = 0; i < dilations1.size(); i++) {
+    CHECK_EQ(dilations1[i], 1) << "dilations1[" << i << "]: " << dilations1[i]
+                               << " must be 1";
+  }
+  // comupte new_wight and new bias
+  ///////////////////////////////////////////////////////////////////////////////
+  // Compute ConvConvFuser
+  // Before fusion
+  //
+  //   conv(x) = conv(x) = kx + z = y
+  //   conv(y) = ay + b
+  //
+  // After fusion:
+  //
+  //   conv(conv(x)) = a(kx + z) + b = akx + az + b
+  //
+  //   new_weights = ak
+  //   new_bias = az + b
+  ///////////////////////////////////////////////////////////////////////////////
+  if (enable0_int8) {
+    LOG(FATAL) << "it doesn't support";
+    return;
+  } else {
+    // compute new conv_weight
+    Tensor weight_tensor;
+    auto in_dims = weight0_t->dims();
+    auto weight_dims = weight1_t->dims();
+    const float* din = weight0_t->data<float>();
+    const float* weights = weight1_t->data<float>();
+    int oc0 = in_dims[0];
+    int ic = in_dims[1];
+    int ih = in_dims[2];
+    int iw = in_dims[3];
+    int oc = weight_dims[0];
+    weight_tensor.Resize({oc, ic, ih, iw});
+    float* dout = weight_tensor.mutable_data<float>();
+    ComputeNewWeight(dout, din, weights, oc0, ic, ih, iw, oc);
+    weight0_t->CopyDataFrom(weight_tensor);
+  }
+  // compute new conv_bias
+  if (conv_has_bias0_ && conv_op_desc->HasInput("Bias") &&
+      conv_op_desc->Input("Bias").size() > 0) {
+    auto bias_t0 = scope->FindVar(matched.at("conv_bias0")->arg()->name)
+                       ->GetMutable<lite::Tensor>();
+    if (conv_has_bias1_ && conv_op_desc1->HasInput("Bias") &&
+        conv_op_desc1->Input("Bias").size() > 0) {
+      auto bias_t1 = scope->FindVar(matched.at("conv_bias1")->arg()->name)
+                         ->GetMutable<lite::Tensor>();
+      Tensor bias;
+      bias.CopyDataFrom(*bias_t1);
+      auto bias_data = bias.mutable_data<float>();
+      ComputeNewBias(bias_data, bias_t0, weight1_t, bias_t1);
+      bias_t1->CopyDataFrom(bias);
+      conv_op_desc->SetInput(
+          "Bias", {matched.at("conv_bias1")->arg()->name});  // conv_bias
+      IR_NODE_LINK_TO(matched.at("conv_bias1"), matched.at("conv2d0"));
+    } else {
+      Tensor bias;
+      auto weight_dims = weight1_t->dims();
+      bias.Resize({weight_dims[0]});
+      auto bias_d = bias.mutable_data<float>();
+      ComputeNewBias(bias_d, bias_t0, weight1_t, nullptr);
+      bias_t0->CopyDataFrom(bias);
+      conv_op_desc->SetInput(
+          "Bias", {matched.at("conv_bias0")->arg()->name});  // conv_bias
+    }
+  } else {
+    if (conv_has_bias1_ && conv_op_desc1->HasInput("Bias") &&
+        conv_op_desc1->Input("Bias").size() > 0) {
+      conv_op_desc->SetInput(
+          "Bias", {matched.at("conv_bias1")->arg()->name});  // conv_bias
+      IR_NODE_LINK_TO(matched.at("conv_bias1"), matched.at("conv2d0"));
+    }
+  }
+  conv_op_desc->SetType(conv_type0_);
+  conv_op_desc->SetInput("Input", {matched.at("conv_input0")->arg()->name});
+  conv_op_desc->SetInput("Filter", {matched.at("conv_weight0")->arg()->name});
+  conv_op_desc->SetOutput("Output", {matched.at("conv_out1")->arg()->name});
+
+  auto update_conv_desc = *conv_instruct->mutable_op_info();
+  conv_instruct->ResetOp(update_conv_desc, graph->valid_places());
+
+  IR_OP_VAR_LINK(matched.at("conv2d0"), matched.at("conv_out1"));
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/conv_conv_fuser.h b/lite/core/mir/fusion/conv_conv_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d1f58d1c8746a137e2078006016ec6007c2afbb
--- /dev/null
+++ b/lite/core/mir/fusion/conv_conv_fuser.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class ConvConvFuser : public FuseBase {
+ public:
+  explicit ConvConvFuser(const std::string& conv_type0,
+                         const std::string& conv_type1,
+                         const bool conv_has_bias0,
+                         const bool conv_has_bias1)
+      : conv_type0_(conv_type0),
+        conv_type1_(conv_type1),
+        conv_has_bias0_(conv_has_bias0),
+        conv_has_bias1_(conv_has_bias1) {}
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  void ComputeNewWeight(float* dout,
+                        const float* din,
+                        const float* weights,
+                        int oc0,
+                        int ic,
+                        int ih,
+                        int iw,
+                        int oc1) {
+    // input conv_weight0_t weights conv_weight1_t
+    // output weight_tensor
+    // ksize = 1
+    int in_size = ih * iw;
+    int in_channel_size = ic * in_size;
+    // out = w1[j, i, ih, iw] * w2[k, j, kw, kh]
+    // out_dim = [oc1, ic, kh, kw], din_dim = [oc0, ic, kh, kw]
+    // weight_dim = [oc1, oc0, kh, kw]
+    for (int k = 0; k < oc1; k++) {
+      const float* weights_ptr = weights + k * oc0;
+      float* out_ptr = dout + k * in_channel_size;
+      for (int c = 0; c < ic; c++) {
+        float* out_ptr_channel = out_ptr + c * in_size;
+        const float* din_ptr = din + c * in_size;
+        for (int i = 0; i < in_size; i++) {
+          float sum = 0.f;
+          for (int j = 0; j < oc0; j++) {
+            sum += din_ptr[j * in_channel_size] * weights_ptr[j];
+          }
+          *out_ptr_channel++ = sum;
+        }
+      }
+    }
+  }
+
+  void ComputeNewBias(float* dout,
+                      Tensor* bias0_tensor,
+                      Tensor* weight_tensor,
+                      Tensor* bias1_tensor) {
+    // input bias0_tensor weight_tensor bias1_tensor
+    // output bias_tensor
+    auto in_dims = bias0_tensor->dims();
+    auto weight_dims = weight_tensor->dims();
+    const float* din = bias0_tensor->data<float>();
+    const float* weights = weight_tensor->data<float>();
+    int ic = in_dims[0];
+    int oc = weight_dims[0];
+    // out_k = b0[num, j, 1, 1] * w2[k, j, 1, 1]
+    if (bias1_tensor) {
+      const float* din2 = bias1_tensor->data<float>();
+      for (int k = 0; k < oc; k++) {
+        const float* weights_ptr = weights + k * ic;
+        float sum = 0.f;
+        for (int j = 0; j < ic; j++) {
+          sum += din[j] * weights_ptr[j];
+        }
+        dout[k] = sum + din2[k];
+      }
+    } else {
+      for (int k = 0; k < oc; k++) {
+        const float* weights_ptr = weights + k * ic;
+        float sum = 0.f;
+        for (int j = 0; j < ic; j++) {
+          sum += din[j] * weights_ptr[j];
+        }
+        dout[k] = sum;
+      }
+    }
+  }
+
+ private:
+  std::string conv_type0_{"conv2d"};
+  std::string conv_type1_{"conv2d"};
+  bool conv_has_bias0_{false};
+  bool conv_has_bias1_{false};
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
index ea8400b0bb2cd1680e52d9a92ef79aca4e09887b..da42d6d0c79a2a7975eacca7095fedababac6d89 100644
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -34,20 +34,25 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   }
 
   // fuse quantized node and dequant node
-  for (auto& op_type :
-       {"conv2d", "mul", "depthwise_conv2d", "conv2d_transpose"}) {
+  std::vector<std::string> quantized_op_types = {
+      "conv2d", "depthwise_conv2d", "conv2d_transpose", "mul"};
+  for (auto& op_type : quantized_op_types) {
     fusion::DequantOpFuser fuser(op_type);
     fuser(graph.get());
   }
-
-  for (auto& op_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) {
+  for (auto& op_type : quantized_op_types) {
     fusion::ChannelWiseDequantOpFuser fuser(op_type);
     fuser(graph.get());
   }
 
   // process quant_dequant_node
-  fusion::DeleteQuantDequantOpFuser dqd_fuser;
-  dqd_fuser(graph.get());
+  std::vector<std::string> quant_dequant_op_types = {
+      "fake_quantize_dequantize_abs_max",
+      "fake_quantize_dequantize_moving_average_abs_max"};
+  for (auto& op_type : quant_dequant_op_types) {
+    fusion::DeleteQuantDequantOpFuser dqd_fuser(op_type);
+    dqd_fuser(graph.get());
+  }
 }
 
 }  // namespace mir
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
index 1335518b00db5311b4605148817faed52164fd7a..758a85c84064fa8d1953a6531300208d13525634 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -315,30 +315,33 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
 }
 
 void DeleteQuantDequantOpFuser::BuildPattern() {
-  std::string quant_dequant_op_type =
-      "fake_quantize_dequantize_moving_average_abs_max";
-  auto* input_scale_node =
-      VarNode("input_scale_node")
-          ->assert_is_op_input(quant_dequant_op_type, "InScale");
-  auto* input_act_node =
-      VarNode("input_act_node")->assert_is_op_input(quant_dequant_op_type, "X");
-  auto* quant_dequant_node = OpNode("quant_dequant_node", quant_dequant_op_type)
-                                 ->assert_is_op(quant_dequant_op_type);
+  auto* input_act_node = VarNode("input_act_node")
+                             ->assert_is_op_input(quant_dequant_op_type_, "X");
+  auto* quant_dequant_node =
+      OpNode("quant_dequant_node", quant_dequant_op_type_)
+          ->assert_is_op(quant_dequant_op_type_);
   auto* output_scale_node =
       VarNode("output_scale_node")
-          ->assert_is_op_output(quant_dequant_op_type, "OutScale");
+          ->assert_is_op_output(quant_dequant_op_type_, "OutScale");
   auto* output_act_node =
       VarNode("output_act_node")
-          ->assert_is_op_output(quant_dequant_op_type, "Out");
-
-  quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
+          ->assert_is_op_output(quant_dequant_op_type_, "Out");
+
+  if (quant_dequant_op_type_ ==
+      "fake_quantize_dequantize_moving_average_abs_max") {
+    auto* input_scale_node =
+        VarNode("input_scale_node")
+            ->assert_is_op_input(quant_dequant_op_type_, "InScale");
+    quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
+  } else {
+    quant_dequant_node->LinksFrom({input_act_node});
+  }
   output_scale_node->LinksFrom({quant_dequant_node});
   output_act_node->LinksFrom({quant_dequant_node});
 }
 
 void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
                                               const key2nodes_t& matched) {
-  auto* input_scale_node = matched.at("input_scale_node");
   auto* input_act_node = matched.at("input_act_node");
   auto* quant_dequant_node = matched.at("quant_dequant_node");
   auto* output_scale_node = matched.at("output_scale_node");
@@ -368,7 +371,12 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
   }
   // delete nodes and edges
   std::set<const Node*> nodes2rm = {
-      input_scale_node, quant_dequant_node, output_scale_node, output_act_node};
+      quant_dequant_node, output_scale_node, output_act_node};
+  if (quant_dequant_op_type_ ==
+      "fake_quantize_dequantize_moving_average_abs_max") {
+    auto* input_scale_node = matched.at("input_scale_node");
+    nodes2rm.insert(input_scale_node);
+  }
   GraphSafeRemoveNodes(graph, nodes2rm);
 }
 
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h
index ac3ac112b3aa504bc075125f2f13292073ca9444..c2dd1e5191cf0ad9b242dfa230abe3d38bad0cf7 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -86,17 +86,22 @@ class ChannelWiseDequantOpFuser : public FuseBase {
   std::string quantized_op_type_{};
 };
 
-/* The pattern like "fake_quantize_dequantize_moving_average_abs_max +
- * quantized_op" can be deteted by this fuser. The fuser modifies the input
- * scale for the quantized_op and deletes the fake_quant_dequant_op.
+/* The pattern like "fake_quantize_dequantize_op + quantized_op" can be
+ * deteted by this fuser. The fuser modifies the input scale for the
+ * quantized_op and deletes the fake_quant_dequant_op.
 */
 class DeleteQuantDequantOpFuser : public FuseBase {
  public:
+  explicit DeleteQuantDequantOpFuser(const std::string& quant_dequant_op_type)
+      : quant_dequant_op_type_(quant_dequant_op_type) {}
   void BuildPattern() override;
   void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
 
  private:
   cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+
+ private:
+  std::string quant_dequant_op_type_{};
 };
 
 }  // namespace fusion
diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc
index d7486c0933dbbe74115bd6358962817b2b946c12..3c9bac1c5b9fbf6d48683f6423a4c670b17cb127 100644
--- a/lite/core/mir/generate_program_pass.cc
+++ b/lite/core/mir/generate_program_pass.cc
@@ -39,6 +39,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     nodes_in_order = graph->StmtTopologicalOrder();
   }
 
+  insts_.emplace_back();
   for (auto& item : nodes_in_order) {
     if (item->IsStmt()) {
       auto& stmt = item->AsStmt();
@@ -57,7 +58,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
             .SetSyncStreams(stmt.sync_streams_);
       }
 #endif
-      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
+      insts_.back().emplace_back(stmt.op(), std::move(stmt.kernels().front()));
     }
   }
 }
diff --git a/lite/core/mir/generate_program_pass.h b/lite/core/mir/generate_program_pass.h
index b126b4aba4d09a95a0033b04ed241812c88a3287..2ef4d035710d9542b365789aeabe8a08537ff225 100644
--- a/lite/core/mir/generate_program_pass.h
+++ b/lite/core/mir/generate_program_pass.h
@@ -42,7 +42,7 @@ class GenerateProgramPass : public ProgramPass {
   }
 
  private:
-  std::vector<Instruction> insts_;
+  std::vector<std::vector<Instruction>> insts_;
 };
 
 }  // namespace mir
diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc
index 92804d6e72e7a2de6f3a6f3b47f338aecd25aa8c..eddbebb545351fa6b1820682af487bb7b04e8bb3 100644
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -315,4 +315,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
                      TARGET(kBM),
                      TARGET(kRKNPU),
                      TARGET(kAPU),
-                     TARGET(kMLU)});
+                     TARGET(kMLU),
+                     TARGET(kHuaweiAscendNPU)});
diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc
index 46738dd49c16fd9736d61711b4baf56d51247699..e09220d083ee8241001b6d9d55fb48eb1ba74f2e 100644
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -284,13 +284,19 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
                 head_node->AsArg().name,
                 cur_node->AsArg().name);
   // for subgraph op, modify the BlockDesc
-  auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
-                             inst_node->AsStmt().op().get())
-                             ->GetSubBlock();
-  for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
-    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
-    UpdateInputTo(
-        sub_block_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
+  auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                              inst_node->AsStmt().op().get())
+                              ->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx =
+      inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       ++sub_op_idx) {
+    auto* sub_op_desc = const_cast<cpp::OpDesc*>(
+        sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
+    UpdateInputTo(sub_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
   }
 
   // recreate the op
@@ -444,21 +450,27 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
                  tail_node->AsArg().name,
                  cur_node->AsArg().name);
   // for subgraph op, modify the BlockDesc
-  auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
-                             inst_node->AsStmt().op().get())
-                             ->GetSubBlock();
-  for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
-    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
+  auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                              inst_node->AsStmt().op().get())
+                              ->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx =
+      inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       ++sub_op_idx) {
+    auto* sub_op_desc = const_cast<cpp::OpDesc*>(
+        sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
     UpdateOutputTo(
-        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+        sub_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
     /* graph like this
      *        subgraph_op_0
      *          /       \
      *         /         \
      * subgraph_op_1   host_op
      */
-    UpdateInputTo(
-        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+    UpdateInputTo(sub_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
   }
 
   // recreate the op
@@ -482,15 +494,22 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
   }
 }
 
-bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
-  auto* block_desc =
-      static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
-          ->GetSubBlock();
-  for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
-    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
-    CHECK(op_desc);
-    if (op_desc->Type() == "conv2d") {
-      for (auto& names : op_desc->inputs()) {
+bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node,
+                                               Node* inst_node) {
+  auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                              inst_node->AsStmt().op().get())
+                              ->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx =
+      inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       sub_op_idx++) {
+    auto sub_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx);
+    CHECK(sub_op_desc);
+    if (sub_op_desc->Type() == "conv2d") {
+      for (auto& names : sub_op_desc->inputs()) {
         if (std::find(names.second.begin(),
                       names.second.end(),
                       arg_node->AsArg().name) != names.second.end()) {
@@ -746,19 +765,23 @@ std::pair<bool, std::string> CheckOutputAndInsert(
 // insert cast op on mlu, to avoid cast on cpu
 void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
                                         const Type* subgraph_type) {
-  auto subgraph_op = subgraph_node->AsStmt().op();
-  CHECK_EQ(subgraph_op->Type(), "subgraph");
-  auto op = dynamic_cast<operators::SubgraphOp*>(subgraph_op.get());
-  CHECK(op);
-  auto block_desc = op->GetSubBlock();
+  CHECK_EQ(subgraph_node->AsStmt().op()->Type(), "subgraph");
+  auto subgraph_op =
+      dynamic_cast<operators::SubgraphOp*>(subgraph_node->AsStmt().op().get());
+  CHECK(subgraph_op);
+  auto sub_program_desc = subgraph_op->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx = subgraph_op->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc = const_cast<cpp::BlockDesc*>(
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx));
 
   // create a new block desc to keep op sequence correct
-  cpp::BlockDesc* new_block_desc = new cpp::BlockDesc();
-  new_block_desc->ClearOps();
-  new_block_desc->ClearVars();
-  new_block_desc->SetIdx(block_desc->Idx());
-  new_block_desc->SetParentIdx(block_desc->ParentIdx());
-  new_block_desc->SetForwardBlockIdx(block_desc->ForwardBlockIdx());
+  cpp::BlockDesc new_block_desc;
+  new_block_desc.ClearOps();
+  new_block_desc.ClearVars();
+  new_block_desc.SetIdx(sub_block_desc->Idx());
+  new_block_desc.SetParentIdx(sub_block_desc->ParentIdx());
+  new_block_desc.SetForwardBlockIdx(sub_block_desc->ForwardBlockIdx());
 
   // find all IO that is not weight or persist
   std::list<std::string> i_names, o_names;
@@ -769,8 +792,8 @@ void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
     auto input_name = input->AsArg().name;
     if (!(input->AsArg().is_weight || input->AsArg().is_persist)) {
       i_names.emplace_back(input_name);
-      auto ret = CheckInputAndInsert(op->scope(),
-                                     new_block_desc,
+      auto ret = CheckInputAndInsert(subgraph_op->scope(),
+                                     &new_block_desc,
                                      input_name,
                                      input->AsArg().type,
                                      subgraph_type);
@@ -783,8 +806,8 @@ void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
     auto output_name = output->AsArg().name;
     if (!(output->AsArg().is_weight || output->AsArg().is_persist)) {
       o_names.emplace_back(output_name);
-      auto ret = CheckOutputAndInsert(op->scope(),
-                                      block_desc,
+      auto ret = CheckOutputAndInsert(subgraph_op->scope(),
+                                      sub_block_desc,
                                       output_name,
                                       output->AsArg().type,
                                       subgraph_type);
@@ -795,46 +818,48 @@ void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
   }
 
   // update input and output
-  for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); ++op_idx) {
-    auto desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
-    auto new_desc = new_block_desc->AddOp<cpp::OpDesc>();
-    *new_desc = *desc;
-
-    if (desc->Type() != "layout" && desc->Type() != "cast") {
-      auto op_input_args = new_desc->InputArgumentNames();
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       ++sub_op_idx) {
+    auto sub_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx);
+    auto new_op_desc = new_block_desc.AddOp<cpp::OpDesc>();
+    *new_op_desc = *sub_op_desc;
+
+    if (sub_op_desc->Type() != "layout" && sub_op_desc->Type() != "cast") {
+      auto op_input_args = new_op_desc->InputArgumentNames();
       for (auto& input_arg : op_input_args) {
-        auto op_input = new_desc->Input(input_arg);
+        auto op_input = new_op_desc->Input(input_arg);
         for (auto& it : i_names) {
           auto index = std::find(op_input.begin(), op_input.end(), it);
           if (index != op_input.end() &&
               node_replace.find(it) != node_replace.end()) {
             index = op_input.erase(index);
             op_input.emplace(index, node_replace.at(it));
-            VLOG(4) << new_desc->Type() << "] change input from " << it
+            VLOG(4) << new_op_desc->Type() << "] change input from " << it
                     << " to " << node_replace.at(it);
           }
         }
-        new_desc->SetInput(input_arg, op_input);
+        new_op_desc->SetInput(input_arg, op_input);
       }
 
-      auto op_output_args = new_desc->OutputArgumentNames();
+      auto op_output_args = new_op_desc->OutputArgumentNames();
       for (auto& output_arg : op_output_args) {
-        auto op_output = new_desc->Output(output_arg);
+        auto op_output = new_op_desc->Output(output_arg);
         for (auto& it : o_names) {
           auto index = std::find(op_output.begin(), op_output.end(), it);
           if (index != op_output.end() &&
               node_replace.find(it) != node_replace.end()) {
             index = op_output.erase(index);
             op_output.emplace(index, node_replace.at(it));
-            VLOG(4) << new_desc->Type() << "] change output from " << it
+            VLOG(4) << new_op_desc->Type() << "] change output from " << it
                     << " to " << node_replace.at(it);
           }
         }
-        new_desc->SetOutput(output_arg, op_output);
+        new_op_desc->SetOutput(output_arg, op_output);
       }
     }
   }
-  op->SetSubBlock(new_block_desc);
+
+  *sub_block_desc = new_block_desc;
 }
 
 void ModifyValidPlaces(SSAGraph* graph, bool use_mlu_cast) {
diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc
index f8991a359b177799cc5f59651c5d305fe64231ef..9cf7bc8995766e47895ce3dd2ef6bf7bcb614e5c 100644
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -153,60 +153,61 @@ Node *SSAGraph::GraphCreateInstructNode(
 }
 
 void SSAGraph::Build(const Program &program,
-                     const std::vector<Place> &valid_places) {
+                     const std::vector<Place> &valid_places,
+                     int block_idx) {
   CHECK(node_storage_.empty());
 
-  auto weights_name = program.weights();
-  auto is_weights = [&](const std::string &name) -> bool {
-    auto it = std::find(weights_name.begin(), weights_name.end(), name);
-    if (it == weights_name.end()) return false;
+  auto weights = program.weights();
+  auto is_weight = [&](const std::string &name) -> bool {
+    auto it = std::find(weights.begin(), weights.end(), name);
+    if (it == weights.end()) return false;
     return true;
   };
 
-  std::map<std::string, PrecisionType> var_types = program.var_data_type();
-
-  std::map<std::string, mir::Node *> arg_update_node_map_;
-  for (auto &op : program.ops()) {
+  auto var_type_map = program.var_type_map();
+  std::map<std::string, mir::Node *> arg_update_node_map;
+  for (auto &op : program.ops(block_idx)) {
     VLOG(3) << op->op_info()->Type();
     auto *op_node = GraphCreateInstructNode(op, valid_places);
-    for (const std::string &name : op->op_info()->input_names()) {
+    auto *op_info = op->op_info();
+    const auto &op_type = op_info->Type();
+    for (const auto &var_name : op_info->input_names()) {
       mir::Node *arg_node = nullptr;
-      if (arg_update_node_map_.count(name)) {
-        arg_node = arg_update_node_map_.at(name);
+      if (arg_update_node_map.count(var_name)) {
+        arg_node = arg_update_node_map.at(var_name);
       } else {
         node_storage_.emplace_back();
         arg_node = &node_storage_.back();
-        arg_node->AsArg(name, node_storage_.size() - 1);
-        arg_update_node_map_[name] = arg_node;
+        arg_node->AsArg(var_name, node_storage_.size() - 1);
+        arg_update_node_map[var_name] = arg_node;
       }
-      if (var_types.count(name)) {
+      if (var_type_map.count(var_name)) {
         if (!arg_node->arg()->type) {
-          arg_node->arg()->type = LiteType::GetTensorTy(
-              TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+          arg_node->arg()->type = var_type_map[var_name];
         }
         // Store the original data type of the output tensors for
         // type_precision_cast_pass, to keep the consistency between the
         // output types of original graph and optimized graph's
-        if (op->op_info()->Type() == "fetch") {
+        if (op_type == "fetch") {
           op->mutable_op_info()->SetAttr<int>(
-              "data_type", static_cast<int>(var_types[name]));
+              "data_type",
+              static_cast<int>(var_type_map[var_name]->precision()));
         }
       }
-      if (is_weights(name)) arg_node->AsArg().is_weight = true;
+      if (is_weight(var_name)) arg_node->AsArg().is_weight = true;
       CHECK(arg_node->IsRoleSet());
       DirectedLink(arg_node, op_node);
     }
-    for (const std::string &name : op->op_info()->output_names()) {
+    for (const auto &var_name : op->op_info()->output_names()) {
       node_storage_.emplace_back();
       auto *arg_node = &node_storage_.back();
-      arg_node->AsArg(name, node_storage_.size() - 1);
-      arg_update_node_map_[name] = arg_node;
-      if (var_types.count(name) && !arg_node->arg()->type) {
-        arg_node->arg()->type = LiteType::GetTensorTy(
-            TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+      arg_node->AsArg(var_name, node_storage_.size() - 1);
+      arg_update_node_map[var_name] = arg_node;
+      if (var_type_map.count(var_name) && !arg_node->arg()->type) {
+        arg_node->arg()->type = var_type_map[var_name];
       }
 
-      if (is_weights(name)) arg_node->AsArg().is_weight = true;
+      if (is_weight(var_name)) arg_node->AsArg().is_weight = true;
       CHECK(arg_node->IsRoleSet());
       DirectedLink(op_node, arg_node);
     }
diff --git a/lite/core/mir/ssa_graph.h b/lite/core/mir/ssa_graph.h
index e2967cf96a6b00ccc225ce05b043cb94f161b1d6..819b0a71ea1be04c85316e90001aef311b7d7238 100644
--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
@@ -35,9 +35,13 @@ class GraphBase {};
 
 class SSAGraph : GraphBase {
  public:
-  // @param program: the op program
+  // @param program: the target program with vars and ops
   // @param valid_places: the valid places user set for the system.
-  void Build(const Program &program, const std::vector<Place> &valid_places);
+  // @param block_idx: the block index in the target program, default is 0(main
+  // block)
+  void Build(const Program &program,
+             const std::vector<Place> &valid_places,
+             int block_idx = kRootBlockIdx);
   void RemoveNode(const mir::Node *node);
 
   std::vector<mir::Node *> StmtTopologicalOrder();
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
index 4b9f34225f70e9050b2605b49e888ed323536b2f..13805b2b18634551d4b74ac436954fa8f6b9ed05 100644
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -411,16 +411,17 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   cpp::OpDesc subgraph_op_desc;
   subgraph_op_desc.SetType("subgraph");
 
-  // Create a new sub block desc for storing all of Ops and Vars of the target
-  // subgraph and sub_block_idx is set as a attribute of subgraph op,
-  // sub_block_idx < 0 means it's a new subgraph op
-  int sub_block_idx = -(subgraph_idx + 1);
-  auto sub_block_desc = new cpp::BlockDesc();
+  // Create a program desc and a block desc for storing all of Ops and Vars of
+  // the target subgraph and sub_block_idx is set as a attribute of subgraph op,
+  // sub_block_idx = 0 means it's a new subgraph op
+  auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+  int sub_block_idx = 0;
+  auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
   sub_block_desc->ClearOps();
   sub_block_desc->ClearVars();
   for (auto &op_node : subgraph_nodes) {
-    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
-    *sub_block_op_desc = *op_node->AsStmt().op_info();
+    auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_op_desc = *op_node->AsStmt().op_info();
   }
   subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx);
 
@@ -437,13 +438,13 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
                        &local_var_nodes,
                        &unused_var_nodes);
   // A simplified model without the original weight/local/unused nodes on the
-  // subgraph ops will be saved only if 'SUBGRAPH_DISABLE_ONLINE_MODE' is set to
-  // true and Predictor->Run(...), Predictor->Save(...) is called.
+  // subgraph ops will be saved only if 'SUBGRAPH_ONLINE_MODE' is set to
+  // true(default) and Predictor->Run(...), Predictor->Save(...) is called.
   std::set<Node *> input_var_nodes(idata_var_nodes.begin(),
                                    idata_var_nodes.end());
   std::set<Node *> output_var_nodes(odata_var_nodes.begin(),
                                     odata_var_nodes.end());
-  if (!GetBoolFromEnv(SUBGRAPH_DISABLE_ONLINE_MODE)) {
+  if (GetBoolFromEnv(SUBGRAPH_ONLINE_MODE, true)) {
     input_var_nodes.insert(weight_var_nodes.begin(), weight_var_nodes.end());
     output_var_nodes.insert(local_var_nodes.begin(), local_var_nodes.end());
     output_var_nodes.insert(unused_var_nodes.begin(), unused_var_nodes.end());
@@ -476,7 +477,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   subgraph_op_desc.SetOutput("Outputs", output_var_names);
   auto subgraph_op = LiteOpRegistry::Global().Create("subgraph");
   static_cast<operators::SubgraphOp *>(subgraph_op.get())
-      ->SetSubBlock(sub_block_desc);
+      ->SetProgramDesc(sub_program_desc);
   auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
   subgraph_op->Attach(subgraph_op_desc, any_op->scope());
 
diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc
index 06c9c4c78fedba7cfabcd4ff2dd3804b404f966d..f7e354f7a22582991ca64fa2d5fcc147bf6ed427 100644
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -141,12 +141,11 @@ std::vector<std::string> AddFetchDesc(
 }
 
 TEST(Subgraph, detect_simple_model) {
-  cpp::ProgramDesc program_desc;
+  auto program_desc = std::make_shared<cpp::ProgramDesc>();
   std::vector<Place> valid_places{{TARGET(kHost), PRECISION(kFloat)}};
   auto scope = std::make_shared<Scope>();
   // Build a simple network
-  program_desc.ClearBlocks();
-  auto* block_desc = program_desc.AddBlock<cpp::BlockDesc>();
+  auto* block_desc = program_desc->AddBlock<cpp::BlockDesc>();
   block_desc->ClearOps();
   block_desc->ClearVars();
   auto* var_desc = block_desc->AddVar<cpp::VarDesc>();
@@ -181,13 +180,13 @@ TEST(Subgraph, detect_custom_model) {
                  "the path of model files.";
     return;
   }
-  cpp::ProgramDesc program_desc;
+  auto program_desc = std::make_shared<cpp::ProgramDesc>();
   auto scope = std::make_shared<Scope>();
   LoadModelPb(FLAGS_model_dir,
               FLAGS_model_file,
               FLAGS_params_file,
               scope.get(),
-              &program_desc,
+              program_desc.get(),
               !FLAGS_model_file.empty() && !FLAGS_params_file.empty(),
               false);
   std::vector<Place> valid_places({
@@ -200,6 +199,9 @@ TEST(Subgraph, detect_custom_model) {
 #ifdef LITE_WITH_NPU
       Place{TARGET(kNPU), PRECISION(kFloat)},
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+      Place{TARGET(kHuaweiAscendNPU), PRECISION(kFloat)},
+#endif
 #ifdef LITE_WITH_XTCL
       Place{TARGET(kXPU), PRECISION(kFloat)},
 #endif
diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc
index f4df5c5f454c08c5f79dd220e579632dc7cf05a5..429c780912094baf9ceb8b5124dc197abd51af41 100644
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -40,6 +40,21 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   fuser();
 }
 
+void HuaweiAscendNPUSubgraphPass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  std::set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 void APUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   std::set<std::string> supported_lists;
 #define USE_SUBGRAPH_BRIDGE(op_type, target) \
@@ -119,6 +134,9 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
     .BindTargets({TARGET(kNPU)});
+REGISTER_MIR_PASS(huawei_ascend_npu_subgraph_pass,
+                  paddle::lite::mir::HuaweiAscendNPUSubgraphPass)
+    .BindTargets({TARGET(kHuaweiAscendNPU)});
 REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass)
     .BindTargets({TARGET(kAPU)});
 REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h
index 8c2b501a62356c91e93f3c4ca91f70879d3c9229..c40a527cfe72ab1556e868d05aab5c0280fa4514 100644
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass {
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };
 
+class HuaweiAscendNPUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 class APUSubgraphPass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
index 104ad5b4fa819de5ff3501c08c60e9918c93cddf..5a57623b0c984be24e2d0b97ee575b22d369fdad 100644
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -187,6 +187,10 @@ TEST(Subgraph, generate_model_and_check_precision) {
 #ifdef LITE_WITH_NPU
   valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  valid_places.push_back(
+      lite_api::Place{TARGET(kHuaweiAscendNPU), PRECISION(kFloat)});
+#endif
 #ifdef LITE_WITH_XTCL
   valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
 #endif
diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc
index 39a94cbca6bd6222da5da1d314ea07475592bf0e..40ece35993cfd2f8bce07e605387741202973614 100644
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -36,14 +36,20 @@ void UpdateInputsForSubgraph(OpLite* op,
       op_desc->GetAttr<std::vector<std::string>>("input_data_names");
   std::replace(input_data_names.begin(), input_data_names.end(), from, to);
   op_desc->SetAttr("input_data_names", input_data_names);
-  auto* subblock_desc = static_cast<operators::SubgraphOp*>(op)->GetSubBlock();
-  CHECK(subblock_desc);
-  for (size_t i = 0; i < subblock_desc->OpsSize(); i++) {
-    auto* subblock_op_desc = subblock_desc->GetOp<cpp::OpDesc>(i);
-    for (auto& subblock_op_input : *subblock_op_desc->mutable_inputs()) {
-      for (auto& subblock_var_name : subblock_op_input.second) {
-        if (subblock_var_name == from) {
-          subblock_var_name = to;
+  auto sub_program_desc =
+      static_cast<operators::SubgraphOp*>(op)->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx = op_desc->GetAttr<int32_t>("sub_block");
+  auto sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       sub_op_idx++) {
+    auto sub_op_desc = const_cast<cpp::OpDesc*>(
+        sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
+    for (auto& sub_op_input : *sub_op_desc->mutable_inputs()) {
+      for (auto& sub_var_name : sub_op_input.second) {
+        if (sub_var_name == from) {
+          sub_var_name = to;
         }
       }
     }
diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h
index d9f420cfad90d3c6a1f08072d8c5f87d2326661a..f7d35bfef3ac53903448c48300c144f8fd15652d 100644
--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -59,25 +59,46 @@ class VariablePlaceInferencePass : public DebugPass {
   }
 
   // Set the type of the weight
-  void SetWeightType(Node* w,
+  void SetWeightType(Node* weight_node,
                      const LiteType& type,
-                     const std::map<std::string, bool>& lite_with_targets) {
+                     const std::map<std::string, bool>& with_targets) {
     VLOG(4) << "type.precision():" << PrecisionRepr(type.precision());
-    if (lite_with_targets.at("kFPGA")) {
-      w->AsArg().type = LiteType::GetTensorTy(
+    if (with_targets.at("kFPGA")) {
+      weight_node->AsArg().type = LiteType::GetTensorTy(
           TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-    } else if (lite_with_targets.at("kOpenCL")) {
-      w->AsArg().type = LiteType::GetTensorTy(
+    } else if (with_targets.at("kOpenCL")) {
+      weight_node->AsArg().type = LiteType::GetTensorTy(
           TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-    } else if (lite_with_targets.at("kCUDA")) {
-      w->AsArg().type = LiteType::GetTensorTy(
+    } else if (with_targets.at("kCUDA")) {
+      weight_node->AsArg().type = LiteType::GetTensorTy(
           TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
     } else {
-      w->AsArg().type = LiteType::GetTensorTy(
+      weight_node->AsArg().type = LiteType::GetTensorTy(
           TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
     }
   }
 
+  // Update a's kUnk fields from b's fields.
+  void UpdateTypeFrom(const Type** a, const Type* b) {
+    auto target = (*a)->target();
+    auto precision = (*a)->precision();
+    auto layout = (*a)->layout();
+    if (target == TARGET(kUnk)) {
+      target = b->target();
+    }
+    if (precision == PRECISION(kUnk)) {
+      precision = b->precision();
+    }
+    if (layout == DATALAYOUT(kUnk)) {
+      layout = b->layout();
+    }
+    if ((*a)->IsTensor() && b->IsTensor()) {
+      *a = LiteType::GetTensorTy(target, precision, layout);
+    } else if ((*a)->IsTensorList() && b->IsTensorList()) {
+      *a = LiteType::GetTensorListTy(target, precision, layout);
+    }
+  }
+
   void InferenceArgumentPlace(SSAGraph* graph) {
     auto& valid_places = graph->valid_places();
     auto valid_places_has_target = [&](TargetType t) -> bool {
@@ -88,122 +109,90 @@ class VariablePlaceInferencePass : public DebugPass {
       }
       return false;
     };
-    std::map<std::string, bool> lite_with_targets{
+    std::map<std::string, bool> with_targets{
         {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
         {"kCUDA", valid_places_has_target(TARGET(kCUDA))},
         {"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
-    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
-    VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
+    VLOG(4) << "with_targets['kOpenCL']:" << with_targets["kOpenCL"];
+    VLOG(4) << "with_targets['kFPGA']:" << with_targets["kFPGA"];
 
     VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
-    for (auto& x : graph->StmtTopologicalOrder()) {
-      auto& inst = x->AsStmt();
+    for (auto& node : graph->StmtTopologicalOrder()) {
+      auto& inst = node->AsStmt();
+      const auto* op_info = inst.op_info();
+      const auto& op_type = op_info->Type();
+      auto& kernel = inst.picked_kernel();
+
       // The IoCopyOp is a tool operator, it won't support the type inference.
       // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
-      // for
-      // tool operator
-      if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
-        VLOG(3) << "inst.op_type() == 'io_copy', continue";
-        if (inst.op_type() == "io_copy") continue;
+      // for tool operator
+      if ((!with_targets["kFPGA"]) && (!with_targets["kOpenCL"])) {
+        VLOG(3) << "skip 'io_copy' if target is FPGA and OpenCL";
+        if (op_type == "io_copy") continue;
       }
-      // deal with inputs
-      VLOG(4) << "Infering op " << inst.op_info()->Repr();
-      // TODO(zhaolong): Add check if the node's name in op's arguments.
 
-      auto get_argname = [&](
-          const std::string& node_name,
-          const std::map<std::string, std::vector<std::string>>& argname_map)
-          -> std::string {
-            for (auto& ele : argname_map) {
-              auto it =
-                  std::find(ele.second.begin(), ele.second.end(), node_name);
-              if (it != ele.second.end()) return ele.first;
-            }
-            return "";
-          };
-
-      for (auto* x_in : x->inlinks) {
-        std::string node_name = x_in->AsArg().name;
-        std::string arg_name = get_argname(node_name, inst.op_info()->inputs());
-        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
-                                   << node_name;
-        VLOG(4) << "-- input arg_name:" << arg_name << " "
-                << "-- node name:" << node_name;
-        auto type = inst.picked_kernel().GetInputDeclType(arg_name);
-        if (!x_in->AsArg().type) {
-          VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
-          if (x_in->AsArg().is_weight) {
-            SetWeightType(x_in, *type, lite_with_targets);
+      // Infering the input and output variable's place according to the
+      // declaration of I/O arguments of the picked kernel of the op
+      VLOG(4) << "Op " << op_info->Repr();
+      for (auto* in_node : node->inlinks) {
+        auto& var = in_node->AsArg();
+        const auto& var_name = var.name;
+        auto* var_type = &var.type;
+        std::string arg_name;
+        CHECK(op_info->GetInputArgname(var_name, &arg_name))
+            << "Can not find the input argument for var " << var_name;
+        VLOG(4) << " - input arg name:" << arg_name << " var name:" << var_name;
+        const auto* decl_type = kernel.GetInputDeclType(arg_name);
+        if (!(*var_type)) {
+          VLOG(4) << "set type " << *decl_type << " " << var_name;
+          if (var.is_weight) {
+            SetWeightType(in_node, *decl_type, with_targets);
           } else {
-            x_in->AsArg().type = type;
+            *var_type = decl_type;
           }
-        } else if (x_in->AsArg().type->target() == TARGET(kUnk) &&
-                   x_in->AsArg().type->precision() != PRECISION(kUnk) &&
-                   x_in->AsArg().type->layout() == DATALAYOUT(kUnk)) {
+        } else if (!(*var_type)->place().is_valid()) {
           // If is quantization, infer the Int8 type.
-          if (type->precision() == PRECISION(kInt8)) {
-            x_in->AsArg().type = type;
+          if (decl_type->precision() == PRECISION(kInt8)) {
+            *var_type = decl_type;
           } else {
-            PrecisionType tmp_ptype = x_in->AsArg().type->precision();
-            x_in->AsArg().type = LiteType::GetTensorTy(
-                type->target(), tmp_ptype, type->layout());
+            UpdateTypeFrom(var_type, decl_type);
           }
         }
       }
-
-      VLOG(4) << "inst " << inst.op_info()->Repr();
-      for (auto* x_out : x->outlinks) {
-        std::string node_name = x_out->AsArg().name;
-        std::string arg_name =
-            get_argname(node_name, inst.op_info()->outputs());
-        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
-                                   << node_name << " in Inst "
-                                   << inst.op_type();
-        VLOG(4) << "-- output arg_name " << arg_name;
-        auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
-        if (!x_out->AsArg().type) {
-          VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
-          if (x_out->AsArg().is_weight) {
-            SetWeightType(x_out, *type, lite_with_targets);
+      for (auto* out_node : node->outlinks) {
+        auto& var = out_node->AsArg();
+        const auto& var_name = var.name;
+        auto* var_type = &var.type;
+        std::string arg_name;
+        CHECK(op_info->GetOutputArgname(var_name, &arg_name))
+            << "Can not find the output argument for var " << var_name;
+        VLOG(4) << " - output arg name:" << arg_name
+                << " var name:" << var_name;
+        const auto* decl_type = kernel.GetOutputDeclType(arg_name);
+        if (!(*var_type)) {
+          VLOG(4) << "set type " << *decl_type << " " << var_name;
+          if (var.is_weight) {
+            SetWeightType(out_node, *decl_type, with_targets);
           } else {
-            x_out->AsArg().type = type;
+            *var_type = decl_type;
           }
-        } else if (x_out->AsArg().type->target() == TARGET(kUnk) &&
-                   x_out->AsArg().type->precision() != PRECISION(kUnk) &&
-                   x_out->AsArg().type->layout() == DATALAYOUT(kUnk)) {
+        } else if (!(*var_type)->place().is_valid()) {
           // If is quantization, infer the Int8 type.
-          if (type->precision() == PRECISION(kInt8)) {
-            x_out->AsArg().type = type;
-          } else if (type->precision() == PRECISION(kFP16) &&
-                     type->target() != TARGET(kOpenCL)) {
-            x_out->AsArg().type = type;
+          if (decl_type->precision() == PRECISION(kInt8) ||
+              (decl_type->precision() == PRECISION(kFP16) &&
+               decl_type->target() != TARGET(kOpenCL))) {
+            *var_type = decl_type;
           } else {
-            PrecisionType tmp_ptype = x_out->AsArg().type->precision();
-            x_out->AsArg().type = LiteType::GetTensorTy(
-                type->target(), tmp_ptype, type->layout());
+            UpdateTypeFrom(var_type, decl_type);
           }
         }
       }
     }
   }
 
-  // Update me's kUnk fields by other's fields.
-  void UpdatePlace(Place* me, const Place& other) {
-    CHECK(other.is_valid());
-    if (me->target == TARGET(kUnk)) {
-      me->target = other.target;
-    }
-    if (me->precision == PRECISION(kUnk)) {
-      me->precision = other.precision;
-    }
-    if (me->layout == DATALAYOUT(kUnk)) {
-      me->layout = other.layout;
-    }
-  }
-
  private:
-  // The default target for arguments, e.g. load weights to CPU memory for CUDA
-  // computation by default.
+  // The default target for arguments, e.g. load weights to CPU memory for
+  // CUDA computation by default.
   TargetType argument_default_target_{TARGET(kHost)};
 };
 
diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h
index 079586d5e0c00f261bfbf4c7658ccca97402f8ac..d94753220a1b5d963092c62c43d7e49b03243c63 100644
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -99,7 +99,7 @@ class OpLite : public Registry {
   std::vector<std::unique_ptr<KernelBase>> CreateKernels(
       const std::vector<Place> &places, const std::string &kernel_type = "");
 
-  lite::Scope *scope() { return scope_; }
+  Scope *scope() { return scope_; }
 
   // Assign op param to kernel.
   virtual void AttachKernel(KernelBase *kernel) = 0;
@@ -169,7 +169,7 @@ class OpLite : public Registry {
   }
 
  protected:
-  lite::Scope *scope_{nullptr};
+  Scope *scope_{nullptr};
   std::unique_ptr<KernelBase> kernel_;
   std::string op_type_;
   std::vector<Place> valid_places_;
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 70905c96f08d74fc5e27c85c7ccf3d395420a5e9..42dac8e59bda84ce5dc2cb04f2f3712d1386b96c 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -19,6 +19,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
 #include "lite/core/mir/generate_program_pass.h"
 #include "lite/core/mir/pass_manager.h"
 #include "lite/core/mir/pass_utils.h"
@@ -36,6 +37,9 @@ namespace lite {
  * lite::Optimizer optimize a program. It utilize the mir passes to analysis the
  * program and export an optimized program.
  */
+// TODO(hong1986032) Support the following passes for the subblocks
+const std::set<std::string> kSubblockUnsupportedPasses(
+    {"memory_optimize_pass"});
 class Optimizer {
  public:
   Optimizer() {}
@@ -60,14 +64,20 @@ class Optimizer {
     program_ = &program;
     valid_places_ = valid_places;
     CHECK(!valid_places.empty()) << "At least one valid_place should be set";
-    CHECK(!graph_) << "duplicate optimize found";
-
-    graph_.reset(new mir::SSAGraph);
-    graph_->Build(program, valid_places);
-    graph_->SetValidPlaces(valid_places);
+    CHECK(graphs_.empty()) << "duplicate optimize found";
+
+    auto block_size = program.block_size();
+    for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+      std::unique_ptr<mir::SSAGraph> graph;
+      graph.reset(new mir::SSAGraph);
+      graph->Build(program, valid_places, block_idx);
+      graph->SetValidPlaces(valid_places);
+      graphs_.emplace_back(std::move(graph));
+    }
 
     SpecifyKernelPickTactic(kernel_pick_factor);
     InitTargetTypeTransformPass();
+    InitControlFlowOpUnusedInputsAndOutputsEliminatePass();
 
     if (passes.empty() || passes.size() == 1) {
       std::vector<std::string> passes_local{
@@ -76,6 +86,7 @@ class Optimizer {
            "lite_conv_elementwise_fuse_pass",      // conv-elemwise-bn
            "lite_conv_bn_fuse_pass",               //
            "lite_conv_elementwise_fuse_pass",      // conv-bn-elemwise
+           "lite_conv_conv_fuse_pass",             //
            // TODO(Superjomn) Refine the fusion related design to select fusion
            // kernels for devices automatically.
            "lite_conv_activation_fuse_pass",              //
@@ -106,11 +117,13 @@ class Optimizer {
                                                       // 'enable_int8' for all
                                                       // of the quantized ops.
            "npu_subgraph_pass",
+           "huawei_ascend_npu_subgraph_pass",
            "xpu_subgraph_pass",
            "bm_subgraph_pass",
            "apu_subgraph_pass",
            "rknpu_subgraph_pass",
            "mlu_subgraph_pass",
+           "control_flow_op_unused_inputs_and_outputs_eliminate_pass",
            "static_kernel_pick_pass",  // pick original kernel from graph
 
            "remove_tf_redundant_ops_pass",
@@ -175,62 +188,15 @@ class Optimizer {
     exec_scope_ = program.exec_scope();
   }
 
-  const lite::Scope* exec_scope() const { return exec_scope_; }
-
-  // Set shape(dims) infos of var descs to scope var.
-  //  developer can write pass using input / output tensor dims of op.
-  //
-  // Example: If you have node `Node* softmax_node`,
-  //   you can get dims of output tensor in passes:
-  //
-  //   auto* scope = softmax_node->AsStmt().op()->scope();
-  //   auto softmax_out_arg_name =
-  //             softmax_node->outlinks.front()->AsArg().name;
-  //   auto softmax_out_tensor =
-  //             scope->FindVar(softmax_out_arg_name)->Get<lite::Tensor>();
-  //   softmax_out_dims = softmax_out_tensor.dims();
-  void SetVarDescShapeToScopeVar() {
-    auto dims_to_str_func = [](std::vector<int64_t> shape) -> std::string {
-      std::string str_res;
-      for (size_t i = 0; i < shape.size(); ++i) {
-        str_res += std::to_string(shape[i]);
-        if (i != shape.size() - 1) {
-          str_res += "x";
-        }
-      }
-      return str_res;
-    };
-
-    auto* program_desc = program_->program_desc();
-    VLOG(5) << "program_desc->BlocksSize():" << program_desc->BlocksSize();
-    auto blocks_desc = program_desc->GetBlocks();
-    for (size_t bidx = 0; bidx < blocks_desc.size(); ++bidx) {
-      auto block_desc = blocks_desc[bidx];
-      auto vars_desc = block_desc.GetVars();
-      for (size_t vidx = 0; vidx < vars_desc.size(); ++vidx) {
-        auto var_desc = vars_desc[vidx];
-        VLOG(5) << var_desc.Name() << " "
-                << dims_to_str_func(var_desc.GetShape());
-        if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue;
-        auto* var = program_->exec_scope()->FindVar(var_desc.Name());
-        auto tensor = var->GetMutable<lite::Tensor>();
-        if (tensor->dims().size() == 0 && var_desc.GetShape().size() != 0) {
-          VLOG(5) << "var_desc.Name():" << var_desc.Name()
-                  << " shape:" << dims_to_str_func(var_desc.GetShape());
-          tensor->Resize(var_desc.GetShape());
-        }
-        VLOG(5) << "var_desc.Name():" << var_desc.Name()
-                << " shape:" << dims_to_str_func(var_desc.GetShape())
-                << " tensor:" << tensor->dims();
-      }
-    }
-  }
+  const Scope* exec_scope() const { return exec_scope_; }
 
   // Generate a new program based on the mir graph.
   std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
     auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
         "generate_program_pass");
-    pass->Apply(graph_);
+    for (auto& graph : graphs_) {
+      pass->Apply(graph);
+    }
     auto program = pass->GenProgram();
     CHECK(exec_scope_);
     program->set_exec_scope(exec_scope_);
@@ -246,27 +212,38 @@ class Optimizer {
     pass->SetValidPlaces(valid_places_);
   }
 
+  void InitControlFlowOpUnusedInputsAndOutputsEliminatePass() {
+    auto* pass =
+        mir::PassManager::Global()
+            .LookUp<mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass>(
+                "control_flow_op_unused_inputs_and_outputs_eliminate_pass");
+    CHECK(pass);
+    CHECK(!graphs_.empty());
+    pass->SetAllGraphs(&graphs_);
+  }
+
   // Generate C++ code which combines the inference program, model and weights.
   void GenCode(const std::string& code_dir);
 
-  const mir::SSAGraph& ssa_graph() const {
-    CHECK(graph_);
-    return *graph_;
+  const mir::SSAGraph& ssa_graph(int block_idx = kRootBlockIdx) const {
+    CHECK(!graphs_.empty());
+    CHECK(graphs_[block_idx]);
+    return *graphs_[block_idx];
   }
 
-  mir::SSAGraph* mutable_ssa_graph() {
-    CHECK(graph_);
-    return graph_.get();
+  mir::SSAGraph* mutable_ssa_graph(int block_idx = kRootBlockIdx) {
+    CHECK(!graphs_.empty());
+    CHECK(graphs_[block_idx]);
+    return graphs_[block_idx].get();
   }
 
-  lite::Scope* exec_scope() { return exec_scope_; }
+  Scope* exec_scope() { return exec_scope_; }
 
  protected:
   void SpecifyKernelPickTactic(core::KernelPickFactor factor);
 
   // Specify the passes and run them.
   void RunPasses(const std::vector<std::string>& passes) {
-    SetVarDescShapeToScopeVar();
     for (auto& x : passes) {
       LOG(INFO) << "== Running pass: " << x;
       mir::Pass* pass = mir::PassManager::Global().LookUp(x);
@@ -284,16 +261,23 @@ class Optimizer {
         LOG(INFO) << "   - Skip " << x
                   << " because the target or kernel does not match.";
       } else {
-        pass->Apply(graph_);
+        // Check the pass whether it is supported for processing subblocks
+        if (kSubblockUnsupportedPasses.count(x)) {
+          pass->Apply(graphs_[kRootBlockIdx]);
+        } else {
+          for (auto& graph : graphs_) {
+            pass->Apply(graph);
+          }
+        }
         LOG(INFO) << "== Finished running: " << x;
       }
     }
   }
 
  private:
-  std::unique_ptr<mir::SSAGraph> graph_;
+  std::vector<std::unique_ptr<mir::SSAGraph>> graphs_;
   std::vector<Place> valid_places_;
-  lite::Scope* exec_scope_{};
+  Scope* exec_scope_{};
   Program* program_{};
 };
 
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 2d0d4c8b66138e40d6986fcaa39e35e82322ece5..bd6dd09683b5004167ee1f8d6426fde0fff4f6b0 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -15,6 +15,7 @@
 #include "lite/core/program.h"
 #include <algorithm>
 #include <map>
+#include <set>
 #include "lite/model_parser/cpp_desc.h"
 #include "lite/operators/conditional_block_op.h"
 #include "lite/operators/subgraph_op.h"
@@ -26,122 +27,221 @@
 namespace paddle {
 namespace lite {
 
-void RuntimeProgram::SaveOpInfosToProgram(cpp::ProgramDesc* desc) {
-  CHECK(desc);
-  // NOTE: RuntimeProgram do not has all meta info, so save model just update
-  // upon origin model
-  CHECK(desc->BlocksSize());
-  auto main_block = desc->GetBlock<cpp::BlockDesc>(0);
-  main_block->ClearOps();
-  for (auto& node : instructions_) {
-    auto op_type = node.op()->op_info()->Type();
-    if (op_type == "subgraph") {
-      auto subgraph_op = const_cast<operators::SubgraphOp*>(
-          static_cast<const operators::SubgraphOp*>(node.op()));
-      int sub_block_idx = subgraph_op->op_info()->GetAttr<int32_t>("sub_block");
-      if (sub_block_idx < 0) {
-        // It's a new subgraph op when its sub_block_idx < 0, Now we add its
+void RuntimeProgram::SaveToProgram(
+    std::shared_ptr<cpp::ProgramDesc> program_desc) {
+  CHECK(program_desc);
+  auto block_size = program_desc->BlocksSize();
+  CHECK_GT(block_size, 0) << "No block found!";
+  // TODD(hong19860320) Only support updating the block desc which already
+  // exists in the origin program desc
+  CHECK_LE(block_size, instructions_.size())
+      << "Invalid block size, expected (0," << instructions_.size()
+      << "] but got " << block_size;
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+    // Record all of the origin vars in the origin block
+    std::map<std::string, cpp::VarDesc> origin_var_maps;
+    auto var_size = block_desc->VarsSize();
+    for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
+      auto v = block_desc->GetVar<cpp::VarDesc>(var_idx);
+      origin_var_maps.emplace(v->Name(), *v);
+    }
+    // Update the ops and vars for each block according to the instructions
+    block_desc->ClearVars();
+    block_desc->ClearOps();
+    std::set<std::string> already_added_vars;
+    for (auto& inst : instructions_[block_idx]) {
+      auto* op = const_cast<OpLite*>(inst.op());
+      auto* op_info = op->op_info();
+      auto op_type = op_info->Type();
+      auto* kernel = inst.mutable_kernel();
+      auto* scope = op->scope();
+      // Update the origin vars which are referred by the instructions
+      // Add the new vars which are created in the passes and referred by the
+      // instructions
+      auto var_names = op_info->input_names();
+      auto out_names = op_info->output_names();
+      // Combine input and output vars and delete the duplicates
+      var_names.insert(var_names.end(), out_names.begin(), out_names.end());
+      std::stable_sort(var_names.begin(), var_names.end());
+      var_names.erase(std::unique(var_names.begin(), var_names.end()),
+                      var_names.end());
+      for (auto& var_name : var_names) {
+        if (already_added_vars.count(var_name)) continue;
+        auto* v = block_desc->AddVar<cpp::VarDesc>();
+        v->SetName(var_name);
+        auto it = origin_var_maps.find(var_name);
+        if (it != origin_var_maps.end()) {
+          v->SetType(it->second.GetType());
+          v->SetPersistable(it->second.Persistable());
+          if (var_name != "feed" && var_name != "fetch") {
+            v->SetShape(it->second.GetShape());
+            v->SetDataType(it->second.GetDataType());
+          }
+        } else {
+          std::string arg_name;
+          const Type* decl_type;
+          if (op_info->GetInputArgname(var_name, &arg_name)) {
+            decl_type = kernel->GetInputDeclType(arg_name);
+          } else {
+            op_info->GetOutputArgname(var_name, &arg_name);
+            decl_type = kernel->GetOutputDeclType(arg_name);
+          }
+          if (decl_type->IsTensor()) {
+            v->SetType(cpp::VarDesc::Type::LOD_TENSOR);
+            auto tensor = scope->FindVar(var_name)->GetMutable<Tensor>();
+            v->SetPersistable(tensor->persistable());
+            if (var_name != "feed" && var_name != "fetch") {
+              v->SetShape(tensor->dims().data());
+              auto precision = tensor->precision();
+              switch (precision) {
+#define SET_DATATYPE(precision__, data_type)           \
+  case PrecisionType::precision__:                     \
+    v->SetDataType(data_type);                         \
+    LOG(INFO) << "Update var " << var_name << " done"; \
+    break
+                SET_DATATYPE(kBool, VarDescAPI::VarDataType::BOOL);
+                SET_DATATYPE(kFloat, VarDescAPI::VarDataType::FP32);
+                SET_DATATYPE(kFP16, VarDescAPI::VarDataType::FP16);
+                SET_DATATYPE(kInt8, VarDescAPI::VarDataType::INT8);
+                SET_DATATYPE(kInt16, VarDescAPI::VarDataType::INT16);
+                SET_DATATYPE(kInt32, VarDescAPI::VarDataType::INT32);
+                SET_DATATYPE(kInt64, VarDescAPI::VarDataType::INT64);
+#undef SET_DATATYPE
+                default:
+                  LOG(WARNING) << "Unknown precision type "
+                               << PrecisionToStr(precision) << " for var "
+                               << var_name << " in op " << op_type;
+              }
+            }
+          } else if (decl_type->IsTensorList()) {
+            // Set persistable=false for tensor array
+            v->SetType(cpp::VarDesc::Type::LOD_TENSOR_ARRAY);
+            v->SetPersistable(false);
+          } else {
+            CHECK(false) << "Unsupported decl type " << *decl_type
+                         << " for var " << var_name << " in op " << op_type;
+          }
+        }
+        already_added_vars.insert(var_name);
+      }
+      // Replace all of origin ops with the instructions
+      auto op_desc = block_desc->AddOp<cpp::OpDesc>();
+      *op_desc = *op_info;
+      op_desc->SetAttr(kKernelTypeAttr, kernel->SerializedKernelType());
+      if (op_type == "subgraph" && !op_info->GetAttr<int32_t>("sub_block")) {
+        // It's a new subgraph op when its sub_block_idx = 0, Now we add its
         // subblock desc to the program desc, Then update its sub_block_idx to
         // the index of block desc of the program desc.
-        sub_block_idx = desc->BlocksSize();
-        auto sub_block_desc = subgraph_op->GetSubBlock();
-        CHECK(sub_block_desc);
-        auto new_block_desc = desc->AddBlock<cpp::BlockDesc>();
-        *new_block_desc = *sub_block_desc;
-        delete sub_block_desc;
-        subgraph_op->mutable_op_info()->SetAttr<int32_t>("sub_block",
-                                                         sub_block_idx);
-        subgraph_op->SetSubBlock(new_block_desc);
-        // Update main block desc after a new subblock desc is added
-        main_block = desc->GetBlock<cpp::BlockDesc>(0);
+        auto subgraph_op = static_cast<operators::SubgraphOp*>(op);
+        auto sub_program_desc = subgraph_op->GetProgramDesc();
+        CHECK(sub_program_desc);
+        auto sub_block_desc = program_desc->AddBlock<cpp::BlockDesc>();
+        *sub_block_desc = *sub_program_desc->GetBlock<cpp::BlockDesc>(0);
+        subgraph_op->SetProgramDesc(program_desc);
+        op_desc->SetAttr<int32_t>("sub_block", program_desc->BlocksSize() - 1);
+        // Attach op and kernel again to update the new block_idx and
+        // program_desc
+        subgraph_op->Attach(*op_desc, scope);
+        subgraph_op->AttachKernel(kernel);
+        // Update the pointer of block desc after a new subblock desc is added
+        block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
       }
     }
-    auto op = main_block->AddOp<cpp::OpDesc>();
-    *op = *node.op()->op_info();
-    op->SetAttr(kKernelTypeAttr, node.kernel()->SerializedKernelType());
   }
 }
 
-// `UpdateVarsOfProgram` will remove unused var_descs and add new created
-// vars' descs in the block 0. Now, the type of a new created var can only
-// be LOD_TENSOR.
-void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
-  CHECK(desc);
-  CHECK(desc->BlocksSize());
-  std::map<std::string, cpp::VarDesc> origin_var_maps;
-  auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
-  auto var_size = main_block.VarsSize();
-  for (int i = 0; i < var_size; i++) {
-    auto v = main_block.GetVar<cpp::VarDesc>(i);
-    auto name = v->Name();
-    origin_var_maps.emplace(name, *v);
-  }
-
-  main_block.ClearVars();
-  for (auto& node : instructions_) {
-    auto* op = const_cast<lite::OpLite*>(node.op());
-    auto* kernel = node.kernel();
-    auto* scope = op->scope();
-    auto in_names = op->op_info()->input_names();
-    auto out_names = op->op_info()->output_names();
-    in_names.insert(in_names.end(), out_names.begin(), out_names.end());
-    std::stable_sort(in_names.begin(), in_names.end());
-    in_names.erase(std::unique(in_names.begin(), in_names.end()),
-                   in_names.end());
-    for (auto& in_name : in_names) {
-      auto it = origin_var_maps.find(in_name);
-      if (it != origin_var_maps.end()) {
-        auto* v = main_block.AddVar<cpp::VarDesc>();
-        v->SetName((it->second).Name());
-        v->SetType((it->second).GetType());
-        v->SetPersistable((it->second).Persistable());
-        if ((it->second).Name() != "feed" && (it->second).Name() != "fetch") {
-          v->SetShape((it->second).GetShape());
-          v->SetDataType((it->second).GetDataType());
-        }
+// Create runtime program from sub_block desc according to block_idx and
+// program_desc, which is used for while/conditional_block/subgraph op.
+RuntimeProgram::RuntimeProgram(
+    const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+    Scope* exec_scope,
+    int block_idx)
+    : exec_scope_(exec_scope) {
+#ifdef LITE_WITH_OPENCL
+  using OpenCLContext = Context<TargetType::kOpenCL>;
+  std::unique_ptr<KernelContext> local_ctx(new KernelContext());
+  local_ctx->As<OpenCLContext>().InitOnce();
+#endif
+  CHECK(program_desc);
+  auto block_size = program_desc->BlocksSize();
+  CHECK(block_size) << "No block found!";
+  CHECK(block_idx >= 0 && block_idx < block_size)
+      << "Invalid block index, expected [0," << (block_size - 1) << "] but got "
+      << block_idx;
+  auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+  instructions_.resize(kRootBlockIdx + 1);
+  auto op_size = block_desc->OpsSize();
+  for (size_t op_idx = 0; op_idx < op_size; op_idx++) {
+    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
+    CHECK(op_desc);
+    std::string op_type = op_desc->Type();
+    // if (op_type == "feed" || op_type == "fetch") continue;
+    // Create op and pick up the best kernel
+    auto op = LiteOpRegistry::Global().Create(op_type);
+    CHECK(op) << "no Op found for " << op_type;
+    if (op_type == "while") {
+      static_cast<operators::WhileOp*>(op.get())->SetProgramDesc(program_desc);
+    } else if (op_type == "conditional_block") {
+      static_cast<operators::ConditionalBlockOp*>(op.get())->SetProgramDesc(
+          program_desc);
+    } else if (op_type == "subgraph") {
+      static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
+          program_desc);
+    }
+    op->Attach(*op_desc, exec_scope_);
+    std::unique_ptr<KernelBase> kernel;
+    if (op_desc->HasAttr(kKernelTypeAttr)) {
+      // Create op and pick up the best kernel according to the
+      // kKernelTypeAttr attribute
+      auto kernel_type = op_desc->GetAttr<std::string>(kKernelTypeAttr);
+      std::string alias;
+      Place place;
+      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
+      VLOG(3) << "Found the attr '" << kKernelTypeAttr << "': " << kernel_type
+              << " for " << op_type;
+      auto kernels = op->CreateKernels({place});
+      CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
+      auto it = std::find_if(
+          kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
+            return it->alias() == alias;
+          });
+      CHECK(it != kernels.end());
+      kernel = std::move(*it);
+    } else {
+      // TODO(hong19860320) add kernel picking according to the type of input
+      // and output tensors
+      VLOG(3) << "The attr '" << kKernelTypeAttr
+              << "' not found, pick the first kernel for " << op_type;
+      std::vector<std::unique_ptr<KernelBase>> kernels;
+#if defined(LITE_WITH_ARM)
+      kernels = op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}});
+#elif defined(LITE_WITH_X86)
+      kernels = op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}});
+#endif
+      if (kernels.size() > 0) {
+        kernel = std::move(kernels.front());
       } else {
-        // New created vars must be LOD_TENSOR
-        auto* v = main_block.AddVar<cpp::VarDesc>();
-        v->SetName(in_name);
-        v->SetType(cpp::VarDesc::Type::LOD_TENSOR);
-        std::string in_arg_name;
-        const Type* type;
-        if (op->op_info()->GetInputArgname(in_name, &in_arg_name)) {
-          type = kernel->GetInputDeclType(in_arg_name);
-        } else {
-          op->op_info()->GetOutputArgname(in_name, &in_arg_name);
-          type = kernel->GetOutputDeclType(in_arg_name);
-        }
-        if (type->IsTensor()) {
-          auto tensor = scope->FindVar(in_name)->GetMutable<Tensor>();
-          v->SetPersistable(tensor->persistable());
-          if (in_name != "feed" && in_name != "fetch") {
-            v->SetShape(tensor->dims().data());
-            switch (tensor->precision()) {
-#define SET_DATATYPE(precision__, data_type)                    \
-  case PrecisionType::precision__:                              \
-    v->SetDataType(data_type);                                  \
-    LOG(INFO) << "update var" << (it->second).Name() << "done"; \
-    break
-              SET_DATATYPE(kBool, VarDescAPI::VarDataType::BOOL);
-              SET_DATATYPE(kFloat, VarDescAPI::VarDataType::FP32);
-              SET_DATATYPE(kFP16, VarDescAPI::VarDataType::FP16);
-              SET_DATATYPE(kInt8, VarDescAPI::VarDataType::INT8);
-              SET_DATATYPE(kInt16, VarDescAPI::VarDataType::INT16);
-              SET_DATATYPE(kInt32, VarDescAPI::VarDataType::INT32);
-              SET_DATATYPE(kInt64, VarDescAPI::VarDataType::INT64);
-#undef SET_DATATYPE
-              default:
-                VLOG(4) << "warning! unknown precision type";
-            }
-          }
-        } else {
-          CHECK(false) << "unsupported var type";
-        }
+        LOG(WARNING) << "No kernels found for " << op_type;
       }
     }
+#ifdef LITE_WITH_OPENCL
+    if (kernel->target() == TARGET(kOpenCL)) {
+      std::unique_ptr<KernelContext> ctx(new KernelContext());
+      (*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
+      kernel->SetContext(std::move(ctx));
+    } else {
+      kernel->SetContext(
+          ContextScheduler::Global().NewContext(kernel->target()));
+    }
+#else
+    kernel->SetContext(ContextScheduler::Global().NewContext(kernel->target()));
+#endif
+    instructions_[kRootBlockIdx].emplace_back(std::move(op), std::move(kernel));
   }
+  Init();
 }
+
 void RuntimeProgram::Run() {
 #ifdef LITE_WITH_PRECISION_PROFILE
   auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
@@ -158,7 +258,8 @@ void RuntimeProgram::Run() {
   }
 #endif
   int idx = -1;
-  for (auto& inst : instructions_) {
+  auto& insts = instructions_[kRootBlockIdx];
+  for (auto& inst : insts) {
     ++idx;
 #ifndef LITE_WITH_FPGA
     if (inst.is_feed_fetch_op()) continue;
@@ -191,57 +292,50 @@ void RuntimeProgram::Run() {
 #endif
 }
 
-void Program::Build(const cpp::ProgramDesc& prog) {
+void Program::Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc) {
   CHECK(ops_.empty()) << "Executor duplicate Build found";
 
   // Create operators.
-  auto& program = prog;
-  CHECK(program.BlocksSize());
-  auto& main_block = *program.GetBlock<cpp::BlockDesc>(0);
-  for (size_t i = 0; i < main_block.OpsSize(); ++i) {
-    auto& op_desc = *main_block.GetOp<cpp::OpDesc>(i);
-    auto op_type = op_desc.Type();
-    // if (op_type == "feed" || op_type == "fetch") continue;
-    VLOG(4) << "create Op [" << op_type << "]";
-    auto op = LiteOpRegistry::Global().Create(op_type);
-    CHECK(op) << "no Op found for " << op_type;
-    if (op_type == "while" || op_type == "conditional_block" ||
-        op_type == "subgraph") {
-      auto sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
-      CHECK(sub_block_idx >= 0 && sub_block_idx < program.BlocksSize())
-          << "Invalid attribute sub_block(" << sub_block_idx << ") for "
-          << op_type;
-      auto sub_block_desc =
-          const_cast<cpp::ProgramDesc&>(prog).GetBlock<cpp::BlockDesc>(
-              sub_block_idx);
-      CHECK(sub_block_desc);
+  auto block_size = program_desc->BlocksSize();
+  CHECK(block_size);
+  ops_.resize(block_size);
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    auto* block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+    auto op_size = block_desc->OpsSize();
+    for (size_t op_idx = 0; op_idx < op_size; ++op_idx) {
+      auto* op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
+      auto op_type = op_desc->Type();
+      VLOG(4) << "create Op [" << op_type << "]";
+      auto op = LiteOpRegistry::Global().Create(op_type);
+      CHECK(op) << "no Op found for " << op_type;
       if (op_type == "while") {
-        static_cast<operators::WhileOpLite*>(op.get())->SetSubBlock(
-            sub_block_desc);
+        static_cast<operators::WhileOp*>(op.get())->SetProgramDesc(
+            program_desc);
       } else if (op_type == "conditional_block") {
-        static_cast<operators::ConditionalBlockOpLite*>(op.get())->SetSubBlock(
-            sub_block_desc);
+        static_cast<operators::ConditionalBlockOp*>(op.get())->SetProgramDesc(
+            program_desc);
       } else if (op_type == "subgraph") {
-        static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(
-            sub_block_desc);
+        static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
+            program_desc);
       }
+      op->Attach(*op_desc, exec_scope_);
+      ops_[block_idx].emplace_back(std::move(op));
     }
-    ops_.emplace_back(std::move(op));
-    ops_.back()->Attach(op_desc, exec_scope_);
   }
 }
 
-void Program::PrepareWorkspace(const cpp::ProgramDesc& prog,
-                               const std::vector<std::string>& var_names) {
+void Program::PrepareWorkspace(
+    const std::shared_ptr<cpp::ProgramDesc>& program_desc,
+    const std::vector<std::string>& vars_to_clone) {
   CHECK(!exec_scope_) << "Duplicate PrepareWorkspace found";
   exec_scope_ = &scope_->NewScope();
   // Create Feed and Fetch var.
   scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
   scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
-  tmp_vars_.push_back("feed");
-  tmp_vars_.push_back("fetch");
+  vars_.push_back("feed");
+  vars_.push_back("fetch");
 
-  auto VarPrecision2KernlPrecision =
+  auto VarDescType2PrecisionType =
       [](const lite::VarDescAPI::Type& type) -> PrecisionType {
     switch (type) {
       case lite::VarDescAPI::Type::FP32:
@@ -257,44 +351,60 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog,
       case lite::VarDescAPI::Type::INT64:
         return PRECISION(kInt64);
       default:
-        // LOG(FATAL) << "not supported type: " << static_cast<int>(type);
+        LOG(WARNING) << "Unable to convert var desc type("
+                     << static_cast<int>(type) << ") to precision type!";
         return PRECISION(kUnk);
     }
   };
 
-  auto& program = prog;
-  CHECK(program.BlocksSize());
-  for (size_t b = 0; b < program.BlocksSize(); ++b) {
-    auto& main_block = *program.GetBlock<cpp::BlockDesc>(b);
-    for (size_t i = 0; i < main_block.VarsSize(); ++i) {
-      auto& var_desc = *main_block.GetVar<cpp::VarDesc>(i);
-      if (!var_desc.Persistable()) {
-        if (var_desc.GetType() == lite::VarDescAPI::Type::LOD_TENSOR &&
-            VarPrecision2KernlPrecision(var_desc.GetDataType()) !=
-                PRECISION(kUnk)) {
-          var_data_type_[var_desc.Name()] =
-              VarPrecision2KernlPrecision(var_desc.GetDataType());
-        }
-        tmp_vars_.push_back(var_desc.Name());
-        VLOG(4) << "var name: " << var_desc.Name() << " type is "
-                << static_cast<int>(var_desc.GetType()) << " data type is "
-                << static_cast<int>(var_desc.GetDataType());
-        exec_scope_->Var(var_desc.Name());
-        if (b > 0) {
-          VLOG(4) << "var: " << var_desc.Name();
+  auto block_size = program_desc->BlocksSize();
+  CHECK(block_size);
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    auto* block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+    auto var_size = block_desc->VarsSize();
+    for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
+      auto* var_desc = block_desc->GetVar<cpp::VarDesc>(var_idx);
+      const auto& var_name = var_desc->Name();
+      const auto& var_type = var_desc->GetType();
+      if (!var_desc->Persistable()) {
+        vars_.push_back(var_name);
+        auto* var = exec_scope_->Var(var_name);
+        VLOG(4) << "Var " << var_name << " in block " << block_idx;
+        VLOG(4) << " - type " << static_cast<int>(var_type);
+        if (var_type == lite::VarDescAPI::Type::LOD_TENSOR) {
+          const auto& var_data_type =
+              VarDescType2PrecisionType(var_desc->GetDataType());
+          if (var_data_type != PRECISION(kUnk)) {
+            var_type_map_[var_name] = LiteType::GetTensorTy(
+                TARGET(kUnk), var_data_type, DATALAYOUT(kUnk));
+          }
+          VLOG(4) << " - data type " << static_cast<int>(var_data_type);
+          // Create the tensor with the shape from var desc, it's convenient to
+          // the graph analysis in the passes, but you should resize the tensor
+          // with the real shape before accessing its data, because the
+          // var_shape may be [-1,3,224,224]
+          const auto& var_shape = var_desc->GetShape();
+          auto* tensor = var->GetMutable<lite::Tensor>();
+          if (tensor->dims().empty() && !var_shape.empty()) {
+            tensor->Resize(var_shape);
+            VLOG(4) << " - dims " << tensor->dims().repr();
+          }
+        } else if (var_type == lite::VarDescAPI::Type::LOD_TENSOR_ARRAY) {
+          var_type_map_[var_name] = LiteType::GetTensorListTy(
+              TARGET(kUnk), PRECISION(kUnk), DATALAYOUT(kUnk));
         }
       } else {
-        if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue;
-        weights_.push_back(var_desc.Name());
-        if (var_desc.Persistable()) scope_->Var(var_desc.Name());
+        if (var_name == "feed" || var_name == "fetch") continue;
+        weights_.push_back(var_name);
+        scope_->Var(var_name);
       }
     }
   }
 
-  for (auto i : var_names) {
-    exec_scope_->LocalVar(i);
-    auto* tensor = scope_->Var(i)->GetMutable<lite::Tensor>();
-    auto* sub_tensor = exec_scope_->Var(i)->GetMutable<lite::Tensor>();
+  for (auto var_name : vars_to_clone) {
+    exec_scope_->LocalVar(var_name);
+    auto* tensor = scope_->Var(var_name)->GetMutable<Tensor>();
+    auto* sub_tensor = exec_scope_->Var(var_name)->GetMutable<Tensor>();
     sub_tensor->CopyDataFrom(*tensor);
   }
 }
diff --git a/lite/core/program.h b/lite/core/program.h
index 544795af2e6642baedcb6b3d1333f43b428f819d..f0715b9760b81f8de42e0acee5f5839fc42dd65a 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -41,61 +41,66 @@ static const char kKernelTypeAttr[] = "__@kernel_type_attr@__";
 // - scope: which contains all the weights
 struct Program {
  public:
-  explicit Program(const std::shared_ptr<Scope>& root) { scope_ = root; }
-  Program(const cpp::ProgramDesc& desc,
-          const std::shared_ptr<Scope>& root,
+  explicit Program(const std::shared_ptr<Scope>& root_scope) {
+    scope_ = root_scope;
+  }
+  Program(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
+          const std::shared_ptr<Scope>& root_scope,
           const std::vector<Place>& valid_places,
           const std::vector<std::string>& var_names = {})
-      : scope_(root), valid_places_(valid_places) {
-    desc_.CopyFrom(desc);
+      : scope_(root_scope), valid_places_(valid_places) {
     CHECK(scope_) << "scope should be init first";
     VLOG(4) << "prepare work";
-    PrepareWorkspace(desc, var_names);
+    PrepareWorkspace(program_desc, var_names);
     VLOG(4) << "build desc";
-    Build(desc);
+    Build(program_desc);
     VLOG(4) << "build desc finished";
   }
 
   std::unique_ptr<Program> Clone() const {
-    std::unique_ptr<Program> res(new Program(desc_, scope_, valid_places_));
-    return res;
+    return std::unique_ptr<Program>(new Program(scope_));
   }
 
   const std::list<std::string>& weights() const { return weights_; }
-  const std::list<std::string>& tmp_vars() const { return tmp_vars_; }
+  const std::list<std::string>& vars() const { return vars_; }
   std::list<std::string>* mutable_weights() { return &weights_; }
-  std::list<std::string>* mutable_tmp_vars() { return &tmp_vars_; }
+  std::list<std::string>* mutable_vars() { return &vars_; }
 
-  const std::list<std::shared_ptr<OpLite>>& ops() const { return ops_; }
-  std::list<std::shared_ptr<OpLite>>* mutable_ops() { return &ops_; }
+  const std::list<std::shared_ptr<OpLite>>& ops(
+      int block_idx = kRootBlockIdx) const {
+    return ops_[block_idx];
+  }
+  std::list<std::shared_ptr<OpLite>>* mutable_ops(
+      int block_idx = kRootBlockIdx) {
+    return &ops_[block_idx];
+  }
 
-  lite::Scope* exec_scope() { return exec_scope_; }
-  lite::Scope* scope() { return scope_.get(); }
+  size_t block_size() { return ops_.size(); }
 
-  cpp::ProgramDesc* program_desc() { return &desc_; }
+  Scope* exec_scope() { return exec_scope_; }
+  Scope* scope() { return scope_.get(); }
 
-  const std::map<std::string, PrecisionType>& var_data_type() const {
-    return var_data_type_;
+  const std::map<std::string, const Type*>& var_type_map() const {
+    return var_type_map_;
   }
 
  private:
   // Build from a program and scope.
-  void Build(const cpp::ProgramDesc& program);
+  void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc);
   // Create temporary variables.
-  void PrepareWorkspace(const cpp::ProgramDesc& program,
-                        const std::vector<std::string>& var_names = {});
+  void PrepareWorkspace(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
+                        const std::vector<std::string>& vars_to_clone = {});
 
  private:
-  std::map<std::string, PrecisionType> var_data_type_;
-  std::list<std::string> tmp_vars_;
+  std::map<std::string, const Type*> var_type_map_;
+  std::list<std::string> vars_;
   std::list<std::string> weights_;
-  std::list<std::shared_ptr<OpLite>> ops_;
+  std::vector<std::list<std::shared_ptr<OpLite>>> ops_;
   // the scope to run the kernels, NOTE this is the execution scope.
-  std::shared_ptr<lite::Scope> scope_;
+  std::shared_ptr<Scope> scope_;
   std::vector<Place> valid_places_;
   // Runtime scope.
-  lite::Scope* exec_scope_{};
-  cpp::ProgramDesc desc_;
+  Scope* exec_scope_{};
 };
 
 struct Instruction {
@@ -173,8 +178,22 @@ struct Instruction {
  */
 class LITE_API RuntimeProgram {
  public:
-  explicit RuntimeProgram(std::vector<Instruction>&& insts)
+  explicit RuntimeProgram(std::vector<std::vector<Instruction>>&& insts)
       : instructions_(std::move(insts)) {
+    Init();
+  }
+  explicit RuntimeProgram(
+      const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+      Scope* exec_scope,
+      int block_idx = kRootBlockIdx);
+  ~RuntimeProgram() {
+#ifdef LITE_WITH_PROFILE
+    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
+    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch);
+#endif  // LITE_WITH_PROFILE
+  }
+
+  void Init() {
     if (instructions_.empty()) {
       LOG(FATAL) << "no instructions";
     }
@@ -183,7 +202,7 @@ class LITE_API RuntimeProgram {
 #endif
 #ifdef LITE_WITH_NVTX
     const NVTXAnnotator& annotator = NVTXAnnotator::Global();
-    for (auto& inst : instructions_) {
+    for (auto& inst : instructions_[kRootBlockIdx]) {
       NVTXRangeAnnotation annotation = annotator.AnnotateBlock();
       register_layer_names_.push_back(annotator.RegisterString(
           const_cast<paddle::lite::OpLite*>(inst.op())->Type().c_str()));
@@ -191,41 +210,38 @@ class LITE_API RuntimeProgram {
     register_layer_names_.push_back(annotator.RegisterString("one_loop"));
 #endif
   }
-  ~RuntimeProgram() {
-#ifdef LITE_WITH_PROFILE
-    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
-    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch);
-#endif  // LITE_WITH_PROFILE
-  }
 
   void Run();
 
-  void set_exec_scope(lite::Scope* x) { exec_scope_ = x; }
-  lite::Scope* exec_scope() { return exec_scope_; }
+  void set_exec_scope(Scope* x) { exec_scope_ = x; }
+  Scope* exec_scope() { return exec_scope_; }
 
-  size_t num_instructions() const { return instructions_.size(); }
+  const std::vector<Instruction>& instructions(
+      int block_idx = kRootBlockIdx) const {
+    return instructions_[block_idx];
+  }
 
-  const std::vector<Instruction>& instructions() const { return instructions_; }
+  std::vector<Instruction>* mutable_instructions(
+      int block_idx = kRootBlockIdx) {
+    return &instructions_[block_idx];
+  }
 
-  // `SaveOpInfosToProgram` will update the op list(ops_) of the block 0
-  // in ProgramDesc.
-  void SaveOpInfosToProgram(cpp::ProgramDesc* desc);
+  size_t block_size() { return instructions_.size(); }
 
-  // `UpdateVarsOfProgram` will update the var list(vars_) of the block 0 in
-  // ProgramDesc. Namely, if a new var created in some passes, its var_desc will
-  // be added in vars_.
-  void UpdateVarsOfProgram(cpp::ProgramDesc* desc);
+  // Update the ops and vars of all of blocks to the given program_desc
+  // according to the instructions
+  void SaveToProgram(std::shared_ptr<cpp::ProgramDesc> program_desc);
 
  private:
   RuntimeProgram(const RuntimeProgram&) = delete;
-  std::vector<Instruction> instructions_;
-  lite::Scope* exec_scope_{};
+  std::vector<std::vector<Instruction>> instructions_;
+  Scope* exec_scope_{};
 
 #ifdef LITE_WITH_PROFILE
   profile::Profiler profiler_;
   void set_profiler() {
-    for (auto i = instructions_.begin(); i != instructions_.end(); ++i) {
-      i->set_profiler(&profiler_);
+    for (auto& inst : instructions_[kRootBlockIdx]) {
+      inst.set_profiler(&profiler_);
     }
   }
 #endif
diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc
index 197ee4ddbcd5df62dd0f8a15eba39e2a880f7125..3b21cf9147ded7b05938edc6c2985c8fce23842f 100644
--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
@@ -84,6 +84,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
   lod_ = other.lod_;
   memory_size_ = other.memory_size_;
   precision_ = other.precision_;
+  persistable_ = other.persistable_;
   buffer_->CopyDataFrom(*other.buffer_, memory_size_);
 }
 
diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
index 3d09c071aa7ecbe51f1723cad314f2aedcdb2bd7..2604f104e72081025d9bd59bb60843cc627ad54f 100644
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -78,6 +78,28 @@ void RunModel(std::string model_dir,
   // 1. Set MobileConfig
   MobileConfig config;
   config.set_model_from_file(model_dir);
+
+  // NOTE: Use android gpu with opencl, you should ensure:
+  //  first, [compile **cpu+opencl** paddlelite
+  //    lib](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/demo_guides/opencl.md);
+  //  second, [convert and use opencl nb
+  //    model](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/docs/user_guides/opt/opt_bin.md).
+  //
+  /*  Uncomment code below to enable OpenCL
+  bool is_opencl_backend_valid = ::IsOpenCLBackendValid();
+  std::cout << "is_opencl_backend_valid:" << is_opencl_backend_valid <<
+  std::endl;
+  if (is_opencl_backend_valid) {
+    // give opencl nb model dir
+    config.set_model_from_file(model_dir);
+  } else {
+    std::cout << "Unsupport opencl nb model." << std::endl;
+    exit(1);
+    // you can give backup cpu nb model instead
+    // config.set_model_from_file(cpu_nb_model_dir);
+  }
+  */
+
   // NOTE: To load model transformed by model_optimize_tool before
   // release/v2.3.0, plese use `set_model_dir` API as listed below.
   // config.set_model_dir(model_dir);
diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt
index 2416278ad74068d28f6de523c55513891b08cc72..5dffd7c1a93225a38e433a4ff447b9b0fc863216 100644
--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
         X86_DEPS ${x86_kernels}
         ARM_DEPS ${arm_kernels}
         NPU_DEPS ${npu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
         RKNPU_DEPS ${rknpu_kernels}
         XPU_DEPS ${xpu_kernels}
         CL_DEPS ${opencl_kernels}
@@ -44,6 +45,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
     X86_DEPS ${x86_kernels}
     ARM_DEPS ${arm_kernels}
     NPU_DEPS ${npu_kernels}
+    HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
     RKNPU_DEPS ${rknpu_kernels}
     XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt
index 17a836b17183d69b0e2a15b46b7a2097c323312f..91268bc28dbdf38137904f986b254a76cbd5e538 100644
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -14,3 +14,4 @@ add_subdirectory(mlu)
 add_subdirectory(apu)
 add_subdirectory(bm)
 add_subdirectory(rknpu)
+add_subdirectory(huawei_ascend_npu)
diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc
index 21373811dd91d009d834a16d2c437bc722cd676a..579ed97b161dade9822250dab411cefd214b50f8 100644
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -37,7 +37,7 @@ bool SubgraphEngine::BuildDeviceProgram() {
   subgraph::apu::Graph graph;
   int neuron_errCode = NeuronModel_create(&model_);
   if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Fail to create model";
+    LOG(WARNING) << "[APU] Failed to create the neuron model!";
     return false;
   }
   graph.set_model(model_);
@@ -46,11 +46,12 @@ bool SubgraphEngine::BuildDeviceProgram() {
 
   // Convert all of ops and their input vars and weights and added into the APU
   // NIR graph
-  if (origin_program_.empty()) {
+  if (!origin_program_) {
     BuildOriginProgram();
   }
   const auto& bridges = subgraph::Registry::Instance();
-  for (auto& inst : origin_program_) {
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
@@ -70,55 +71,38 @@ bool SubgraphEngine::BuildDeviceProgram() {
     }
   }
 
-  // Get input tensor
-  std::vector<uint32_t> ins;
-  origin_itensors_.resize(input_names_.size());
-  origin_idims_.resize(input_names_.size());
+  // Get the index of input tensors
+  std::vector<uint32_t> input_indices;
   for (int i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
-    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
-    VLOG(3) << "subgraph input name: " << i << ", " << input_names_[i] << ":"
-            << origin_idims_[i].production();
-    // Get input index
-    int idx;
-    if (graph.Has(input_names_[i])) {
-      ins.push_back(graph.Get(input_names_[i])->index());
-      VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index();
-    } else {
-      LOG(WARNING) << "Fail to find input: " << input_names_[i];
-      return false;
-    }
+    CHECK(graph.Has(input_names_[i])) << "[APU] Failed to find input node "
+                                      << input_names_[i];
+    auto index = graph.Get(input_names_[i])->index();
+    input_indices.push_back(index);
+    VLOG(3) << "[APU] Input[" << i << "] name " << input_names_[i] << " dims "
+            << origin_itensors_[i]->dims() << " index " << index;
   }
 
-  // Get output tensor
-  std::vector<uint32_t> outs;
-  origin_otensors_.resize(output_names_.size());
-  origin_odims_.resize(output_names_.size());
+  // Get the index of output tensors
+  std::vector<uint32_t> output_indices;
   for (int i = 0; i < output_names_.size(); i++) {
-    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-    VLOG(3) << "subgraph output name: " << i << ", " << output_names_[i] << ":"
-            << origin_odims_[i].production();
+    CHECK(graph.Has(output_names_[i])) << "[APU] Failed to find output node "
+                                       << output_names_[i];
     origin_otensors_[i]->mutable_data<int8_t>();
-    // Get input index
-    if (graph.Has(output_names_[i])) {
-      outs.push_back(graph.Get(output_names_[i])->index());
-      VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index();
-    } else {
-      LOG(WARNING) << "Fail to find output: " << output_names_[i];
-      return false;
-    }
+    auto index = graph.Get(output_names_[i])->index();
+    output_indices.push_back(index);
+    VLOG(3) << "[APU] Output[" << i << "] name " << output_names_[i] << " dims "
+            << origin_otensors_[i]->dims() << " index " << index;
   }
 
-  VLOG(3) << "ins size: " << ins.size() << " outs size:" << outs.size();
-  // Set subgraph input/output
-  NeuronModel_identifyInputsAndOutputs(
-      model_, ins.size(), &ins[0], outs.size(), &outs[0]);
+  // Indentify the input and output tensors of the neuron model
+  NeuronModel_identifyInputsAndOutputs(model_,
+                                       input_indices.size(),
+                                       &input_indices[0],
+                                       output_indices.size(),
+                                       &output_indices[0]);
   neuron_errCode = NeuronModel_finish(model_);
   if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode;
+    LOG(WARNING) << "[APU] Fail to create NIR model:" << neuron_errCode;
     return false;
   }
   VLOG(3) << "[APU] APU NIR model created!";
@@ -207,11 +191,11 @@ SubgraphEngine::~SubgraphEngine() {
 void SubgraphCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                    param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
   CHECK(engine_);
 }
 
diff --git a/lite/kernels/apu/subgraph_compute.h b/lite/kernels/apu/subgraph_compute.h
index beb582b8cc16e456491c28ace5e2d1695143216a..de15abdf7fdbce8001676a2bf7f651ad1e435c74 100644
--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
@@ -31,12 +31,16 @@ class SubgraphEngine : public subgraph::Engine {
  public:
   SubgraphEngine(KernelContext *ctx,
                  int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                  const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}
 
   ~SubgraphEngine();
 
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 6d1d24adcb4cf74b3c6bb991a33316e974dc0110..f4fe6ba1ebb9a7e775f0d5db1031f9fd40508c20 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -75,7 +75,6 @@ add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_comp
 add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(clip_compute_arm ARM extra SRCS clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -87,7 +86,6 @@ add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_comp
 add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
diff --git a/lite/kernels/arm/conditional_block_compute.h b/lite/kernels/arm/conditional_block_compute.h
deleted file mode 100644
index 91eadff931ec8aa54092347bcf18f8428130ef75..0000000000000000000000000000000000000000
--- a/lite/kernels/arm/conditional_block_compute.h
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/program.h"
-#include "lite/operators/conditional_block_op.h"
-#ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/basic_profiler.h"
-#include "lite/core/profile/precision_profiler.h"
-#include "lite/core/profile/profiler.h"
-#endif
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class CondExecutor {
-  typedef std::shared_ptr<OpLite> OpPtr;
-
- public:
-  CondExecutor(cpp::BlockDesc *block, Scope *scope, Place place)
-      : scope_(scope), place_(place) {
-    int32_t op_size = block->OpsSize();
-    for (int32_t i = 0; i < op_size; ++i) {
-      auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
-      auto op_type = op_desc.Type();
-      auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
-      op_handler->Attach(op_desc, scope);
-
-      auto hostplace = place_;
-      hostplace.target = TARGET(kHost);
-      auto kernels = op_handler->CreateKernels({place_, hostplace});
-      CHECK_GT(kernels.size(), 0) << "cannot create kernel";
-      op_handler->AttachKernel(kernels[0].get());
-      op_handler->SetKernel(kernels);
-      ops_of_block_.push_back(op_handler);
-    }
-  }
-
-  void Run() {
-#ifdef LITE_WITH_PROFILE
-#ifdef LITE_WITH_PRECISION_PROFILE
-    lite::profile::Profiler profiler;
-#endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
-    for (auto &op_handler : ops_of_block_) {
-      op_handler->CheckShape();
-      op_handler->InferShape();
-#ifdef LITE_WITH_PROFILE
-#ifdef LITE_WITH_PRECISION_PROFILE
-      std::unique_ptr<KernelBase> kernel(op_handler->GetKernel());
-      Instruction inst(op_handler, std::move(kernel));
-      inst.set_profiler(&profiler);
-#endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
-      op_handler->Run();
-#ifdef LITE_WITH_PROFILE
-#ifdef LITE_WITH_PRECISION_PROFILE
-      LITE_PRECISION_PROFILE(inst)
-#endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
-    }
-  }
-
- private:
-  Scope *scope_;
-  Place place_;
-  std::vector<OpPtr> ops_of_block_;
-};
-
-class ConditionalBlockCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConditionalBlockParam;
-
-  void PrepareForRun() override;
-  void Run() override;
-
-  virtual ~ConditionalBlockCompute() = default;
-
- private:
-  std::shared_ptr<CondExecutor> executor_;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/elementwise_compute.cc b/lite/kernels/arm/elementwise_compute.cc
index 28082785e1c726097a8bfd2165f0d09b9962a5e7..3e898d9ded2153588c164d2ccd618fc77f7c3854 100644
--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -202,17 +202,13 @@ void ElementwiseMulCompute<T, PType>::Run() {
   }
 }
 
-template <>
-void ElementwiseMulCompute<int64_t, PRECISION(kInt64)>::Run() {
-  auto& param = this->template Param<operators::ElementwiseParam>();
-  lite::arm::math::elementwise_compute_basic<int64_t>(param, "mul", "");
-}
-
-void ElementwiseMulActivationCompute::Run() {
-  auto& param = Param<operators::FusionElementwiseActivationParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
+template <typename T, PrecisionType PType>
+void ElementwiseMulActivationCompute<T, PType>::Run() {
+  auto& param =
+      this->template Param<operators::FusionElementwiseActivationParam>();
+  auto* x_data = param.X->template data<T>();
+  auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
   int axis = param.axis;
   std::string act_type = param.act_type;
   auto x_dims = param.X->dims();
@@ -221,21 +217,21 @@ void ElementwiseMulActivationCompute::Run() {
   if (x_dims.size() < y_dims.size() &&
       is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
     if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu_broadcast<float>(
+      lite::arm::math::elementwise_mul_relu_broadcast<T>(
           y_data, x_data, out_data, pre, n, post);
     } else {
       LOG(FATAL) << "unsupported Activation type: " << act_type;
     }
   } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
     if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu_broadcast(
+      lite::arm::math::elementwise_mul_relu_broadcast<T>(
           x_data, y_data, out_data, pre, n, post);
     } else {
       LOG(FATAL) << "unsupported Activation type: " << act_type;
     }
   } else {
     if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu(
+      lite::arm::math::elementwise_mul_relu<T>(
           x_data, y_data, out_data, x_dims.production());
     } else {
       LOG(FATAL) << "unsupported Activation type: " << act_type;
@@ -426,46 +422,60 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
-using elementwise_mul_float =
+using elementwise_mul_float_t =
     paddle::lite::kernels::arm::ElementwiseMulCompute<float, PRECISION(kFloat)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float, def)
+    elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float_t, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
-using elementwise_mul_int32 =
+using elementwise_mul_int32_t =
     paddle::lite::kernels::arm::ElementwiseMulCompute<int, PRECISION(kInt32)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32, def)
+    elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32_t, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .Finalize();
 
-using elementwise_mul_int64 =
+using elementwise_mul_int64_t =
     paddle::lite::kernels::arm::ElementwiseMulCompute<int64_t,
                                                       PRECISION(kInt64)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64, def)
+    elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64_t, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(
-    fusion_elementwise_mul_activation,
-    kARM,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::arm::ElementwiseMulActivationCompute,
-    def)
+using fusion_elementwise_mul_activation_float_t = paddle::lite::kernels::arm::
+    ElementwiseMulActivationCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     fusion_elementwise_mul_activation_float_t,
+                     def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
+using fusion_elementwise_mul_activation_int64_t = paddle::lite::kernels::arm::
+    ElementwiseMulActivationCompute<int64_t, PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kARM,
+                     kInt64,
+                     kNCHW,
+                     fusion_elementwise_mul_activation_int64_t,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(elementwise_max,
                      kARM,
                      kFloat,
@@ -489,22 +499,22 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
-using elementwise_div_fp32 =
+using elementwise_div_fp32_t =
     paddle::lite::kernels::arm::ElementwiseDivCompute<float, PRECISION(kFloat)>;
 
 REGISTER_LITE_KERNEL(
-    elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32, def)
+    elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32_t, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
-using elementwise_div_int64 =
+using elementwise_div_int64_t =
     paddle::lite::kernels::arm::ElementwiseDivCompute<int64_t,
                                                       PRECISION(kInt64)>;
 
 REGISTER_LITE_KERNEL(
-    elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64, def)
+    elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64_t, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
@@ -522,11 +532,11 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
-using elementwise_mod_int64 =
+using elementwise_mod_int64_t =
     paddle::lite::kernels::arm::ElementwiseModCompute<int64_t,
                                                       PRECISION(kInt64)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mod, kARM, kInt64, kNCHW, elementwise_mod_int64, def)
+    elementwise_mod, kARM, kInt64, kNCHW, elementwise_mod_int64_t, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
diff --git a/lite/kernels/arm/elementwise_compute.h b/lite/kernels/arm/elementwise_compute.h
index 7d7a93bf6954de9bbcd1b44061e614cd041fafe8..89d9898648d25fec98568f2456fe96903da0a69d 100644
--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
@@ -62,8 +62,8 @@ class ElementwiseMulCompute : public KernelLite<TARGET(kARM), PType> {
   virtual ~ElementwiseMulCompute() = default;
 };
 
-class ElementwiseMulActivationCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ElementwiseMulActivationCompute : public KernelLite<TARGET(kARM), PType> {
  public:
   void Run() override;
 
diff --git a/lite/kernels/arm/elementwise_compute_test.cc b/lite/kernels/arm/elementwise_compute_test.cc
index 62a5bc423ca6e72098332963713e8baffb366325..79262fb4ef75283eba12efa0a4ad8dc048681338 100644
--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ b/lite/kernels/arm/elementwise_compute_test.cc
@@ -533,13 +533,15 @@ TEST(fusion_elementwise_mul_activation_arm, retrive_op) {
 }
 
 TEST(fusion_elementwise_mul_activation_arm, init) {
-  ElementwiseMulActivationCompute fusion_elementwise_mul_activation;
+  ElementwiseMulActivationCompute<float, PRECISION(kFloat)>
+      fusion_elementwise_mul_activation;
   ASSERT_EQ(fusion_elementwise_mul_activation.precision(), PRECISION(kFloat));
   ASSERT_EQ(fusion_elementwise_mul_activation.target(), TARGET(kARM));
 }
 
 TEST(fusion_elementwise_mul_activation_arm, compute) {
-  ElementwiseMulActivationCompute fusion_elementwise_mul_activation;
+  ElementwiseMulActivationCompute<float, PRECISION(kFloat)>
+      fusion_elementwise_mul_activation;
   operators::FusionElementwiseActivationParam param;
   lite::Tensor x, y, output, output_ref;
 
diff --git a/lite/kernels/arm/fc_compute.cc b/lite/kernels/arm/fc_compute.cc
index 0ff1cd6b0dc26cdb2b45b00e34baced1bc5fa131..6e3a620a4a8989807481cb0f56ac91643eda4ce7 100644
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
@@ -88,7 +88,7 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
 
   auto i_data = param.input->data<float>();
   auto o_data = param.output->mutable_data<float>();
-  auto w_data = flag_gemm_ ? param.w->data<float>() : weights_.data<float>();
+  auto w_data = param.w->data<float>();
   const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
   if (flag_trans_bias_) {
     b_data = bias_.data<float>();
@@ -149,8 +149,7 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
 
   auto i_data = param.input->data<int8_t>();
   auto o_data = param.output->mutable_data<float>();
-  auto w_data =
-      flag_trans_weights_ ? weights_.data<int8_t>() : param.w->data<int8_t>();
+  auto w_data = param.w->data<int8_t>();
   const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
   if (flag_trans_bias_) {
     b_data = bias_.data<float>();
@@ -208,8 +207,7 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
 
   auto i_data = param.input->data<int8_t>();
   auto o_data = param.output->mutable_data<int8_t>();
-  auto w_data =
-      flag_trans_weights_ ? weights_.data<int8_t>() : param.w->data<int8_t>();
+  auto w_data = param.w->data<int8_t>();
   const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
   if (flag_trans_bias_) {
     b_data = bias_.data<float>();
diff --git a/lite/kernels/arm/fc_compute.h b/lite/kernels/arm/fc_compute.h
index 4f8a82a8689c1f221ee146176ff7074602cad1c9..e45758775d99112afa0a7e3a45e1c15a9ea371aa 100644
--- a/lite/kernels/arm/fc_compute.h
+++ b/lite/kernels/arm/fc_compute.h
@@ -104,9 +104,11 @@ class FcCompute : public KernelLite<TARGET(kARM), PType> {
     CHECK_EQ(k_, static_cast<int>(w_dims[0]));
     flag_gemm_ = check_fc_use_gemm<PType, OutType>(
         m_, param.weight_scale, param.bias != nullptr);
-    if (!flag_trans_weights_ && !flag_gemm_) {
-      flag_trans_weights_ = true;
-      fc_trans_weights<PType>(*param.w, &weights_);
+    if (flag_trans_weights_ == flag_gemm_) {
+      flag_trans_weights_ = !flag_trans_weights_;
+      Tensor tmp_tensor;
+      fc_trans_weights<PType>(*param.w, &tmp_tensor);
+      param.w->CopyDataFrom(tmp_tensor);
     }
   }
 
@@ -117,7 +119,6 @@ class FcCompute : public KernelLite<TARGET(kARM), PType> {
 
  private:
   DDim last_shape_;
-  Tensor weights_;
   Tensor bias_;
   bool flag_trans_weights_{false};
   bool flag_trans_bias_{false};
diff --git a/lite/kernels/arm/gather_compute.cc b/lite/kernels/arm/gather_compute.cc
index 2a9c70aede7475b36f70c628ff6ccaa823f030b2..f5a87e5431955252e47143252ce13ba4056c4a7f 100644
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
@@ -20,44 +20,45 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-template <typename T>
+template <typename IndexType, typename DataType>
 void GatherFunc(const operators::GatherParam& param) {
   auto src_dims = param.X->dims();
   auto index_size = param.Index->dims()[0];
-  auto* p_src = param.X->data<T>();
-  const int* p_index = param.Index->data<int>();
-  auto* p_output = param.Out->mutable_data<T>();
+  auto* p_src = param.X->data<DataType>();
+  const IndexType* p_index = param.Index->data<IndexType>();
+  auto* p_output = param.Out->mutable_data<DataType>();
 
   int slice_size = 1;
   for (size_t i = 1; i < src_dims.size(); ++i) {
     slice_size *= src_dims[i];
   }
   for (int i = 0; i < index_size; ++i) {
-    int index_ = p_index[i];
+    IndexType index_ = p_index[i];
     memcpy(p_output + i * slice_size,
            p_src + index_ * slice_size,
-           slice_size * sizeof(T));
+           slice_size * sizeof(DataType));
   }
 }
 
-void GatherCompute::Run() {
-  auto& param = this->Param<operators::GatherParam>();
+template <typename IndexType>
+void GatherCompute<IndexType>::Run() {
+  auto& param = this->template Param<operators::GatherParam>();
 
   switch (param.X->precision()) {
     case PRECISION(kFloat):
-      GatherFunc<float>(param);
+      GatherFunc<IndexType, float>(param);
       break;
     case PRECISION(kInt8):
-      GatherFunc<int8_t>(param);
+      GatherFunc<IndexType, int8_t>(param);
       break;
     case PRECISION(kInt16):
-      GatherFunc<int16_t>(param);
+      GatherFunc<IndexType, int16_t>(param);
       break;
     case PRECISION(kInt32):
-      GatherFunc<int32_t>(param);
+      GatherFunc<IndexType, int32_t>(param);
       break;
     case PRECISION(kInt64):
-      GatherFunc<int64_t>(param);
+      GatherFunc<IndexType, int64_t>(param);
       break;
     default:
       LOG(FATAL) << "Gather does not implement for the "
@@ -70,9 +71,26 @@ void GatherCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
+REGISTER_LITE_KERNEL(gather,
+                     kARM,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::arm::GatherCompute<int32_t>,
+                     def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .BindInput("Index", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(gather,
+                     kARM,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::arm::GatherCompute<int64_t>,
+                     def_int64_idx)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/gather_compute.h b/lite/kernels/arm/gather_compute.h
index 9753f42972407b250886afa6bada8861a642e189..0226e5f68eee3f23dbd945af6f4f455ab79190c5 100644
--- a/lite/kernels/arm/gather_compute.h
+++ b/lite/kernels/arm/gather_compute.h
@@ -23,6 +23,7 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
+template <typename IndexType>
 class GatherCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   void Run() override;
diff --git a/lite/kernels/arm/sequence_conv_compute.cc b/lite/kernels/arm/sequence_conv_compute.cc
index 69740a258be165f9ceec6829a81497e842b5a697..455615e66de53a4a6f235f8ab803394962292936 100644
--- a/lite/kernels/arm/sequence_conv_compute.cc
+++ b/lite/kernels/arm/sequence_conv_compute.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstddef>
 #include <string>
 #include <vector>
+#include "lite/backends/arm/math/conv_block_utils.h"
 #include "lite/backends/arm/math/conv_impl.h"
 #include "lite/backends/arm/math/sgemm.h"
 #include "lite/core/op_registry.h"
@@ -101,10 +102,14 @@ void SequenceConvCompute::Run() {
           1,
           1,  // stride_h, stride_w, dilation_h, dilation_w
           tmp_data);
-      local_naive_transpose(tmp_data,
-                            sub_col_data,
-                            kernel_size * hidden_dim,
-                            input_row_end - input_row_begin);
+      int cols = kernel_size * hidden_dim;
+      int rows = input_row_end - input_row_begin;
+      if (cols % 4 == 0 && rows % 4 == 0) {
+        paddle::lite::arm::math::local_transpose(
+            tmp_data, sub_col_data, cols, rows);
+      } else {
+        local_naive_transpose(tmp_data, sub_col_data, cols, rows);
+      }
     }
   }
 
diff --git a/lite/kernels/arm/while_compute.h b/lite/kernels/arm/while_compute.h
deleted file mode 100644
index f735d96f9190755daacdf846a2d99901c1a14493..0000000000000000000000000000000000000000
--- a/lite/kernels/arm/while_compute.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <memory>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/while_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class StepExecutor {
-  typedef std::shared_ptr<OpLite> OpPtr;
-
- public:
-  StepExecutor(cpp::BlockDesc *block, Scope *scope, Place place)
-      : scope_(scope), place_(place) {
-    int32_t op_size = block->OpsSize();
-    for (int32_t i = 0; i < op_size; ++i) {
-      auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
-      auto op_type = op_desc.Type();
-      auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
-      // VLOG(4) << "while: creating Op [" << op_type << "]";
-      op_handler->Attach(op_desc, scope);
-
-      auto hostplace = place_;
-      hostplace.target = TARGET(kHost);
-      auto kernels = op_handler->CreateKernels({place_, hostplace});
-      CHECK_GT(kernels.size(), 0) << "cannot create kernel";
-      op_handler->AttachKernel(kernels[0].get());
-      op_handler->SetKernel(kernels);
-      ops_of_block_.push_back(op_handler);
-    }
-  }
-
-  void Run() {
-    for (auto &op_handler : ops_of_block_) {
-      // VLOG(4) << op_handler->op_info()->Repr();
-      op_handler->InferShape();
-      // VLOG(4) << "while: infered shape";
-      op_handler->Run();
-    }
-  }
-
- private:
-  Scope *scope_;
-  Place place_;
-  std::vector<OpPtr> ops_of_block_;
-};
-
-class WhileCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::WhileParam;
-
-  void Run() override;
-  void PrepareForRun() override;
-
-  virtual ~WhileCompute() = default;
-
- private:
-  std::shared_ptr<StepExecutor> executor_;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc
index 664198cf9fb45664fdc088df382b9b94a1924e9b..0bbf3db2e9736c69bc4498993ac6623adafa4a1b 100644
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -28,36 +28,20 @@ namespace lite {
 namespace kernels {
 namespace bm {
 
-bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
-  // Obtain the origin input tensors, and create the origin output
-  // tensors(Don't try to access them before launch the device program or the
-  // origin program)
-  PrepareWorkspaceForOriginProgram();
-  // Create the device input and output tensors, but don't initialize them
-  // with the dimensions
-  device_inputs_.resize(input_names_.size());
-  for (int i = 0; i < input_names_.size(); i++) {
-    device_inputs_[i].reset(new hiai::AiTensor);
-    CHECK(device_inputs_[i]);
-  }
-  device_outputs_.resize(output_names_.size());
-  for (int i = 0; i < output_names_.size(); i++) {
-    device_outputs_[i].reset(new hiai::AiTensor);
-    CHECK(device_outputs_[i]);
-  }
-  return true;
-}
-
 bool SubgraphEngine::BuildDeviceProgram() {
   int status = 0;
   subgraph::bm::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
   graph.CreateCompilerHandle();
   auto& ctx = this->ctx_->template As<BMContext>();
-  if (origin_program_.empty()) {
+  for (size_t i = 0; i < input_names_.size(); i++) {
+    graph.AddNode(input_names_[i]);
+  }
+  if (!origin_program_) {
     BuildOriginProgram();
   }
-  for (auto& inst : origin_program_) {
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
@@ -79,7 +63,7 @@ bool SubgraphEngine::BuildDeviceProgram() {
   std::string net_name = "bmnet_f32bmodel";
   auto unique_net_name = lite::subgraph::bm::UniqueName(net_name);
   __bmcompile_opt(
-      graph.GetCompilerHandle(), const_cast<char*>(unique_net_name.c_str()), 2);
+      graph.GetCompilerHandle(), const_cast<char*>(unique_net_name.c_str()), 1);
   void* bmodel_data = nullptr;
   unsigned int data_size = 0;
   bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
@@ -93,13 +77,11 @@ bool SubgraphEngine::BuildDeviceProgram() {
   net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]);
   auto& stage = net_info_->stages[0];
   // input
-  origin_idims_.resize(input_names_.size());
-  origin_itensors_.resize(input_names_.size());
   device_inputs_.resize(input_names_.size());
   for (size_t i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(net_info_->input_names[i]);
+    origin_itensors_[i] =
+        exec_scope_->FindMutableTensor(net_info_->input_names[i]);
     CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
     bm_device_mem_t* p_mem =
         static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
     CHECK(p_mem != nullptr);
@@ -112,8 +94,6 @@ bool SubgraphEngine::BuildDeviceProgram() {
                             stage.input_shapes[i]);
   }
   // output
-  origin_odims_.resize(output_names_.size());
-  origin_otensors_.resize(output_names_.size());
   device_outputs_.resize(net_info_->output_num);
   int out_index = 0;
   for (int i = 0; i < output_names_.size(); i++) {
@@ -121,14 +101,13 @@ bool SubgraphEngine::BuildDeviceProgram() {
   }
 
   for (int i = 0; i < net_info_->output_num; i++) {
-    Tensor* t_cur = scope_->FindMutableTensor(net_info_->output_names[i]);
+    Tensor* t_cur = exec_scope_->FindMutableTensor(net_info_->output_names[i]);
     CHECK(t_cur != nullptr);
     bm_device_mem_t* p_mem =
         static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
     CHECK(p_mem != nullptr);
     if (outname_map_.find(net_info_->output_names[i]) != outname_map_.end()) {
       origin_otensors_[out_index] = t_cur;
-      origin_odims_[out_index] = origin_otensors_[out_index]->dims();
       origin_otensors_[out_index]->mutable_data<float>();
       out_index += 1;
     }
@@ -173,11 +152,11 @@ bool SubgraphEngine::LaunchDeviceProgram() {
 void SubgraphCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                    param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
   CHECK(engine_);
 }
 
diff --git a/lite/kernels/bm/subgraph_compute.h b/lite/kernels/bm/subgraph_compute.h
index 7a5b2552ff95681da09346ba11f40f1a6acb7f01..d1dcb3a6d3ef7eb6d9091eb45d1960862cca273a 100644
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
@@ -36,15 +36,18 @@ class SubgraphEngine : public subgraph::Engine {
  public:
   SubgraphEngine(KernelContext *ctx,
                  int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                  const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}
 
  protected:
-  bool PrepareWorkspaceForDeviceProgram() override;
   bool BuildDeviceProgram() override;
   bool LaunchDeviceProgram() override;
 
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index 22bb4345fe744df9a06997d366310e2cc24a7a12..3d396cfa12f8d89e4d868f5bce98cf143ab072ec 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -7,6 +7,7 @@ message(STATUS "compile with lite CUDA kernels")
 # basic kernels
 add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(fc_compute_cuda CUDA basic SRCS fc_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(gru_compute_cuda CUDA basic SRCS gru_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(matmul_compute_cuda CUDA basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
@@ -14,6 +15,7 @@ add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${
 add_kernel(abs_compute_cuda CUDA basic SRCS abs_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(tanh_compute_cuda CUDA basic SRCS tanh_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sigmoid_compute_cuda CUDA basic SRCS sigmoid_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_concat_compute_cuda CUDA extra SRCS sequence_pool_concat_compute.cu DEPS ${lite_kernel_deps})
@@ -60,6 +62,7 @@ nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_
 nv_test(abs_compute_cuda_test SRCS abs_compute_test.cc DEPS abs_compute_cuda)
 nv_test(tanh_compute_cuda_test SRCS tanh_compute_test.cc DEPS tanh_compute_cuda)
 nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda)
+nv_test(sigmoid_compute_cuda_test SRCS sigmoid_compute_test.cc DEPS sigmoid_compute_cuda)
 nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda)
 nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
 nv_test(search_group_padding_compute_cuda_test SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_cuda)
@@ -69,6 +72,7 @@ nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_comp
 #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda)
 nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
 nv_test(fc_compute_cuda_test SRCS fc_compute_test.cc DEPS fc_compute_cuda)
+nv_test(gru_compute_cuda_test SRCS gru_compute_test.cc DEPS gru_compute_cuda)
 nv_test(matmul_compute_cuda_test SRCS matmul_compute_test.cc DEPS matmul_compute_cuda)
 nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
 nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda)
diff --git a/lite/kernels/cuda/assign_value_compute.cu b/lite/kernels/cuda/assign_value_compute.cu
index 89f2937f10399361951c3c8deb47e3700f93e288..6a2740101c2883b3b2f7c999bd96fd3fbd3ab3ce 100644
--- a/lite/kernels/cuda/assign_value_compute.cu
+++ b/lite/kernels/cuda/assign_value_compute.cu
@@ -68,7 +68,7 @@ void AssignValueCompute::Run() {
 
 REGISTER_LITE_KERNEL(assign_value,
                      kCUDA,
-                     kAny,
+                     kFloat,
                      kNCHW,
                      paddle::lite::kernels::cuda::AssignValueCompute,
                      def)
diff --git a/lite/kernels/cuda/concat_compute_test.cc b/lite/kernels/cuda/concat_compute_test.cc
index cc12fcd289d36c38f02663c6a7aaa0ec7c70653a..08dd4013a5ce75ea5abc0c9d678f7437276df161 100644
--- a/lite/kernels/cuda/concat_compute_test.cc
+++ b/lite/kernels/cuda/concat_compute_test.cc
@@ -69,7 +69,7 @@ void concat_compute_ref(const operators::ConcatParam& param) {
   std::vector<int> input_cols(input.size());
   for (int i = 0; i < num; ++i) {
     int input_i_numel = input[i]->dims().size() == 0 ? 0 : 1;
-    for (int didx = 0; didx < input[i]->dims().size(); ++didx) {
+    for (size_t didx = 0; didx < input[i]->dims().size(); ++didx) {
       input_i_numel *= input[i]->dims()[didx];
     }
     int t_cols = input_i_numel / rows;
diff --git a/lite/kernels/cuda/dropout_compute.cc b/lite/kernels/cuda/dropout_compute.cc
index 7e3a3a62432f3bc5f2e62112b2b220abc17ee2bd..f9303a39cebda322526e6cc25401db35e1f4309b 100644
--- a/lite/kernels/cuda/dropout_compute.cc
+++ b/lite/kernels/cuda/dropout_compute.cc
@@ -23,6 +23,9 @@ namespace cuda {
 
 void DropoutCompute::Run() {
   auto& param = Param<operators::DropoutParam>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
   const float* x_data = param.x->data<float>();
   float* out_data = param.output->mutable_data<float>(TARGET(kCUDA));
   int num = param.x->dims().production();
@@ -31,7 +34,7 @@ void DropoutCompute::Run() {
   if (param.dropout_implementation == "downgrade_in_infer") {
     scale = 1.0f - prob_data;
   }
-  lite::cuda::math::scale(num, x_data, out_data, scale, 0);
+  lite::cuda::math::scale(num, x_data, out_data, scale, 0.f, stream);
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/gru_compute.cu b/lite/kernels/cuda/gru_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ddca95048b303cce55cc3435b15f945a84fc8c0c
--- /dev/null
+++ b/lite/kernels/cuda/gru_compute.cu
@@ -0,0 +1,394 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/cuda/gru_compute.h"
+
+#include <string>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/bias.h"
+#include "lite/backends/cuda/math/gru_forward.h"
+#include "lite/backends/cuda/math/sequence2batch.h"
+#include "lite/backends/cuda/target_wrapper.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+struct GRUMetaValue {
+  T* gate_weight;
+  T* state_weight;
+  T* gate_value;
+  T* reset_output_value;
+  T* output_value;
+  T* prev_out_value;
+};
+
+template <typename T>
+struct GRUUnitFunctor {
+  static void compute(GRUMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      const lite::cuda::math::ActivationType& active_node,
+                      const lite::cuda::math::ActivationType& active_gate,
+                      bool origin_mode,
+                      lite::cuda::math::Gemm<T, T>* blas,
+                      CUDAContext* context) {
+    dim3 threads, grids;
+    if (batch_size == 1) {
+      if (lite::TargetWrapperCuda::GetComputeCapability() >= 70) {
+        if (frame_size < 16) {
+          constexpr int tiled_size = 8;
+          int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
+          threads = dim3(tiled_size, 1);
+          grids = dim3(frame_blocks, 1);
+          lite::cuda::math::FastCollectiveGruGate<
+              T,
+              tiled_size><<<grids, threads, 0, context->exec_stream()>>>(
+              value.gate_value,
+              value.prev_out_value,
+              value.gate_weight,
+              value.reset_output_value,
+              frame_size,
+              active_gate);
+          frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+          grids = dim3(frame_blocks, 1);
+          lite::cuda::math::FastCollectiveGruOut<
+              T,
+              tiled_size><<<grids, threads, 0, context->exec_stream()>>>(
+              value.state_weight,
+              value.prev_out_value,
+              value.output_value,
+              value.gate_value,
+              value.reset_output_value,
+              frame_size,
+              active_node,
+              origin_mode);
+        } else {
+          constexpr int tiled_size = 16;
+          int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
+          threads = dim3(tiled_size, 1);
+          grids = dim3(frame_blocks, 1);
+          lite::cuda::math::FastCollectiveGruGate<
+              T,
+              tiled_size><<<grids, threads, 0, context->exec_stream()>>>(
+              value.gate_value,
+              value.prev_out_value,
+              value.gate_weight,
+              value.reset_output_value,
+              frame_size,
+              active_gate);
+          frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+          grids = dim3(frame_blocks, 1);
+          lite::cuda::math::FastCollectiveGruOut<
+              T,
+              tiled_size><<<grids, threads, 0, context->exec_stream()>>>(
+              value.state_weight,
+              value.prev_out_value,
+              value.output_value,
+              value.gate_value,
+              value.reset_output_value,
+              frame_size,
+              active_node,
+              origin_mode);
+        }
+        return;
+      } else {
+        int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+        int frame_blocks = (frame_size + 1024 - 1) / 1024;
+        threads = dim3(frame_per_block, 1);
+        grids = dim3(frame_blocks, 1);
+      }
+    } else {
+      threads = dim3(32, 32);
+      grids = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+
+    if (value.prev_out_value) {
+      CHECK(blas->init(false,
+                       false,
+                       batch_size,
+                       frame_size * 2,
+                       frame_size,
+                       frame_size,
+                       frame_size * 2,
+                       frame_size * 3,
+                       context));
+      blas->run(1.0f,
+                1.0f,
+                value.prev_out_value,
+                value.gate_weight,
+                value.gate_value,
+                context);
+    }
+
+    lite::cuda::math::GruForwardResetOutput<
+        T><<<grids, threads, 0, context->exec_stream()>>>(
+        value.gate_value,
+        value.reset_output_value,
+        value.prev_out_value,
+        frame_size,
+        batch_size,
+        active_gate,
+        batch_size != 1);
+    CUDA_POST_KERNEL_CHECK;
+
+    if (value.prev_out_value) {
+      CHECK(blas->init(false,
+                       false,
+                       batch_size,
+                       frame_size,
+                       frame_size,
+                       frame_size,
+                       frame_size,
+                       frame_size * 3,
+                       context));
+      blas->run(1.0f,
+                1.0f,
+                value.reset_output_value,
+                value.state_weight,
+                value.gate_value + frame_size * 2,
+                context);
+    }
+
+    lite::cuda::math::GruForwardFinalOutput<
+        T><<<grids, threads, 0, context->exec_stream()>>>(value.gate_value,
+                                                          value.prev_out_value,
+                                                          value.output_value,
+                                                          frame_size,
+                                                          batch_size,
+                                                          active_node,
+                                                          origin_mode,
+                                                          batch_size != 1);
+    CUDA_POST_KERNEL_CHECK;
+  }
+};
+
+template struct GRUUnitFunctor<float>;
+
+template <>
+struct GRUUnitFunctor<half> {
+  static void compute(GRUMetaValue<half> value,
+                      int frame_size,
+                      int batch_size,
+                      const lite::cuda::math::ActivationType& active_node,
+                      const lite::cuda::math::ActivationType& active_gate,
+                      bool origin_mode,
+                      lite::cuda::math::Gemm<half, half>* blas,
+                      CUDAContext* context) {
+    dim3 threads, grids;
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grids = dim3(frame_blocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grids = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+
+    if (value.prev_out_value) {
+      CHECK(blas->init(false,
+                       false,
+                       batch_size,
+                       frame_size * 2,
+                       frame_size,
+                       frame_size,
+                       frame_size * 2,
+                       frame_size * 3,
+                       context));
+      blas->run(1.0f,
+                1.0f,
+                value.prev_out_value,
+                value.gate_weight,
+                value.gate_value,
+                context);
+    }
+
+    lite::cuda::math::GruForwardResetOutput<
+        half><<<grids, threads, 0, context->exec_stream()>>>(
+        value.gate_value,
+        value.reset_output_value,
+        value.prev_out_value,
+        frame_size,
+        batch_size,
+        active_gate,
+        batch_size == 1);
+    CUDA_POST_KERNEL_CHECK;
+
+    if (value.prev_out_value) {
+      CHECK(blas->init(false,
+                       false,
+                       batch_size,
+                       frame_size,
+                       frame_size,
+                       frame_size,
+                       frame_size,
+                       frame_size * 3,
+                       context));
+      blas->run(1.0f,
+                1.0f,
+                value.reset_output_value,
+                value.state_weight,
+                value.gate_value + frame_size * 2,
+                context);
+    }
+
+    lite::cuda::math::GruForwardFinalOutput<
+        half><<<grids, threads, 0, context->exec_stream()>>>(
+        value.gate_value,
+        value.prev_out_value,
+        value.output_value,
+        frame_size,
+        batch_size,
+        active_node,
+        origin_mode,
+        batch_size == 1);
+    CUDA_POST_KERNEL_CHECK;
+  }
+};
+
+template <typename T, PrecisionType PType>
+void GRUCompute<T, PType>::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<T, T>);
+}
+
+template <typename T, PrecisionType PType>
+void GRUCompute<T, PType>::Run() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+  auto& param = this->template Param<param_t>();
+
+  auto* input = param.input;
+  lite::Tensor* h0{nullptr};
+  if (param.h0) {
+    h0 = const_cast<lite::Tensor*>(param.h0);
+  }
+  lite::Tensor* bias{nullptr};
+  if (param.bias) {
+    bias = const_cast<lite::Tensor*>(param.bias);
+  }
+  const lite::Tensor* weight = param.weight;
+  T* weight_data = const_cast<T*>(weight->template data<T>());
+  lite::Tensor* batch_gate = param.batch_gate;
+  lite::Tensor* batch_reset_hidden_prev = param.batch_reset_hidden_prev;
+  lite::Tensor* batch_hidden = param.batch_hidden;
+  lite::Tensor* hidden = param.hidden;
+  T* batch_reset_hidden_prev_data =
+      batch_reset_hidden_prev->template mutable_data<T>(TARGET(kCUDA));
+  hidden->template mutable_data<T>(TARGET(kCUDA));
+  T* batch_gate_data = batch_gate->template mutable_data<T>(TARGET(kCUDA));
+  T* batch_hidden_data = batch_hidden->template mutable_data<T>(TARGET(kCUDA));
+  bool is_reverse = param.is_reverse;
+  auto active_node = lite::cuda::math::GetActiveType(param.activation);
+  auto active_gate = lite::cuda::math::GetActiveType(param.gate_activation);
+  bool origin_mode = param.origin_mode;
+
+  auto hidden_dims = hidden->dims();
+  int frame_size = hidden_dims[1];
+
+  lite::cuda::math::LoDTensor2BatchFunctor<T> batch_func;
+  batch_func(*input, batch_gate, is_reverse, stream);
+
+  if (bias) {
+    lite::cuda::math::RowwiseAdd<T> add_bias;
+    add_bias(batch_gate_data,
+             bias->template data<T>(),
+             batch_gate_data,
+             frame_size,
+             batch_gate->numel(),
+             stream);
+  }
+  GRUMetaValue<T> gru_value;
+  gru_value.gate_weight = weight_data;
+  gru_value.state_weight = weight_data + 2 * frame_size * frame_size;
+
+  if (h0) {
+    // Since the batch computing for GRU reorders the input sequences
+    // according to their length. The initialized cell state also needs
+    // to reorder.
+    ordered_h0_.Resize(h0->dims());
+    lite::cuda::math::CopyMatrixRowsFunctor<T> row_shuffle;
+    row_shuffle(*h0, &ordered_h0_, batch_gate->lod()[2], true, stream);
+    gru_value.prev_out_value = ordered_h0_.mutable_data<T>(TARGET(kCUDA));
+  } else {
+    gru_value.prev_out_value = nullptr;
+  }
+  auto batch_starts = batch_gate->lod()[0];
+  size_t num_batch = batch_starts.size() - 1;
+  for (size_t n = 0; n < num_batch; ++n) {
+    int bstart = static_cast<int>(batch_starts[n]);
+    int bend = static_cast<int>(batch_starts[n + 1]);
+    int cur_batch_size = bend - bstart;
+
+    gru_value.output_value = batch_hidden_data + bstart * frame_size;
+    gru_value.gate_value = batch_gate_data + bstart * frame_size * 3;
+    gru_value.reset_output_value =
+        batch_reset_hidden_prev_data + bstart * frame_size;
+
+    GRUUnitFunctor<T>::compute(gru_value,
+                               frame_size,
+                               cur_batch_size,
+                               active_node,
+                               active_gate,
+                               origin_mode,
+                               gemm_impl_.get(),
+                               &context);
+    gru_value.prev_out_value = gru_value.output_value;
+  }
+
+  lite::cuda::math::Batch2LoDTensorFunctor<T> to_seq;
+  batch_hidden->set_lod(batch_gate->lod());
+  to_seq(*batch_hidden, hidden, stream);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using GRUFp32 =
+    paddle::lite::kernels::cuda::GRUCompute<float, PRECISION(kFloat)>;
+
+using GRUFp16 = paddle::lite::kernels::cuda::GRUCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(gru, kCUDA, kFloat, kNCHW, GRUFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("H0", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("BatchGate", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("BatchResetHiddenPrev", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("BatchHidden", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(gru, kCUDA, kFP16, kNCHW, GRUFp16, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("H0", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("Weight",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("BatchGate",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("BatchResetHiddenPrev",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("BatchHidden",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Hidden",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/gru_compute.h b/lite/kernels/cuda/gru_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..070deca2c54b919d1afeb856633d94fe5919eabd
--- /dev/null
+++ b/lite/kernels/cuda/gru_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/kernel.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType PType>
+class GRUCompute : public KernelLite<TARGET(kCUDA), PType> {
+ public:
+  using param_t = operators::GRUParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~GRUCompute() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::Gemm<T, T>> gemm_impl_{nullptr};
+  lite::Tensor ordered_h0_;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/gru_compute_test.cc b/lite/kernels/cuda/gru_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..adff5b6b28d6a2b4b9513148fa1219f78534dfca
--- /dev/null
+++ b/lite/kernels/cuda/gru_compute_test.cc
@@ -0,0 +1,183 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/gru_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class GRUTest : public ::testing::Test {
+ protected:
+  GRUTest()
+      : batch_(12),
+        frame_size_(128),
+        activation_("tanh"),
+        gate_activation_("sigmoid"),
+        is_reverse_(false),
+        origin_mode_(false),
+        x_shape_({batch_, frame_size_ * 3}),
+        w_shape_({frame_size_, frame_size_ * 3}),
+        out_shape_({batch_, frame_size_}),
+        lod_({{0, 4, 9, 12}}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(lite::DDim(x_shape_));
+    x_ref_.set_lod(lod_);
+
+    w_ref_.Resize(lite::DDim(w_shape_));
+    w_gpu_.Resize(lite::DDim(w_shape_));
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+    auto w_ref_data = w_ref_.mutable_data<float>();
+
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+    for (int64_t i = 0; i < w_ref_.numel(); i++) {
+      w_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_cpu_.Resize(out_ref_.dims());
+    out_gpu_.Resize(out_ref_.dims());
+    batch_gate_gpu_.Resize(lite::DDim(x_shape_));
+    batch_hidden_gpu_.Resize(lite::DDim(out_shape_));
+    batch_reset_hidden_gpu_.Resize(lite::DDim(out_shape_));
+    RunBaseLine();
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.input = &x_gpu_;
+    param_.weight = &w_gpu_;
+    param_.gate_activation = gate_activation_;
+    param_.activation = activation_;
+    param_.is_reverse = is_reverse_;
+    param_.origin_mode = origin_mode_;
+    param_.hidden = &out_gpu_;
+    param_.batch_gate = &batch_gate_gpu_;
+    param_.batch_reset_hidden_prev = &batch_reset_hidden_gpu_;
+    param_.batch_hidden = &batch_hidden_gpu_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    x_gpu_.set_lod(x_ref_.lod());
+    w_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(w_ref_.data<float>(),
+                                                    w_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(x_shape_));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+    x_gpu_.set_lod(x_ref_.lod());
+    w_half_.Resize(w_ref_.dims());
+    auto w_half_data = w_half_.mutable_data<half>();
+    for (int64_t i = 0; i < w_half_.numel(); i++) {
+      w_half_data[i] = half(lite::float16(w_ref_.data<float>()[i]));
+    }
+    w_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(w_half_data, w_gpu_.dims());
+  }
+
+  void RunBaseLine() {}
+
+  int batch_, frame_size_;
+  std::string activation_, gate_activation_;
+  bool is_reverse_, origin_mode_;
+  std::vector<int64_t> x_shape_, w_shape_, out_shape_;
+  LoD lod_;
+  lite::Tensor x_ref_, w_ref_, out_ref_;
+  lite::Tensor x_gpu_, w_gpu_;
+  lite::Tensor x_half_, w_half_;
+  lite::Tensor batch_gate_gpu_;
+  lite::Tensor batch_hidden_gpu_;
+  lite::Tensor batch_reset_hidden_gpu_;
+  lite::Tensor out_cpu_, out_gpu_;
+
+  operators::GRUParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(GRUTest, TestFP32) {
+  InitFloatInput();
+  GRUCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+}
+
+TEST_F(GRUTest, TestFP16) {
+  InitHalfInput();
+  GRUCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/scale_compute.cc b/lite/kernels/cuda/scale_compute.cc
index 6bf7414d8c85383a834159678cdd5a09e0b434d9..9ce5905a7de750e1eed41e56784419c737e6d2d9 100644
--- a/lite/kernels/cuda/scale_compute.cc
+++ b/lite/kernels/cuda/scale_compute.cc
@@ -23,8 +23,11 @@ namespace cuda {
 
 void ScaleCompute::Run() {
   auto& param = Param<operators::ScaleParam>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
   const float* x_data = param.x->data<float>();
-  float* output_data = param.output->mutable_data<float>();
+  float* output_data = param.output->mutable_data<float>(TARGET(kCUDA));
   DDim x_dims = param.x->dims();
   bool bias_after_scale = param.bias_after_scale;
   float scale = param.scale;
@@ -33,7 +36,7 @@ void ScaleCompute::Run() {
     bias *= scale;
   }
   lite::cuda::math::scale(
-      x_dims.production(), x_data, output_data, scale, bias);
+      x_dims.production(), x_data, output_data, scale, bias, stream);
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/sequence_mask_compute.cu b/lite/kernels/cuda/sequence_mask_compute.cu
index 8a8f292c103b8fb7b55940cf075d4b80b3fb328d..8e227a6a272127f500e10775f7ed4db53660e1f8 100644
--- a/lite/kernels/cuda/sequence_mask_compute.cu
+++ b/lite/kernels/cuda/sequence_mask_compute.cu
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/cuda/sequence_mask_compute.h"
-
 #include <thrust/device_ptr.h>
+#include <thrust/functional.h>
 #include <thrust/reduce.h>
 
 #include "lite/backends/cuda/cuda_utils.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/sequence_mask_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -44,7 +44,7 @@ void SequenceMaskCompute<T, Ptype>::Run() {
   auto stream = ctx.exec_stream();
 
   const auto* x = param.X;
-  auto* x_data = x->template data<int64_t>();
+  const int64_t* x_data = x->template data<int64_t>();
   auto* y = param.Y;
   int maxlen = param.maxlen;
 
@@ -57,8 +57,11 @@ void SequenceMaskCompute<T, Ptype>::Run() {
   }
 
   if (maxlen < 0) {
-    maxlen = thrust::reduce(
-        x_data, x_data + x->numel(), 0, thrust::maximum<int64_t>());
+    maxlen = static_cast<int>(
+        thrust::reduce(thrust::device_pointer_cast(x_data),
+                       thrust::device_pointer_cast(x_data) + x->numel(),
+                       static_cast<int64_t>(0),
+                       thrust::maximum<int64_t>()));
   }
 
   auto y_dim = x->dims().Vectorize();
diff --git a/lite/kernels/cuda/sequence_pad_compute.cu b/lite/kernels/cuda/sequence_pad_compute.cu
index 1e304f00633794dcac5d8ebfcd9d79defb4980f7..8368eb3007e3f1d036420a5dc1c86204365e179c 100644
--- a/lite/kernels/cuda/sequence_pad_compute.cu
+++ b/lite/kernels/cuda/sequence_pad_compute.cu
@@ -32,9 +32,19 @@ void SequencePadCompute<T, Ptype>::Run() {
   const auto* pad_value = param.PadValue;
   auto* out = param.Out;
   auto* len_t = param.Length;
-  int padded_length = param.padded_length;
-
   int seq_num = x->lod()[0].size() - 1;
+  int padded_length;
+  if (param.padded_length == -1) {
+    int max_seq_len = 0;
+    for (int i = 0; i < seq_num; ++i) {
+      max_seq_len = std::max(
+          max_seq_len, static_cast<int>(x->lod()[0][i + 1] - x->lod()[0][i]));
+    }
+    padded_length = max_seq_len;
+  } else {
+    padded_length = param.padded_length;
+  }
+
   int max_seq_len = 0;
   int step_width = x->numel() / x->dims()[0];
 
diff --git a/lite/kernels/cuda/sequence_unpad_compute.cu b/lite/kernels/cuda/sequence_unpad_compute.cu
index bdedd74588884aa1e4b7f7c7ae3f414810b0826a..b4274e19a86d55a4e5e5099e984c537c2929bce7 100644
--- a/lite/kernels/cuda/sequence_unpad_compute.cu
+++ b/lite/kernels/cuda/sequence_unpad_compute.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+
 #include "lite/backends/cuda/math/sequence_padding.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/target_wrapper.h"
@@ -29,8 +30,39 @@ void SequenceUnpadCompute<T, Ptype>::Run() {
   auto& ctx = this->ctx_->template As<CUDAContext>();
   auto stream = ctx.exec_stream();
 
+  auto x_dims = param.X->dims();
+  auto len_dims = param.Length->dims();
+
+  auto* seq_len_ptr = param.Length->template data<int64_t>();
+  seq_len_cpu_.Resize(param.Length->dims());
+  TargetWrapperCuda::MemcpyAsync(seq_len_cpu_.mutable_data<int64_t>(),
+                                 seq_len_ptr,
+                                 sizeof(int64_t) * param.Length->numel(),
+                                 IoDirection::DtoH,
+                                 stream);
+  TargetWrapperCuda::StreamSync(stream);
+
+  int64_t batch_size = len_dims[0];
+  std::vector<uint64_t> out_lod0(batch_size + 1, 0);
+  for (int64_t i = 0; i < batch_size; ++i) {
+    out_lod0[i + 1] = out_lod0[i] + seq_len_cpu_.data<int64_t>()[i];
+  }
+  paddle::lite::LoD out_lod;
+  out_lod.push_back(out_lod0);
+
+  int64_t out_dim0 = out_lod0.back();
+  std::vector<int64_t> out_dims{out_dim0};
+  if (x_dims.size() == 2) {
+    out_dims.push_back(1);
+  } else {
+    for (size_t i = 2; i < x_dims.size(); ++i) {
+      out_dims.push_back(x_dims[i]);
+    }
+  }
+  param.Out->Resize(out_dims);
+  param.Out->set_lod(out_lod);
+
   const auto* pad_tensor = param.X;
-  const auto* len_t = param.Length;
   auto* seq_tensor = param.Out;
 
   int padded_length = pad_tensor->dims()[1];
diff --git a/lite/kernels/cuda/sequence_unpad_compute.h b/lite/kernels/cuda/sequence_unpad_compute.h
index f36520ea15c4ad504b2fd357d8729d6d0dbc2615..6b077a4dcbd91eb8f9a9e2cb1340088434f117aa 100644
--- a/lite/kernels/cuda/sequence_unpad_compute.h
+++ b/lite/kernels/cuda/sequence_unpad_compute.h
@@ -31,6 +31,7 @@ class SequenceUnpadCompute : public KernelLite<TARGET(kCUDA), Ptype> {
 
  private:
   lite::Tensor seq_offsets_;
+  lite::Tensor seq_len_cpu_;
   std::vector<size_t> seq_offsets_vec_;
 };
 
diff --git a/lite/kernels/cuda/sigmoid_compute.cu b/lite/kernels/cuda/sigmoid_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2879f50b4d8a61c80c8c73bf8b3f43e4c8dbe5b0
--- /dev/null
+++ b/lite/kernels/cuda/sigmoid_compute.cu
@@ -0,0 +1,57 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/activation.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/sigmoid_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+void SigmoidCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  int num = static_cast<int>(param.X->numel());
+  auto input = param.X->template data<T>();
+  auto output = param.Out->template mutable_data<T>(TARGET(kCUDA));
+
+  lite::cuda::math::sigmoid<T>(num, input, output, stream);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using SigmoidFp32 =
+    paddle::lite::kernels::cuda::SigmoidCompute<float, PRECISION(kFloat)>;
+
+using SigmoidFp16 =
+    paddle::lite::kernels::cuda::SigmoidCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(sigmoid, kCUDA, kFloat, kNCHW, SigmoidFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sigmoid, kCUDA, kFP16, kNCHW, SigmoidFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sigmoid_compute.h b/lite/kernels/cuda/sigmoid_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..455dc38d1f8d04fdaf5f4a70ee704c8a2fe7ddef
--- /dev/null
+++ b/lite/kernels/cuda/sigmoid_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class SigmoidCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+  virtual ~SigmoidCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sigmoid_compute_test.cc b/lite/kernels/cuda/sigmoid_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e27904333b918baf0de7042005955b8fb44d6930
--- /dev/null
+++ b/lite/kernels/cuda/sigmoid_compute_test.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sigmoid_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/target_wrapper.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SigmoidTest : public ::testing::Test {
+ protected:
+  SigmoidTest() : m_(8), n_(64), shape_({m_, n_}) {
+    x_ref_.Resize(lite::DDim(shape_));
+    x_gpu_.Resize(lite::DDim(shape_));
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+
+    out_ref_.Resize(lite::DDim(shape_));
+    out_cpu_.Resize(out_ref_.dims());
+    out_gpu_.Resize(out_ref_.dims());
+    RunBaseLine();
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.Out = &out_gpu_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(shape_));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+  }
+
+  void RunBaseLine() {
+    for (int64_t i = 0; i < x_ref_.numel(); ++i) {
+      out_ref_.mutable_data<float>()[i] =
+          1.f / (1.f + expf(-1 * x_ref_.data<float>()[i]));
+    }
+  }
+
+  int m_, n_;
+  std::vector<int64_t> shape_;
+  lite::Tensor x_ref_, out_ref_;
+  lite::Tensor x_gpu_;
+  lite::Tensor x_half_;
+  lite::Tensor out_cpu_, out_gpu_;
+
+  operators::ActivationParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(SigmoidTest, TestFP32) {
+  InitFloatInput();
+  SigmoidCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = out_cpu_.data<float>()[i];
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5);
+  }
+}
+
+TEST_F(SigmoidTest, TestFP16) {
+  InitHalfInput();
+  SigmoidCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 2e-2);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/var_conv_2d_compute.cu b/lite/kernels/cuda/var_conv_2d_compute.cu
index b847069879357ea600fd62b8f70d6c50e3c8c35f..b14073e5e1bfe074d355265726562579895dde86 100644
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
@@ -184,6 +184,8 @@ using VarConvFp16 =
 REGISTER_LITE_KERNEL(var_conv_2d, kCUDA, kFloat, kNCHW, VarConvFp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .Finalize();
@@ -191,6 +193,9 @@ REGISTER_LITE_KERNEL(var_conv_2d, kCUDA, kFloat, kNCHW, VarConvFp32, def)
 REGISTER_LITE_KERNEL(var_conv_2d, kCUDA, kFP16, kNCHW, VarConvFp16, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
     .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("COLUMN",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
     .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
     .Finalize();
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index d6df1f877bc31d7d1d443aa809997f895b3f7ec6..4b082a92e9119deef74ceea3889730159ffdaf9d 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -19,6 +19,9 @@ add_kernel(read_from_array_compute_host Host extra SRCS read_from_array_compute.
 add_kernel(assign_compute_host Host extra SRCS assign_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(retinanet_detection_output_compute_host Host extra SRCS retinanet_detection_output_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(where_index_compute_host Host extra SRCS where_index_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(print_compute_host Host extra SRCS print_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(while_compute_host Host extra SRCS while_compute.cc DEPS ${lite_kernel_deps} program)
+add_kernel(conditional_block_compute_host Host extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} program)
 add_kernel(activation_grad_compute_host Host train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps})
 
 if(LITE_BUILD_EXTRA)
diff --git a/lite/kernels/host/assign_compute.cc b/lite/kernels/host/assign_compute.cc
index e496ffbd1d9a6362d730117be949cbdab83ec62a..bfbbc32e5f3b3b4dd5936e0e296306641312cabf 100644
--- a/lite/kernels/host/assign_compute.cc
+++ b/lite/kernels/host/assign_compute.cc
@@ -51,3 +51,19 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kAny),
                                        DATALAYOUT(kAny))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(assign,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::AssignCompute,
+                     def_tensor_array)
+    .BindInput("X",
+               {LiteType::GetTensorListTy(TARGET(kHost),
+                                          PRECISION(kAny),
+                                          DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorListTy(TARGET(kHost),
+                                           PRECISION(kAny),
+                                           DATALAYOUT(kAny))})
+    .Finalize();
diff --git a/lite/kernels/arm/conditional_block_compute.cc b/lite/kernels/host/conditional_block_compute.cc
similarity index 51%
rename from lite/kernels/arm/conditional_block_compute.cc
rename to lite/kernels/host/conditional_block_compute.cc
index f0bd43e1300d4034241c03d3e4ce27dcaa59c1e5..5bdca012dd4e838f3371bae7cf17634513d59db5 100644
--- a/lite/kernels/arm/conditional_block_compute.cc
+++ b/lite/kernels/host/conditional_block_compute.cc
@@ -12,28 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/conditional_block_compute.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
+#include "lite/kernels/host/conditional_block_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 void ConditionalBlockCompute::PrepareForRun() {
-  auto& param = Param<operators::ConditionalBlockParam>();
-  auto cur_scope = param.scope;
-
-  executor_ =
-      std::make_shared<CondExecutor>(param.sub_block, cur_scope, place());
+  auto& param = this->Param<param_t>();
+  program_.reset(new RuntimeProgram(
+      param.program_desc, param.exec_scope, param.block_idx));
 }
+
 void ConditionalBlockCompute::Run() {
-  auto& param = Param<operators::ConditionalBlockParam>();
+  auto& param = this->Param<param_t>();
   for (auto& out : param.outs) {
     out->clear();
   }
@@ -43,32 +36,40 @@ void ConditionalBlockCompute::Run() {
     auto* cond_data = cond->data<bool>();
     need_run = cond_data[0];
   } else {
-    auto x = param.x;
-    for (auto pt : x) {
-      if (pt == nullptr || !pt->IsInitialized() || pt->dims().empty()) {
+    for (auto input : param.inputs) {
+      if (input == nullptr || !input->IsInitialized() ||
+          input->dims().empty()) {
         need_run = false;
         break;
       }
     }
   }
   if (need_run) {
-    executor_->Run();
+    program_->Run();
   }
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(conditional_block,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ConditionalBlockCompute,
+                     kHost,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::host::ConditionalBlockCompute,
                      def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Cond", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Scope", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Input",
+               {LiteType::GetTensorListTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindInput("Cond",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorListTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("Scope",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
     .Finalize();
diff --git a/lite/kernels/host/conditional_block_compute.h b/lite/kernels/host/conditional_block_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d3381ce3c4d6da076e6bb477df423bc640c56c9
--- /dev/null
+++ b/lite/kernels/host/conditional_block_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/program.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class ConditionalBlockCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  using param_t = operators::ConditionalBlockParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+
+ private:
+  std::unique_ptr<RuntimeProgram> program_;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/print_compute.cc b/lite/kernels/host/print_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..00c8ab7b13597ad33b9fafc878cd553572462a99
--- /dev/null
+++ b/lite/kernels/host/print_compute.cc
@@ -0,0 +1,188 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/print_compute.h"
+
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+const char kForward[] = "FORWARD";
+const char kBackward[] = "BACKWARD";
+const char kBoth[] = "BOTH";
+
+class TensorFormatter {
+ public:
+  TensorFormatter() {}
+
+  std::string Format(const Tensor& print_tensor,
+                     const std::string& tensor_name = "",
+                     const std::string& message = "") {
+    std::stringstream log_stream;
+    if (!tensor_name.empty()) {
+      log_stream << "Variable: " << tensor_name << std::endl;
+    }
+
+    if (!message.empty()) {
+      log_stream << "  - message: " << message << std::endl;
+    }
+
+    if (print_tensor_lod_) {
+      log_stream << "  - lod: {";
+      const LoD& lod = print_tensor.lod();
+      for (auto level : lod) {
+        log_stream << "{";
+        bool is_first = true;
+        for (auto i : level) {
+          if (is_first) {
+            log_stream << i;
+            is_first = false;
+          } else {
+            log_stream << ", " << i;
+          }
+        }
+        log_stream << "}";
+      }
+      log_stream << "}" << std::endl;
+    }
+
+    log_stream << "  - place: " << TargetToStr(print_tensor.target())
+               << std::endl;  // TODO(hong19860320) always kHost
+
+    if (print_tensor_shape_) {
+      log_stream << "  - shape: " << print_tensor.dims().repr() << std::endl;
+    }
+
+    if (print_tensor_layout_) {
+      log_stream << "  - layout: "
+                 << DataLayoutToStr(
+                        DATALAYOUT(kNCHW))  // TODO(hong19860320) Query the data
+                                            // layout from target tensor
+                 << std::endl;
+    }
+
+    auto dtype = print_tensor.precision();
+    if (print_tensor_type_) {
+      log_stream << "  - dtype: " << PrecisionToStr(dtype) << std::endl;
+    }
+
+    if (dtype == PRECISION(kBool)) {
+      FormatData<bool>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kInt8)) {
+      FormatData<int8_t>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kInt16)) {
+      FormatData<int16_t>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kInt32)) {
+      FormatData<int32_t>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kInt64)) {
+      FormatData<int64_t>(print_tensor, log_stream);
+    } else if (dtype == PRECISION(kFloat)) {
+      FormatData<float>(print_tensor, log_stream);
+    } else {
+      log_stream << "\tdata: unprintable type: " << PrecisionToStr(dtype)
+                 << std::endl;
+    }
+    return log_stream.str();
+  }
+
+  void Print(const Tensor& print_tensor,
+             const std::string& tensor_name = "",
+             const std::string& message = "") {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    std::cout << Format(print_tensor, tensor_name, message);
+  }
+
+  void SetPrintTensorType(bool print_tensor_type) {
+    print_tensor_type_ = print_tensor_type;
+  }
+  void SetPrintTensorShape(bool print_tensor_shape) {
+    print_tensor_shape_ = print_tensor_shape;
+  }
+  void SetPrintTensorLod(bool print_tensor_lod) {
+    print_tensor_lod_ = print_tensor_lod;
+  }
+  void SetPrintTensorLayout(bool print_tensor_layout) {
+    print_tensor_layout_ = print_tensor_layout;
+  }
+  void SetSummarize(int64_t summarize) { summarize_ = summarize; }
+
+ private:
+  template <typename T>
+  void FormatData(const Tensor& print_tensor, std::stringstream& log_stream) {
+    int64_t print_size = summarize_ == -1
+                             ? print_tensor.numel()
+                             : std::min(summarize_, print_tensor.numel());
+    const T* data = print_tensor.data<T>();  // Always kHost, so unnessary to
+                                             // copy the data from device
+    log_stream << "  - data: [";
+    if (print_size > 0) {
+      log_stream << data[0];
+      for (int64_t i = 1; i < print_size; ++i) {
+        log_stream << " " << data[i];
+      }
+    }
+    log_stream << "]" << std::endl;
+  }
+
+  int64_t summarize_ = -1;
+  bool print_tensor_type_ = true;
+  bool print_tensor_shape_ = true;
+  bool print_tensor_lod_ = true;
+  bool print_tensor_layout_ = true;
+};
+
+void PrintCompute::Run() {
+  auto& param = Param<param_t>();
+  param.out->CopyDataFrom(*param.in);
+
+  if ((param.is_forward && param.print_phase == kBackward) ||
+      (!param.is_forward && param.print_phase == kForward)) {
+    return;
+  }
+
+  int first_n = param.first_n;
+  if (first_n > 0 && ++times_ > first_n) return;
+
+  TensorFormatter formatter;
+  const std::string& name = param.print_tensor_name ? param.name : "";
+  formatter.SetPrintTensorType(param.print_tensor_type);
+  formatter.SetPrintTensorShape(param.print_tensor_shape);
+  formatter.SetPrintTensorLod(param.print_tensor_lod);
+  formatter.SetPrintTensorLayout(param.print_tensor_layout);
+  formatter.SetSummarize(static_cast<int64_t>(param.summarize));
+  formatter.Print(*param.in, name, param.message);
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    print, kHost, kAny, kAny, paddle::lite::kernels::host::PrintCompute, def)
+    .BindInput("In",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
diff --git a/lite/kernels/host/print_compute.h b/lite/kernels/host/print_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..91a54182d2d2e00250da01fcd5d62556da930198
--- /dev/null
+++ b/lite/kernels/host/print_compute.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class PrintCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  using param_t = operators::PrintParam;
+
+  void Run() override;
+
+  virtual ~PrintCompute() = default;
+
+ private:
+  mutable int times_{0};
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/while_compute.cc b/lite/kernels/host/while_compute.cc
similarity index 50%
rename from lite/kernels/arm/while_compute.cc
rename to lite/kernels/host/while_compute.cc
index 9241fd410a542cef797b57b9341f59895b0f734d..4886b5ffe0f48b231bcef59b5494fc126b8b69e2 100644
--- a/lite/kernels/arm/while_compute.cc
+++ b/lite/kernels/host/while_compute.cc
@@ -12,44 +12,44 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/while_compute.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
+#include "lite/kernels/host/while_compute.h"
+#include <unordered_map>
+#include <utility>
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 void WhileCompute::PrepareForRun() {
-  auto &param = Param<operators::WhileParam>();
-  auto cur_scope = param.scope;
-
-  executor_ =
-      std::make_shared<StepExecutor>(param.sub_block, cur_scope, place());
+  auto &param = this->Param<param_t>();
+  program_.reset(new RuntimeProgram(
+      param.program_desc, param.exec_scope, param.block_idx));
 }
 void WhileCompute::Run() {
-  auto &param = Param<operators::WhileParam>();
+  auto &param = this->Param<param_t>();
   while (param.cond->data<bool>()[0]) {
-    executor_->Run();
+    program_->Run();
   }
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(
-    while, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::WhileCompute, def)
-    .BindInput("X", {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))})
+    while, kHost, kAny, kAny, paddle::lite::kernels::host::WhileCompute, def)
+    .BindInput("X",
+               {LiteType::GetTensorListTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
     .BindInput("Condition",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
     .BindOutput("Out",
-                {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))})
-    .BindOutput("StepScopes", {LiteType::GetTensorTy(TARGET(kARM))})
+                {LiteType::GetTensorListTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("StepScopes",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
     .Finalize();
diff --git a/lite/kernels/host/while_compute.h b/lite/kernels/host/while_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..42065865e45c18376034dea0e105bc6d4f1f053f
--- /dev/null
+++ b/lite/kernels/host/while_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/program.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class WhileCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  using param_t = operators::WhileParam;
+
+  void Run() override;
+  void PrepareForRun() override;
+
+  virtual ~WhileCompute() = default;
+
+ private:
+  std::unique_ptr<RuntimeProgram> program_;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/huawei_ascend_npu/CMakeLists.txt b/lite/kernels/huawei_ascend_npu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..be0a8d05081e3dda5f474689dc4eed23bc5f56c4
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(bridges)
+
+add_kernel(subgraph_compute_huawei_ascend_npu HUAWEI_ASCEND_NPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_huawei_ascend_npu subgraph_bridge_engine ${huawei_ascend_npu_subgraph_bridges})
diff --git a/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt b/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f6fac2b0b560dcc467132abe9a21c2c75d266a77
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
@@ -0,0 +1,19 @@
+if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+  return()
+endif()
+
+lite_cc_library(subgraph_bridge_utility_huawei_ascend_npu SRCS utility.cc DEPS)
+lite_cc_library(subgraph_bridge_graph_huawei_ascend_npu SRCS graph.cc DEPS subgraph_bridge_utility_huawei_ascend_npu)
+
+set(huawei_ascend_npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_huawei_ascend_npu subgraph_bridge_graph_huawei_ascend_npu)
+
+lite_cc_library(subgraph_bridge_act_op_huawei_ascend_npu SRCS act_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_op_huawei_ascend_npu SRCS conv_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
+
+set(huawei_ascend_npu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_engine
+        subgraph_bridge_graph_huawei_ascend_npu
+        subgraph_bridge_act_op_huawei_ascend_npu
+        subgraph_bridge_conv_op_huawei_ascend_npu
+        CACHE INTERNAL "huawei_ascend_npu_subgraph_bridges")
diff --git a/lite/kernels/huawei_ascend_npu/bridges/act_op.cc b/lite/kernels/huawei_ascend_npu/bridges/act_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0293515356a13035fcdc4725c5de132ea06ceb67
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/act_op.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+
+template <typename ActType>
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Act node
+  auto act_node = graph->template Add<ActType>(out_name);
+  auto act_op = act_node->template data<ActType>();
+  act_op->set_input_x(*x_node->data());
+
+  return SUCCESS;
+}
+
+template <>
+int ActConverter<ge::op::LeakyRelu>(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Act node
+  auto act_node = graph->template Add<ge::op::LeakyRelu>(out_name);
+  auto act_op = act_node->template data<ge::op::LeakyRelu>();
+  act_op->set_input_x(*x_node->data());
+  // only for leaky_relu
+  auto alpha = op_info->GetAttr<float>("alpha");
+  act_op->set_attr_negative_slope(alpha);
+
+  return SUCCESS;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(
+    sigmoid,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Sigmoid>);
+REGISTER_SUBGRAPH_BRIDGE(
+    relu,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Relu>);
+REGISTER_SUBGRAPH_BRIDGE(
+    tanh,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Tanh>);
+REGISTER_SUBGRAPH_BRIDGE(
+    relu6,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Relu6>);
+REGISTER_SUBGRAPH_BRIDGE(
+    leaky_relu,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::LeakyRelu>);
+REGISTER_SUBGRAPH_BRIDGE(
+    softsign,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Softsign>);
+REGISTER_SUBGRAPH_BRIDGE(
+    softplus,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Softplus>);
diff --git a/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc b/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..075bbca8bd63a3c12d74b3624c6a1d51d7edfb76
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
@@ -0,0 +1,252 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " << op_type << "... ";
+
+  // Get input and output vars and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  ge::DataType ge_data_type = CvtPrecisionType(input->precision());
+
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+
+  auto output_name = op_info->Output("Output").front();
+  auto output = scope->FindMutableTensor(output_name);
+  auto output_dims = output->dims();
+
+  auto bs = input_dims[0];
+  auto ic = input_dims[1];
+  auto oc = filter_dims[0];
+  CHECK_EQ(input_dims.size(), 4L);
+  CHECK_EQ(output_dims.size(), 4L);
+  CHECK_EQ(filter_dims.size(), 4L);
+  CHECK_EQ(output_dims[0], bs);
+  CHECK_EQ(output_dims[1], oc);
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  // Conv2D: groups must set to 1; DepthwiseConv2D: groups not supported.
+  CHECK_LE(groups, 1)
+      << "[HUAWEI_ASCEND_NPU] groups > 1 NOT supported, groups: " << groups;
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  bool with_act =
+      op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
+  std::string act_type =
+      with_act ? op_info->GetAttr<std::string>("act_type") : "";
+  float leaky_relu_alpha = act_type == "leaky_relu"
+                               ? op_info->GetAttr<float>("leaky_relu_alpha")
+                               : 0.f;
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+
+  // Input node
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    input_node = graph->Add(input_name, *input);
+  }
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "[HUAWEI_ASCEND_NPU] Paddings size should be "
+         "the same or twice as the input size.";
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
+  // Check depthwise mode, and decide whether use DepthwiseConv2D Op
+  bool use_depthwise_conv = false;
+  bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1);
+  if (is_depthwise_mode && dilations[0] == 1 && dilations[1] == 1) {
+    use_depthwise_conv = true;
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] DepthwiseConv2D op is used.";
+  }
+
+  // Filter node
+  auto filter_node = graph->Add(filter_name, *filter);
+
+  // Add bias node if exists bias
+  // Supports the bias nodes with the following dimensions
+  // 0: {oc} => 1D tensor of foramt ND
+  // 1: {1, oc, oh, ow}
+  // 2: {n, oc, oh, ow}
+  std::vector<int64_t> bias_shape;
+  std::shared_ptr<Node> bias_node = nullptr;
+  bool is_channel_bias = false;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      auto bias_data_size = bias_dims.production();
+      auto output_data_size = output_dims.production();
+      if (bias_data_size == oc) {
+        // 0: {oc}
+        bias_shape = {oc};
+        is_channel_bias = true;
+      } else if (bias_data_size == output_data_size / bs) {
+        // 1: {1, oc, oh, ow}
+        bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
+      } else if (bias_data_size == output_data_size) {
+        // 2: {n, oc, oh, ow}
+        bias_shape = output_dims.Vectorize();
+      } else {
+        LOG(WARNING)
+            << "[HUAWEI_ASCEND_NPU] Bias dimension " << bias_dims
+            << " isn't supported in conv2d Op when output dimension is "
+            << output_dims;
+        return FAILED;
+      }
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
+    }
+  }
+
+  // Ascend must update convop desc, or IR model build will fail
+  ge::TensorDesc conv2d_input_desc_x(
+      ge::Shape(CvtShape(input_dims)), ge::FORMAT_NCHW, ge_data_type);
+  ge::TensorDesc conv2d_input_desc_filter(
+      ge::Shape(CvtShape(filter_dims)), ge::FORMAT_NCHW, ge_data_type);
+  ge::TensorDesc conv2d_input_desc_bias(
+      ge::Shape(bias_shape), ge::FORMAT_ND, ge_data_type);
+  ge::TensorDesc conv2d_output_desc_y(
+      ge::Shape(CvtShape(output_dims)), ge::FORMAT_NCHW, ge_data_type);
+  // Setting desc name
+  conv2d_input_desc_x.SetName("conv2d_input_desc_x");
+  conv2d_input_desc_filter.SetName("conv2d_input_desc_filter");
+  conv2d_input_desc_bias.SetName("conv2d_input_desc_bias");
+  conv2d_output_desc_y.SetName("conv2d_output_desc_y");
+  // Conv node
+  std::shared_ptr<Node> conv_node = nullptr;
+  if (use_depthwise_conv && is_depthwise_mode) {
+    conv_node = graph->Add<ge::op::DepthwiseConv2D>(output_name);
+    auto conv_op = conv_node->data<ge::op::DepthwiseConv2D>();
+    conv_op->set_input_x(*input_node->data());
+    conv_op->set_input_filter(*filter_node->data());
+    conv_op->set_attr_strides(
+        ge::Operator::OpListInt({1, 1, strides[0], strides[1]}));
+    conv_op->set_attr_dilations({1, 1, dilations[0], dilations[1]});
+    conv_op->set_attr_pads(
+        {paddings[0], paddings[1], paddings[2], paddings[3]});
+    conv_op->set_attr_data_format("NCHW");
+    if (bias_node != nullptr && is_channel_bias) {
+      conv_op->set_input_bias(*bias_node->data());
+      conv_op->update_input_desc_bias(conv2d_input_desc_bias);
+    }
+    // update tensor desc to conv2d
+    conv_op->update_input_desc_x(conv2d_input_desc_x);
+    conv_op->update_input_desc_filter(conv2d_input_desc_filter);
+    conv_op->update_output_desc_y(conv2d_output_desc_y);
+  } else {
+    conv_node = graph->Add<ge::op::Conv2D>(output_name);
+    auto conv_op = conv_node->data<ge::op::Conv2D>();
+    conv_op->set_input_x(*input_node->data());
+    conv_op->set_input_filter(*filter_node->data());
+    conv_op->set_attr_strides(
+        ge::Operator::OpListInt({bs, ic, strides[0], strides[1]}));
+    conv_op->set_attr_pads(ge::Operator::OpListInt(
+        {paddings[0], paddings[1], paddings[2], paddings[3]}));
+    conv_op->set_attr_dilations(
+        ge::Operator::OpListInt({bs, ic, dilations[0], dilations[1]}));
+    conv_op->set_attr_groups(groups);
+    conv_op->set_attr_data_format("NCHW");
+    if (bias_node != nullptr && is_channel_bias) {
+      conv_op->set_input_bias(*bias_node->data());
+      conv_op->update_input_desc_bias(conv2d_input_desc_bias);
+    }
+    // update tensor desc to conv2d
+    conv_op->update_input_desc_x(conv2d_input_desc_x);
+    conv_op->update_input_desc_filter(conv2d_input_desc_filter);
+    conv_op->update_output_desc_y(conv2d_output_desc_y);
+  }
+  // append Add node to support bias
+  if (bias_node != nullptr && !is_channel_bias) {
+    auto add_node = graph->Add<ge::op::Add>(output_name);
+    auto add_op = add_node->data<ge::op::Add>();
+    add_op->set_input_x1(*conv_node->data());
+    add_op->set_input_x2(*bias_node->data());
+    conv_node = add_node;
+  }
+  CHECK(conv_node);
+
+  // ONLY support relu/leaky_relu now
+  // to do (@qili93): add more act types
+  if (!act_type.empty()) {
+    if (act_type == "relu") {
+      auto act_node = graph->Add<ge::op::Relu>(output_name);
+      auto act_op = act_node->data<ge::op::Relu>();
+      act_op->set_input_x(*conv_node->data());
+    } else if (act_type == "leaky_relu") {
+      auto act_node = graph->Add<ge::op::LeakyRelu>(output_name);
+      auto act_op = act_node->data<ge::op::LeakyRelu>();
+      act_op->set_input_x(*conv_node->data());
+      act_op->set_attr_negative_slope(leaky_relu_alpha);
+    } else {
+      LOG(WARNING) << "[HUAWEI_ASCEND_NPU] act type not supported: "
+                   << act_type;
+      return FAILED;
+    }
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(
+    conv2d,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(
+    depthwise_conv2d,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ConvConverter);
diff --git a/lite/kernels/huawei_ascend_npu/bridges/graph.cc b/lite/kernels/huawei_ascend_npu/bridges/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e1eaf1228fd3df7583ddc194b3d58862ddc0e12
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/graph.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include <utility>
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
+  auto it = nodes_.find(name);
+  if (it != nodes_.end()) {
+    // Only variable node can be shared with the same name
+    if (!node->is_var() || !it->second.back()->is_var()) {
+      LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Const or data node " << name
+                 << " is redefined.";
+      return -1;
+    }
+  } else {
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
+  }
+  it->second.push_back(node);
+  return it->second.size();
+}
+
+// Const or data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const Tensor& tensor,
+                                 std::vector<int64_t> shape,
+                                 DataLayoutType layout) {
+  std::shared_ptr<Node> node = nullptr;
+  PrecisionType precision = tensor.precision();
+  if (tensor.persistable()) {
+    // Const node
+    node = Add<ge::op::Const>(name, precision, layout);
+    ge::TensorDesc desc(ge::Shape(shape),
+                        CvtDataLayoutType(layout),
+                        CvtPrecisionType(precision));
+    desc.SetName("const_node_desc");
+    node->data<ge::op::Const>()->set_attr_value(
+        CvtTensor(tensor, shape, layout));
+    node->data<ge::op::Const>()->update_output_desc_y(desc);
+  } else {
+    // Data node
+    node = Add(name, shape, precision, layout);
+  }
+  return node;
+}
+
+// Data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 std::vector<int64_t> shape,
+                                 PrecisionType precision,
+                                 DataLayoutType layout) {
+  auto node = Add<ge::op::Data>(name, precision, layout);
+  ge::TensorDesc desc(
+      ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision));
+  desc.SetName("data_node_desc");
+  node->data<ge::op::Data>()->update_input_desc_x(desc);
+  node->data<ge::op::Data>()->update_output_desc_y(desc);
+  return node;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/huawei_ascend_npu/bridges/graph.h b/lite/kernels/huawei_ascend_npu/bridges/graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb763004939a4ccfffdd526e92bc029509aab45e
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/graph.h
@@ -0,0 +1,196 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "op_proto/built-in/inc/all_ops.h"  // opp/op_proto/built-in/inc
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+
+// Graph and node is defined to collect all of converted HiAI IR nodes
+class Node {
+ public:
+  enum class Role {
+    kVar = 0,
+    kConst,
+    kData,
+  };
+
+  Node(std::shared_ptr<ge::Operator> data,
+       PrecisionType precision,
+       DataLayoutType layout,
+       Role role)
+      : data_(data), precision_(precision), layout_(layout), role_(role) {}
+  Node(PrecisionType precision, DataLayoutType layout, Role role)
+      : precision_(precision), layout_(layout), role_(role) {}
+
+  void set_data(std::shared_ptr<ge::Operator> data) { data_ = data; }
+  void set_precision(PrecisionType precision) { precision_ = precision; }
+  void set_layout(DataLayoutType layout) { layout_ = layout; }
+  void set_role(Role role) { role_ = role; }
+
+  template <typename T>
+  std::shared_ptr<T> data() {
+    return std::static_pointer_cast<T>(data_);
+  }
+  std::shared_ptr<ge::Operator> data() { return data_; }
+  PrecisionType precision() const { return precision_; }
+  DataLayoutType layout() const { return layout_; }
+  bool is_var() const { return role_ == Role::kVar; }
+  bool is_const() const { return role_ == Role::kConst; }
+  bool is_data() const { return role_ == Role::kData; }
+
+ private:
+  std::shared_ptr<ge::Operator> data_{nullptr};
+  PrecisionType precision_{PRECISION(kFloat)};
+  DataLayoutType layout_{DATALAYOUT(kNCHW)};
+  Role role_{Role::kVar};
+};
+
+class Graph {
+ public:
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+
+  // Variable, const or data node
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    Node::Role role = Node::Role::kVar;
+    if (typeid(T) == typeid(ge::op::Const)) {
+      role = Node::Role::kConst;
+    } else if (typeid(T) == typeid(ge::op::Data)) {
+      role = Node::Role::kData;
+    }
+    auto node = std::make_shared<Node>(precision, layout, role);
+    auto idx = Add(name, node);
+    CHECK_GE(idx, 1);
+    // Generate a unique name for the created HiAI IR
+    node->set_data(
+        std::make_shared<T>(name + "__" + paddle::lite::to_string(idx)));
+    return node;
+  }
+
+  // Const or data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            std::vector<int64_t> shape,
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, tensor, tensor.dims().Vectorize(), layout);
+  }
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, tensor, dims.Vectorize(), layout);
+  }
+
+  // Const node
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const std::vector<T>& data,
+                            std::vector<int64_t> shape = {},
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    if (shape.empty()) {
+      shape = {static_cast<int64_t>(data.size())};
+    } else {
+      int size = 1;
+      for (auto i : shape) {
+        size *= i;
+      }
+      CHECK_EQ(data.size(), size);
+    }
+    Tensor tensor;
+    tensor.Resize(shape);
+    tensor.set_persistable(true);
+    std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
+                reinterpret_cast<const uint8_t*>(data.data()),
+                data.size() * sizeof(T));
+    return Add(name, tensor, layout);
+  }
+
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const std::vector<T>& data,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, data, dims.Vectorize(), layout);
+  }
+
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            T value,
+                            std::vector<int64_t> shape = {1},
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    int64_t size = 1;
+    for (auto i : shape) {
+      size *= i;
+    }
+    std::vector<T> data(size, value);
+    return Add(name, data, shape, layout);
+  }
+
+  template <typename T>
+  std::shared_ptr<Node> Add(const std::string& name,
+                            T value,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, value, dims.Vectorize(), layout);
+  }
+
+  // Data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            std::vector<int64_t> shape,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            DDim dims,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, dims.Vectorize(), precision, layout);
+  }
+
+  std::shared_ptr<Node> Get(std::string name) {
+    CHECK(Has(name)) << "[HUAWEI_ASCEND_NPU] Node " << name << " not found.";
+    return nodes_.at(name).back();
+  }
+
+  bool Has(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+ private:
+  std::map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
+};
+
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h b/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d38a4b0e68df0ddd66e0642e34323c40a6f1056
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// activation
+USE_SUBGRAPH_BRIDGE(sigmoid, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(relu, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(tanh, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(relu6, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(leaky_relu, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(softsign, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(softplus, kHuaweiAscendNPU);
+// conv
+USE_SUBGRAPH_BRIDGE(conv2d, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kHuaweiAscendNPU);
diff --git a/lite/kernels/huawei_ascend_npu/bridges/utility.cc b/lite/kernels/huawei_ascend_npu/bridges/utility.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2fdaa49b94f48ad12b58036cd89d2f545566cad6
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.cc
@@ -0,0 +1,217 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+
+ge::DataType CvtPrecisionType(PrecisionType itype) {
+  ge::DataType otype = ge::DT_FLOAT;
+  switch (itype) {
+    case PRECISION(kFloat):
+      otype = ge::DT_FLOAT;
+      break;
+    case PRECISION(kFP16):
+      otype = ge::DT_FLOAT16;
+      break;
+    case PRECISION(kInt8):
+      otype = ge::DT_INT8;
+      break;
+    case PRECISION(kInt16):
+      otype = ge::DT_INT16;
+      break;
+    case PRECISION(kInt32):
+      otype = ge::DT_INT32;
+      break;
+    case PRECISION(kInt64):
+      otype = ge::DT_INT64;
+      break;
+    // TODO(liq27) support more precision type
+    default:
+      LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Can not convert precision type("
+                 << PrecisionToStr(itype) << ") from Lite to NPU";
+      break;
+  }
+  return otype;
+}
+
+ge::Format CvtDataLayoutType(DataLayoutType itype) {
+  ge::Format otype = ge::FORMAT_NCHW;
+  switch (itype) {
+    case DATALAYOUT(kNCHW):
+      otype = ge::FORMAT_NCHW;
+      break;
+    case DATALAYOUT(kNHWC):
+      otype = ge::FORMAT_NHWC;
+      break;
+    // TODO(liq27) support more data layout type
+    default:
+      LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Can not convert data layout type("
+                 << DataLayoutToStr(itype)
+                 << ") from Lite to HUAWEI_ASCEND_NPU";
+      break;
+  }
+  return otype;
+}
+
+std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape) {
+  std::vector<int64_t> out_shape;
+  // Padding the shape to 4-dimensions(NCHW)
+  for (size_t i = 0; i < 4 - in_shape.size(); i++) {
+    out_shape.push_back(1);
+  }
+  for (size_t i = 0; i < in_shape.size(); i++) {
+    out_shape.push_back(in_shape[i]);
+  }
+  return out_shape;
+}
+
+std::vector<int64_t> CvtShape(const DDim& in_dims) {
+  return CvtShape(in_dims.Vectorize());
+}
+
+ge::Tensor CvtTensor(const Tensor& in_tensor,
+                     std::vector<int64_t> out_shape,
+                     DataLayoutType in_layout) {
+  PrecisionType in_precision = in_tensor.precision();
+  auto in_size = in_tensor.dims().production();
+  auto in_shape = in_tensor.dims().Vectorize();
+  if (out_shape.empty()) {
+    out_shape = in_shape;
+  }
+  ge::TensorDesc out_desc(ge::Shape(out_shape),
+                          CvtDataLayoutType(in_layout),
+                          CvtPrecisionType(in_precision));
+  auto out_size = out_desc.GetShape().GetShapeSize();
+  CHECK_EQ(out_size, in_size);
+  ge::Tensor out_tensor;
+  out_tensor.SetTensorDesc(out_desc);
+  out_tensor.SetData(reinterpret_cast<const uint8_t*>(in_tensor.raw_data()),
+                     in_tensor.memory_size());
+  return out_tensor;
+}
+
+int CvtActMode(std::string act_type) {
+  int act_mode = 1;
+  if (act_type == "sigmoid") {
+    act_mode = 0;
+  } else if (act_type == "relu") {
+    act_mode = 1;
+  } else if (act_type == "tanh") {
+    act_mode = 2;
+  } else if (act_type == "relu_clipped" || act_type == "relu6") {
+    act_mode = 3;
+  } else if (act_type == "elu") {
+    act_mode = 4;
+  } else if (act_type == "leaky_relu") {
+    act_mode = 5;
+  } else if (act_type == "abs") {
+    act_mode = 6;
+  } else if (act_type == "softsign") {
+    act_mode = 8;
+  } else if (act_type == "softplus") {
+    act_mode = 9;
+  } else if (act_type == "hard_sigmoid") {
+    act_mode = 10;
+  } else if (act_type == "thresholded_relu") {
+    act_mode = 11;
+  } else {
+    // TODO(liqi27) support more activation mode
+    LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Unsupported activation type "
+               << act_type;
+  }
+  return act_mode;
+}
+
+const std::string& CvtFormat(ge::Format format) {
+  static const int MAX_FORMAT_LENGTH = 25;
+  static const std::string format2string[] = {
+      "FORMAT_NCHW = 0",
+      "FORMAT_NHWC = 1",
+      "FORMAT_ND = 2",
+      "FORMAT_NC1HWC0 = 3",
+      "FORMAT_FRACTAL_Z = 4",
+      "FORMAT_NC1C0HWPAD = 5",
+      "FORMAT_NHWC1C0 = 6",
+      "FORMAT_FSR_NCHW = 7",
+      "FORMAT_FRACTAL_DECONV = 8",
+      "FORMAT_C1HWNC0 = 9",
+      "FORMAT_FRACTAL_DECONV_TRANSPOSE = 10",
+      "FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS = 11",
+      "FORMAT_NC1HWC0_C04 = 12",
+      "FORMAT_FRACTAL_Z_C04 = 13",
+      "FORMAT_CHWN = 14",
+      "FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS = 15",
+      "FORMAT_HWCN = 16",
+      "FORMAT_NC1KHKWHWC0 = 17",
+      "FORMAT_BN_WEIGHT = 18",
+      "FORMAT_FILTER_HWCK = 19",
+      "FORMAT_HASHTABLE_LOOKUP_LOOKUPS = 20",
+      "FORMAT_HASHTABLE_LOOKUP_KEYS = 21",
+      "FORMAT_HASHTABLE_LOOKUP_VALUE = 22",
+      "FORMAT_HASHTABLE_LOOKUP_OUTPUT = 23",
+      "FORMAT_HASHTABLE_LOOKUP_HITS = 24"};
+  auto x = static_cast<int>(format);
+  CHECK_LT(x, MAX_FORMAT_LENGTH);
+  return format2string[x];
+}
+
+const std::string& CvtDataType(ge::DataType data_type) {
+  static const int MAX_DATATYPE_LENGTH = 14;
+  static const std::string datatype2string[] = {"DT_FLOAT=0",
+                                                "DT_FLOAT16=1",
+                                                "DT_INT8=2",
+                                                "DT_INT32=3",
+                                                "DT_UINT8=4",
+                                                "Unknown=5",
+                                                "DT_INT16=6",
+                                                "DT_UINT16=7",
+                                                "DT_UINT32=8",
+                                                "DT_INT64=9",
+                                                "DT_UINT64=10",
+                                                "DT_DOUBLE=11",
+                                                "DT_BOOL=12",
+                                                "DT_STRING=13"};
+
+  auto x = static_cast<int>(data_type);
+  CHECK_LT(x, MAX_DATATYPE_LENGTH);
+  return datatype2string[x];
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/huawei_ascend_npu/bridges/utility.h b/lite/kernels/huawei_ascend_npu/bridges/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..da9a8999ad09e545745f30e02ca62c60e6f9bf82
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+// #include "graph/buffer.h"
+#include "graph/tensor.h"
+#include "graph/types.h"
+#include "lite/core/op_lite.h"
+#include "lite/utils/macros.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+
+// Type/tensor converters for converting Paddle type/tensor to HiAI type/tensor
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+
+ge::DataType CvtPrecisionType(PrecisionType itype);
+
+ge::Format CvtDataLayoutType(DataLayoutType itype);
+
+// Padding the shape to 4-dimensions(NCHW) for HiAI
+std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape);
+
+std::vector<int64_t> CvtShape(const DDim& in_dims);
+
+ge::Tensor CvtTensor(const Tensor& in_tensor,
+                     std::vector<int64_t> out_shape = {},
+                     DataLayoutType in_layout = DATALAYOUT(kNCHW));
+
+int CvtActMode(std::string act_type);
+
+const std::string& CvtFormat(ge::Format format);
+const std::string& CvtDataType(ge::DataType data_type);
+
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/huawei_ascend_npu/subgraph_compute.cc b/lite/kernels/huawei_ascend_npu/subgraph_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e71c71ca28b163f27a9783572d585466335ef87
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.cc
@@ -0,0 +1,483 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/huawei_ascend_npu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include "lite/utils/io.h"
+#include "lite/utils/md5.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace huawei_ascend_npu {
+
+// Generate the model name by using md5 hashes based on:
+// 1. the sorted variable input names
+// 2. the shapes of the origin input tensors
+// 3. the sorted variable output names
+std::string DeviceProgram::GenerateModelName(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims) {
+  std::ostringstream os;
+  CHECK_EQ(input_names.size(), origin_idims.size());
+  for (size_t i = 0; i < input_names.size(); i++) {
+    os << input_names[i];
+    for (auto dim : origin_idims[i]) {
+      os << dim;
+    }
+  }
+  for (auto output_name : output_names) {
+    os << output_name;
+  }
+  return MD5(os.str());
+}
+
+// Serialize the generated model, the precisions and dimensions of the origin
+// output tensors of the subgraph op into files
+bool DeviceProgram::LoadFromCacheFile(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims,
+    const std::string& model_cache_dir,
+    const int device_id) {
+  // Generate the model name if not initialized
+  if (model_name_.empty()) {
+    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
+  }
+  // Load from the cached model file, return a HiAI model manager client for
+  // inference
+  auto model_path = model_cache_dir + "/" + model_name_ + ".om";
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model from cached file from:"
+          << model_path;
+  model_client_ = lite::huawei_ascend_npu::Device::Global().LoadFromFile(
+      model_path, device_id);
+  if (!model_client_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from cached file failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
+  // Deserialize the precisions and shapes of the origin output tensors from the
+  // cached configuration file
+  auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Load configuration from " << config_path;
+  std::vector<char> config_buffer;
+  if (!ReadFile(config_path, &config_buffer)) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] read from " << config_path
+                 << " failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading configuration success:"
+          << config_path;
+  std::string config_str(config_buffer.begin(), config_buffer.end());
+  // Parse the precision and shapes of the output tensors
+  auto output_options = Split<std::string>(config_str, ";");
+  CHECK_EQ(output_options.size(), output_names.size());
+  origin_otypes_.resize(output_names.size());
+  origin_odims_.resize(output_names.size());
+  for (size_t i = 0; i < output_names.size(); i++) {
+    auto items = Split<std::string>(output_options[i], ":");
+    CHECK_EQ(items.size(), 2);  // precision and shapes
+    origin_otypes_[i] = static_cast<PrecisionType>(std::stoi(items[0]));
+    origin_odims_[i] = Split<int64_t>(items[1], ",");
+  }
+  return true;
+}
+
+bool DeviceProgram::BuildGraphAndCacheToFile(
+    RuntimeProgram* origin_program,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims,
+    const std::vector<Tensor*>& origin_otensors,
+    const std::string& model_cache_dir,
+    const int device_id) {
+  // Generate the model name if not initialized
+  if (model_name_.empty()) {
+    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
+  }
+  // Convert all of ops and their input vars and weights to HiAI IR nodes,
+  // then added them into the IR graph
+  int status = 0;
+  subgraph::huawei_ascend_npu::Graph graph;
+  const auto& bridges = subgraph::Registry::Instance();
+  CHECK(origin_program)
+      << "[HUAWEI_ASCEND_NPU] The origin program is not initialized!";
+  CHECK_GT(origin_program->instructions(kRootBlockIdx).size(), 0)
+      << "[HUAWEI_ASCEND_NPU] No instructions found in the origin program!";
+  const auto& insts = origin_program->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
+    auto op = const_cast<OpLite*>(inst.op());
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists(op_type, TARGET(kHuaweiAscendNPU))) {
+      return false;
+    }
+    auto kernel = inst.kernel();
+    status |= bridges.Select(op_type, TARGET(kHuaweiAscendNPU))(
+        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
+    if (subgraph::CHECK_FAILED(status)) {
+      return false;
+    }
+  }
+  // Collect the input and output nodes of the IR graph
+  std::vector<ge::Operator> device_inodes;
+  for (size_t i = 0; i < input_names.size(); i++) {
+    CHECK(graph.Has(input_names[i]));
+    CHECK(graph.Get(input_names[i])->is_data());
+    device_inodes.push_back(*graph.Get(input_names[i])->data());
+  }
+  std::vector<ge::Operator> device_onodes;
+  for (size_t i = 0; i < output_names.size(); i++) {
+    CHECK(graph.Has(output_names[i]));
+    device_onodes.push_back(*graph.Get(output_names[i])->data());
+  }
+  // Build the IR graph to the om model
+  std::vector<char> model_buffer;
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Building model from model buffer...";
+  if (!lite::huawei_ascend_npu::Device::Global().Build(
+          device_inodes, device_onodes, &model_buffer)) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Build model failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Build model success.";
+  // Load the om model and create a model manager client
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model from memory ...";
+  model_client_ = lite::huawei_ascend_npu::Device::Global().LoadFromMem(
+      model_buffer, device_id);
+  if (!model_client_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from memory failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Load model from memory success.";
+  // Update the precison and dimensions of the origin output tensors
+  CHECK_EQ(origin_otensors.size(), output_names.size());
+  origin_otypes_.resize(output_names.size());
+  origin_odims_.resize(output_names.size());
+  for (size_t i = 0; i < output_names.size(); i++) {
+    origin_otypes_[i] = graph.Get(output_names[i])->precision();
+    origin_odims_[i] = origin_otensors[i]->dims().Vectorize();
+  }
+  if (!model_cache_dir.empty()) {
+    auto model_path = model_cache_dir + "/" + model_name_ + ".om";
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Saving model to " << model_path;
+    if (!WriteFile(model_path, model_buffer)) {
+      LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Open " << model_path
+                   << " for writting failed!";
+    }
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Saved OM model success:";
+    // Serialize the precisions and shapes of the origin output tensors into the
+    // configuration file
+    std::ostringstream os;
+    for (size_t i = 0; i < output_names.size(); i++) {
+      os << static_cast<int32_t>(origin_otypes_[i]) << ":";
+      for (auto dim : origin_odims_[i]) {
+        os << dim << ",";
+      }
+      os << ";";
+    }
+    auto str = os.str();
+    std::vector<char> config_buffer(str.begin(), str.end());
+    auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Saving configuration to " << config_path;
+    if (!WriteFile(config_path, config_buffer)) {
+      LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Open " << config_path
+                   << " for writting failed!";
+    }
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Saved configuration file success.";
+  }
+  return true;
+}
+
+bool DeviceProgram::ShareBufferWithOriginTensors(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    std::vector<Tensor*>* origin_itensors,
+    std::vector<Tensor*>* origin_otensors,
+    std::vector<std::shared_ptr<ge::Tensor>>* device_itensors,
+    std::vector<std::shared_ptr<ge::Tensor>>* device_otensors) {
+  CHECK(!model_name_.empty() && model_client_);
+  // Query the dimensions of the device input and output tensors if not
+  // initialized
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Sharing buffer with origin tnsors...";
+  if (device_idims_.empty() || device_odims_.empty()) {
+    if (!(model_client_->GetModelIOTensorDim(&device_idims_, &device_odims_))) {
+      LOG(WARNING)
+          << "[HUAWEI_ASCEND_NPU] Get the dimensions of input and output "
+             "tensors failed!";
+      return false;
+    }
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] GetModelIOTensorDim success.";
+  // Check the dimensions of the device tensors and the origin tensors
+  CHECK_EQ(device_itensors->size(), input_names.size());
+  CHECK_EQ(device_otensors->size(), output_names.size());
+  CHECK_EQ(origin_otypes_.size(), output_names.size());
+  CHECK_EQ(origin_odims_.size(), output_names.size());
+  CHECK_EQ(device_idims_.size(), input_names.size());
+  CHECK_EQ(device_odims_.size(), output_names.size());
+  for (size_t i = 0; i < input_names.size(); i++) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Inputs[" << i
+            << "] name: " << input_names[i]
+            << " origin dims:" << (*origin_itensors)[i]->dims().repr()
+            << " device dims: {" << device_idims_[i].GetNumber() << ","
+            << device_idims_[i].GetChannel() << ","
+            << device_idims_[i].GetHeight() << ","
+            << device_idims_[i].GetWidth() << "}";
+    CHECK_EQ((*origin_itensors)[i]->dims().production(),
+             device_idims_[i].GetNumber() * device_idims_[i].GetChannel() *
+                 device_idims_[i].GetHeight() * device_idims_[i].GetWidth());
+
+    // reset tensor desc
+    if ((*device_itensors)[i]->SetTensorDesc(
+            device_idims_[i].GetGeTensorDesc()) != ge::GRAPH_SUCCESS) {
+      LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Tensor input tensor "
+                      "SetTensorDesc failed!";
+    } else {
+      VLOG(3) << "[HUAWEI_ASCEND_NPU] ge::Tensor input tensor SetTensorDesc "
+                 "success.";
+    }
+    // copy data from origin to device
+    if ((*device_itensors)[i]->SetData(
+            reinterpret_cast<uint8_t*>((*origin_itensors)[i]->raw_data()),
+            (*origin_itensors)[i]->memory_size()) != ge::GRAPH_SUCCESS) {
+      LOG(WARNING)
+          << "[HUAWEI_ASCEND_NPU] ge::Tensor input tensor SetData failed!";
+    } else {
+      VLOG(3) << "[HUAWEI_ASCEND_NPU] ge::Tensor input tensor SetData success.";
+    }
+    VLOG(3)
+        << "[HUAWEI_ASCEND_NPU] Init the input tensors for the device program "
+           "and share their buffers with the origin input tensors";
+
+    // Share data buf between device_itensor and origin_itensor
+    std::shared_ptr<Buffer> buffer = std::make_shared<Buffer>(
+        reinterpret_cast<void*>((*device_itensors)[i]->GetData()),
+        lite_api::TargetType::kHost,
+        (*device_itensors)[i]->GetSize());
+    (*origin_itensors)[i]->ResetBuffer(buffer,
+                                       (*device_itensors)[i]->GetSize());
+  }
+  for (size_t i = 0; i < output_names.size(); i++) {
+    (*origin_otensors)[i]->set_precision(origin_otypes_[i]);
+    (*origin_otensors)[i]->Resize(origin_odims_[i]);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Outputs[" << i
+            << "] name: " << output_names[i]
+            << " origin dims:" << (*origin_otensors)[i]->dims().repr()
+            << " device dims: {" << device_odims_[i].GetNumber() << ","
+            << device_odims_[i].GetChannel() << ","
+            << device_odims_[i].GetHeight() << ","
+            << device_odims_[i].GetWidth() << "}";
+    CHECK_EQ((*origin_otensors)[i]->dims().production(),
+             device_odims_[i].GetNumber() * device_odims_[i].GetChannel() *
+                 device_odims_[i].GetHeight() * device_odims_[i].GetWidth());
+
+    // reset tensor desc
+    if ((*device_otensors)[i]->SetTensorDesc(
+            device_odims_[i].GetGeTensorDesc()) != ge::GRAPH_SUCCESS) {
+      LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Tensor output tensor "
+                      "SetTensorDesc failed!";
+    } else {
+      VLOG(3) << "[HUAWEI_ASCEND_NPU] ge::Tensor output tensor SetTensorDesc "
+                 "success.";
+    }
+    VLOG(3)
+        << "[HUAWEI_ASCEND_NPU] Init the output tensors for the device program "
+           "and share their buffers with the origin output tensors";
+  }
+  return true;
+}
+
+bool DeviceProgram::SharedBufferWithOutputTensors(
+    const std::vector<std::string>& output_names,
+    std::vector<Tensor*>* origin_otensors,
+    std::vector<std::shared_ptr<ge::Tensor>>* device_otensors) {
+  CHECK(!model_name_.empty() && model_client_);
+  // Check the dimensions of the device tensors and the origin tensors
+  CHECK_EQ(device_otensors->size(), output_names.size());
+  CHECK_EQ(origin_otypes_.size(), output_names.size());
+  CHECK_EQ(origin_odims_.size(), output_names.size());
+
+  for (size_t i = 0; i < output_names.size(); i++) {
+    CHECK_EQ((*origin_otensors)[i]->dims().production(),
+             device_odims_[i].GetNumber() * device_odims_[i].GetChannel() *
+                 device_odims_[i].GetHeight() * device_odims_[i].GetWidth());
+
+    // Share data buf between device_itensor and origin_itensor
+    std::shared_ptr<Buffer> buffer = std::make_shared<Buffer>(
+        reinterpret_cast<void*>((*device_otensors)[i]->GetData()),
+        lite_api::TargetType::kHost,
+        (*device_otensors)[i]->GetSize());
+    (*origin_otensors)[i]->ResetBuffer(buffer,
+                                       (*device_otensors)[i]->GetSize());
+  }
+  // unload model after model execution
+  CHECK_EQ(model_client_->UnloadModel(), true);
+  return true;
+}
+
+bool DeviceProgram::ZeroCopyRun(
+    std::vector<std::shared_ptr<ge::Tensor>>* device_itensors,
+    std::vector<std::shared_ptr<ge::Tensor>>* device_otensors) {
+  CHECK(!model_name_.empty() && model_client_);
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  // int istamp;
+  auto start_time = GetCurrentUS();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Starting ZeroCopyRun to ModelExecute ...";
+  CHECK_EQ(model_client_->ModelExecute(device_itensors, device_otensors), true);
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Process cost " << GetCurrentUS() - start_time
+          << " us";
+  return true;
+}
+
+bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
+  // Obtain the origin input tensors, and create the origin output
+  // tensors(Don't try to access them before launch the device program or the
+  // origin program)
+  PrepareWorkspaceForOriginProgram();
+  // Create the device input and output tensors, but don't initialize them
+  // with the dimensions
+  device_itensors_.resize(input_names_.size());
+  for (size_t i = 0; i < input_names_.size(); i++) {
+    device_itensors_[i].reset(new ge::Tensor);
+    CHECK(device_itensors_[i]);
+  }
+  device_otensors_.resize(output_names_.size());
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    device_otensors_[i].reset(new ge::Tensor);
+    CHECK(device_otensors_[i]);
+  }
+  return true;
+}
+
+bool SubgraphEngine::BuildDeviceProgram() {
+  // Check if the cache device program exists
+  if (!device_programs_.count(origin_idims_)) {
+    auto device_program = std::make_shared<DeviceProgram>();
+    // Obtain the model cache dir from the NPU Context of the subgraph op
+    auto model_cache_dir =
+        ctx_->As<HuaweiAscendNPUContext>().SubgraphModelCacheDir();
+    auto device_id = ctx_->As<HuaweiAscendNPUContext>().HuaweiAscendDeviceID();
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Get model cached dir: " << model_cache_dir;
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Get huawei ascend npu device id: "
+            << device_id;
+    // Check and load if the cached model and configuration file exists
+    if (model_cache_dir.empty() ||
+        !device_program->LoadFromCacheFile(input_names_,
+                                           output_names_,
+                                           origin_idims_,
+                                           model_cache_dir,
+                                           device_id)) {
+      // Build the model online, including converting the paddle ops to the HiAI
+      // IR nodes, building the HiAI IR graph to the om model, then load it as a
+      // new HiAI model manager client for inference.
+      if (!origin_program_) {
+        BuildOriginProgram();
+      }
+      CHECK(origin_program_)
+          << "[HUAWEI_ASCEND_NPU] The origin program is not initialized!";
+      CHECK_GT(origin_program_->instructions().size(), 0)
+          << "[HUAWEI_ASCEND_NPU] No instructions found in the origin program!";
+      if (!device_program->BuildGraphAndCacheToFile(origin_program_.get(),
+                                                    input_names_,
+                                                    output_names_,
+                                                    origin_idims_,
+                                                    origin_otensors_,
+                                                    model_cache_dir,
+                                                    device_id)) {
+        return false;
+      }
+    }
+    if (device_program->model_client_ == nullptr) {
+      return false;
+    }
+    device_programs_[origin_idims_] = device_program;
+  }
+  auto device_program = device_programs_[origin_idims_];
+  CHECK(device_program && device_program->model_client_);
+  return device_program->ShareBufferWithOriginTensors(input_names_,
+                                                      output_names_,
+                                                      &origin_itensors_,
+                                                      &origin_otensors_,
+                                                      &device_itensors_,
+                                                      &device_otensors_);
+}
+
+bool SubgraphEngine::LaunchDeviceProgram() {
+  // Roll back to launch the origin program if the device program can't be
+  // found or the model client isn't initialized.
+  if (device_programs_.count(origin_idims_) == 0 ||
+      device_programs_[origin_idims_]->model_client_ == nullptr) {
+    return LaunchOriginProgram();
+  }
+  auto device_program = device_programs_[origin_idims_];
+  if (!device_program->model_client_) {
+    return LaunchOriginProgram();
+  }
+  if (!device_program->ZeroCopyRun(&device_itensors_, &device_otensors_)) {
+    return false;
+  }
+  if (!device_program->SharedBufferWithOutputTensors(
+          output_names_, &origin_otensors_, &device_otensors_)) {
+    return false;
+  }
+  return true;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
+                                   param.input_data_names,
+                                   param.output_data_names));
+  CHECK(engine_);
+}
+
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Run();
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kHuaweiAscendNPU,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::huawei_ascend_npu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .Finalize();
diff --git a/lite/kernels/huawei_ascend_npu/subgraph_compute.h b/lite/kernels/huawei_ascend_npu/subgraph_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb7d2efe0c29912a07f11a544c91432d69c51fa0
--- /dev/null
+++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.h
@@ -0,0 +1,121 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "graph/tensor.h"
+#include "lite/backends/huawei_ascend_npu/device.h"
+#include "lite/core/kernel.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace huawei_ascend_npu {
+
+using TensorDesc = paddle::lite::huawei_ascend_npu::TensorDesc;
+using AclModelClient = paddle::lite::huawei_ascend_npu::AclModelClient;
+
+class DeviceProgram {
+ public:
+  DeviceProgram() {}
+  ~DeviceProgram() {}
+  std::string GenerateModelName(
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::vector<int64_t>>& origin_idims);
+  bool LoadFromCacheFile(const std::vector<std::string>& input_names,
+                         const std::vector<std::string>& output_names,
+                         const std::vector<std::vector<int64_t>>& origin_idims,
+                         const std::string& model_cache_dir,
+                         const int device_id);
+  bool BuildGraphAndCacheToFile(
+      RuntimeProgram* origin_program,
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::vector<int64_t>>& origin_idims,
+      const std::vector<Tensor*>& origin_otensors,
+      const std::string& model_cache_dir,
+      const int device_id);
+  bool ShareBufferWithOriginTensors(
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      std::vector<Tensor*>* origin_itensors,
+      std::vector<Tensor*>* origin_otensors,
+      std::vector<std::shared_ptr<ge::Tensor>>* device_itensors,
+      std::vector<std::shared_ptr<ge::Tensor>>* device_otensors);
+  bool SharedBufferWithOutputTensors(
+      const std::vector<std::string>& output_names,
+      std::vector<Tensor*>* origin_otensors,
+      std::vector<std::shared_ptr<ge::Tensor>>* device_otensors);
+  bool ZeroCopyRun(std::vector<std::shared_ptr<ge::Tensor>>* device_itensors,
+                   std::vector<std::shared_ptr<ge::Tensor>>* device_otensors);
+
+ public:
+  std::string model_name_{""};
+  std::shared_ptr<AclModelClient> model_client_{nullptr};
+  std::vector<std::vector<int64_t>> origin_odims_;
+  std::vector<PrecisionType> origin_otypes_;
+  std::vector<TensorDesc> device_idims_{};
+  std::vector<TensorDesc> device_odims_{};
+};
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext* ctx,
+                 int block_idx,
+                 const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+                 Scope* exec_scope,
+                 const std::vector<std::string>& input_names,
+                 const std::vector<std::string>& output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}
+
+ protected:
+  bool PrepareWorkspaceForDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
+
+ private:
+  std::vector<std::shared_ptr<ge::Tensor>> device_itensors_{};
+  std::vector<std::shared_ptr<ge::Tensor>> device_otensors_{};
+  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<DeviceProgram>>
+      device_programs_;
+};
+
+class SubgraphCompute
+    : public KernelLite<TARGET(kHuaweiAscendNPU), PRECISION(kAny)> {
+ public:
+  using param_t = operators::SubgraphParam;
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace huawei_ascend_npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h
index 044827dbf98c561b0d424a1c93b0da650ef58796..75570a6249ecaa36a94b73dafb27f655495cab87 100644
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -43,13 +43,17 @@ class SubgraphEngine : public subgraph::Engine {
  public:
   SubgraphEngine(KernelContext* ctx,
                  int block_idx,
-                 cpp::BlockDesc* block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+                 Scope* exec_scope,
                  const std::vector<std::string>& input_names,
                  const std::vector<std::string>& output_names,
-                 Scope* scope,
                  paddle::lite_api::PrecisionType type)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope),
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names),
         fp_type_(type) {
     VLOG(4) << "[MLU] PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL is "
             << GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL");
@@ -103,7 +107,7 @@ class SubgraphEngine : public subgraph::Engine {
 
  protected:
   bool BuildDeviceProgram() override {
-    if (origin_program_.empty()) {
+    if (!origin_program_) {
       BuildOriginProgram();
     }
     if (!error_compile_batch_size_changeable_ &&
@@ -128,13 +132,15 @@ class SubgraphEngine : public subgraph::Engine {
     origin_itensors_.clear();
     origin_otensors_.clear();
 
-    auto data_order = block_desc_->GetOp<cpp::OpDesc>(0)->Type() == "layout"
+    auto* sub_block_desc =
+        program_desc_->GetBlock()<cpp::BlockDesc>(block_idx_);
+    auto data_order = sub_block_desc->GetOp<cpp::OpDesc>(0)->Type() == "layout"
                           ? CNML_NCHW
                           : CNML_NHWC;
     // Convert all of input data vars and added into the MLU IR graph
     status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
     for (auto& input_name : input_names_) {
-      auto input_tensor = scope_->FindMutableTensor(input_name);
+      auto input_tensor = exec_scope_->FindMutableTensor(input_name);
       auto data_type = input_tensor->precision();
       cnmlDataType_t fp_type = PrecisionToDatatype(data_type);
       origin_itensors_.push_back(input_tensor);
@@ -161,7 +167,8 @@ class SubgraphEngine : public subgraph::Engine {
     LOG(INFO) << "START TO CONVERT ";
     // Convert all of ops and its weights and added into the MLU IR graph
     const auto& bridges = subgraph::Registry::Instance();
-    for (auto& inst : origin_program_) {
+    const auto& insts = origin_program_->instructions(kRootBlockIdx);
+    for (auto& inst : insts) {
       auto op = inst.op();
       CHECK(op);
       std::string op_type = op->op_info()->Type();
@@ -200,7 +207,7 @@ class SubgraphEngine : public subgraph::Engine {
     for (auto& output_name : output_names_) {
       if (graph->HasNode(output_name)) {
         graph->AddOutput(graph->GetNode(output_name));
-        auto output_tensor = scope_->FindMutableTensor(output_name);
+        auto output_tensor = exec_scope_->FindMutableTensor(output_name);
         origin_otensors_.push_back(output_tensor);
         VLOG(4) << "subgraph output tensor " << output_name << std::endl;
 
@@ -257,7 +264,7 @@ class SubgraphEngine : public subgraph::Engine {
     for (const auto& input_name : input_names_) {
       tmp = input_name;
       name += TrimStrings(tmp) + delimiter + input_shape_str;
-      auto input_tensor = scope_->FindMutableTensor(input_name);
+      auto input_tensor = exec_scope_->FindMutableTensor(input_name);
       for (const auto& iterm : input_tensor->dims().Vectorize()) {
         name += std::to_string(iterm) + delimiter_num;
       }
@@ -266,7 +273,7 @@ class SubgraphEngine : public subgraph::Engine {
     for (const auto& output_name : output_names_) {
       tmp = output_name;
       name += TrimStrings(tmp) + delimiter + output_shape_str;
-      auto output_tensor = scope_->FindMutableTensor(output_name);
+      auto output_tensor = exec_scope_->FindMutableTensor(output_name);
       for (const auto& iterm : output_tensor->dims().Vectorize()) {
         name += std::to_string(iterm) + delimiter_num;
       }
@@ -284,7 +291,8 @@ class SubgraphEngine : public subgraph::Engine {
         origin_otensors_[i]->Resize(iter->second[i]);
       }
     } else {
-      for (auto& inst : origin_program_) {
+      const auto& insts = origin_program_->instructions(kRootBlockIdx);
+      for (auto& inst : insts) {
         auto op = inst.op();
         CHECK(op);
         op->CheckShape();
@@ -475,11 +483,11 @@ class SubgraphCompute
     auto& param = this->template Param<param_t>();
     // LOG(INFO) << "SUBGRAP Prepare RUN index " << param.sub_block_idx;
     engine_.reset(new SubgraphEngine<Precision>(this->ctx_.get(),
-                                                param.sub_block_idx,
-                                                param.sub_block_desc,
+                                                param.block_idx,
+                                                param.program_desc,
+                                                param.exec_scope,
                                                 param.input_data_names,
                                                 param.output_data_names,
-                                                param.scope,
                                                 this->precision()));
     CHECK(engine_);
   }
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index 5157f47867160cf4f705306ca37cfad962373386..be30d1c03988cb8b88761c0719c2785446c0b0ea 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU AND NOT LITE_WITH_APU)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU AND NOT LITE_WITH_APU AND NOT LITE_WITH_HUAWEI_ASCEND_NPU)
   return()
 endif()
 
diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc
index 884ab1acce8f0927def660ae35941d85b4c85901..b9f81a74ad997966ecb79c66bceed1e84b4a91f7 100644
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
@@ -25,11 +25,14 @@ namespace subgraph {
 
 Engine::Engine(KernelContext *ctx,
                int block_idx,
-               cpp::BlockDesc *block_desc,
+               const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+               Scope *exec_scope,
                const std::vector<std::string> &input_names,
-               const std::vector<std::string> &output_names,
-               lite::Scope *scope)
-    : ctx_(ctx), block_idx_(block_idx), block_desc_(block_desc), scope_(scope) {
+               const std::vector<std::string> &output_names)
+    : ctx_(ctx),
+      block_idx_(block_idx),
+      program_desc_(program_desc),
+      exec_scope_(exec_scope) {
   input_names_ = input_names;
   output_names_ = output_names;
   // Sort the name of input and output tensors, it's convenient for us to get
@@ -55,12 +58,12 @@ bool Engine::PrepareWorkspaceForOriginProgram() {
   origin_idims_.resize(input_names_.size());
   origin_itensors_.resize(input_names_.size());
   for (int i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    origin_itensors_[i] = exec_scope_->FindMutableTensor(input_names_[i]);
     CHECK(origin_itensors_[i]);
   }
   origin_otensors_.resize(output_names_.size());
   for (int i = 0; i < output_names_.size(); i++) {
-    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    origin_otensors_[i] = exec_scope_->FindMutableTensor(output_names_[i]);
     CHECK(origin_otensors_[i]);
   }
   return true;
@@ -69,70 +72,20 @@ bool Engine::PrepareWorkspaceForOriginProgram() {
 bool Engine::BuildOriginProgram() {
   // TODO(hong19860320) The block_desc need to be divided into subgraphs during
   // the exection time. But only see them as a subgraph now.
-  origin_program_.clear();
-  for (size_t op_idx = 0; op_idx < block_desc_->OpsSize(); op_idx++) {
-    auto op_desc = block_desc_->GetOp<cpp::OpDesc>(op_idx);
-    CHECK(op_desc);
-    std::string op_type = op_desc->Type();
-    // Create op and pick up the best kernel
-    auto op = LiteOpRegistry::Global().Create(op_desc->Type());
-    CHECK(op) << "no Op found for " << op_type;
-    op->Attach(*op_desc, scope_);
-    std::unique_ptr<KernelBase> picked_kernel;
-    if (op_desc->HasAttr(kKernelTypeAttr)) {
-      // Create op and pick up the best kernel according to the
-      // kKernelTypeAttr attribute
-      auto kernel_type = op_desc->GetAttr<std::string>(kKernelTypeAttr);
-      std::string alias;
-      Place place;
-      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-      VLOG(3) << "Found the attr '" << kKernelTypeAttr << "': " << kernel_type
-              << " for " << op_type;
-      auto kernels = op->CreateKernels({place});
-      CHECK_GT(kernels.size(), 0u) << "No kernels found for " << op_type;
-      auto it = std::find_if(
-          kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase> &it) {
-            return it->alias() == alias;
-          });
-      CHECK(it != kernels.end());
-      picked_kernel = std::move(*it);
-    } else {
-      // TODO(hong19860320) add kernel picking according to the type of input
-      // and output tensors
-      VLOG(3) << "The attr '" << kKernelTypeAttr
-              << "' not found, pick the first kernel for " << op_type;
-      std::vector<std::unique_ptr<KernelBase>> kernels;
-#if defined(LITE_WITH_ARM)
-      kernels = op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}});
-#elif defined(LITE_WITH_X86)
-      kernels = op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}});
-#endif
-      if (kernels.size() > 0) {
-        picked_kernel = std::move(kernels.front());
-      } else {
-        LOG(WARNING) << "No kernels found for " << op_type;
-      }
-    }
-    if (picked_kernel != nullptr) {
-      picked_kernel->SetContext(
-          ContextScheduler::Global().NewContext(picked_kernel->target()));
-    }
-    origin_program_.emplace_back(std::move(op), std::move(picked_kernel));
+  if (!origin_program_) {
+    origin_program_.reset(
+        new RuntimeProgram(program_desc_, exec_scope_, block_idx_));
   }
-  CHECK(!origin_program_.empty()) << "no instructions";
   return true;
 }
 
 bool Engine::LaunchOriginProgram() {
-  if (origin_program_.empty()) {
+  if (!origin_program_) {
     BuildOriginProgram();
   }
-  if (!origin_program_.empty()) {
-    for (auto &inst : origin_program_) {
-      auto op_type = inst.op()->op_info()->Type();
-      if (op_type == "feed" || op_type == "fetch") continue;
-      inst.Run();
-    }
+  if (origin_program_) {
+    VLOG(3) << "Roll back to run the origin program.";
+    origin_program_->Run();
     return true;
   }
   return false;
diff --git a/lite/kernels/npu/bridges/engine.h b/lite/kernels/npu/bridges/engine.h
index b49b8fea5a6d39610ea7398e177e7d1ec5a35f92..daa02fb0d7bf8f70ebf8b21821a274b6a0ba062d 100644
--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
@@ -30,10 +30,10 @@ class Engine {
  public:
   Engine(KernelContext *ctx,
          int block_idx,
-         cpp::BlockDesc *block_desc,
+         const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+         Scope *exec_scope,
          const std::vector<std::string> &input_names,
-         const std::vector<std::string> &output_names,
-         lite::Scope *scope);
+         const std::vector<std::string> &output_names);
   virtual ~Engine() = default;
 
   virtual bool Run();
@@ -54,15 +54,15 @@ class Engine {
 
   KernelContext *ctx_{nullptr};
   int block_idx_{-1};
-  cpp::BlockDesc *block_desc_{nullptr};
+  const std::shared_ptr<const cpp::ProgramDesc> program_desc_{nullptr};
   std::vector<std::string> input_names_;
   std::vector<std::string> output_names_;
-  Scope *scope_{nullptr};
+  Scope *exec_scope_{nullptr};
   bool is_first_epoch_{true};
   std::vector<std::vector<int64_t>> origin_idims_;
   std::vector<Tensor *> origin_itensors_;
   std::vector<Tensor *> origin_otensors_;
-  std::vector<Instruction> origin_program_;
+  std::unique_ptr<RuntimeProgram> origin_program_{nullptr};
 };
 
 }  // namespace subgraph
diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc
index 6afb445e0ed411251d203bcb0420b0fba8ab6beb..e9c5957ff6d8f026f712de04f4e32cd69baf50a9 100644
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -55,7 +55,8 @@ std::string DeviceProgram::GenerateModelName(
 }
 
 // Deserialize the generated model, the precisions and dimensions of the origin
-// output tensors of the subgraph op into files
+// output tensors of the subgraph op from the cached configuration file and HiAI
+// om file
 bool DeviceProgram::LoadFromCacheFile(
     const std::vector<std::string>& input_names,
     const std::vector<std::string>& output_names,
@@ -71,7 +72,7 @@ bool DeviceProgram::LoadFromCacheFile(
   VLOG(3) << "[NPU] Load model from " << model_path;
   std::vector<char> model_buffer;
   if (!ReadFile(model_path, &model_buffer)) {
-    LOG(WARNING) << "[NPU] read from " << model_path << " failed!";
+    LOG(WARNING) << "[NPU] Open " << model_path << " for reading failed!";
     return false;
   }
   bool model_comp = false;
@@ -98,9 +99,9 @@ bool DeviceProgram::LoadFromCacheFile(
     LOG(WARNING) << "[NPU] read from " << config_path << " failed!";
     return false;
   }
-  std::string config_str(config_buffer.begin(), config_buffer.end());
+  std::string str(config_buffer.begin(), config_buffer.end());
   // Parse the precision and shapes of the output tensors
-  auto output_options = Split<std::string>(config_str, ";");
+  auto output_options = Split<std::string>(str, ";");
   CHECK_EQ(output_options.size(), output_names.size());
   origin_otypes_.resize(output_names.size());
   origin_odims_.resize(output_names.size());
@@ -114,7 +115,7 @@ bool DeviceProgram::LoadFromCacheFile(
 }
 
 bool DeviceProgram::BuildGraphAndCacheToFile(
-    const std::vector<Instruction>& origin_program,
+    RuntimeProgram* origin_program,
     const std::vector<std::string>& input_names,
     const std::vector<std::string>& output_names,
     const std::vector<std::vector<int64_t>>& origin_idims,
@@ -127,10 +128,13 @@ bool DeviceProgram::BuildGraphAndCacheToFile(
   // Convert all of ops and their input vars and weights to HiAI IR nodes,
   // then added them into the HiAI IR graph
   int status = 0;
-  CHECK(!origin_program.empty()) << "no instructions";
   subgraph::npu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
-  for (auto& inst : origin_program) {
+  CHECK(origin_program) << "[NPU] The origin program is not initialized!";
+  CHECK_GT(origin_program->instructions(kRootBlockIdx).size(), 0)
+      << "[NPU] No instructions found in the origin program!";
+  const auto& insts = origin_program->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
@@ -149,7 +153,8 @@ bool DeviceProgram::BuildGraphAndCacheToFile(
   // Collect the input and output nodes of the HiAI IR graph
   std::vector<ge::Operator> device_inodes;
   for (size_t i = 0; i < input_names.size(); i++) {
-    CHECK(graph.Has(input_names[i]) && graph.Get(input_names[i])->is_data());
+    CHECK(graph.Has(input_names[i]));
+    CHECK(graph.Get(input_names[i])->is_data());
     device_inodes.push_back(*graph.Get(input_names[i])->data());
   }
   std::vector<ge::Operator> device_onodes;
@@ -173,6 +178,9 @@ bool DeviceProgram::BuildGraphAndCacheToFile(
     LOG(WARNING) << "[NPU] Load model failed!";
     return false;
   }
+  // Do not check model compatibility because it assume that the cached om model
+  // is always compatible with the current device
+  // Update the precison and dimensions of the origin output tensors
   // Update the precison and dimensions of the origin output tensors
   CHECK_EQ(origin_otensors.size(), output_names.size());
   origin_otypes_.resize(output_names.size());
@@ -247,7 +255,7 @@ bool DeviceProgram::ShareBufferWithOriginTensors(
                  device_idims_[i].GetHeight() * device_idims_[i].GetWidth());
     VLOG(3) << "[NPU] Init the input tensors for the device program and share "
                "their buffers with the origin input tensors";
-    // reinit device tensor will free shared buffer, so copy data to a tmp
+    // Reinit device tensor will free shared buffer, so copy data to a tmp
     // tensor
     Tensor tmp;
     tmp.CopyDataFrom(*(*origin_itensors)[i]);
@@ -337,8 +345,9 @@ bool SubgraphEngine::BuildDeviceProgram() {
   if (!device_programs_.count(origin_idims_)) {
     auto device_program = std::make_shared<DeviceProgram>();
     // Obtain the model cache dir from the NPU Context of the subgraph op
-    auto model_cache_dir = ctx_->As<NPUContext>().SubgraphModelCacheDir();
-    VLOG(3) << "[NPU] Getting subgraph model_cache_dir is: " << model_cache_dir;
+    auto model_cache_dir =
+        ctx_->As<NPUContext>().SubgraphModelCacheDir(exec_scope_);
+    VLOG(3) << "[NPU] Getting subgraph_model_cache_dir: " << model_cache_dir;
     // Check and load if the cached model and configuration file exists
     if (model_cache_dir.empty() ||
         !device_program->LoadFromCacheFile(
@@ -346,11 +355,13 @@ bool SubgraphEngine::BuildDeviceProgram() {
       // Build the model online, including converting the paddle ops to the HiAI
       // IR nodes, building the HiAI IR graph to the om model, then load it as a
       // new HiAI model manager client for inference.
-      if (origin_program_.empty()) {
+      if (!origin_program_) {
         BuildOriginProgram();
       }
-      CHECK(!origin_program_.empty()) << "no instructions";
-      if (!device_program->BuildGraphAndCacheToFile(origin_program_,
+      CHECK(origin_program_) << "[NPU] The origin program is not initialized!";
+      CHECK_GT(origin_program_->instructions().size(), 0)
+          << "[NPU] No instructions found in the origin program!";
+      if (!device_program->BuildGraphAndCacheToFile(origin_program_.get(),
                                                     input_names_,
                                                     output_names_,
                                                     origin_idims_,
@@ -391,11 +402,11 @@ bool SubgraphEngine::LaunchDeviceProgram() {
 void SubgraphCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                    param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
   CHECK(engine_);
 }
 
diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h
index 33321a7789fbc1eee5ff759dcf682d8e875ffe96..2203acaee82704b2a9e93d8b14d708197d7afb1a 100644
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -41,7 +41,7 @@ class DeviceProgram {
                          const std::vector<std::vector<int64_t>>& origin_idims,
                          const std::string& model_cache_dir);
   bool BuildGraphAndCacheToFile(
-      const std::vector<Instruction>& origin_program,
+      RuntimeProgram* origin_program,
       const std::vector<std::string>& input_names,
       const std::vector<std::string>& output_names,
       const std::vector<std::vector<int64_t>>& origin_idims,
@@ -71,12 +71,16 @@ class SubgraphEngine : public subgraph::Engine {
  public:
   SubgraphEngine(KernelContext* ctx,
                  int block_idx,
-                 cpp::BlockDesc* block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+                 Scope* exec_scope,
                  const std::vector<std::string>& input_names,
-                 const std::vector<std::string>& output_names,
-                 Scope* scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string>& output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}
 
  protected:
   bool PrepareWorkspaceForDeviceProgram() override;
diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h
index 4eab7be1f1ac6459250c6df984160f0f6060ea1c..e61557a71dfbf1353decc9491b67c5e1e326512e 100644
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -152,7 +152,7 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
   cl::NDRange local_work_size_ = cl::NDRange{
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   bool use_lws_{true};
-  bool use_tune_{false};
+  bool use_tune_{true};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/nearest_interp_image_compute_test.cc b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
index 4a9948832d1a96d95a7f317bd3ac8245292ae02b..fb40da290d10ed49f293cf7ff78865f2e7967eab 100644
--- a/lite/kernels/opencl/nearest_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
@@ -155,6 +155,7 @@ TEST(nearest_interp_image2d, compute) {
               auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
               auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
               auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+              memset(reinterpret_cast<char *>(y_data_ref), 0, y_ref.numel());
               auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
                   x_data, 0, sizeof(float) * x_dim.production()));
               auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
diff --git a/lite/kernels/rknpu/subgraph_compute.cc b/lite/kernels/rknpu/subgraph_compute.cc
index a50505c38c0740f762256cd71e006caf9249838e..da01539b291d57da1501f8c3790acae8496581f3 100644
--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
@@ -28,26 +28,6 @@ namespace lite {
 namespace kernels {
 namespace rknpu {
 
-bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
-  // Obtain the origin input tensors, and create the origin output
-  // tensors(Don't try to access them before launch the device program or the
-  // origin program)
-  PrepareWorkspaceForOriginProgram();
-  // Create the device input and output tensors, but don't initialize them
-  // with the dimensions
-  device_itensors_.resize(input_names_.size());
-  for (int i = 0; i < input_names_.size(); i++) {
-    device_itensors_[i].reset(new hiai::AiTensor);
-    CHECK(device_itensors_[i]);
-  }
-  device_otensors_.resize(output_names_.size());
-  for (int i = 0; i < output_names_.size(); i++) {
-    device_otensors_[i].reset(new hiai::AiTensor);
-    CHECK(device_otensors_[i]);
-  }
-  return true;
-}
-
 bool SubgraphEngine::BuildDeviceProgram() {
   LOG(INFO) << "[RKNPU]:BuildDeviceProgram";
   int status = 0;
@@ -55,10 +35,11 @@ bool SubgraphEngine::BuildDeviceProgram() {
   // RKNPU IR graph
   subgraph::rknpu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
-  if (origin_program_.empty()) {
+  if (!origin_program_) {
     BuildOriginProgram();
   }
-  for (auto& inst : origin_program_) {
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
@@ -76,92 +57,26 @@ bool SubgraphEngine::BuildDeviceProgram() {
   }
   // Collect the valid input and output nodes in the RKNPU IR graph and update
   // the input and output names
-  device_inames_.clear();
-  device_onames_.clear();
-
-  for (auto& input_name : input_names_) {
-    LOG(INFO) << "[RKNPU] Input node " << input_name;
-    if (graph.Has(input_name)) {
-      LOG(INFO) << input_name << " Precision "
-                << PrecisionToStr(graph.Get(input_name)->precision());
-      device_itensors_.push_back(graph.Get(input_name)->data());
-      device_inames_.push_back(input_name);
-    } else {
-      LOG(WARNING) << "[RKNPU] Input node " << input_name
-                   << " is ignored because it does not exist.";
-    }
-  }
-
-  for (auto& output_name : output_names_) {
-    LOG(INFO) << "[RKNPU] Output node " << output_name;
-    if (graph.Has(output_name)) {
-      auto tensor = scope_->FindMutableTensor(output_name);
-      LOG(INFO) << output_name << " Precision "
-                << PrecisionToStr(tensor->precision());
-      device_otensors_.push_back(graph.Get(output_name)->data());
-      device_onames_.push_back(output_name);
-    } else {
-      LOG(WARNING) << "[RKNPU] Output node " << output_name
-                   << " is ignored because it does not exist.";
-    }
-  }
-  CHECK(!device_inames_.empty())
-      << "[RKNPU] No input nodes found for building NPU model";
-  CHECK(!device_onames_.empty())
-      << "[RKNPU] No output nodes found for building NPU model";
-
-  device_program_ = lite::rknpu::Device::Global().Build(
-      model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
-  if (device_program_ == nullptr) {
-    LOG(WARNING) << "[RKNPU] Build model failed!";
-    return false;
-  }
-
-  // input
-  origin_idims_.resize(input_names_.size());
-  origin_itensors_.resize(input_names_.size());
+  device_itensors_.clear();
+  device_otensors_.clear();
   for (size_t i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
-    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
-  }
-  // output
-  origin_odims_.resize(output_names_.size());
-  origin_otensors_.resize(output_names_.size());
-  for (size_t i = 0; i < output_names_.size(); i++) {
-    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-
-    auto output_dims = origin_otensors_[i]->dims();
-  }
-
-  origin_idims_.resize(device_inames_.size());
-  origin_itensors_.resize(device_inames_.size());
-  device_itensors_.resize(device_inames_.size());
-  origin_odims_.resize(device_onames_.size());
-  origin_otensors_.resize(device_onames_.size());
-  device_otensors_.resize(device_onames_.size());
-  for (int i = 0; i < device_inames_.size(); i++) {
-    auto node = graph.Get(device_inames_[i]);
+    CHECK(graph.Has(input_names_[i])) << "[RKNPU] Failed to find input node "
+                                      << input_names_[i];
+    auto node = graph.Get(input_names_[i]);
     auto precision = node->precision();
     auto layout = node->layout();
-    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
-    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
-
-    LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << device_inames_[i]
+    LOG(INFO) << "[RKNPU] Inputs[" << i << "] name: " << input_names_[i]
               << " precision: " << PrecisionToStr(precision)
               << " layout: " << DataLayoutToStr(layout);
+    device_itensors_.push_back(node->data());
   }
-  for (int i = 0; i < device_onames_.size(); i++) {
-    auto node = graph.Get(device_onames_[i]);
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    CHECK(graph.Has(output_names_[i])) << "[RKNPU] Failed to find output node "
+                                       << output_names_[i];
+    auto node = graph.Get(output_names_[i]);
     auto precision = node->precision();
     auto layout = node->layout();
-    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-    LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << device_onames_[i]
+    LOG(INFO) << "[RKNPU] Outputs[" << i << "] name: " << output_names_[i]
               << " precision: " << PrecisionToStr(precision)
               << " layout: " << DataLayoutToStr(layout);
     // Prepare the device output tensors
@@ -182,11 +97,19 @@ bool SubgraphEngine::BuildDeviceProgram() {
         origin_otensors_[i]->mutable_data<int64_t>();
         break;
       default:
-        LOG(FATAL) << "[RKNPU] " << device_onames_[i]
+        LOG(FATAL) << "[RKNPU] " << output_names_[i]
                    << " can't mutable data with precision type "
                    << PrecisionToStr(precision);
         break;
     }
+    device_otensors_.push_back(node->data());
+  }
+  // Create the RKNPU model and set the input and output nodes
+  device_program_ = lite::rknpu::Device::Global().Build(
+      model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
+  if (device_program_ == nullptr) {
+    LOG(WARNING) << "[RKNPU] Build model failed!";
+    return false;
   }
   return true;
 }
@@ -196,8 +119,8 @@ bool SubgraphEngine::LaunchDeviceProgram() {
   std::vector<rk::nn::InputInfo> inputs;
   std::vector<rk::nn::OutputInfo> outputs;
 
-  inputs.resize(device_itensors_.size());
-  for (size_t i = 0; i < device_itensors_.size(); i++) {
+  inputs.resize(origin_itensors_.size());
+  for (size_t i = 0; i < origin_itensors_.size(); i++) {
     inputs[i].index = i;
     inputs[i].buf = const_cast<void*>(origin_itensors_[i]->raw_data());
     inputs[i].size = origin_itensors_[i]->memory_size();
@@ -207,8 +130,8 @@ bool SubgraphEngine::LaunchDeviceProgram() {
     inputs[i].layout = rk::nn::DataLayoutType::NCHW;
   }
 
-  outputs.resize(device_otensors_.size());
-  for (size_t i = 0; i < device_otensors_.size(); i++) {
+  outputs.resize(origin_otensors_.size());
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
     outputs[i].index = i;
     outputs[i].buf = const_cast<void*>(origin_otensors_[i]->raw_data());
     outputs[i].size = origin_otensors_[i]->memory_size();
@@ -225,11 +148,11 @@ void SubgraphCompute::PrepareForRun() {
   LOG(INFO) << "[RKNPU]:PrepareForRun";
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                    param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
   CHECK(engine_);
 }
 
diff --git a/lite/kernels/rknpu/subgraph_compute.h b/lite/kernels/rknpu/subgraph_compute.h
index a4bdadc658a81decd8107072f7b5948613d0c68a..78162b3d165bde8e33436654bbcd1110ad9afea6 100644
--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
@@ -34,15 +34,18 @@ class SubgraphEngine : public subgraph::Engine {
  public:
   SubgraphEngine(KernelContext *ctx,
                  int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                  const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}
 
  protected:
-  bool PrepareWorkspaceForDeviceProgram() override;
   bool BuildDeviceProgram() override;
   bool LaunchDeviceProgram() override;
 
diff --git a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
index 376cdd0dc23426ede42ddac60e061727f73322e3..224bfdc130338bc653091400708bc8a7421a9482 100644
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
@@ -31,11 +31,14 @@ void XPUEmbeddingWithEltwiseAddCompute::PrepareForRun() {
     CHECK_EQ(table_dims.size(), 2); /* shape like [table_len, embed_dim] */
     table_lens_cpu_.push_back(table_dims[0]);
   }
-  void* lens_ptr = nullptr;
+
   size_t lens_size = table_lens_cpu_.size() * sizeof(int);
-  xpu_malloc(&lens_ptr, lens_size);
-  xpu_memcpy(lens_ptr, &table_lens_cpu_[0], lens_size, XPU_HOST_TO_DEVICE);
-  table_lens_guard_.reset(lens_ptr);
+  table_lens_guard_ =
+      TargetWrapperXPU::MallocScratchPad(lens_size, false /* use_l3 */);
+  XPU_CALL(xpu_memcpy(table_lens_guard_->addr_,
+                      &table_lens_cpu_[0],
+                      lens_size,
+                      XPU_HOST_TO_DEVICE));
 }
 
 void XPUEmbeddingWithEltwiseAddCompute::Run() {
@@ -55,16 +58,16 @@ void XPUEmbeddingWithEltwiseAddCompute::Run() {
   int embed_dim = table_dims[1];
   int emb_layer_num = param.Ids.size();
   int r = xdnn::embedding_with_ewadd<float, int64_t, false, false>(
-      ctx.GetRawContext(),                        /* context */
-      embed_dim,                                  /* embed_dim */
-      idx_len,                                    /* idx_len */
-      emb_layer_num,                              /* emb_layer_num */
-      param.padding_idx,                          /* padding_idx */
-      &arg_tables_[0],                            /* tables */
-      &arg_ids_[0],                               /* indices */
-      static_cast<int*>(table_lens_guard_.get()), /* table_lens */
-      nullptr,                                    /* scale_after_emb */
-      nullptr,                                    /* scale_after_ewadd */
+      ctx.GetRawContext(),                         /* context */
+      embed_dim,                                   /* embed_dim */
+      idx_len,                                     /* idx_len */
+      emb_layer_num,                               /* emb_layer_num */
+      param.padding_idx,                           /* padding_idx */
+      &arg_tables_[0],                             /* tables */
+      &arg_ids_[0],                                /* indices */
+      static_cast<int*>(table_lens_guard_->addr_), /* table_lens */
+      nullptr,                                     /* scale_after_emb */
+      nullptr,                                     /* scale_after_ewadd */
       param.Out->mutable_data<float>(TARGET(kXPU)) /* top */);
   CHECK_EQ(r, 0);
 }
diff --git a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
index 10ba6e0b5b76a1dbebfd633732f7c36e6ac7c954..124ed7866f0a52b892e30ae41398d5140064c964 100644
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include <memory>
 #include <vector>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
 #include "lite/core/kernel.h"
-#include "lite/kernels/xpu/utils.h"  // XPUFreeDeleter
 
 namespace paddle {
 namespace lite {
@@ -36,7 +35,7 @@ class XPUEmbeddingWithEltwiseAddCompute
  private:
   std::vector<const int64_t*> arg_ids_;
   std::vector<const float*> arg_tables_;
-  std::unique_ptr<void, XPUFreeDeleter> table_lens_guard_;
+  XPUScratchPadGuard table_lens_guard_;
   std::vector<int> table_lens_cpu_;
 };
 
diff --git a/lite/kernels/xpu/__xpu__mmdnn_compute.cc b/lite/kernels/xpu/__xpu__mmdnn_compute.cc
index 39ddecb1139073cb1a0bd8e3c7afc89f1d739da8..09d59fcee37c634a87636ac80e7be15d927f2509 100644
--- a/lite/kernels/xpu/__xpu__mmdnn_compute.cc
+++ b/lite/kernels/xpu/__xpu__mmdnn_compute.cc
@@ -27,8 +27,8 @@ namespace {
 
 void FillMax(float max, float* xpu_ptr) {
   float maxs[4] = {max, 0.0f, 0.0f, 0.0f};
-  xpu_memcpy(
-      xpu_ptr, maxs, 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(
+      xpu_ptr, maxs, 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 }
 
 void GrnnLayout(int batch,
@@ -156,8 +156,8 @@ class MMDNNIdInfo {
            idx_sorted.data(),
            idx_sorted.size() * sizeof(int));
     offset += idx_sorted.size() * sizeof(int);
-    xpu_memcpy(
-        l3_buffer_, cpu_buffer_, offset, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    XPU_CALL(xpu_memcpy(
+        l3_buffer_, cpu_buffer_, offset, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
   }
 };
 
@@ -221,29 +221,32 @@ class MMDNNFcOp {
              int m,
              float* out,
              const float* in_max_by_caller = nullptr) {
+    int r = 0;
     if (in_max_by_caller == nullptr) {
-      xdnn::findmax<float>(ctx, in, m * k_, in_max_);
+      r = xdnn::findmax<float>(ctx, in, m * k_, in_max_);
+      CHECK_EQ(r, 0);
       in_max_by_caller = in_max_;
     }
-    xdnn::gemm_int16_maxptr<float, int16_t, float>(ctx,
-                                                   false,
-                                                   true,
-                                                   m,
-                                                   n_,
-                                                   k_,
-                                                   1.0f,
-                                                   in,
-                                                   k_,
-                                                   weight_,
-                                                   k_,
-                                                   0.0f,
-                                                   out,
-                                                   n_,
-                                                   bias_,
-                                                   act_type_,
-                                                   in_max_by_caller,
-                                                   weight_max_,
-                                                   out_max);
+    r = xdnn::gemm_int16_maxptr<float, int16_t, float>(ctx,
+                                                       false,
+                                                       true,
+                                                       m,
+                                                       n_,
+                                                       k_,
+                                                       1.0f,
+                                                       in,
+                                                       k_,
+                                                       weight_,
+                                                       k_,
+                                                       0.0f,
+                                                       out,
+                                                       n_,
+                                                       bias_,
+                                                       act_type_,
+                                                       in_max_by_caller,
+                                                       weight_max_,
+                                                       out_max);
+    CHECK_EQ(r, 0);
   }
 };
 
@@ -331,44 +334,49 @@ class MMDNNGrnnOp {
       gru_out = l3_buffer + 4 * slot_size;
     }
 
-    xdnn::search_seq2batch(ctx,
-                           batch,
-                           max_width,
-                           cap_e_,
-                           sentense.idx_sorted_32,
-                           sentense.lod_32,
-                           sentense.new_offset_32,
-                           in,
-                           seq2batch_out);
-
-    xdnn::findmax<float>(ctx, in, cap_l * cap_e_, input_max_);
+    int r = 0;
+    r = xdnn::search_seq2batch(ctx,
+                               batch,
+                               max_width,
+                               cap_e_,
+                               sentense.idx_sorted_32,
+                               sentense.lod_32,
+                               sentense.new_offset_32,
+                               in,
+                               seq2batch_out);
+    CHECK_EQ(r, 0);
+
+    r = xdnn::findmax<float>(ctx, in, cap_l * cap_e_, input_max_);
+    CHECK_EQ(r, 0);
     fc_e2h0_.Infer(ctx, seq2batch_out, cap_l, fc_e2h_out, input_max_);
     fc_e2h1_.Infer(
         ctx, seq2batch_out, cap_l, fc_e2h_out + cap_l * cap_h_, input_max_);
     fc_e2h2_.Infer(
         ctx, seq2batch_out, cap_l, fc_e2h_out + cap_l * cap_h_ * 2, input_max_);
-    xdnn::search_grnn<float, int16_t>(ctx,
-                                      cap_l,
-                                      cap_h_,
-                                      cap_e_,
-                                      max_width,
-                                      sentense.new_offset_32,
-                                      fc_e2h_out,
-                                      dense_h2h_,
-                                      gru_out,
-                                      dense_h2h_max_[0],
-                                      dense_h2h_max_[1],
-                                      dense_h2h_max_[2]);
-
-    xdnn::search_batch2seq(ctx,
-                           batch,
-                           max_width,
-                           cap_h_,
-                           sentense.idx_sorted_32,
-                           sentense.lod_32,
-                           sentense.new_offset_32,
-                           gru_out,
-                           out);
+    r = xdnn::search_grnn<float, int16_t>(ctx,
+                                          cap_l,
+                                          cap_h_,
+                                          cap_e_,
+                                          max_width,
+                                          sentense.new_offset_32,
+                                          fc_e2h_out,
+                                          dense_h2h_,
+                                          gru_out,
+                                          dense_h2h_max_[0],
+                                          dense_h2h_max_[1],
+                                          dense_h2h_max_[2]);
+    CHECK_EQ(r, 0);
+
+    r = xdnn::search_batch2seq(ctx,
+                               batch,
+                               max_width,
+                               cap_h_,
+                               sentense.idx_sorted_32,
+                               sentense.lod_32,
+                               sentense.new_offset_32,
+                               gru_out,
+                               out);
+    CHECK_EQ(r, 0);
   }
 };
 
@@ -435,38 +443,43 @@ class MMDNNAttentionOp {
     }
 
     seqfc_.Infer(ctx, input, cap_l, seqfc_out);
-    xdnn::search_noaligned_mat_mul(ctx,
-                                   0,
-                                   1,
-                                   batch,
-                                   lod_32,
-                                   max_width,
-                                   dim_,
-                                   alpha0_,
-                                   input,
-                                   seqfc_out,
-                                   batchgemm0_out);
-    xdnn::search_seq_softmax(
+    int r = 0;
+    r = xdnn::search_noaligned_mat_mul(ctx,
+                                       0,
+                                       1,
+                                       batch,
+                                       lod_32,
+                                       max_width,
+                                       dim_,
+                                       alpha0_,
+                                       input,
+                                       seqfc_out,
+                                       batchgemm0_out);
+    CHECK_EQ(r, 0);
+    r = xdnn::search_seq_softmax(
         ctx, batchgemm0_out, seq_softmax_out, lod_32, batch, max_width);
-    xdnn::search_noaligned_mat_mul(ctx,
-                                   0,
-                                   0,
-                                   batch,
-                                   lod_32,
-                                   max_width,
-                                   dim_,
-                                   alpha1_,
-                                   seq_softmax_out,
-                                   input,
-                                   batchgemm1_out);
-    xdnn::sequence_pooling_forward(ctx,
-                                   xdnn::Pooling_t::MAX_WITHOUT_INDEX,
-                                   batch,
-                                   lod_32,
-                                   dim_,
-                                   batchgemm1_out,
-                                   nullptr,
-                                   pool_out);
+    CHECK_EQ(r, 0);
+    r = xdnn::search_noaligned_mat_mul(ctx,
+                                       0,
+                                       0,
+                                       batch,
+                                       lod_32,
+                                       max_width,
+                                       dim_,
+                                       alpha1_,
+                                       seq_softmax_out,
+                                       input,
+                                       batchgemm1_out);
+    CHECK_EQ(r, 0);
+    r = xdnn::sequence_pooling_forward(ctx,
+                                       xdnn::Pooling_t::MAX_WITHOUT_INDEX,
+                                       batch,
+                                       lod_32,
+                                       dim_,
+                                       batchgemm1_out,
+                                       nullptr,
+                                       pool_out);
+    CHECK_EQ(r, 0);
   }
 };
 
@@ -510,12 +523,13 @@ class MMDNNMatchConvTopk {
             float conv_w_max,
             int dim_t,
             int dim_in,
+            int out_channel,
             int upper_bound_batch,
             int upper_bound_seqlen,
             const std::vector<int>& topks) {
     dim_t_ = dim_t;
     dim_in_ = dim_in;
-    out_channel_ = 5;  // TODO(miaotianxiang):
+    out_channel_ = out_channel;
     topks_ = topks;
 
     xw_fc_.Init(input_w,
@@ -553,10 +567,10 @@ class MMDNNMatchConvTopk {
     topks_xpu_guard_ =
         TargetWrapperXPU::MallocScratchPad(topks_.size() * sizeof(int), false);
     topks_xpu_ = reinterpret_cast<int*>(topks_xpu_guard_->addr_);
-    xpu_memcpy(topks_xpu_,
-               topks_.data(),
-               topks_.size() * sizeof(int),
-               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    XPU_CALL(xpu_memcpy(topks_xpu_,
+                        topks_.data(),
+                        topks_.size() * sizeof(int),
+                        XPUMemcpyKind::XPU_HOST_TO_DEVICE));
     useless_topk_pos_guard_ =
         TargetWrapperXPU::MallocScratchPad(4 * sizeof(int), false);
     useless_topk_pos_ = reinterpret_cast<int*>(useless_topk_pos_guard_->addr_);
@@ -576,18 +590,18 @@ class MMDNNMatchConvTopk {
     for (auto e : left_lod) {
       left_lod_32_cpu.push_back(e);
     }
-    xpu_memcpy(left_lod_32_,
-               left_lod_32_cpu.data(),
-               left_lod_32_cpu.size() * sizeof(int),
-               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    XPU_CALL(xpu_memcpy(left_lod_32_,
+                        left_lod_32_cpu.data(),
+                        left_lod_32_cpu.size() * sizeof(int),
+                        XPUMemcpyKind::XPU_HOST_TO_DEVICE));
     std::vector<int> right_lod_32_cpu;
     for (auto e : right_lod) {
       right_lod_32_cpu.push_back(e);
     }
-    xpu_memcpy(right_lod_32_,
-               right_lod_32_cpu.data(),
-               right_lod_32_cpu.size() * sizeof(int),
-               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    XPU_CALL(xpu_memcpy(right_lod_32_,
+                        right_lod_32_cpu.data(),
+                        right_lod_32_cpu.size() * sizeof(int),
+                        XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 
     std::vector<int> lod_match = {0};
     std::vector<int> lod_conv = {0};
@@ -611,18 +625,18 @@ class MMDNNMatchConvTopk {
       left_seqlen_sum += len_x;
       right_seqlen_sum += len_y;
     }
-    xpu_memcpy(match_lod_32_,
-               lod_match.data(),
-               lod_match.size() * sizeof(int),
-               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-    xpu_memcpy(conv_lod_32_,
-               lod_conv.data(),
-               lod_conv.size() * sizeof(int),
-               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-    xpu_memcpy(topk_offset_32_,
-               lod_topk.data(),
-               lod_topk.size() * sizeof(int),
-               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    XPU_CALL(xpu_memcpy(match_lod_32_,
+                        lod_match.data(),
+                        lod_match.size() * sizeof(int),
+                        XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+    XPU_CALL(xpu_memcpy(conv_lod_32_,
+                        lod_conv.data(),
+                        lod_conv.size() * sizeof(int),
+                        XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+    XPU_CALL(xpu_memcpy(topk_offset_32_,
+                        lod_topk.data(),
+                        lod_topk.size() * sizeof(int),
+                        XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 
     float* xwy_out = hbm_buffer_;
     float* conv_out = hbm_buffer_ + x_mul_y_sum * dim_t_;
@@ -640,19 +654,21 @@ class MMDNNMatchConvTopk {
 
     int max_width = std::max(left_seqlen_max, right_seqlen_max);
     xw_fc_.Infer(ctx, left->data<float>(), left_seqlen_sum, xw_out);
-    xdnn::match_matrix_tensor(ctx,
-                              batch,
-                              xw_out,
-                              right->data<float>(),
-                              left_lod_32_,
-                              right_lod_32_,
-                              dim_t_,
-                              dim_in_,
-                              xwy_out,
-                              xw_fc_.out_max,
-                              xdnn::Activation_t::RELU,
-                              max_width);
-    xdnn::search_varconv<float, int16_t>(
+    int r = 0;
+    r = xdnn::match_matrix_tensor(ctx,
+                                  batch,
+                                  xw_out,
+                                  right->data<float>(),
+                                  left_lod_32_,
+                                  right_lod_32_,
+                                  dim_t_,
+                                  dim_in_,
+                                  xwy_out,
+                                  xw_fc_.out_max,
+                                  xdnn::Activation_t::RELU,
+                                  max_width);
+    CHECK_EQ(r, 0);
+    r = xdnn::search_varconv<float, int16_t>(
         ctx,
         batch,
         dim_t_,
@@ -668,24 +684,27 @@ class MMDNNMatchConvTopk {
         conv_out,
         conv_weight_max_,
         xdnn::Activation_t::RELU);  // TODO(miaotianxiang):
-    xdnn::sequence_concat(ctx,
-                          xwy_out,
-                          match_lod_32_,
-                          conv_out,
-                          conv_lod_32_,
-                          seq_concat_out,
-                          batch);
-    xdnn::sequence_topk_avg_pooling(ctx,
-                                    seq_concat_out,
-                                    seq_avg_topk_out,
-                                    useless_topk_pos_,
-                                    batch,
-                                    dim_t_ + out_channel_,
-                                    topk_offset_32_,
-                                    left_lod_32_,
-                                    right_lod_32_,
-                                    topks_xpu_,
-                                    topks_.size());
+    CHECK_EQ(r, 0);
+    r = xdnn::sequence_concat(ctx,
+                              xwy_out,
+                              match_lod_32_,
+                              conv_out,
+                              conv_lod_32_,
+                              seq_concat_out,
+                              batch);
+    CHECK_EQ(r, 0);
+    r = xdnn::sequence_topk_avg_pooling(ctx,
+                                        seq_concat_out,
+                                        seq_avg_topk_out,
+                                        useless_topk_pos_,
+                                        batch,
+                                        dim_t_ + out_channel_,
+                                        topk_offset_32_,
+                                        left_lod_32_,
+                                        right_lod_32_,
+                                        topks_xpu_,
+                                        topks_.size());
+    CHECK_EQ(r, 0);
   }
 };
 
@@ -802,34 +821,38 @@ class MMDNNBidEmbGrnnAtt {
     pool_rv = grnn_rv_pool_out->mutable_data<float>(TARGET(kXPU));
     att_out = att_pool_out->mutable_data<float>(TARGET(kXPU));
 
-    xdnn::search_bid_emb_ew(ctx,
-                            batch,
-                            sentense.lod_64,
-                            sentense.id0_64,
-                            sentense.id1_64,
-                            table_,
-                            table_len_,
-                            emb_dim_,
-                            emb_fw,
-                            emb_rv,
-                            table_len_ - 2,
-                            1);
+    int r = 0;
+    r = xdnn::search_bid_emb_ew(ctx,
+                                batch,
+                                sentense.lod_64,
+                                sentense.id0_64,
+                                sentense.id1_64,
+                                table_,
+                                table_len_,
+                                emb_dim_,
+                                emb_fw,
+                                emb_rv,
+                                table_len_ - 2,
+                                1);
+    CHECK_EQ(r, 0);
     bi_rv_.Infer(ctx,
                  sentense,
                  emb_rv,
                  grnn_rv,
                  l3_buffer + 2 * slot_len,
                  l3_size - 2 * slot_len * sizeof(float));
-    xdnn::sequence_reverse(
+    r = xdnn::sequence_reverse(
         ctx, batch, sentense.lod_32, cap_h_, grnn_rv, grnn_rv_rv);
-    xdnn::sequence_pooling_forward(ctx,
-                                   xdnn::Pooling_t::LAST,
-                                   batch,
-                                   sentense.lod_32,
-                                   cap_h_,
-                                   grnn_rv,
-                                   nullptr,
-                                   pool_rv);
+    CHECK_EQ(r, 0);
+    r = xdnn::sequence_pooling_forward(ctx,
+                                       xdnn::Pooling_t::LAST,
+                                       batch,
+                                       sentense.lod_32,
+                                       cap_h_,
+                                       grnn_rv,
+                                       nullptr,
+                                       pool_rv);
+    CHECK_EQ(r, 0);
 
     bi_fw_.Infer(ctx,
                  sentense,
@@ -837,19 +860,23 @@ class MMDNNBidEmbGrnnAtt {
                  grnn_fw,
                  l3_buffer + 2 * slot_len,
                  l3_size - 2 * slot_len * sizeof(float));
-    xdnn::sequence_pooling_forward(ctx,
-                                   xdnn::Pooling_t::LAST,
-                                   batch,
-                                   sentense.lod_32,
-                                   cap_h_,
-                                   grnn_fw,
-                                   nullptr,
-                                   pool_fw);
+    r = xdnn::sequence_pooling_forward(ctx,
+                                       xdnn::Pooling_t::LAST,
+                                       batch,
+                                       sentense.lod_32,
+                                       cap_h_,
+                                       grnn_fw,
+                                       nullptr,
+                                       pool_fw);
+    CHECK_EQ(r, 0);
     const int concat_widths[] = {cap_h_, cap_h_, cap_h_};
     const float* concat_ptrs[] = {emb_fw, grnn_fw, grnn_rv_rv};
-    xdnn::concat<float>(
+    r = xdnn::concat<float>(
         ctx, cap_l, concat_widths + 1, 2, concat_ptrs + 1, concat_2in);
-    xdnn::concat<float>(ctx, cap_l, concat_widths, 3, concat_ptrs, concat_3in);
+    CHECK_EQ(r, 0);
+    r = xdnn::concat<float>(
+        ctx, cap_l, concat_widths, 3, concat_ptrs, concat_3in);
+    CHECK_EQ(r, 0);
     att_.Infer(ctx,
                sentense,
                concat_2in,
@@ -899,16 +926,18 @@ class MMDNNEmbAtt {
     int cap_l = sentense.lod.back();
     const float* emb_tables[] = {table_, table_};
     const int64_t* emb_indices[] = {sentense.id0_64, sentense.id1_64};
-    xdnn::embedding_with_ewadd<float, int64_t, false, false>(ctx,
-                                                             emb_dim_,
-                                                             cap_l,
-                                                             2,
-                                                             table_len_ - 2,
-                                                             emb_tables,
-                                                             emb_indices,
-                                                             nullptr,
-                                                             nullptr,
-                                                             emb_fw);
+    int r =
+        xdnn::embedding_with_ewadd<float, int64_t, false, false>(ctx,
+                                                                 emb_dim_,
+                                                                 cap_l,
+                                                                 2,
+                                                                 table_len_ - 2,
+                                                                 emb_tables,
+                                                                 emb_indices,
+                                                                 nullptr,
+                                                                 nullptr,
+                                                                 emb_fw);
+    CHECK_EQ(r, 0);
     att_.Infer(ctx, sentense, emb_fw, att_out, l3_buffer, l3_size);
   }
 };
@@ -990,7 +1019,7 @@ class MMDNNMergeAll {
     fc2_.Init(
         fc2_w, fc2_w_max, fc2_b, fc2_n_, fc2_k_, xdnn::Activation_t::LINEAR);
 
-    int hbm_total_len = max_cap_l * cap_h_ * 4 +
+    int hbm_total_len = max_cap_l * cap_e_ * 2 + max_cap_l * cap_h_ * 2 +
                         upper_bound_batch * (2 * cap_h_ + fc0_k_ + fc0_n_ +
                                              fc1_k_ + fc1_n_ + fc2_n_);
     hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(
@@ -1000,7 +1029,7 @@ class MMDNNMergeAll {
 
   void Infer(xdnn::Context* ctx,
              const MMDNNIdInfo& sentense,
-             const std::vector<lite::Tensor*> concat_2in1_x,
+             const std::vector<lite::Tensor*> concat_topk_x,
              const std::vector<lite::Tensor*> concat_7in1_x,
              lite::Tensor* out,
              float* l3_buffer = nullptr,
@@ -1010,13 +1039,13 @@ class MMDNNMergeAll {
 
     float* topk_concat_out_fw = hbm_buffer_;
     int hbm_total_len =
-        cap_l * cap_h_ * 4 +
+        cap_l * cap_e_ * 2 + cap_l * cap_h_ * 2 +
         batch * (2 * cap_h_ + fc0_k_ + fc0_n_ + fc1_k_ + fc1_n_ + fc2_n_);
     if (l3_size > 0 && l3_size >= hbm_total_len * sizeof(float)) {
       topk_concat_out_fw = l3_buffer;
     }
-    float* topk_concat_out_rv = topk_concat_out_fw + cap_l * cap_h_;
-    float* grnn_fw = topk_concat_out_rv + cap_l * cap_h_;
+    float* topk_concat_out_rv = topk_concat_out_fw + cap_l * cap_e_;
+    float* grnn_fw = topk_concat_out_rv + cap_l * cap_e_;
     float* grnn_rv = grnn_fw + cap_l * cap_h_;
     float* pool_fw = grnn_rv + cap_l * cap_h_;
     float* pool_rv = pool_fw + batch * cap_h_;
@@ -1027,18 +1056,27 @@ class MMDNNMergeAll {
     // float* fc2_out = fc1_out + batch * fc1_n_;
     float* fc2_out = out->mutable_data<float>(TARGET(kXPU));
 
-    const int concat_widths[] = {static_cast<int>(concat_2in1_x[0]->dims()[1]),
-                                 static_cast<int>(concat_2in1_x[1]->dims()[1])};
-    const float* concat_ptrs[] = {concat_2in1_x[0]->data<float>(),
-                                  concat_2in1_x[1]->data<float>()};
-    xdnn::concat<float>(
-        ctx, cap_l, concat_widths, 2, concat_ptrs, topk_concat_out_fw);
-    xdnn::sequence_reverse(ctx,
-                           batch,
-                           sentense.lod_32,
-                           cap_e_,
-                           topk_concat_out_fw,
-                           topk_concat_out_rv);
+    std::vector<int> concat_widths;
+    std::vector<const float*> concat_ptrs;
+    for (const auto* t : concat_topk_x) {
+      concat_widths.push_back(static_cast<int>(t->dims()[1]));
+      concat_ptrs.push_back(t->data<float>());
+    }
+    int r = 0;
+    r = xdnn::concat<float>(ctx,
+                            cap_l,
+                            concat_widths.data(),
+                            concat_widths.size(),
+                            concat_ptrs.data(),
+                            topk_concat_out_fw);
+    CHECK_EQ(r, 0);
+    r = xdnn::sequence_reverse(ctx,
+                               batch,
+                               sentense.lod_32,
+                               cap_e_,
+                               topk_concat_out_fw,
+                               topk_concat_out_rv);
+    CHECK_EQ(r, 0);
     coverage_fw_.Infer(ctx,
                        sentense,
                        topk_concat_out_fw,
@@ -1051,22 +1089,24 @@ class MMDNNMergeAll {
                        grnn_rv,
                        l3_buffer + hbm_total_len,
                        l3_size - hbm_total_len * sizeof(float));
-    xdnn::sequence_pooling_forward(ctx,
-                                   xdnn::Pooling_t::LAST,
-                                   batch,
-                                   sentense.lod_32,
-                                   cap_h_,
-                                   grnn_fw,
-                                   nullptr,
-                                   pool_fw);
-    xdnn::sequence_pooling_forward(ctx,
-                                   xdnn::Pooling_t::LAST,
-                                   batch,
-                                   sentense.lod_32,
-                                   cap_h_,
-                                   grnn_rv,
-                                   nullptr,
-                                   pool_rv);
+    r = xdnn::sequence_pooling_forward(ctx,
+                                       xdnn::Pooling_t::LAST,
+                                       batch,
+                                       sentense.lod_32,
+                                       cap_h_,
+                                       grnn_fw,
+                                       nullptr,
+                                       pool_fw);
+    CHECK_EQ(r, 0);
+    r = xdnn::sequence_pooling_forward(ctx,
+                                       xdnn::Pooling_t::LAST,
+                                       batch,
+                                       sentense.lod_32,
+                                       cap_h_,
+                                       grnn_rv,
+                                       nullptr,
+                                       pool_rv);
+    CHECK_EQ(r, 0);
 
     const int concat_widths_fc0[] = {
         static_cast<int>(concat_7in1_x[0]->dims()[1]),
@@ -1089,11 +1129,13 @@ class MMDNNMergeAll {
     const int concat_widths_fc1[] = {cap_h_, cap_h_, fc0_n_};
     const float* concat_ptrs_fc1[] = {pool_fw, pool_rv, fc0_out};
 
-    xdnn::concat<float>(
+    r = xdnn::concat<float>(
         ctx, batch, concat_widths_fc0, 7, concat_ptrs_fc0, fc0_in);
+    CHECK_EQ(r, 0);
     fc0_.Infer(ctx, fc0_in, batch, fc0_out);
-    xdnn::concat<float>(
+    r = xdnn::concat<float>(
         ctx, batch, concat_widths_fc1, 3, concat_ptrs_fc1, fc1_in);
+    CHECK_EQ(r, 0);
     fc1_.Infer(ctx, fc1_in, batch, fc1_out);
     fc2_.Infer(ctx, fc1_out, batch, fc2_out);
   }
@@ -1111,14 +1153,12 @@ class XPUMmdnnBidEmbGrnnAttCompute
  private:
   MMDNNIdInfo id_;
   MMDNNBidEmbGrnnAtt compound_;
-  int upper_bound_batch_ = 40;
-  int upper_bound_seqlen_ = 512;
 };
 
 void XPUMmdnnBidEmbGrnnAttCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
 
-  id_.Init(upper_bound_batch_, upper_bound_seqlen_);
+  id_.Init(XPU_MAX_LOD_SIZE, XPU_MAX_LOD_SEQ_LEN);
   compound_.Init(param.emb_tbl,
                  param.grnn_fw_wh,
                  param.grnn_fw_wh_maxs,
@@ -1131,8 +1171,8 @@ void XPUMmdnnBidEmbGrnnAttCompute::PrepareForRun() {
                  param.att_fc_w,
                  param.att_fc_w_max,
                  param.att_fc_b,
-                 upper_bound_batch_,
-                 upper_bound_seqlen_);
+                 XPU_MAX_LOD_SIZE,
+                 XPU_MAX_LOD_SEQ_LEN);
 }
 
 void XPUMmdnnBidEmbGrnnAttCompute::Run() {
@@ -1157,6 +1197,76 @@ void XPUMmdnnBidEmbGrnnAttCompute::Run() {
                   xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size);
 }
 
+class XPUMmdnnBidEmbGrnnAttCompute2
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMmdnnBidEmbGrnnAttParam2;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  MMDNNIdInfo id_;
+  MMDNNBidEmbGrnnAtt compound_;
+};
+
+void XPUMmdnnBidEmbGrnnAttCompute2::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  id_.Init(XPU_MAX_LOD_SIZE, XPU_MAX_LOD_SEQ_LEN);
+  compound_.Init(param.emb_tbl,
+                 param.grnn_fw_wh,
+                 param.grnn_fw_wh_maxs,
+                 param.grnn_fw_wi,
+                 param.grnn_fw_wi_maxs,
+                 param.grnn_rv_wh,
+                 param.grnn_rv_wh_maxs,
+                 param.grnn_rv_wi,
+                 param.grnn_rv_wi_maxs,
+                 param.att_fc_w,
+                 param.att_fc_w_max,
+                 param.att_fc_b,
+                 XPU_MAX_LOD_SIZE,
+                 XPU_MAX_LOD_SEQ_LEN);
+}
+
+void XPUMmdnnBidEmbGrnnAttCompute2::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* xpu_ctx = ctx.GetRawContext();
+
+  int batch = param.id0->lod()[0].size() - 1;
+  id_.Update(param.id0, param.id1);
+  compound_.Infer(ctx.GetRawContext(),
+                  batch,
+                  id_,
+                  param.grnn_fw_pool_out,
+                  param.grnn_rv_pool_out,
+                  param.att_pool_out,
+                  param.concat_3in1_out,
+                  param.emb_fw_out,
+                  reinterpret_cast<float*>(
+                      reinterpret_cast<char*>(xpu_ctx->workspace_l3_ptr) +
+                      xpu_ctx->used_l3_size),
+                  xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size);
+
+  int num = param.id0->numel();
+  int embed_dim = param.emb_tbl->dims()[1];
+
+  // TODO(miaotianxiang):
+  int r = xdnn::embedding<float, int64_t>(
+      ctx.GetRawContext(),                               /* context */
+      num,                                               /* num */
+      param.id0->data<int64_t>(),                        /* indices */
+      embed_dim,                                         /* embed_dim */
+      param.emb_tbl->data<float>(),                      /* table */
+      param.emb0_out->mutable_data<float>(TARGET(kXPU)), /* top */
+      128000 /* padding_idx */);
+  CHECK_EQ(r, 0);
+}
+
 class XPUMmdnnBidEmbAttCompute
     : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
  public:
@@ -1169,20 +1279,18 @@ class XPUMmdnnBidEmbAttCompute
  private:
   MMDNNIdInfo id_;
   MMDNNEmbAtt compound_;
-  int upper_bound_batch_ = 40;
-  int upper_bound_seqlen_ = 512;
 };
 
 void XPUMmdnnBidEmbAttCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
 
-  id_.Init(upper_bound_batch_, upper_bound_seqlen_);
+  id_.Init(XPU_MAX_LOD_SIZE, XPU_MAX_LOD_SEQ_LEN);
   compound_.Init(param.emb_tbl,
                  param.att_fc_w,
                  param.att_fc_w_max,
                  param.att_fc_b,
-                 upper_bound_batch_,
-                 upper_bound_seqlen_);
+                 XPU_MAX_LOD_SIZE,
+                 XPU_MAX_LOD_SEQ_LEN);
 }
 
 void XPUMmdnnBidEmbAttCompute::Run() {
@@ -1215,8 +1323,6 @@ class XPUMmdnnMatchConvTopkCompute
 
  private:
   MMDNNMatchConvTopk compound_;
-  int upper_bound_batch_ = 40;
-  int upper_bound_seqlen_ = 512;
 };
 
 void XPUMmdnnMatchConvTopkCompute::PrepareForRun() {
@@ -1228,8 +1334,9 @@ void XPUMmdnnMatchConvTopkCompute::PrepareForRun() {
                  param.conv_w_max,
                  param.dim_t,
                  param.input_w->dims()[0],
-                 upper_bound_batch_,
-                 upper_bound_seqlen_,
+                 param.output_channel,
+                 XPU_MAX_LOD_SIZE,
+                 XPU_MAX_LOD_SEQ_LEN,
                  param.topks);
 }
 
@@ -1261,14 +1368,12 @@ class XPUMmdnnMergeAllCompute
  private:
   MMDNNIdInfo id_;
   MMDNNMergeAll compound_;
-  int upper_bound_batch_ = 40;
-  int upper_bound_seqlen_ = 512;
 };
 
 void XPUMmdnnMergeAllCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
 
-  id_.Init(upper_bound_batch_, upper_bound_seqlen_);
+  id_.Init(XPU_MAX_LOD_SIZE, XPU_MAX_LOD_SEQ_LEN);
   compound_.Init(param.grnn_fw_wh,
                  param.grnn_fw_wh_maxs,
                  param.grnn_fw_wi,
@@ -1286,8 +1391,8 @@ void XPUMmdnnMergeAllCompute::PrepareForRun() {
                  param.fc2_w,
                  param.fc2_w_max,
                  param.fc2_b,
-                 upper_bound_batch_,
-                 upper_bound_seqlen_);
+                 XPU_MAX_LOD_SIZE,
+                 XPU_MAX_LOD_SEQ_LEN);
 }
 
 void XPUMmdnnMergeAllCompute::Run() {
@@ -1296,10 +1401,10 @@ void XPUMmdnnMergeAllCompute::Run() {
 
   auto* xpu_ctx = ctx.GetRawContext();
 
-  id_.Update(param.concat_2in1_x[0], param.concat_2in1_x[1]);
+  id_.Update(param.concat_topk_x[0], param.concat_topk_x[1]);
   compound_.Infer(ctx.GetRawContext(),
                   id_,
-                  param.concat_2in1_x,
+                  param.concat_topk_x,
                   param.concat_7in1_x,
                   param.out,
                   reinterpret_cast<float*>(
@@ -1335,6 +1440,29 @@ REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_grnn_att,
     .BindOutput("emb_fw_out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_grnn_att2,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMmdnnBidEmbGrnnAttCompute2,
+                     def)
+    .BindInput("id0", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("id1", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("emb_tbl", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_fw_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_fw_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_rv_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_rv_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("att_fc_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("att_fc_b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("emb0_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("grnn_fw_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("grnn_rv_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("att_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("concat_3in1_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("emb_fw_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_att,
                      kXPU,
                      kFloat,
@@ -1371,7 +1499,7 @@ REGISTER_LITE_KERNEL(__xpu__mmdnn_merge_all,
                      paddle::lite::kernels::xpu::XPUMmdnnMergeAllCompute,
                      def)
     .BindInput("concat_7in1_x", {LiteType::GetTensorTy(TARGET(kXPU))})
-    .BindInput("concat_2in1_x", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("concat_topk_x", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("grnn_fw_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("grnn_fw_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("grnn_rv_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.h b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
index 71db4e6f44f9c36e4acdaf0a440463a61f4e3099..dbc2d785d42ad29dc1cfbe36f744b71662e48315 100644
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include <vector>
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/kernel.h"
diff --git a/lite/kernels/xpu/__xpu__resnet50_compute.h b/lite/kernels/xpu/__xpu__resnet50_compute.h
index 3d42f8b6f26edf615dba165b553b633673a4ae66..7ce8b1192ea9e85d83ddbeddc374378692866aa6 100644
--- a/lite/kernels/xpu/__xpu__resnet50_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include <vector>
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/kernel.h"
diff --git a/lite/kernels/xpu/__xpu__search_attention_compute.cc b/lite/kernels/xpu/__xpu__search_attention_compute.cc
index 515be8935637d89d58db830f96f2ea439e7d7e68..7f02f566dfb01f2d8a57302e714f4f2cb3d4b786 100644
--- a/lite/kernels/xpu/__xpu__search_attention_compute.cc
+++ b/lite/kernels/xpu/__xpu__search_attention_compute.cc
@@ -22,16 +22,19 @@ namespace kernels {
 namespace xpu {
 
 void XPUMmdnnSearchAttentionCompute::PrepareForRun() {
-  offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
-  pad_begin_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
-  w_max_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(8 * sizeof(float));
+  offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  pad_begin_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  w_max_xpu_guard_ =
+      TargetWrapperXPU::MallocScratchPad(8 * sizeof(float), false /* use_l3 */);
   buffer_at_l3_guard_ = TargetWrapperXPU::MallocScratchPad(
       5 * L3_SLOT_SIZE * sizeof(float), false /* use_l3 */);
   buffer_at_gm_guard_ = TargetWrapperXPU::MallocScratchPad(
       5 * GM_SLOT_SIZE * sizeof(float), false /* use_l3 */);
 
-  offset_cpu.reset(new int[64]);
-  pad_begin_cpu.reset(new int[64]);
+  offset_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  pad_begin_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
 }
 
 void XPUMmdnnSearchAttentionCompute::Run() {
@@ -72,18 +75,18 @@ void XPUMmdnnSearchAttentionCompute::Run() {
   }
   offset_cpu[batch] = offset[batch];
 
-  xpu_memcpy(offset_xpu_guard_->addr_,
-             offset_cpu.get(),
-             offset.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-  xpu_memcpy(pad_begin_xpu_guard_->addr_,
-             pad_begin_cpu.get(),
-             batch * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-  xpu_memcpy(w_max_xpu_guard_->addr_,
-             maxs_cpu,
-             8 * sizeof(float),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(offset_xpu_guard_->addr_,
+                      offset_cpu.get(),
+                      offset.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(pad_begin_xpu_guard_->addr_,
+                      pad_begin_cpu.get(),
+                      batch * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(w_max_xpu_guard_->addr_,
+                      maxs_cpu,
+                      8 * sizeof(float),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 
   int* offset_xpu = reinterpret_cast<int*>(offset_xpu_guard_->addr_);
   int* pad_begin_xpu = reinterpret_cast<int*>(pad_begin_xpu_guard_->addr_);
@@ -115,90 +118,99 @@ void XPUMmdnnSearchAttentionCompute::Run() {
   }
 
   const auto* bottom_data = X->data<float>();
-  xdnn::search_sequence_pad_depad(ctx.GetRawContext(),
-                                  const_cast<float*>(bottom_data),
-                                  group_padding_output,
-                                  offset_xpu,
-                                  max_seq,
-                                  batch,
-                                  dim1,
-                                  0);  // is_depad = 0
+  int r = 0;
+  r = xdnn::search_sequence_pad_depad(ctx.GetRawContext(),
+                                      const_cast<float*>(bottom_data),
+                                      group_padding_output,
+                                      offset_xpu,
+                                      max_seq,
+                                      batch,
+                                      dim1,
+                                      0);  // is_depad = 0
+  CHECK_EQ(r, 0);
   // do-findmax
-  xdnn::findmax<float>(ctx.GetRawContext(),
-                       group_padding_output,
-                       batch * max_seq * dim1,
-                       maxs_xpu);
-  xdnn::gemm_int16_maxptr<float, int16_t, float>(
-      ctx.GetRawContext(),
-      false,
-      true,  // trans_a, trans_b
-      batch * max_seq,
-      dim1,
-      dim1,  // m, n, k
-      1.0f,
-      group_padding_output,
-      dim1,  // alpha, data_a, lda
-      w_data,
-      dim1,
-      0.0f,  // data_b, ldb, beta
-      seq_fc_output,
-      dim1,
-      b_data,  // data_c, ldc, bias
-      xdnn::Activation_t::LINEAR,
-      maxs_xpu,
-      maxs_xpu + 4,
-      nullptr);  // max_a, max_b, max_c
-  xdnn::search_aligned_mat_mul(ctx.GetRawContext(),
-                               0,
-                               1,
-                               batch,
-                               max_seq,
-                               max_seq,
-                               dim1,
-                               alpha0,
-                               group_padding_output,
-                               dim1,
-                               seq_fc_output,
-                               dim1,
-                               batchgemm0_output,
-                               max_seq);
-  xdnn::search_pad_mask(ctx.GetRawContext(),
-                        batchgemm0_output,
-                        attention_output,
-                        pad_begin_xpu,
-                        batch,
-                        max_seq,
-                        max_seq,
-                        batch,
-                        mask);
-  xdnn::softmax2d_forward(ctx.GetRawContext(),
-                          attention_output,
-                          seq_softmax_output,
-                          batch * max_seq,
-                          max_seq,
-                          true);
-  xdnn::search_aligned_mat_mul(ctx.GetRawContext(),
-                               0,
-                               0,
-                               batch,
-                               max_seq,
-                               dim1,
-                               max_seq,
-                               alpha1,
-                               seq_softmax_output,
-                               max_seq,
-                               group_padding_output,
-                               dim1,
-                               batchgemm1_output,
-                               dim1);
-  xdnn::search_sequence_pad_depad(ctx.GetRawContext(),
-                                  top_data,
-                                  batchgemm1_output,
-                                  offset_xpu,
-                                  max_seq,
-                                  batch,
-                                  dim1,
-                                  1);  // is_depad = 1
+  r = xdnn::findmax<float>(ctx.GetRawContext(),
+                           group_padding_output,
+                           batch * max_seq * dim1,
+                           maxs_xpu);
+  CHECK_EQ(r, 0);
+  r = xdnn::gemm_int16_maxptr<float, int16_t, float>(
+      ctx.GetRawContext(),        /* ctx */
+      false,                      /* trans_a */
+      true,                       /* trans_b */
+      batch * max_seq,            /* m */
+      dim1,                       /* n */
+      dim1,                       /* k */
+      1.0f,                       /* alpha */
+      group_padding_output,       /* data_a */
+      dim1,                       /* lda */
+      w_data,                     /* data_b */
+      dim1,                       /* ldb */
+      0.0f,                       /* beta */
+      seq_fc_output,              /* data_c */
+      dim1,                       /* ldc */
+      b_data,                     /* bias */
+      xdnn::Activation_t::LINEAR, /* act */
+      maxs_xpu,                   /* max_a */
+      maxs_xpu + 4,               /* max_b */
+      nullptr /* max_c */);
+  CHECK_EQ(r, 0);
+  r = xdnn::search_aligned_mat_mul(ctx.GetRawContext(),
+                                   0,
+                                   1,
+                                   batch,
+                                   max_seq,
+                                   max_seq,
+                                   dim1,
+                                   alpha0,
+                                   group_padding_output,
+                                   dim1,
+                                   seq_fc_output,
+                                   dim1,
+                                   batchgemm0_output,
+                                   max_seq);
+  CHECK_EQ(r, 0);
+  r = xdnn::search_pad_mask(ctx.GetRawContext(),
+                            batchgemm0_output,
+                            attention_output,
+                            pad_begin_xpu,
+                            batch,
+                            max_seq,
+                            max_seq,
+                            batch,
+                            mask);
+  CHECK_EQ(r, 0);
+  r = xdnn::softmax2d_forward(ctx.GetRawContext(),
+                              attention_output,
+                              seq_softmax_output,
+                              batch * max_seq,
+                              max_seq,
+                              true);
+  CHECK_EQ(r, 0);
+  r = xdnn::search_aligned_mat_mul(ctx.GetRawContext(),
+                                   0,
+                                   0,
+                                   batch,
+                                   max_seq,
+                                   dim1,
+                                   max_seq,
+                                   alpha1,
+                                   seq_softmax_output,
+                                   max_seq,
+                                   group_padding_output,
+                                   dim1,
+                                   batchgemm1_output,
+                                   dim1);
+  CHECK_EQ(r, 0);
+  r = xdnn::search_sequence_pad_depad(ctx.GetRawContext(),
+                                      top_data,
+                                      batchgemm1_output,
+                                      offset_xpu,
+                                      max_seq,
+                                      batch,
+                                      dim1,
+                                      1);  // is_depad = 1
+  CHECK_EQ(r, 0);
 }
 
 }  // namespace xpu
diff --git a/lite/kernels/xpu/activation_compute.h b/lite/kernels/xpu/activation_compute.h
index e440bde4146a88929c52c20ff1038eb35be91d38..f2ad667886ac33191687b70aa7548050461545e7 100644
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/batch_norm_compute.h b/lite/kernels/xpu/batch_norm_compute.h
index 7b428476b96ca3b2b60c66df28b7f82e8f57bebc..f5244574cebab6b10bbd81af9c8303ffec9f0965 100644
--- a/lite/kernels/xpu/batch_norm_compute.h
+++ b/lite/kernels/xpu/batch_norm_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/cast_compute.h b/lite/kernels/xpu/cast_compute.h
index 8992c29732630a5bf0d9c092461569234257e3a9..efd4cbae8d2d708b25729f04f36bc22d1d909e11 100644
--- a/lite/kernels/xpu/cast_compute.h
+++ b/lite/kernels/xpu/cast_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/conv_compute.h b/lite/kernels/xpu/conv_compute.h
index b7631ce4e5773afe7cdd797a245c806b51d25c56..76159444c1861fad14b6ac4f0d32da626b3a8802 100644
--- a/lite/kernels/xpu/conv_compute.h
+++ b/lite/kernels/xpu/conv_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/dropout_compute.h b/lite/kernels/xpu/dropout_compute.h
index 0eaafb4f5555a163623402fee82d50bfa095b0b3..360450df537a68b9412d21db4e06dc74d6071ca6 100644
--- a/lite/kernels/xpu/dropout_compute.h
+++ b/lite/kernels/xpu/dropout_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/elementwise_compute.h b/lite/kernels/xpu/elementwise_compute.h
index 863ee3c643f9c431dacd057e251941914b1dd1c5..d910b9293e74428c426d9505245bc5958fc9df3a 100644
--- a/lite/kernels/xpu/elementwise_compute.h
+++ b/lite/kernels/xpu/elementwise_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/layer_norm_compute.h b/lite/kernels/xpu/layer_norm_compute.h
index 5d2df37795811ef8027e12b25139f2b7091cceed..9eeb5924c512fcfbf8825a9ff775378dfe4d6d4c 100644
--- a/lite/kernels/xpu/layer_norm_compute.h
+++ b/lite/kernels/xpu/layer_norm_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/lookup_table_compute.cc b/lite/kernels/xpu/lookup_table_compute.cc
index 568d303adefaa06bb8665b4cc92d4a949419d587..4256687fa8c17c7fe36e91ff727d52eb1047646f 100644
--- a/lite/kernels/xpu/lookup_table_compute.cc
+++ b/lite/kernels/xpu/lookup_table_compute.cc
@@ -29,12 +29,13 @@ void LookupTableCompute::Run() {
   int embed_dim = param.W->dims()[1];
 
   int r = xdnn::embedding<float, int64_t>(
-      ctx.GetRawContext(),        /* context */
-      num,                        /* num */
-      param.Ids->data<int64_t>(), /* indices */
-      embed_dim,                  /* embed_dim */
-      param.W->data<float>(),     /* table */
-      param.Out->mutable_data<float>(TARGET(kXPU)) /* top */);
+      ctx.GetRawContext(),                          /* context */
+      num,                                          /* num */
+      param.Ids->data<int64_t>(),                   /* indices */
+      embed_dim,                                    /* embed_dim */
+      param.W->data<float>(),                       /* table */
+      param.Out->mutable_data<float>(TARGET(kXPU)), /* top */
+      param.padding_idx /* padding_idx */);
   CHECK_EQ(r, 0);
 }
 
diff --git a/lite/kernels/xpu/lookup_table_compute.h b/lite/kernels/xpu/lookup_table_compute.h
index 2ba1afc869cf9c3a49ab1ad29c66c6c89ba87d19..7a43f5244e5d514a1644aac0437951af35bb7767 100644
--- a/lite/kernels/xpu/lookup_table_compute.h
+++ b/lite/kernels/xpu/lookup_table_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/match_matrix_tensor_compute.cc b/lite/kernels/xpu/match_matrix_tensor_compute.cc
index 3c4e896d23add6df99a7b66a830dc526dc808e95..c3ee547ccce56cd16401e4aca465e64d99a26185 100644
--- a/lite/kernels/xpu/match_matrix_tensor_compute.cc
+++ b/lite/kernels/xpu/match_matrix_tensor_compute.cc
@@ -23,12 +23,15 @@ namespace kernels {
 namespace xpu {
 
 void MatchMatrixTensorCompute::PrepareForRun() {
-  wx_max_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
-  offset_l_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
-  offset_r_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
-
-  offset_l_cpu.reset(new int[64]);
-  offset_r_cpu.reset(new int[64]);
+  wx_max_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  offset_l_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  offset_r_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+
+  offset_l_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  offset_r_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
 }
 
 void MatchMatrixTensorCompute::Run() {
@@ -76,25 +79,25 @@ void MatchMatrixTensorCompute::Run() {
   int* offset_r_xpu = reinterpret_cast<int*>(offset_r_xpu_guard_->addr_);
 
   int r = xdnn::gemm_int16_tmp_api<float, int16_t, float>(
-      ctx.GetRawContext(), /* ctx */
-      false,
-      false, /* trans_a, trans_b */
-      x->dims()[0],
-      dim_t * dim_in,
-      dim_in, /* m, n, k */
-      1.0f,
-      bottom_l_data,
-      dim_in, /* alpha, data_a, lda */
-      w_data,
-      dim_t * dim_in,
-      0.0f, /* data_b, ldb, beta */
-      bottom_l_trans_data,
-      dim_t * dim_in, /* data_c, ldc */
-      nullptr,        /* bias */
-      xdnn::Activation_t::LINEAR,
-      0.0f,
-      w_max,
-      wx_max /* max_a, max_b, max_c */);
+      ctx.GetRawContext(),        /* ctx */
+      false,                      /* trans_a */
+      false,                      /* trans_b */
+      x->dims()[0],               /* m */
+      dim_t * dim_in,             /* n */
+      dim_in,                     /* k */
+      1.0f,                       /* alpha */
+      bottom_l_data,              /* data_a */
+      dim_in,                     /* lda */
+      w_data,                     /* data_b */
+      dim_t * dim_in,             /* ldb */
+      0.0f,                       /* beta */
+      bottom_l_trans_data,        /* data_c */
+      dim_t * dim_in,             /* ldc */
+      nullptr,                    /* bias */
+      xdnn::Activation_t::LINEAR, /* act */
+      0.0f,                       /* max_a */
+      w_max,                      /* max_b */
+      wx_max /* max_c */);
   CHECK_EQ(r, 0);
 
   int max_width = 0;
@@ -110,14 +113,14 @@ void MatchMatrixTensorCompute::Run() {
       max_width = offset_r_cpu[i] - offset_r_cpu[i - 1];
     }
   }
-  xpu_memcpy(offset_l_xpu,
-             offset_l_cpu.get(),
-             offset_l.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-  xpu_memcpy(offset_r_xpu,
-             offset_r_cpu.get(),
-             offset_r.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(offset_l_xpu,
+                      offset_l_cpu.get(),
+                      offset_l.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(offset_r_xpu,
+                      offset_r_cpu.get(),
+                      offset_r.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 
   r = xdnn::match_matrix_tensor(ctx.GetRawContext(),
                                 batch_size,
diff --git a/lite/kernels/xpu/matmul_compute.h b/lite/kernels/xpu/matmul_compute.h
index aca3cbc603eff490ae19fd2546352adca3c1a7cf..0fef2086e294fa5cd79e49adeb6b136f484a1efd 100644
--- a/lite/kernels/xpu/matmul_compute.h
+++ b/lite/kernels/xpu/matmul_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/mul_compute.h b/lite/kernels/xpu/mul_compute.h
index bb2778c0e73189b11135395b42655e0250bbfd0a..3c91384b726a4d43c6a38e96d143657c12dadd8a 100644
--- a/lite/kernels/xpu/mul_compute.h
+++ b/lite/kernels/xpu/mul_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/pool_compute.h b/lite/kernels/xpu/pool_compute.h
index 5648554c41c76396184b7dc536f8c8628cbf23e4..39e14f04a8c41bc057ac5733d881ba713c0883b2 100644
--- a/lite/kernels/xpu/pool_compute.h
+++ b/lite/kernels/xpu/pool_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/scale_compute.h b/lite/kernels/xpu/scale_compute.h
index 6989b0f0f31e54a63dac2f7c2090dc676e31acfb..5a84fe26a0d409dcd979ca7c26128775a4f64df2 100644
--- a/lite/kernels/xpu/scale_compute.h
+++ b/lite/kernels/xpu/scale_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/search_fc_compute.cc b/lite/kernels/xpu/search_fc_compute.cc
index 79f4c2d0d809ea9848fb383863d0f9dd2ec5a2ae..52a9999b468564d81288ce494f575a8d1d46e4fc 100644
--- a/lite/kernels/xpu/search_fc_compute.cc
+++ b/lite/kernels/xpu/search_fc_compute.cc
@@ -23,7 +23,8 @@ namespace kernels {
 namespace xpu {
 
 void SearchFcCompute::PrepareForRun() {
-  maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(float));
+  maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(float), false /* use_l3 */);
 }
 
 void SearchFcCompute::Run() {
@@ -59,34 +60,34 @@ void SearchFcCompute::Run() {
 
   float* maxs_xpu = reinterpret_cast<float*>(maxs_xpu_guard_->addr_);
   float maxs_cpu[8] = {0.0f, 0.0f, 0.0f, 0.0f, w_max, 0.0f, 0.0f, 0.0f};
-  xpu_memcpy(maxs_xpu,
-             &maxs_cpu[0],
-             8 * sizeof(float),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(maxs_xpu,
+                      &maxs_cpu[0],
+                      8 * sizeof(float),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 
   int r = xdnn::findmax<float>(
       ctx.GetRawContext(), bottom_data, batch * _in, maxs_xpu);
   CHECK_EQ(r, 0);
   r = xdnn::gemm_int16_maxptr<float, int16_t, float>(
       ctx.GetRawContext(), /* ctx */
-      false,
-      true, /*trans_a, trans_b*/
-      batch,
-      _out,
-      _in, /*m, n, k*/
-      1.0f,
-      bottom_data,
-      _in, /*alpha, data_a, lda*/
-      weights,
-      _in,
-      0.0f, /*data_b, ldb, beta*/
-      top_data,
-      _out,
-      bias_data, /* data_c, ldc, bias*/
-      act,
-      maxs_xpu,
-      maxs_xpu + 4,
-      nullptr /*act, max_a, max_b, max_c*/);
+      false,               /* trans_a */
+      true,                /* trans_b */
+      batch,               /* m */
+      _out,                /* n */
+      _in,                 /* k */
+      1.0f,                /* alpha */
+      bottom_data,         /* data_a */
+      _in,                 /* lda */
+      weights,             /* data_b */
+      _in,                 /* ldb */
+      0.0f,                /* beta */
+      top_data,            /* data_c */
+      _out,                /* ldc */
+      bias_data,           /* bias */
+      act,                 /* act */
+      maxs_xpu,            /* max_a */
+      maxs_xpu + 4,        /* max_b */
+      nullptr /* max_c */);
   CHECK_EQ(r, 0);
 }
 
diff --git a/lite/kernels/xpu/search_grnn_compute.cc b/lite/kernels/xpu/search_grnn_compute.cc
index 1c19f58da1b5deaa3d74791561494f13b681cf3a..d4e2e4a9969149b0d2f7f2b75c195d1b3a5fda5c 100644
--- a/lite/kernels/xpu/search_grnn_compute.cc
+++ b/lite/kernels/xpu/search_grnn_compute.cc
@@ -24,13 +24,16 @@ namespace kernels {
 namespace xpu {
 
 void SearchGrnnCompute::PrepareForRun() {
-  offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
-  new_offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(256 * sizeof(int));
-  maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(16 * sizeof(float));
+  offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  new_offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SEQ_LEN * sizeof(int), false /* use_l3 */);
+  maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(16 * sizeof(float),
+                                                       false /* use_l3 */);
 
-  idx_sorted_by_width_data_cpu.reset(new int[64]);
-  offset_cpu.reset(new int[64]);
-  new_offset_cpu.reset(new int[256]);
+  idx_sorted_by_width_data_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  offset_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  new_offset_cpu.reset(new int[XPU_MAX_LOD_SEQ_LEN]);
 }
 
 void SearchGrnnCompute::prepare_layout(const operators::SearchGrnnParam& param,
@@ -96,10 +99,10 @@ void SearchGrnnCompute::prepare_layout(const operators::SearchGrnnParam& param,
     layout_input->Resize({dim0, dim1});
   }
 
-  xpu_memcpy(idx_sorted_by_width->mutable_data<int>(TARGET(kXPU)),
-             idx_sorted_by_width_data_cpu.get(),
-             idx_sorted_by_width->numel() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(idx_sorted_by_width->mutable_data<int>(TARGET(kXPU)),
+                      idx_sorted_by_width_data_cpu.get(),
+                      idx_sorted_by_width->numel() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 }
 
 void SearchGrnnCompute::Run() {
@@ -156,14 +159,14 @@ void SearchGrnnCompute::Run() {
   for (size_t i = 0; i < new_offset.size(); ++i) {
     new_offset_cpu[i] = new_offset[i];
   }
-  xpu_memcpy(offset_xpu,
-             offset_cpu.get(),
-             offset.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-  xpu_memcpy(new_offset_xpu,
-             new_offset_cpu.get(),
-             new_offset.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(offset_xpu,
+                      offset_cpu.get(),
+                      offset.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(new_offset_xpu,
+                      new_offset_cpu.get(),
+                      new_offset.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 
   int r = xdnn::search_seq2batch(ctx.GetRawContext(),
                                  batch,
@@ -200,10 +203,10 @@ void SearchGrnnCompute::Run() {
                         0.0f,
                         0.0f,
                         0.0f};
-  xpu_memcpy(maxs_xpu,
-             maxs_cpu,
-             16 * sizeof(float),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(maxs_xpu,
+                      maxs_cpu,
+                      16 * sizeof(float),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
   r = xdnn::findmax<float>(
       ctx.GetRawContext(), new_emb, cap_l * cap_e, maxs_xpu);
   CHECK_EQ(r, 0);
diff --git a/lite/kernels/xpu/sequence_arithmetic_compute.cc b/lite/kernels/xpu/sequence_arithmetic_compute.cc
index 226c615dba57ae381ed2457e588c5df32f25e04b..e1b9866123395b2d7867154c3b398adae670ed97 100644
--- a/lite/kernels/xpu/sequence_arithmetic_compute.cc
+++ b/lite/kernels/xpu/sequence_arithmetic_compute.cc
@@ -37,44 +37,54 @@ void SequenceArithmeticCompute::Run() {
   const auto* bottom_data1 = bottom1->data<float>();
   auto* top_data = top->mutable_data<float>(TARGET(kXPU));
 
+  int r = 0;
   switch (op_type) {
     case 1:  // addition: top[0] = bottom[0] + bottom[1]
       if (len1 > len2) {
-        xdnn::elementwise_add(
+        r = xdnn::elementwise_add(
             ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2);
-        xdnn::memcpy_device(ctx.GetRawContext(),
-                            &top_data[len2],
-                            &bottom_data0[len2],
-                            (len1 - len2) * sizeof(float));
+        CHECK_EQ(r, 0);
+        r = xdnn::memcpy_device(ctx.GetRawContext(),
+                                &top_data[len2],
+                                &bottom_data0[len2],
+                                (len1 - len2) * sizeof(float));
+        CHECK_EQ(r, 0);
       } else {
-        xdnn::elementwise_add(
+        r = xdnn::elementwise_add(
             ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1);
+        CHECK_EQ(r, 0);
       }
       break;
     case 2:  // substraction: top[0] = bottom[0] - bottom[1]
       if (len1 > len2) {
-        xdnn::elementwise_sub(
+        r = xdnn::elementwise_sub(
             ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2);
-        xdnn::memcpy_device(ctx.GetRawContext(),
-                            &top_data[len2],
-                            &bottom_data0[len2],
-                            (len1 - len2) * sizeof(float));
+        CHECK_EQ(r, 0);
+        r = xdnn::memcpy_device(ctx.GetRawContext(),
+                                &top_data[len2],
+                                &bottom_data0[len2],
+                                (len1 - len2) * sizeof(float));
+        CHECK_EQ(r, 0);
       } else {
-        xdnn::elementwise_sub(
+        r = xdnn::elementwise_sub(
             ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1);
+        CHECK_EQ(r, 0);
       }
       break;
     case 3:  // multiplication: top[0] = bottom[0] * bottom[1]
       if (len1 > len2) {
-        xdnn::elementwise_mul(
+        r = xdnn::elementwise_mul(
             ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2);
-        xdnn::memcpy_device(ctx.GetRawContext(),
-                            &top_data[len2],
-                            &bottom_data0[len2],
-                            (len1 - len2) * sizeof(float));
+        CHECK_EQ(r, 0);
+        r = xdnn::memcpy_device(ctx.GetRawContext(),
+                                &top_data[len2],
+                                &bottom_data0[len2],
+                                (len1 - len2) * sizeof(float));
+        CHECK_EQ(r, 0);
       } else {
-        xdnn::elementwise_mul(
+        r = xdnn::elementwise_mul(
             ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1);
+        CHECK_EQ(r, 0);
       }
       break;
     default:
diff --git a/lite/kernels/xpu/sequence_concat_compute.cc b/lite/kernels/xpu/sequence_concat_compute.cc
index fd7f5999a6ccb18efbcb0e96b50f2b31884fc21c..349fdbad2a89300703c820588b4647bfba77ece5 100644
--- a/lite/kernels/xpu/sequence_concat_compute.cc
+++ b/lite/kernels/xpu/sequence_concat_compute.cc
@@ -23,11 +23,13 @@ namespace kernels {
 namespace xpu {
 
 void SequenceConcatCompute::PrepareForRun() {
-  lod0_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
-  lod1_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+  lod0_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  lod1_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
 
-  lod0_cpu.reset(new int[64]);
-  lod1_cpu.reset(new int[64]);
+  lod0_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  lod1_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
 }
 
 template <typename T>
@@ -106,14 +108,14 @@ void SequenceConcatCompute::Run() {
   for (int i = 0; i < lod1.size(); ++i) {
     lod1_cpu[i] = lod1[i];
   }
-  xpu_memcpy(lod0_xpu,
-             lod0_cpu.get(),
-             lod0.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-  xpu_memcpy(lod1_xpu,
-             lod1_cpu.get(),
-             lod1.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(lod0_xpu,
+                      lod0_cpu.get(),
+                      lod0.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(lod1_xpu,
+                      lod1_cpu.get(),
+                      lod1.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 
   int r = xdnn::sequence_concat(ctx.GetRawContext(),
                                 xs[0]->data<float>(),
diff --git a/lite/kernels/xpu/sequence_pool_compute.cc b/lite/kernels/xpu/sequence_pool_compute.cc
index 81d9b5873c3c42afe94acdd8eb5a292326b7a7b6..f8e71639b7f4c67f7e60103a42766a4d32026bc1 100644
--- a/lite/kernels/xpu/sequence_pool_compute.cc
+++ b/lite/kernels/xpu/sequence_pool_compute.cc
@@ -23,8 +23,9 @@ namespace kernels {
 namespace xpu {
 
 void XPUSequencePoolCompute::PrepareForRun() {
-  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
-  lod_cpu.reset(new int[64]);
+  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
 }
 
 void XPUSequencePoolCompute::Run() {
@@ -55,10 +56,10 @@ void XPUSequencePoolCompute::Run() {
     lod_cpu[i] = in_lod[i];
   }
   int* lod_xpu = reinterpret_cast<int*>(lod_xpu_guard_->addr_);
-  xpu_memcpy(lod_xpu,
-             lod_cpu.get(),
-             in_lod.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(lod_xpu,
+                      lod_cpu.get(),
+                      in_lod.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 
   int r =
       xdnn::sequence_pooling_forward(ctx.GetRawContext(),
diff --git a/lite/kernels/xpu/sequence_reverse_compute.cc b/lite/kernels/xpu/sequence_reverse_compute.cc
index 11e4b80570c19fa90e7846d18a88f966f9a003b7..bb3f37890b644a660c594fb0fd6eea332b90b8d6 100644
--- a/lite/kernels/xpu/sequence_reverse_compute.cc
+++ b/lite/kernels/xpu/sequence_reverse_compute.cc
@@ -23,8 +23,9 @@ namespace xpu {
 
 template <typename T, PrecisionType PType>
 void SequenceReverseCompute<T, PType>::PrepareForRun() {
-  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
-  lod_cpu.reset(new int[64]);
+  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
 }
 
 template <typename T, PrecisionType PType>
@@ -58,10 +59,10 @@ void SequenceReverseCompute<T, PType>::Run() {
     lod_cpu[i] = lod[i];
   }
   int* lod_xpu = reinterpret_cast<int*>(lod_xpu_guard_->addr_);
-  xpu_memcpy(lod_xpu,
-             lod_cpu.get(),
-             lod.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(lod_xpu,
+                      lod_cpu.get(),
+                      lod.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 
   int r = xdnn::sequence_reverse(ctx.GetRawContext(),
                                  batch_size,
diff --git a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
index 54c74211f9738995a8191c77e879a85762d71b3b..4e8485e2999b29dfb487d0c7c632fcfa7a9a3d00 100644
--- a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
+++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
@@ -23,10 +23,11 @@ namespace kernels {
 namespace xpu {
 
 void SequenceTopkAvgPoolingCompute::PrepareForRun() {
-  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(256 * sizeof(int));
-  in_lod_cpu.reset(new int[64]);
-  row_lod_cpu.reset(new int[64]);
-  col_lod_cpu.reset(new int[64]);
+  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      4 * XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  in_lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  row_lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  col_lod_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
 }
 
 void SequenceTopkAvgPoolingCompute::Run() {
@@ -81,22 +82,22 @@ void SequenceTopkAvgPoolingCompute::Run() {
   for (int i = 0; i < col_lod.size(); ++i) {
     col_lod_cpu[i] = col_lod[i];
   }
-  xpu_memcpy(in_lod_xpu,
-             in_lod_cpu.get(),
-             in_lod.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-  xpu_memcpy(row_lod_xpu,
-             row_lod_cpu.get(),
-             row_lod.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-  xpu_memcpy(col_lod_xpu,
-             col_lod_cpu.get(),
-             col_lod.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-  xpu_memcpy(topks_xpu,
-             topks.data(),
-             topks.size() * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(in_lod_xpu,
+                      in_lod_cpu.get(),
+                      in_lod.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(row_lod_xpu,
+                      row_lod_cpu.get(),
+                      row_lod.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(col_lod_xpu,
+                      col_lod_cpu.get(),
+                      col_lod.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(topks_xpu,
+                      topks.data(),
+                      topks.size() * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 
   int r = xdnn::sequence_topk_avg_pooling(ctx.GetRawContext(),
                                           in_data,
diff --git a/lite/kernels/xpu/softmax_compute.h b/lite/kernels/xpu/softmax_compute.h
index e807f38a2ea3c9645b78340ac4dc87d1984c40f7..a3d282588776b7d64bc856adf92685c8524af035 100644
--- a/lite/kernels/xpu/softmax_compute.h
+++ b/lite/kernels/xpu/softmax_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+
 #include "lite/core/kernel.h"
 
 namespace paddle {
diff --git a/lite/kernels/xpu/stack_compute.cc b/lite/kernels/xpu/stack_compute.cc
index 90a6c70b49f39ce744f2a03eec41d79ddc768a19..156162923ceeb4abed466164b11672715f813fd7 100644
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
@@ -25,9 +25,8 @@ void StackCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
 
   int n = param.X.size();
-  void* x_ptr = nullptr;
-  xpu_malloc(&x_ptr, n * 8 /* sizeof(__global__ float*) */);
-  x_ptr_guard_.reset(x_ptr);
+  x_ptr_guard_ = TargetWrapperXPU::MallocScratchPad(
+      n * 8 /* sizeof(__global__ float*) */, false /* use_l3 */);
   x_ptr_cpu_.reserve(n);
 }
 
@@ -47,14 +46,15 @@ void StackCompute::Run() {
   for (int i = 0; i < n; ++i) {
     x_ptr_cpu_[i] = param.X[i]->data<float>();
   }
-  xpu_memcpy(x_ptr_guard_.get(), &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(
+      x_ptr_guard_->addr_, &x_ptr_cpu_[0], n * 8, XPU_HOST_TO_DEVICE));
 
   int r = xdnn::stack_forward(
       ctx.GetRawContext(), /* context */
       height,              /* height */
       width,               /* width */
       n,                   /* n */
-      x_ptr_guard_.get(),  /* x_ptr */
+      x_ptr_guard_->addr_, /* x_ptr */
       param.Out->mutable_data<float>(TARGET(kXPU)) /* out */);
   CHECK_EQ(r, 0);
 }
diff --git a/lite/kernels/xpu/stack_compute.h b/lite/kernels/xpu/stack_compute.h
index 1ba1d92dc9479cfd00c5e154df7b5476ffd9976c..7618e2a147b862aee097a42b36721d520ad6012c 100644
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
@@ -14,10 +14,9 @@
 
 #pragma once
 
-#include <memory>
 #include <vector>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
 #include "lite/core/kernel.h"
-#include "lite/kernels/xpu/utils.h"  // XPUFreeDeleter
 
 namespace paddle {
 namespace lite {
@@ -35,7 +34,7 @@ class StackCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
   virtual ~StackCompute() = default;
 
  private:
-  std::unique_ptr<void, XPUFreeDeleter> x_ptr_guard_;
+  XPUScratchPadGuard x_ptr_guard_;
   std::vector<const float*> x_ptr_cpu_;
 };
 
diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc
index 981922f8eacab57da4638e1fdcdd3df72465b379..ac301108386e2da43b2efc372b96531df8d55523 100644
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -53,10 +53,11 @@ bool SubgraphEngine::BuildDeviceProgram() {
   // IR graph
   subgraph::xpu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
-  if (origin_program_.empty()) {
+  if (!origin_program_) {
     BuildOriginProgram();
   }
-  for (auto& inst : origin_program_) {
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
@@ -123,7 +124,7 @@ bool SubgraphEngine::BuildDeviceProgram() {
     auto node = graph.Get(device_inames_[i]);
     auto precision = node->precision();
     auto layout = node->layout();
-    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
+    origin_itensors_[i] = exec_scope_->FindMutableTensor(device_inames_[i]);
     CHECK(origin_itensors_[i]);
     origin_idims_[i] = origin_itensors_[i]->dims();
     VLOG(3) << "[XPU] Inputs[" << i << "] name: " << device_inames_[i]
@@ -147,7 +148,7 @@ bool SubgraphEngine::BuildDeviceProgram() {
     auto node = graph.Get(device_onames_[i]);
     auto precision = node->precision();
     auto layout = node->layout();
-    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
+    origin_otensors_[i] = exec_scope_->FindMutableTensor(device_onames_[i]);
     CHECK(origin_otensors_[i]);
     origin_odims_[i] = origin_otensors_[i]->dims();
     VLOG(3) << "[XPU] Outputs[" << i << "] name: " << device_onames_[i]
@@ -220,11 +221,11 @@ bool SubgraphEngine::LaunchDeviceProgram() {
 void SubgraphCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                    param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
   CHECK(engine_);
 }
 
diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h
index f09a06a85d5382c72e9efb20cede8bea1922f2da..25ffa721572ce05b0652d56659f3db12903c589b 100644
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
@@ -31,12 +31,16 @@ class SubgraphEngine : public subgraph::Engine {
  public:
   SubgraphEngine(KernelContext *ctx,
                  int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                  const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}
 
  protected:
   bool PrepareWorkspaceForDeviceProgram() override;
diff --git a/lite/kernels/xpu/var_conv_2d_compute.cc b/lite/kernels/xpu/var_conv_2d_compute.cc
index b573c810922db98e901c9f9a1953116f3fdfc657..b73581951f46a5f3cdbaf64cf732b1909805d27d 100644
--- a/lite/kernels/xpu/var_conv_2d_compute.cc
+++ b/lite/kernels/xpu/var_conv_2d_compute.cc
@@ -23,10 +23,12 @@ namespace kernels {
 namespace xpu {
 
 void VarConv2DCompute::PrepareForRun() {
-  offset_x_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
-  offset_y_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
-  offset_x_cpu.reset(new int[64]);
-  offset_y_cpu.reset(new int[64]);
+  offset_x_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  offset_y_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  offset_x_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
+  offset_y_cpu.reset(new int[XPU_MAX_LOD_SIZE]);
 }
 
 void VarConv2DCompute::Run() {
@@ -94,14 +96,14 @@ void VarConv2DCompute::Run() {
     offset_x_cpu[i] = offset_x[i];
     offset_y_cpu[i] = offset_y[i];
   }
-  xpu_memcpy(offset_x_xpu,
-             offset_x_cpu.get(),
-             (batch + 1) * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
-  xpu_memcpy(offset_y_xpu,
-             offset_y_cpu.get(),
-             (batch + 1) * sizeof(int),
-             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  XPU_CALL(xpu_memcpy(offset_x_xpu,
+                      offset_x_cpu.get(),
+                      (batch + 1) * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
+  XPU_CALL(xpu_memcpy(offset_y_xpu,
+                      offset_y_cpu.get(),
+                      (batch + 1) * sizeof(int),
+                      XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 
   int r = xdnn::search_varconv<float, int16_t>(ctx.GetRawContext(),
                                                batch,
diff --git a/lite/model_parser/base/apis.h b/lite/model_parser/base/apis.h
index 2ad6ff47ee17fcdfab335b3a6f87229811d971ae..fa3449017c902479a7f6ad37ef73b3a316f585cc 100644
--- a/lite/model_parser/base/apis.h
+++ b/lite/model_parser/base/apis.h
@@ -17,6 +17,7 @@
 #include "lite/model_parser/base/block_desc.h"
 #include "lite/model_parser/base/op_desc.h"
 #include "lite/model_parser/base/program_desc.h"
+#include "lite/model_parser/base/proto_desc.h"
 #include "lite/model_parser/base/traits.h"
 #include "lite/model_parser/base/var_desc.h"
 #include "lite/utils/all.h"
diff --git a/lite/model_parser/base/proto_desc.h b/lite/model_parser/base/proto_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f62ef6e43883fd41c509795d1e4f695fdbb8910
--- /dev/null
+++ b/lite/model_parser/base/proto_desc.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+
+// The Index of first Block in Program. also called root block.
+constexpr int kRootBlockIdx = 0;
+// The Parent Index of root Block, this block does not exist.
+constexpr int kNoneBlockIdx = -1;
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/base/vector_view.h b/lite/model_parser/base/vector_view.h
index adec1933a2f40face415f610c9ccf2e9f275020c..e4149d9c5acae83472904a86c47659355972855e 100644
--- a/lite/model_parser/base/vector_view.h
+++ b/lite/model_parser/base/vector_view.h
@@ -57,21 +57,35 @@ class VectorView {
  public:
   typedef vector_view::VectorTraits<T, U> Traits;
   explicit VectorView(typename Traits::vector_type const* cvec) {
-    CHECK(cvec);
     cvec_ = cvec;
   }
   typename Traits::subscript_return_type operator[](size_t i) const {
     return cvec_->operator[](i);
   }
-  typename Traits::const_iterator begin() const { return cvec_->begin(); }
-  typename Traits::const_iterator end() const { return cvec_->end(); }
-  size_t size() const { return cvec_->size(); }
+  typename Traits::const_iterator begin() const {
+    if (!cvec_) {
+      return typename Traits::const_iterator();
+    }
+    return cvec_->begin();
+  }
+  typename Traits::const_iterator end() const {
+    if (!cvec_) {
+      return typename Traits::const_iterator();
+    }
+    return cvec_->end();
+  }
+  size_t size() const {
+    if (!cvec_) {
+      return 0;
+    }
+    return cvec_->size();
+  }
   operator std::vector<T>() const {
     VLOG(5) << "Copying elements out of VectorView will damage performance.";
     std::vector<T> tmp;
-    tmp.reserve(cvec_->size());
-    for (auto val : *cvec_) {
-      tmp.push_back(val);
+    tmp.reserve(size());
+    for (size_t i = 0; i < size(); ++i) {
+      tmp.push_back(cvec_->operator[](i));
     }
     return tmp;
   }
diff --git a/lite/model_parser/compatible_pb.cc b/lite/model_parser/compatible_pb.cc
index b8db89230d56e22a361cc4972382d74b8d6f08fd..8bfeb419e51b01ae008959ac5af3e9752834b1ab 100644
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
@@ -234,7 +234,7 @@ void OpAttrsCppToAny(const cpp::OpDesc &cpp_desc, OpDescType *any_desc) {
   template <>                                                               \
   void TransformBlockDescCppToAny<NT::T>(const cpp::T &cpp_desc,            \
                                          NT::T *any_desc) {                 \
-    auto desc = cpp_desc;                                                   \
+    const cpp::T &desc = cpp_desc;                                          \
     any_desc->SetIdx(desc.Idx());                                           \
     any_desc->SetParentIdx(desc.ParentIdx());                               \
     any_desc->SetForwardBlockIdx(desc.ForwardBlockIdx());                   \
diff --git a/lite/model_parser/flatbuffers/io.cc b/lite/model_parser/flatbuffers/io.cc
index 28fa32398cfe76075c1a429f9f1d348842465dfc..ef8e9afaefe94d72113299050f16077a09f6c6cf 100644
--- a/lite/model_parser/flatbuffers/io.cc
+++ b/lite/model_parser/flatbuffers/io.cc
@@ -15,20 +15,21 @@
 #include "lite/model_parser/flatbuffers/io.h"
 #include <memory>
 #include <utility>
+#include <vector>
 
 namespace paddle {
 namespace lite {
 namespace fbs {
 
 void LoadModel(const std::string& path, ProgramDesc* prog) {
+  CHECK(prog);
   FILE* file = fopen(path.c_str(), "rb");
   fseek(file, 0, SEEK_END);
-  int64_t size = ftell(file);
+  int64_t length = ftell(file);
   rewind(file);
-  char* data = new char[size];
-  size = fread(data, 1, size, file);
+  std::vector<char> buf(length);
+  CHECK(fread(buf.data(), 1, length, file));
   fclose(file);
-  std::unique_ptr<char[]> buf(data);
   prog->Init(std::move(buf));
 }
 
diff --git a/lite/model_parser/flatbuffers/op_desc.h b/lite/model_parser/flatbuffers/op_desc.h
index e133ffbc27dce1a8c00eed82cc6d4fca76a8564d..450aa49fa13b676b33bef8490c65061dc504431d 100644
--- a/lite/model_parser/flatbuffers/op_desc.h
+++ b/lite/model_parser/flatbuffers/op_desc.h
@@ -62,7 +62,7 @@ class OpDesc : public OpDescAPI {
   std::vector<std::string> Output(const std::string& param) const override {
     const auto& var = desc_->outputs()->LookupByKey(param.c_str());
     std::vector<std::string> args_vec;
-    if (var->arguments()) {
+    if (var && var->arguments()) {
       args_vec.reserve(var->arguments()->size());
       for (const auto& out : *var->arguments()) {
         args_vec.push_back(out->str());
@@ -169,8 +169,7 @@ class OpDesc : public OpDescAPI {
   }
 
   bool HasOutput(const std::string& param) const {
-    NotImplemented();
-    return false;
+    return !Output(param).empty();
   }
 
   const std::map<std::string, Any>& attrs() const {
diff --git a/lite/model_parser/flatbuffers/program_desc.h b/lite/model_parser/flatbuffers/program_desc.h
index c651d9dc0671aced942bb28466e829a40226c2ba..55218eef5b4037d13b2f45db6de6b94cb39d994e 100644
--- a/lite/model_parser/flatbuffers/program_desc.h
+++ b/lite/model_parser/flatbuffers/program_desc.h
@@ -29,16 +29,25 @@ namespace fbs {
 class ProgramDesc : public ProgramDescAPI {
  public:
   ProgramDesc() = default;
-  explicit ProgramDesc(std::unique_ptr<const char[]> buf) {
-    Init(std::move(buf));
+  explicit ProgramDesc(const std::vector<char>& buf) { Init(buf); }
+  explicit ProgramDesc(std::vector<char>&& buf) {
+    Init(std::forward<std::vector<char>>(buf));
   }
 
-  size_t BlocksSize() const override { return desc_->blocks()->size(); }
+  void Init(const std::vector<char>& buf) {
+    CHECK(buf.data());
+    buf_ = buf;
+    InitProgramDesc();
+  }
 
-  void Init(std::unique_ptr<const char[]> buf) {
-    CHECK(buf.get() != nullptr);
+  void Init(std::vector<char>&& buf) {
+    CHECK(buf.data());
     buf_ = std::move(buf);
-    desc_ = proto::GetProgramDesc(buf_.get());
+    InitProgramDesc();
+  }
+
+  void InitProgramDesc() {
+    desc_ = proto::GetProgramDesc(buf_.data());
     blocks_.reserve(BlocksSize());
     for (size_t idx = 0; idx < BlocksSize(); ++idx) {
       blocks_.push_back(BlockDesc(desc_->blocks()->Get(idx)));
@@ -46,12 +55,12 @@ class ProgramDesc : public ProgramDescAPI {
   }
 
   void CopyFrom(const ProgramDesc& other) {
-    size_t length = strlen(static_cast<const char*>(other.raw_buf()));
-    std::unique_ptr<char[]> buf(new char[length]);
-    memcpy(buf.get(), other.raw_buf(), length);
-    Init(std::move(buf));
+    buf_ = other.buf();
+    Init(buf_);
   }
 
+  size_t BlocksSize() const override { return desc_->blocks()->size(); }
+
   template <typename T>
   T const* GetBlock(int32_t idx) const;
 
@@ -72,11 +81,11 @@ class ProgramDesc : public ProgramDescAPI {
 
   proto::ProgramDesc const* raw_desc() const { return desc_; }
 
-  const void* raw_buf() const { return buf_.get(); }
+  const std::vector<char>& buf() const { return buf_; }
 
  private:
   proto::ProgramDesc const* desc_;
-  std::unique_ptr<const char[]> buf_;
+  std::vector<char> buf_;
   std::vector<BlockDesc> blocks_;
 
  private:
diff --git a/lite/model_parser/flatbuffers/vector_view.h b/lite/model_parser/flatbuffers/vector_view.h
index 1cc890e98d2a85b3113fcf49a68701595e63964e..bb1331823a2dce79d2b3a6784f1f2d5b5864281d 100644
--- a/lite/model_parser/flatbuffers/vector_view.h
+++ b/lite/model_parser/flatbuffers/vector_view.h
@@ -51,6 +51,7 @@ struct FBSStrIterator {
           flatbuffers::Offset<flatbuffers::String>>::return_type>
       VI;
 
+  FBSStrIterator() = default;
   explicit FBSStrIterator(const VI& iter) { iter_ = iter; }
   const VI& raw_iter() const { return iter_; }
 
@@ -104,20 +105,21 @@ class VectorView<std::string, Flatbuffers> {
   explicit VectorView(typename Traits::vector_type const* cvec) {
     cvec_ = cvec;
   }
-  std::string operator[](size_t i) const {
-    CHECK(cvec_);
-    return cvec_->operator[](i)->str();
-  }
+  std::string operator[](size_t i) const { return cvec_->operator[](i)->str(); }
   vector_view::FBSStrIterator begin() const {
-    CHECK(cvec_);
+    if (!cvec_) {
+      return vector_view::FBSStrIterator();
+    }
     return vector_view::FBSStrIterator(cvec_->begin());
   }
   vector_view::FBSStrIterator end() const {
-    CHECK(cvec_);
+    if (!cvec_) {
+      return vector_view::FBSStrIterator();
+    }
     return vector_view::FBSStrIterator(cvec_->end());
   }
   size_t size() const {
-    if (cvec_ == nullptr) {
+    if (!cvec_) {
       return 0;
     }
     return cvec_->size();
@@ -126,10 +128,8 @@ class VectorView<std::string, Flatbuffers> {
     VLOG(5) << "Copying elements out of VectorView will damage performance.";
     std::vector<std::string> tmp;
     tmp.reserve(size());
-    if (cvec_ != nullptr) {
-      for (auto val : *cvec_) {
-        tmp.push_back(val->str());
-      }
+    for (size_t i = 0; i < size(); ++i) {
+      tmp.push_back(cvec_->operator[](i)->str());
     }
     return tmp;
   }
diff --git a/lite/model_parser/general/CMakeLists.txt b/lite/model_parser/general/CMakeLists.txt
index fe3b2f848e404385b8d948db676865b8039f4ba2..ed53678dfac4cc58b208c2faa8573bcd06943aaa 100644
--- a/lite/model_parser/general/CMakeLists.txt
+++ b/lite/model_parser/general/CMakeLists.txt
@@ -3,4 +3,4 @@ lite_cc_library(cpp_var_desc SRCS var_desc.cc)
 lite_cc_library(cpp_block_desc SRCS block_desc.cc)
 lite_cc_library(cpp_program_desc SRCS program_desc.cc)
 
-set(cpp_wrapper cpp_op_desc cpp_var_desc cpp_block_desc cpp_program_desc PARENT_SCOPE)
+set(cpp_wrapper cpp_program_desc cpp_block_desc cpp_var_desc cpp_op_desc PARENT_SCOPE)
diff --git a/lite/model_parser/pb/var_desc.cc b/lite/model_parser/pb/var_desc.cc
index f849b8dd0ed103f789aec41e5c88f3e4f3cdf878..42625ee6190fb98c50de2b88a08b9910d91ed014 100644
--- a/lite/model_parser/pb/var_desc.cc
+++ b/lite/model_parser/pb/var_desc.cc
@@ -294,9 +294,9 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
     case proto::VarType::LOD_TENSOR_ARRAY:
       return desc_->type().tensor_array().tensor();
     default:
-      LOG(FATAL)
-          << "Getting 'tensor_desc' is not supported by the type of var %s."
-          << this->Name();
+      LOG(WARNING) << "Getting 'tensor_desc' is not supported by the type("
+                   << static_cast<int>(desc_->type().type()) << ") of var "
+                   << this->Name();
   }
   return framework::proto::VarDesc().type().lod_tensor().tensor();
 }
@@ -312,10 +312,9 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
       }
       return res;
     default:
-      LOG(FATAL)
-          << "Getting 'tensor_descs' is not supported by the type of var "
-             "%s."
-          << this->Name();
+      LOG(WARNING) << "Getting 'tensor_descs' is not supported by the type("
+                   << static_cast<int>(desc_->type().type()) << ") of var "
+                   << this->Name();
   }
   return std::vector<proto::VarType::TensorDesc>();
 }
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index a5aaf65de4e212fcf862eff62df676a021191d39..1d947a8a0d365401c021a78cb4aba547c70f86b7 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -90,6 +90,7 @@ add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_m
 add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS})
 add_operator(assign_value_op basic SRCS assign_value_op.cc DEPS ${op_DEPS})
 add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
+add_operator(fake_quantize_dequantize_abs_max_op extra SRCS fake_quantize_dequantize_abs_max.cc DEPS ${op_DEPS})
 add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS})
 add_operator(split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS ${op_DEPS})
 add_operator(merge_lod_tensor_op_lite extra SRCS merge_lod_tensor_op.cc DEPS ${op_DEPS})
@@ -116,6 +117,7 @@ add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS})
 add_operator(max_pool_with_index_op extra SRCS max_pool_with_index_op.cc DEPS ${op_DEPS})
 add_operator(pixel_shuffle_op extra SRCS pixel_shuffle_op.cc DEPS ${op_DEPS})
 add_operator(clip_op extra SRCS clip_op.cc DEPS ${op_DEPS})
+add_operator(print_op extra SRCS print_op.cc DEPS ${op_DEPS})
 
 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
diff --git a/lite/operators/__xpu__mmdnn_op.cc b/lite/operators/__xpu__mmdnn_op.cc
index 35024da911ba0659c5005a1adc641fa3adc2f282..b898c0b132dc0767c8ba28c29098ac998c2cab21 100644
--- a/lite/operators/__xpu__mmdnn_op.cc
+++ b/lite/operators/__xpu__mmdnn_op.cc
@@ -88,6 +88,78 @@ bool XPUMmdnnBidEmbGrnnAttOp::AttachImpl(const cpp::OpDesc& op_desc,
   return true;
 }
 
+bool XPUMmdnnBidEmbGrnnAttOp2::CheckShape() const { return true; }
+
+bool XPUMmdnnBidEmbGrnnAttOp2::InferShapeImpl() const {
+  auto& id_dims = param_.id0->dims();
+  auto& id_lod = param_.id0->lod()[0];
+  auto& emb_tbl_dims = param_.emb_tbl->dims();
+  auto& grnn_wh_dims = param_.grnn_rv_wh->dims();
+
+  param_.emb0_out->Resize({id_dims[0], emb_tbl_dims[1]});
+  param_.emb0_out->set_lod({id_lod});
+  param_.grnn_fw_pool_out->Resize(
+      {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]});
+  param_.grnn_rv_pool_out->Resize(
+      {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]});
+  param_.att_pool_out->Resize(
+      {(int64_t)id_lod.size() - 1, 2 * grnn_wh_dims[2]});
+  param_.concat_3in1_out->Resize({id_dims[0], 3 * grnn_wh_dims[2]});
+  param_.concat_3in1_out->set_lod({id_lod});
+  param_.emb_fw_out->Resize({id_dims[0], emb_tbl_dims[1]});
+  param_.emb_fw_out->set_lod({id_lod});
+  return true;
+}
+
+bool XPUMmdnnBidEmbGrnnAttOp2::AttachImpl(const cpp::OpDesc& op_desc,
+                                          lite::Scope* scope) {
+  param_.id0 =
+      scope->FindVar(op_desc.Input("id0").front())->GetMutable<lite::Tensor>();
+  param_.id1 =
+      scope->FindVar(op_desc.Input("id1").front())->GetMutable<lite::Tensor>();
+  param_.emb_tbl = scope->FindVar(op_desc.Input("emb_tbl").front())
+                       ->GetMutable<lite::Tensor>();
+  param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_fw_wi = scope->FindVar(op_desc.Input("grnn_fw_wi").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_wh = scope->FindVar(op_desc.Input("grnn_rv_wh").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_wi = scope->FindVar(op_desc.Input("grnn_rv_wi").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.att_fc_w = scope->FindVar(op_desc.Input("att_fc_w").front())
+                        ->GetMutable<lite::Tensor>();
+  param_.att_fc_b = scope->FindVar(op_desc.Input("att_fc_b").front())
+                        ->GetMutable<lite::Tensor>();
+
+  param_.emb0_out = scope->FindVar(op_desc.Output("emb0_out").front())
+                        ->GetMutable<lite::Tensor>();
+  param_.grnn_fw_pool_out =
+      scope->FindVar(op_desc.Output("grnn_fw_pool_out").front())
+          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_pool_out =
+      scope->FindVar(op_desc.Output("grnn_rv_pool_out").front())
+          ->GetMutable<lite::Tensor>();
+  param_.att_pool_out = scope->FindVar(op_desc.Output("att_pool_out").front())
+                            ->GetMutable<lite::Tensor>();
+  param_.concat_3in1_out =
+      scope->FindVar(op_desc.Output("concat_3in1_out").front())
+          ->GetMutable<lite::Tensor>();
+  param_.emb_fw_out = scope->FindVar(op_desc.Output("emb_fw_out").front())
+                          ->GetMutable<lite::Tensor>();
+
+  param_.grnn_fw_wh_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_fw_wh_maxs");
+  param_.grnn_fw_wi_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_fw_wi_maxs");
+  param_.grnn_rv_wh_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_rv_wh_maxs");
+  param_.grnn_rv_wi_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_rv_wi_maxs");
+  param_.att_fc_w_max = op_desc.GetAttr<float>("att_fc_w_max");
+  return true;
+}
+
 bool XPUMmdnnBidEmbAttOp::CheckShape() const { return true; }
 
 bool XPUMmdnnBidEmbAttOp::InferShapeImpl() const {
@@ -157,6 +229,7 @@ bool XPUMmdnnMatchConvTopkOp::AttachImpl(const cpp::OpDesc& op_desc,
   param_.input_w_max = op_desc.GetAttr<float>("input_w_max");
   param_.conv_w_max = op_desc.GetAttr<float>("conv_w_max");
   param_.topks = op_desc.GetAttr<std::vector<int>>("topks");
+  param_.output_channel = op_desc.GetAttr<int>("output_channel");
   param_.channel_num = op_desc.GetAttr<int>("channel_num");
   param_.dim_t = op_desc.GetAttr<int>("dim_t");
   return true;
@@ -182,10 +255,10 @@ bool XPUMmdnnMergeAllOp::AttachImpl(const cpp::OpDesc& op_desc,
     auto t = scope->FindVar(name)->GetMutable<lite::Tensor>();
     param_.concat_7in1_x.push_back(t);
   }
-  param_.concat_2in1_x.clear();
-  for (auto& name : op_desc.Input("concat_2in1_x")) {
+  param_.concat_topk_x.clear();
+  for (auto& name : op_desc.Input("concat_topk_x")) {
     auto t = scope->FindVar(name)->GetMutable<lite::Tensor>();
-    param_.concat_2in1_x.push_back(t);
+    param_.concat_topk_x.push_back(t);
   }
   param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front())
                           ->GetMutable<lite::Tensor>();
@@ -231,6 +304,8 @@ bool XPUMmdnnMergeAllOp::AttachImpl(const cpp::OpDesc& op_desc,
 
 REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_grnn_att,
                  paddle::lite::operators::XPUMmdnnBidEmbGrnnAttOp);
+REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_grnn_att2,
+                 paddle::lite::operators::XPUMmdnnBidEmbGrnnAttOp2);
 REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_att,
                  paddle::lite::operators::XPUMmdnnBidEmbAttOp);
 REGISTER_LITE_OP(__xpu__mmdnn_match_conv_topk,
diff --git a/lite/operators/__xpu__mmdnn_op.h b/lite/operators/__xpu__mmdnn_op.h
index 7038898cad0823746f905e4e60c06885b57a737c..ba815a1eec7d0913bc08b4f8fa520de73a4bb835 100644
--- a/lite/operators/__xpu__mmdnn_op.h
+++ b/lite/operators/__xpu__mmdnn_op.h
@@ -41,6 +41,29 @@ class XPUMmdnnBidEmbGrnnAttOp : public OpLite {
   mutable XPUMmdnnBidEmbGrnnAttParam param_;
 };
 
+class XPUMmdnnBidEmbGrnnAttOp2 : public OpLite {
+ public:
+  XPUMmdnnBidEmbGrnnAttOp2() {}
+
+  explicit XPUMmdnnBidEmbGrnnAttOp2(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "XPUMmdnnBidEmbGrnnAttOp2";
+  }
+
+ private:
+  mutable XPUMmdnnBidEmbGrnnAttParam2 param_;
+};
+
 class XPUMmdnnBidEmbAttOp : public OpLite {
  public:
   XPUMmdnnBidEmbAttOp() {}
diff --git a/lite/operators/assign_op.cc b/lite/operators/assign_op.cc
index fe1e8db1f954af38041621d1d676cf16833357da..f2237230dceda55c89a423e0ee9504ee1e3c1de8 100644
--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
@@ -21,15 +21,15 @@ namespace lite {
 namespace operators {
 
 bool AssignOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
+  CHECK_OR_FALSE(param_.X || param_.X_array);
+  CHECK_OR_FALSE(param_.Out || param_.Out_array);
   return true;
 }
 
 bool AssignOpLite::InferShapeImpl() const {
-  if (param_.X != nullptr) {
+  if (param_.X) {
     param_.Out->Resize(param_.X->dims());
-  } else if (param_.X_array != nullptr) {
+  } else if (param_.X_array) {
     param_.Out_array->resize(param_.Out_array->size());
   } else {
     LOG(FATAL) << "x or x_array must be set.";
diff --git a/lite/operators/conditional_block_op.cc b/lite/operators/conditional_block_op.cc
index e3678e92c9d33be5428c82331ce963f4c6067369..de8bea345fe8da1e157665b93f9d50c6f6bbffa3 100644
--- a/lite/operators/conditional_block_op.cc
+++ b/lite/operators/conditional_block_op.cc
@@ -20,35 +20,37 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-bool ConditionalBlockOpLite::CheckShape() const {
+bool ConditionalBlockOp::CheckShape() const {
   CHECK_OR_FALSE(param_.cond);
-  CHECK_OR_FALSE(param_.sub_block);
-  CHECK_OR_FALSE(param_.scope);
+  CHECK_OR_FALSE(param_.program_desc);
+  CHECK_OR_FALSE(param_.exec_scope);
   return true;
 }
 
-bool ConditionalBlockOpLite::InferShapeImpl() const { return true; }
+bool ConditionalBlockOp::InferShapeImpl() const { return true; }
 
-bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                        lite::Scope *scope) {
+bool ConditionalBlockOp::AttachImpl(const cpp::OpDesc& op_desc, Scope* scope) {
   auto condition = op_desc.Input("Cond").front();
   param_.cond = scope->FindVar(condition)->GetMutable<lite::Tensor>();
-
   auto inputs = op_desc.Input("Input");
-  for (auto var : inputs) {
-    param_.x.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  for (const auto& input : inputs) {
+    auto* var = scope->FindVar(input);
+    CHECK(var);
+    param_.inputs.push_back(var->GetMutable<lite::Tensor>());
   }
-
   auto outs = op_desc.Output("Out");
-  for (auto var : outs) {
-    param_.outs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  for (const auto& out : outs) {
+    auto* var = scope->FindVar(out);
+    CHECK(var);
+    param_.outs.push_back(var->GetMutable<lite::Tensor>());
   }
-
   param_.is_scalar_condition = op_desc.GetAttr<bool>("is_scalar_condition");
   // obtain sub_block in core program.cc
-  param_.sub_block = sub_block_;
-  param_.scope = scope;
-
+  CHECK(param_.program_desc);
+  param_.block_idx = op_desc.GetAttr<int32_t>("sub_block");
+  CHECK_GE(param_.block_idx, 0);
+  param_.exec_scope = scope;
+  CHECK(param_.exec_scope);
   return true;
 }
 
@@ -57,4 +59,4 @@ bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc,
 }  // namespace paddle
 
 REGISTER_LITE_OP(conditional_block,
-                 paddle::lite::operators::ConditionalBlockOpLite);
+                 paddle::lite::operators::ConditionalBlockOp);
diff --git a/lite/operators/conditional_block_op.h b/lite/operators/conditional_block_op.h
index 1815731c8df3ac07bee80aa8e0cc658e752b5c4f..adcd8acdff391e2ae3ece9ec21669d853250dcf4 100644
--- a/lite/operators/conditional_block_op.h
+++ b/lite/operators/conditional_block_op.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/op_lite.h"
@@ -23,27 +24,30 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-class ConditionalBlockOpLite : public OpLite {
+class ConditionalBlockOp : public OpLite {
  public:
-  ConditionalBlockOpLite() {}
-  explicit ConditionalBlockOpLite(const std::string &op_type)
-      : OpLite(op_type) {}
+  ConditionalBlockOp() {}
+  explicit ConditionalBlockOp(const std::string &op_type) : OpLite(op_type) {}
 
   bool CheckShape() const override;
 
   bool InferShapeImpl() const override;
 
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
   std::string DebugString() const override { return "conditional_block"; }
 
-  void SetSubBlock(cpp::BlockDesc *desc) { sub_block_ = desc; }
+  void SetProgramDesc(std::shared_ptr<const cpp::ProgramDesc> program_desc) {
+    param_.program_desc = program_desc;
+  }
+  std::shared_ptr<const cpp::ProgramDesc> GetProgramDesc() {
+    return param_.program_desc;
+  }
 
  private:
   mutable ConditionalBlockParam param_;
-  cpp::BlockDesc *sub_block_;
 };
 
 }  // namespace operators
diff --git a/lite/operators/fake_quantize_dequantize_abs_max.cc b/lite/operators/fake_quantize_dequantize_abs_max.cc
new file mode 100644
index 0000000000000000000000000000000000000000..354f5e9dcdbd55f634ae394187c5f9163eb9c25a
--- /dev/null
+++ b/lite/operators/fake_quantize_dequantize_abs_max.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/fake_quantize_dequantize_abs_max.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(fake_quantize_dequantize_abs_max,
+                 paddle::lite::operators::FakeQuantizeDequantizeAbsMaxOpLite);
diff --git a/lite/operators/fake_quantize_dequantize_abs_max.h b/lite/operators/fake_quantize_dequantize_abs_max.h
new file mode 100644
index 0000000000000000000000000000000000000000..7413b448ea5e2317501960a246478d15242f9cdc
--- /dev/null
+++ b/lite/operators/fake_quantize_dequantize_abs_max.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class FakeQuantizeDequantizeAbsMaxOpLite : public OpLite {
+ public:
+  FakeQuantizeDequantizeAbsMaxOpLite() {}
+
+  explicit FakeQuantizeDequantizeAbsMaxOpLite(const std::string &type)
+      : OpLite(type) {}
+
+  bool CheckShape() const override { return true; }
+
+  bool InferShapeImpl() const override { return true; }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
+    auto x = op_desc.Input("X").front();
+    auto out = op_desc.Output("Out").front();
+    auto out_scale = op_desc.Output("OutScale").front();
+
+    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+    param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+    param_.out_scale = scope->FindVar(out_scale)->GetMutable<lite::Tensor>();
+    param_.bit_length = op_desc.GetAttr<int>("bit_length");
+    return true;
+  }
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "fake_quantize_dequantize_abs_max";
+  }
+
+ private:
+  mutable FakeQuantDequantAbsMaxParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/gru_op.cc b/lite/operators/gru_op.cc
index 862a1ff98f699393c9aa91afab978f947cc25187..0a9128dcd27870f6456b26ba636d4189267583be 100644
--- a/lite/operators/gru_op.cc
+++ b/lite/operators/gru_op.cc
@@ -75,9 +75,8 @@ bool GRUOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
   auto batch_reset_hidden_prev = op_desc.Output("BatchResetHiddenPrev").front();
   auto batch_hidden = op_desc.Output("BatchHidden").front();
   auto hidden = op_desc.Output("Hidden").front();
-
   param_.input = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  if (op_desc.Input("H0").size()) {
+  if (!op_desc.Input("H0").empty()) {
     auto h0 = op_desc.Input("H0").front();
     param_.h0 = scope->FindVar(h0)->GetMutable<lite::Tensor>();
   }
@@ -90,7 +89,7 @@ bool GRUOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
       scope->FindVar(batch_hidden)->GetMutable<lite::Tensor>();
   param_.hidden = scope->FindVar(hidden)->GetMutable<lite::Tensor>();
 
-  if (op_desc.HasInput("Bias")) {
+  if (!op_desc.Input("Bias").empty()) {
     auto bias = op_desc.Input("Bias").front();
     param_.bias = scope->FindVar(bias)->GetMutable<lite::Tensor>();
   }
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 0a573867b39ada10d1f42f5b6541f982d10edb9d..0d8456fc8755d24572dc91f9e1e1e4bc8706d460 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -90,9 +90,9 @@ struct SubgraphParam : ParamBase {
   std::vector<std::string> output_names{};
   std::vector<std::string> input_data_names{};
   std::vector<std::string> output_data_names{};
-  int sub_block_idx{-1};
-  cpp::BlockDesc* sub_block_desc{nullptr};
-  Scope* scope{nullptr};
+  int block_idx{-1};
+  std::shared_ptr<const cpp::ProgramDesc> program_desc{nullptr};
+  Scope* exec_scope{nullptr};
 };
 
 /// -------------------------- NN operators ------------------------------------
@@ -678,6 +678,13 @@ struct FakeChannelWiseDequantizeMaxAbsParam : ParamBase {
   std::vector<int> quant_bits;
 };
 
+struct FakeQuantDequantAbsMaxParam : ParamBase {
+  const lite::Tensor* x{};
+  lite::Tensor* out{};
+  lite::Tensor* out_scale{};
+  int bit_length;
+};
+
 /// ----------------------- sgd operators ----------------------
 struct SGDParam : ParamBase {
   int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
@@ -939,11 +946,10 @@ struct CompareParam : ParamBase {
 };
 
 struct WhileParam : ParamBase {
-  Scope* scope{};
   Tensor* cond{};
-  cpp::BlockDesc* sub_block{};
-  std::vector<Tensor*> x{};
-  std::vector<Tensor*> outs{};
+  int block_idx{-1};
+  std::shared_ptr<const cpp::ProgramDesc> program_desc{nullptr};
+  Scope* exec_scope{nullptr};
 };
 
 struct TopkParam : ParamBase {
@@ -1461,10 +1467,11 @@ struct MergeLodTensorParam : ParamBase {
 
 struct ConditionalBlockParam : ParamBase {
   const lite::Tensor* cond{};
-  std::vector<lite::Tensor*> x{};
+  std::vector<lite::Tensor*> inputs{};
   std::vector<lite::Tensor*> outs{};
-  cpp::BlockDesc* sub_block{};
-  Scope* scope{};
+  int block_idx{-1};
+  std::shared_ptr<const cpp::ProgramDesc> program_desc{nullptr};
+  Scope* exec_scope{nullptr};
   bool is_scalar_condition{};
 };
 
@@ -1634,11 +1641,36 @@ struct XPUMmdnnBidEmbGrnnAttParam : ParamBase {
   std::vector<float> grnn_rv_wi_maxs;
   float att_fc_w_max{0.0f};
 
-  lite::Tensor* grnn_fw_pool_out{};  // 1
-  lite::Tensor* grnn_rv_pool_out{};  // 2
-  lite::Tensor* att_pool_out{};      // 3
-  lite::Tensor* concat_3in1_out{};   // 4
-  lite::Tensor* emb_fw_out{};        // 5
+  lite::Tensor* grnn_fw_pool_out{};
+  lite::Tensor* grnn_rv_pool_out{};
+  lite::Tensor* att_pool_out{};
+  lite::Tensor* concat_3in1_out{};
+  lite::Tensor* emb_fw_out{};
+};
+
+struct XPUMmdnnBidEmbGrnnAttParam2 : ParamBase {
+  lite::Tensor* id0{};
+  lite::Tensor* id1{};
+  lite::Tensor* emb_tbl{};
+  lite::Tensor* grnn_fw_wh{};
+  lite::Tensor* grnn_fw_wi{};
+  lite::Tensor* grnn_rv_wh{};
+  lite::Tensor* grnn_rv_wi{};
+  lite::Tensor* att_fc_w{};
+  lite::Tensor* att_fc_b{};
+
+  std::vector<float> grnn_fw_wh_maxs;
+  std::vector<float> grnn_fw_wi_maxs;
+  std::vector<float> grnn_rv_wh_maxs;
+  std::vector<float> grnn_rv_wi_maxs;
+  float att_fc_w_max{0.0f};
+
+  lite::Tensor* emb0_out{};
+  lite::Tensor* grnn_fw_pool_out{};
+  lite::Tensor* grnn_rv_pool_out{};
+  lite::Tensor* att_pool_out{};
+  lite::Tensor* concat_3in1_out{};
+  lite::Tensor* emb_fw_out{};
 };
 
 struct XPUMmdnnBidEmbAttParam : ParamBase {
@@ -1650,8 +1682,8 @@ struct XPUMmdnnBidEmbAttParam : ParamBase {
 
   float att_fc_w_max{0.0f};
 
-  lite::Tensor* att_pool_out{};  // 1
-  lite::Tensor* emb_fw_out{};    // 2
+  lite::Tensor* att_pool_out{};
+  lite::Tensor* emb_fw_out{};
 };
 
 struct XPUMmdnnMatchConvTopkParam : ParamBase {
@@ -1663,6 +1695,7 @@ struct XPUMmdnnMatchConvTopkParam : ParamBase {
   float input_w_max{0.0f};
   float conv_w_max{0.0f};
   std::vector<int> topks;
+  int output_channel{0};
   int channel_num{0};
   int dim_t{0};
 
@@ -1671,7 +1704,7 @@ struct XPUMmdnnMatchConvTopkParam : ParamBase {
 
 struct XPUMmdnnMergeAllParam : ParamBase {
   std::vector<lite::Tensor*> concat_7in1_x;
-  std::vector<lite::Tensor*> concat_2in1_x;
+  std::vector<lite::Tensor*> concat_topk_x;
   lite::Tensor* grnn_fw_wh{};
   lite::Tensor* grnn_fw_wi{};
   lite::Tensor* grnn_rv_wh{};
@@ -1760,6 +1793,22 @@ struct ClipParam : ParamBase {
   float max{};
 };
 
+struct PrintParam : ParamBase {
+  const lite::Tensor* in{};
+  lite::Tensor* out{};
+  std::string name;
+  int first_n{-1};
+  std::string message;
+  int summarize{20};
+  bool print_tensor_name{true};
+  bool print_tensor_type{true};
+  bool print_tensor_shape{true};
+  bool print_tensor_lod{true};
+  bool print_tensor_layout{true};
+  std::string print_phase;
+  bool is_forward{true};
+};
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/print_op.cc b/lite/operators/print_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f4299aed06f17d7bf3bd30b9fec34c587168884
--- /dev/null
+++ b/lite/operators/print_op.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/print_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool PrintOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.in);
+  CHECK_OR_FALSE(param_.out);
+  return true;
+}
+
+bool PrintOp::InferShapeImpl() const {
+  param_.out->set_lod(param_.in->lod());
+  param_.out->Resize(param_.in->dims());
+  return true;
+}
+
+bool PrintOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  AttachParam(&param_);
+
+  param_.name = op_desc.Input("In").front();
+  param_.in = scope->FindTensor(param_.name);
+  param_.out = scope->FindMutableTensor(op_desc.Output("Out").front());
+  param_.first_n = op_desc.GetAttr<int32_t>("first_n");
+  param_.message = op_desc.GetAttr<std::string>("message");
+  param_.summarize = op_desc.GetAttr<int32_t>("summarize");
+  param_.print_tensor_name = op_desc.GetAttr<bool>("print_tensor_name");
+  param_.print_tensor_type = op_desc.GetAttr<bool>("print_tensor_type");
+  param_.print_tensor_shape = op_desc.GetAttr<bool>("print_tensor_shape");
+  param_.print_tensor_lod = op_desc.GetAttr<bool>("print_tensor_lod");
+  param_.print_tensor_layout = op_desc.GetAttr<bool>("print_tensor_layout");
+  param_.print_phase = op_desc.GetAttr<std::string>("print_phase");
+  param_.is_forward = op_desc.GetAttr<bool>("is_forward");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(print, paddle::lite::operators::PrintOp);
diff --git a/lite/operators/print_op.h b/lite/operators/print_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd8e777b59c3aac92771442402cf16623b75fbef
--- /dev/null
+++ b/lite/operators/print_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class PrintOp : public OpLite {
+ public:
+  PrintOp() {}
+  explicit PrintOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "print"; }
+
+ private:
+  mutable PrintParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_pad_op.cc b/lite/operators/sequence_pad_op.cc
index 687c4a1989deaa5afea2356338630fa0ee846cb5..858c0ffcbb1a8e739cf4575e9f2f8882fd231912 100644
--- a/lite/operators/sequence_pad_op.cc
+++ b/lite/operators/sequence_pad_op.cc
@@ -61,18 +61,19 @@ bool SequencePadOp::InferShapeImpl() const {
     max_seq_len =
         std::max(max_seq_len, static_cast<int>(x_lod_0[i + 1] - x_lod_0[i]));
   }
-  if (param_.padded_length == -1) {
-    param_.padded_length = max_seq_len;
+  int real_padded_length = param_.padded_length;
+  if (real_padded_length == -1) {
+    real_padded_length = max_seq_len;
   }
-  CHECK_GE(param_.padded_length, max_seq_len)
+  CHECK_GE(real_padded_length, max_seq_len)
       << "The SequencePadOp Attr(padded_length) should be greater than or "
          "equal to the length of the longest original sequence. But the "
          "padded_length we received is "
-      << param_.padded_length
+      << real_padded_length
       << ", the length of the longest original sequence is " << max_seq_len;
 
   int out_dim_0 = seq_num;
-  std::vector<int64_t> out_dims_vec{out_dim_0, param_.padded_length};
+  std::vector<int64_t> out_dims_vec{out_dim_0, real_padded_length};
   std::vector<int64_t> len_dims_vec{out_dim_0};
   auto time_step_dims_vec = time_step_dims.Vectorize();
   out_dims_vec.insert(
@@ -87,7 +88,7 @@ bool SequencePadOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
       &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
   param_.PadValue = const_cast<lite::Tensor *>(
       &scope->FindVar(opdesc.Input("PadValue").front())->Get<lite::Tensor>());
-  param_.Length = scope->FindVar(opdesc.Input("Length").front())
+  param_.Length = scope->FindVar(opdesc.Output("Length").front())
                       ->GetMutable<lite::Tensor>();
   param_.Out =
       scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
diff --git a/lite/operators/sequence_unpad_op.cc b/lite/operators/sequence_unpad_op.cc
index b91d43c741f002b2bdb30e161688cd40b462faee..4f4497f0b81b5710e71cd0a2fcce10e9559d9d30 100644
--- a/lite/operators/sequence_unpad_op.cc
+++ b/lite/operators/sequence_unpad_op.cc
@@ -32,32 +32,7 @@ bool SequenceUnpadOp::CheckShape() const {
   return true;
 }
 
-bool SequenceUnpadOp::InferShapeImpl() const {
-  auto x_dims = param_.X->dims();
-  auto len_dims = param_.Length->dims();
-
-  auto *seq_len_ptr = param_.Length->data<int64_t>();
-  int64_t batch_size = len_dims[0];
-  std::vector<uint64_t> out_lod0(batch_size + 1, 0);
-  for (int64_t i = 0; i < batch_size; ++i) {
-    out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
-  }
-  paddle::lite::LoD out_lod;
-  out_lod.push_back(out_lod0);
-
-  int64_t out_dim0 = out_lod0.back();
-  std::vector<int64_t> out_dims{out_dim0};
-  if (x_dims.size() == 2) {
-    out_dims.push_back(1);
-  } else {
-    for (size_t i = 2; i < x_dims.size(); ++i) {
-      out_dims.push_back(x_dims[i]);
-    }
-  }
-  param_.Out->Resize(out_dims);
-  param_.Out->set_lod(out_lod);
-  return true;
-}
+bool SequenceUnpadOp::InferShapeImpl() const { return true; }
 
 bool SequenceUnpadOp::AttachImpl(const cpp::OpDesc &opdesc,
                                  lite::Scope *scope) {
diff --git a/lite/operators/subgraph_op.cc b/lite/operators/subgraph_op.cc
index 9ac07e96334eda9f0001d33e0789f9de15c4ca67..fec5a0e3254328220508f28a16b110beb01fb613 100644
--- a/lite/operators/subgraph_op.cc
+++ b/lite/operators/subgraph_op.cc
@@ -39,10 +39,11 @@ bool SubgraphOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
       op_desc.GetAttr<std::vector<std::string>>("input_data_names");
   param_.output_data_names =
       op_desc.GetAttr<std::vector<std::string>>("output_data_names");
-  CHECK(param_.sub_block_desc);
-  param_.sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
-  param_.scope = scope;
-  CHECK(param_.scope);
+  CHECK(param_.program_desc);
+  param_.block_idx = op_desc.GetAttr<int32_t>("sub_block");
+  CHECK_GE(param_.block_idx, 0);
+  param_.exec_scope = scope;
+  CHECK(param_.exec_scope);
   return true;
 }
 
diff --git a/lite/operators/subgraph_op.h b/lite/operators/subgraph_op.h
index edbfb922044d60165e589d389cd8cfb3b2547796..df6448f2f78a08f41ac037a13d14cbca1725cfb5 100644
--- a/lite/operators/subgraph_op.h
+++ b/lite/operators/subgraph_op.h
@@ -13,14 +13,11 @@
 // limitations under the License.
 
 #pragma once
-
+#include <memory>
 #include <string>
 #include <vector>
-#include "lite/core/kernel.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
 #include "lite/utils/all.h"
 
 namespace paddle {
@@ -37,14 +34,18 @@ class SubgraphOp : public OpLite {
 
   bool InferShapeImpl() const override;
 
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+  bool AttachImpl(const cpp::OpDesc &op_desc, Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
   std::string DebugString() const override { return "subgraph"; }
 
-  void SetSubBlock(cpp::BlockDesc *desc) { param_.sub_block_desc = desc; }
-  cpp::BlockDesc *GetSubBlock() { return param_.sub_block_desc; }
+  void SetProgramDesc(std::shared_ptr<const cpp::ProgramDesc> program_desc) {
+    param_.program_desc = program_desc;
+  }
+  std::shared_ptr<const cpp::ProgramDesc> GetProgramDesc() {
+    return param_.program_desc;
+  }
 
  private:
   mutable SubgraphParam param_;
diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc
index 83b6cc6a24ed1537adec8fd7d54a477edf91f873..612632acb4fbea692aa4a02dbd94bb1b506460bb 100644
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
@@ -26,10 +26,16 @@ bool VarConv2dOp::InferShapeImpl() const { return true; }
 bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.X = const_cast<lite::Tensor *>(
       &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
-  // param_.ROW = const_cast<lite::Tensor *>(
-  //     &scope->FindVar(opdesc.Input("ROW").front())->Get<lite::Tensor>());
-  // param_.COLUMN = const_cast<lite::Tensor *>(
-  //     &scope->FindVar(opdesc.Input("COLUMN").front())->Get<lite::Tensor>());
+  if (opdesc.HasInput("ROW") && !opdesc.Input("ROW").empty()) {
+    param_.ROW = const_cast<lite::Tensor *>(
+        &scope->FindVar(opdesc.Input("ROW").front())->Get<lite::Tensor>());
+    CHECK(param_.ROW) << "Input(ROW) of VarConv2dOP should not be null.";
+  }
+  if (opdesc.HasInput("COLUMN") && !opdesc.Input("COLUMN").empty()) {
+    param_.COLUMN = const_cast<lite::Tensor *>(
+        &scope->FindVar(opdesc.Input("COLUMN").front())->Get<lite::Tensor>());
+    CHECK(param_.COLUMN) << "Input(COLUMN) of VarConv2dOP should not be null.";
+  }
   param_.W = const_cast<lite::Tensor *>(
       &scope->FindVar(opdesc.Input("W").front())->Get<lite::Tensor>());
   param_.Out =
@@ -37,8 +43,6 @@ bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.Col =
       scope->FindVar(opdesc.Output("Col").front())->GetMutable<lite::Tensor>();
   CHECK(param_.X) << "X(Input) of VarConv2dOP should not be null.";
-  // CHECK(param_.ROW) << "Input(ROW) of VarConv2dOP should not be null.";
-  // CHECK(param_.COLUMN) << "Input(COLUMN) of VarConv2dOP should not be null.";
   CHECK(param_.W) << "W(Input) of VarConv2dOP should not be null.";
   CHECK(param_.Out) << "Out(Output) of VarConv2dOP should not be null.";
   CHECK(param_.Col) << "Col(Output) of VarConv2dOP should not be null.";
diff --git a/lite/operators/while_op.cc b/lite/operators/while_op.cc
index 1dcf9553f331ee6646ad6d93de048728a0886116..ab8e4a5489c13e042bf0d07da1228f33626a1d43 100644
--- a/lite/operators/while_op.cc
+++ b/lite/operators/while_op.cc
@@ -20,31 +20,23 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-bool WhileOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.sub_block);
-  CHECK_OR_FALSE(param_.scope);
+bool WhileOp::CheckShape() const {
   CHECK_OR_FALSE(param_.cond);
+  CHECK_OR_FALSE(param_.program_desc);
+  CHECK_OR_FALSE(param_.exec_scope);
   return true;
 }
 
-bool WhileOpLite::InferShapeImpl() const { return true; }
-
-bool WhileOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto inputs = op_desc.Input("X");
-  auto outs = op_desc.Output("Out");
-
-  for (auto var : inputs) {
-    // param_.x.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  for (auto var : outs) {
-    // param_.outs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  param_.sub_block = sub_block_;
+bool WhileOp::InferShapeImpl() const { return true; }
 
+bool WhileOp::AttachImpl(const cpp::OpDesc &op_desc, Scope *scope) {
   auto condition = op_desc.Input("Condition");
   param_.cond = scope->FindVar(condition[0])->GetMutable<lite::Tensor>();
-  param_.scope = scope;
-
+  CHECK(param_.program_desc);
+  param_.block_idx = op_desc.GetAttr<int32_t>("sub_block");
+  CHECK_GE(param_.block_idx, 0);
+  param_.exec_scope = scope;
+  CHECK(param_.exec_scope);
   return true;
 }
 
@@ -52,4 +44,4 @@ bool WhileOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_OP(while, paddle::lite::operators::WhileOpLite);
+REGISTER_LITE_OP(while, paddle::lite::operators::WhileOp);
diff --git a/lite/operators/while_op.h b/lite/operators/while_op.h
index 94aec15a6d3eb60036bf9c2168fdbd855b84a396..e448ee568723b24a241c5bb127ac61458385337e 100644
--- a/lite/operators/while_op.h
+++ b/lite/operators/while_op.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/op_lite.h"
@@ -23,24 +24,30 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-class WhileOpLite : public OpLite {
+class WhileOp : public OpLite {
  public:
-  WhileOpLite() {}
-  explicit WhileOpLite(const std::string &op_type) : OpLite(op_type) {}
+  WhileOp() {}
+  explicit WhileOp(const std::string &op_type) : OpLite(op_type) {}
 
   bool CheckShape() const override;
 
   bool InferShapeImpl() const override;
 
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
   std::string DebugString() const override { return "while"; }
-  void SetSubBlock(cpp::BlockDesc *desc) { sub_block_ = desc; }
+
+  void SetProgramDesc(std::shared_ptr<const cpp::ProgramDesc> program_desc) {
+    param_.program_desc = program_desc;
+  }
+  std::shared_ptr<const cpp::ProgramDesc> GetProgramDesc() {
+    return param_.program_desc;
+  }
 
  private:
   mutable WhileParam param_;
-  cpp::BlockDesc *sub_block_;
 };
 
 }  // namespace operators
diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt
index 844c3f2ac7146e05b2d93eac76279df022e06652..e9c6574c19bcb6a238503d7b5fc955db9b96d689 100644
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
@@ -1,3 +1,13 @@
+if(LITE_WITH_ARM)
+    lite_cc_test(test_transformer_with_mask_fp32_arm SRCS test_transformer_with_mask_fp32_arm.cc
+      DEPS ${lite_model_test_DEPS} paddle_api_full
+      ARM_DEPS ${arm_kernels}
+      ARGS --model_dir=${LITE_MODEL_DIR}/transformer_with_mask_fp32 SERIAL)
+  if(WITH_TESTING)
+      add_dependencies(test_transformer_with_mask_fp32_arm extern_lite_download_transformer_with_mask_fp32_tar_gz)
+  endif()
+endif()
+
 if(LITE_WITH_XPU)
     lite_cc_test(test_resnet50_lite_xpu SRCS test_resnet50_lite_xpu.cc
       DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
diff --git a/lite/tests/api/test_mmdnn_lite_xpu.cc b/lite/tests/api/test_mmdnn_lite_xpu.cc
index a2a98821e70cb462b23887f851cfc4bce6b463ca..72d774db14d955f17caee217f13fddb32acb93c3 100644
--- a/lite/tests/api/test_mmdnn_lite_xpu.cc
+++ b/lite/tests/api/test_mmdnn_lite_xpu.cc
@@ -26,156 +26,171 @@
 
 DEFINE_bool(perf, false, "perf?");
 DEFINE_string(perf_input, "perf_input", "perf_input");
+DEFINE_int32(perf_batch_size, 40, "perf_batch_size");
+DEFINE_bool(use_xpu, true, "use_xpu?");
+DEFINE_int32(perf_dev, 0, "perf_dev");
 
 namespace paddle {
 namespace lite {
 
-std::vector<int64_t> input0;
-std::vector<uint64_t> input0_lod = {0};
-std::vector<int64_t> input1;
-std::vector<uint64_t> input1_lod = {0};
-std::vector<int64_t> input2;
-std::vector<uint64_t> input2_lod = {0};
-std::vector<int64_t> input3;
-std::vector<uint64_t> input3_lod = {0};
-std::vector<int64_t> input4;
-std::vector<uint64_t> input4_lod = {0};
-std::vector<int64_t> input5;
-std::vector<uint64_t> input5_lod = {0};
+class SampleReader {
+ public:
+  std::vector<std::vector<int64_t>> data;
+  std::vector<std::vector<uint64_t>> lod;
 
-void ParseInput() {
-  std::string raw_input =
-      "0 1;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 "
-      "760166;3719 428 52 18 1102 10327 252 20 153 2897 1146 70 156 6 145 "
-      "10251 839 5 1779 1729 1779 1729 18 2707 6 2707 20 4742 4937 432 6 "
-      "3869;3719 760166 760166 18 1035176 1035176 764393 764393 1259006 767614 "
-      "767614 1020808 769579 793958 793958 1050488 911898 751332 751332 750336 "
-      "750799 750336 751575 751575 751544 751735 751397 751365 751512 751512 "
-      "753011 751562;3719 428 52 18 1102 10327 252 20 153 2897 1146 70 156 6 "
-      "145 10251 839 2 1211 3 3719 720 1540 145 10251 839 9405 4315 5998 4 2 "
-      "600 373 41 3719 428 52 44 10251 4302 1319 7 12 2 768 6 918 6 841 870 8 "
-      "843 8 271;3719 760166 760166 18 1035176 1035176 764393 764393 1259006 "
-      "767614 767614 1020808 769579 793958 793958 1050488 911898 2 773899 "
-      "773899 3719 1118420 1118420 1050488 1050488 911898 9405 4315 5998 4 2 "
-      "785435 785435 41 3719 760166 760166 44 10251 4302 1319 750118 750118 2 "
-      "750465 750465 750274 750398 750233 751252 751252 753447 752830 753112;\n"
-      "0 0;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 "
-      "760166;2109 2467 1805 227 3719 428 52 18 1102 10327 252 20 6 242 78 6 "
-      "532 78;2109 2467 1805 1245431 1245431 760166 760166 18 1035176 1035176 "
-      "764393 764393 752116 242 750370 750370 752081 751247;2109 2467 1805 227 "
-      "3719 428 52 18 1102 10327 252 20 2 145 242 1050 252 3582 2212;2109 2467 "
-      "1805 1245431 1245431 760166 760166 18 1035176 1035176 764393 764393 2 "
-      "871717 871717 757921 757921 3582 2212;\n"
-      "0 0;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 "
-      "760166;145 10251 839 76 31 1337 823 7506 567 65 170 8 21293 3719 5 43 "
-      "394 743 42;1050488 1050488 911898 750016 750016 1337 823 7506 762617 "
-      "762617 866652 8 21293 3719 5 43 914758 914758 757202;145 10251 839 76 "
-      "31 1337 823 7506 567 65 170 8 21293 3719 2 17580 30 523324 3 10251 4104 "
-      "281 3 8511 3719 2217 3 13 226 3083 4 11251 1606 357 9 2 145 10251 839 "
-      "76 31 1337 823 7506 567 65 170 2 7506 2445 8 145 10251 839 528 839 "
-      "19670 6538;1050488 1050488 911898 750016 750016 1337 823 7506 762617 "
-      "762617 866652 8 21293 3719 2 816626 816626 523324 3 1181698 1181698 "
-      "751656 780821 1063148 3719 2217 3 752498 752498 831323 753602 11251 "
-      "1606 357 9 2 1050488 1050488 911898 750016 750016 1337 823 7506 762617 "
-      "762617 866652 2 7506 753045 753045 756756 1050488 911898 528 839 19670 "
-      "6538;\n"
-      "0 0;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 "
-      "760166;145 10251 839 99 4 1102 10327 2196 41 3719 428 52 44 99 4 2899 "
-      "229 10 10 10;1050488 1050488 911898 807966 750273 1035176 1035176 "
-      "1237875 41 3719 760166 760166 753645 753645 750273 2899 229 750001 "
-      "750001 750001;145 10251 839 99 4 1102 10327 2196 41 3719 428 52 44 99 4 "
-      "2899 229 10 10 10 2 1177 8 145 10251 839 99 4 1102 10327 2196 41 3719 "
-      "428 52 44 99 4 2 101 8 1922 17 2184 2 1154 1922 72 1198 1266 "
-      "4516;1050488 1050488 911898 807966 750273 1035176 1035176 1237875 41 "
-      "3719 760166 760166 753645 753645 750273 2899 229 750001 750001 750001 2 "
-      "750257 750257 756756 1050488 911898 807966 750273 1035176 1035176 "
-      "1237875 41 3719 760166 760166 753645 753645 750273 2 764513 764513 "
-      "851213 851213 854628 2 753018 753018 754317 753328 754085 754070;\n"
-      "0 0;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 "
-      "760166;73 5347 112 8 145 10251 839 262 169 22729 3719 6 743 6 339 1156 "
-      "78 136 399 693 128 571;776150 776150 112 756756 756756 1050488 911898 "
-      "791355 791355 22729 3719 6 758277 758277 750137 750234 750241 750178 "
-      "750055 750216 750212 750049;73 5347 112 8 145 10251 839 262 169 22729 "
-      "3719 2 588 415 549 415 115 23;776150 776150 112 756756 756756 1050488 "
-      "911898 791355 791355 22729 3719 2 750221 750221 750262 750277 750277 "
-      "750261;";
-  auto raw_lines = Split(raw_input, "\n");
-  for (auto& raw_line : raw_lines) {
-    auto inputx = Split(raw_line, ";");
-    for (size_t i = 1; i < inputx.size(); ++i) {
-      auto tokens = Split(inputx[i], " ");
-      static std::vector<int64_t>* const input_array[] = {
-          &input0, &input0, &input1, &input2, &input3, &input4, &input5};
-      static std::vector<uint64_t>* const lod_array[] = {&input0_lod,
-                                                         &input0_lod,
-                                                         &input1_lod,
-                                                         &input2_lod,
-                                                         &input3_lod,
-                                                         &input4_lod,
-                                                         &input5_lod};
-      for (auto token : tokens) {
-        input_array[i]->push_back((int64_t)atoi(token.c_str()));
-      }
-      lod_array[i]->push_back((uint64_t)tokens.size() +
-                              (*lod_array[i])[lod_array[i]->size() - 1]);
-    }
-  }
-  return;
-}
+  void Read() {
+    std::string raw_input =
+        "0 1;125 584 142 2114 197;125 756226 756913 855693 760836;125 584 142 "
+        "2114 197 10 2899;125 756226 756913 855693 760836 10 750793;125 584 "
+        "142 2114 197 10 2899 2 825 32 18499 125 584 295 2114 197 2114 2730 6 "
+        "15 32 18499 125 584 142 295 2114 1423 21 2 334 863 5122 197 974 21 "
+        "295 619 25 2114 1755 2701 197 15 216 23 18499 125 584 142 599 3228 23 "
+        "2 5122 1917 804 5 2114 197 1236 3 2114 1403 15 3886 1080 23 1150 125 "
+        "475 23 2998 23;125 756226 756913 855693 760836 10 750793 2 825 750355 "
+        "18499 881680 756226 295 765124 760836 2114 872813 754265 15 32 18499 "
+        "881680 756226 756913 761251 765124 752843 766823 2 334 759834 5122 "
+        "774643 758458 21 295 755114 25 1148365 1755 2701 197 15 216 23 18499 "
+        "881680 756226 756913 826848 3228 23 2 5122 831009 804 752371 2114 "
+        "760836 1236 3 2114 910393 15 3886 1080 23 877375 752137 761034 792123 "
+        "2998 23;1;1;\n"
+        "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;121 28 1054 "
+        "1459 125 72 32 2321 531 125 295 584 142 2114 197 14 477 30 121;121 28 "
+        "764114 1459 753052 750694 750001 886192 750435 752179 295 584 756913 "
+        "855693 760836 14 477 30 753504;121 28 1054 1459 125 72 32 2321 531 "
+        "125 295 584 142 2114 197 2 121 28 1054 1459 125 72 32 2321 531 125 "
+        "295 584 142 4 263 2114 197 43 95 863 2114 323 20 142 626 11 2 45 10 "
+        "45 58 142 65 918 741 2114 197 764 3 5122 26 51 1266 2037 295 222 1121 "
+        "4491 3 545 4338 11 2 5122 26 495 3 142 3444 3249 2114 197 3 626 4 "
+        "2794;121 28 764114 1459 753052 750694 750001 886192 750435 752179 295 "
+        "584 756913 855693 760836 2 121 28 764114 1459 753052 750694 750001 "
+        "886192 750435 752179 295 584 756913 4 750885 2114 760836 43 750030 "
+        "754302 2114 323 822131 142 626 769001 2 45 750128 750324 58 142 "
+        "1147454 918 910829 2114 760836 841946 767340 5122 779102 51 1266 2037 "
+        "756461 222 752031 942669 1139389 780275 4338 830597 2 5122 779102 495 "
+        "761418 142 3444 852932 2114 760836 3 760162 757966 751127;121 295 "
+        "5593 142 2114 197;121 295 5593 925208 2114 760836;\n"
+        "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;207 125 584 "
+        "142 2114 1423 14 5283 1745 73;207 752276 756226 756913 855693 752843 "
+        "14 5283 781651 786597;6109 18807 142 5 64 5283 1745 73 3690 1060 3626 "
+        "4 716 51 1030 2114 197 4 428 936 9066 10 10 10 2 207 125 584 142 2114 "
+        "1423 2 15329 2114 197 5669 401 318 285 953 4 2114 197 2285 7 1783 11 "
+        "2 5122 197 14017 584;6109 18807 142 5 755319 5283 781651 786597 3690 "
+        "1060 3626 4 716 910478 1030 2114 760836 4 750323 936 9066 10 750002 "
+        "750002 2 207 752276 756226 756913 855693 752843 2 15329 2114 760836 "
+        "5669 401 318 757541 750261 4 2114 760836 2285 7 757639 11 2 5122 "
+        "774643 14017 584;125 584 142 1745 5122;125 756226 756913 1745 "
+        "755836;\n"
+        "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;149 396 778 "
+        "584 142 295 2114 1423 14 64 125 584 73 21 36670 5834 10 211 25;149 "
+        "751876 1048872 584 756913 761251 765124 752843 14 64 125 756226 73 "
+        "944567 36670 5834 10 750012 753240;101 10 2114 197 3 946 2 149 396 "
+        "778 584 142 295 2114 1423 2 2610 6 1444 111 2114 948 72 32 21 15 494 "
+        "25 4 2114 197 5669 1145 2 148 295 149 396 778 584 142 295 21 22853 41 "
+        "348 619 25 366 5305 2114 807 4 1115 381 1955 2114 11;101 751178 2114 "
+        "760836 3 946 2 149 751876 1048872 584 756913 761251 765124 752843 2 "
+        "2610 753567 775165 750899 972788 948 750125 750001 751875 15 494 25 4 "
+        "2114 760836 5669 1145 2 148 808886 982157 751876 1048872 584 756913 "
+        "761251 790772 22853 41 348 619 25 366 894206 2114 1008440 4 753953 "
+        "381 851474 765868 11;149 396 778 584 142 295 2 149 396 354 778 584 "
+        "142 1333 2 584 778 295 5122 2 149 396 778 584 3609 2 149 396 64478 "
+        "816 14246 1423 2 149 396 584 32 127 19 3609 2 149 396 584 73 2 149 "
+        "396 584 778 295 2285 142 4922 323 2 149 396 584 2114 2 149 396 253 "
+        "584 2114 197;149 751876 1048872 584 756913 761251 2 149 751876 756286 "
+        "767182 584 756913 1333 2 584 778 897778 941364 2 149 751876 1048872 "
+        "584 1102835 2 149 751876 64478 816 14246 912094 2 149 751876 584 "
+        "773547 127 750771 791456 2 149 751876 584 73 2 149 751876 584 778 "
+        "897778 2285 751493 791984 323 2 149 751876 584 2114 2 149 751876 "
+        "808443 835481 2114 760836;\n"
+        "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;125 584 545 "
+        "149 14 125 584;125 756226 545 874302 14 125 756226;2204 25 30 1692 "
+        "1770 6534 295 125 584 72 32 1346 4 2698 2114 197 11 2 4235 4301 240 "
+        "295 125 584 72 32 21 6708 15 56974 494 25 1030 2114 197 110 804 495 "
+        "611 2 221 759 341 6 5283 1745 73 71 2114 1423 71 125 584 545 149 149 "
+        "2 505 345 58 125 584 65 3486 2114 295 4 45 786 196 6604 6086;2204 25 "
+        "30 797189 1770 1191824 295 752782 756226 751697 750001 1346 4 2698 "
+        "2114 760836 765158 2 4235 4301 240 753859 752782 756226 751697 750001 "
+        "751875 6708 15 56974 494 25 1030 2114 760836 777607 762850 966521 611 "
+        "2 221 752565 750130 750084 910219 781651 786597 71 2114 752843 71 125 "
+        "756226 545 874302 149 2 505 825657 782848 125 756226 65 3486 2114 "
+        "760669 4 45 755747 758903 6604 6086;125 584 2114 2 125 584 2114 1423 "
+        "2 125 584 2114 149 2 149 584 1745 5122 725 2 2114 125 584 2 125 584 "
+        "2114 2 2621 584 2114 2 527 37 2754 130 170 1013 494 887 240 2 4521 "
+        "11111 586 2321 531 125 584 142 1360 816 2842 1423 2 125 584 2114;125 "
+        "756226 2114 2 125 756226 2114 752843 2 125 756226 2114 783644 2 149 "
+        "760183 1745 755836 725 2 2114 125 756226 2 125 756226 2114 2 2621 "
+        "932600 2114 2 527 751304 869964 754462 170 1013 750719 778287 774620 "
+        "2 4521 11111 586 2321 750435 752179 756226 756913 1360 764399 2842 "
+        "1423 2 125 756226 2114;\n"
+        "0 0;125 584 142 2114 197;125 756226 756913 855693 760836;207 584 142 "
+        "2114 197 4 207 584 142 2114 197 674 14 240 4328 14 4328 767;207 "
+        "1237071 756913 855693 760836 4 207 1237071 756913 855693 760836 674 "
+        "14 240 755573 14 4328 795065;207 584 142 2114 197 2 325 71 71 207 584 "
+        "142 2114 197 2 876 125 140 2114 197 2 207 584 142 2114 197 674 1210 "
+        "239 4328 767 268 1349 485 28 4389 504 3 941 57 1419 1978 11;207 "
+        "1237071 756913 855693 760836 2 325 71 71 207 1237071 756913 855693 "
+        "760836 2 876 125 750977 1250790 760836 2 207 1237071 756913 855693 "
+        "760836 674 814792 755820 812174 795065 818859 817155 816597 761001 "
+        "774461 780904 820475 1109800 790141 790459 780324 770390;584 142 295 "
+        "2114 232 2 207 584 2114 197 2 584 142 295 2114 232 2 584 142 512 2114 "
+        "197;584 756913 761251 765124 1006359 2 207 1237071 2114 760836 2 584 "
+        "756913 761251 765124 1006359 2 584 756913 879930 2114 760836;";
 
-class MmdnnReader {
-  std::ifstream ifs;
-  std::vector<std::string> StringSplit(const std::string& in,
-                                       const std::string& delim) {
-    std::vector<std::string> ret;
-    if (in == "") {
-      return ret;
-    }
-    auto begpos = in.find_first_not_of(delim);
-    while (begpos != std::string::npos) {
-      auto endpos = in.find_first_of(delim, begpos);
-      if (endpos == std::string::npos) {
-        endpos = in.size();
+    auto lines = Split(raw_input, "\n");
+    for (auto& line : lines) {
+      auto split1 = Split(line, ";");
+      if (data.size() == 0) {
+        for (size_t i = 1; i < split1.size(); ++i) {
+          data.push_back(std::vector<int64_t>());
+          lod.push_back({0});
+        }
       }
-      std::string ssubstr = in.substr(begpos, endpos - begpos);
-      ret.push_back(ssubstr);
-      begpos = endpos + 1;
-      if (endpos >= (in.size() - 1)) {
-        break;
+
+      for (size_t i = 1; i < split1.size(); ++i) {
+        auto split2 = Split(split1[i], " ");
+        if (split2.size() == 0) {
+          split2.push_back("1280000");
+        }
+        for (auto e : split2) {
+          data[i - 1].push_back(std::stoi(e.c_str(), nullptr, 0));
+        }
+        lod[i - 1].push_back(lod[i - 1].back() + split2.size());
       }
     }
-    return ret;
   }
+};
+
+class FileReader {
+  std::ifstream ifs;
 
  public:
-  std::vector<int64_t> data[6];
-  std::vector<uint64_t> lod[6];
+  std::vector<std::vector<int64_t>> data;
+  std::vector<std::vector<uint64_t>> lod;
 
   void Init(std::string file_name) { ifs.open(file_name); }
 
   int Read(int maxline) {
-    for (int i = 0; i < 6; i++) {
-      data[i].clear();
-    }
-    for (int i = 0; i < 6; i++) {
-      lod[i].clear();
-      lod[i].push_back(0);
-    }
+    data.clear();
+    lod.clear();
+
     std::string line;
     int cnt = 0;
     while (cnt < maxline && getline(ifs, line)) {
-      std::vector<std::string> split1 = StringSplit(line, ";");
-      for (int i = 1; i < 7; i++) {
-        std::vector<std::string> split2 = StringSplit(split1[i], " ");
+      std::vector<std::string> split1 = Split(line, ";");
+      if (data.size() == 0) {
+        for (size_t i = 1; i < split1.size(); ++i) {
+          data.push_back(std::vector<int64_t>());
+          lod.push_back({0});
+        }
+      }
+
+      for (size_t i = 1; i < split1.size(); i++) {
+        std::vector<std::string> split2 = Split(split1[i], " ");
         if (split2.size() == 0) {
           split2.push_back("1280000");
         }
         for (size_t j = 0; j < split2.size(); j++) {
           data[i - 1].push_back(std::stoi(split2[j].c_str(), nullptr, 0));
         }
-        // if (i % 2 == 1) {
-        // lod[i / 2].push_back(lod[i / 2].back() + split2.size());
-        //}
         lod[i - 1].push_back(lod[i - 1].back() + split2.size());
       }
       cnt++;
@@ -186,36 +201,47 @@ class MmdnnReader {
 
 TEST(MMDNN, test_mmdnn_lite_xpu) {
   lite_api::CxxConfig config;
-  config.set_model_dir(FLAGS_model_dir);
-  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
-                           lite_api::Place{TARGET(kXPU), PRECISION(kInt64)},
-                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
-                           lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
-                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  // config.set_model_dir(FLAGS_model_dir);
+  config.set_model_file(FLAGS_model_dir + "/__model__");
+  config.set_param_file(FLAGS_model_dir + "/__param__");
+  config.set_xpu_dev_per_thread(FLAGS_perf_dev);
+  if (FLAGS_use_xpu) {
+    config.set_valid_places(
+        {lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+         lite_api::Place{TARGET(kXPU), PRECISION(kInt64)},
+         lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+         lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
+         lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  } else {
+    config.set_valid_places(
+        {lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+         lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
+         lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  }
   config.set_xpu_workspace_l3_size_per_thread();
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
   if (FLAGS_perf) {
-    MmdnnReader reader;
-    reader.Init(FLAGS_perf_input);
-    int UB_batch = 40;  //  upper bound of batch
+    FileReader file_reader;
+    file_reader.Init(FLAGS_perf_input);
+    int UB_batch = FLAGS_perf_batch_size;  //  upper bound of batch
     int iter = 0;
     double tsc_sum = 0;
 
     while (true) {
-      int batch = reader.Read(UB_batch);
+      int batch = file_reader.Read(UB_batch);
       if (batch <= 0) {
         break;
       }
       ++iter;
-      for (int i = 0; i < 6; ++i) {
+      for (size_t i = 0; i < file_reader.data.size(); ++i) {
         auto input_x = predictor->GetInput(i);
-        input_x->Resize({(int64_t)reader.data[i].size(), 1});
-        input_x->SetLoD({reader.lod[i]});
+        input_x->Resize({(int64_t)file_reader.data[i].size(), 1});
+        input_x->SetLoD({file_reader.lod[i]});
         auto* data_x = input_x->mutable_data<int64_t>();
         memcpy(data_x,
-               reader.data[i].data(),
-               reader.data[i].size() * sizeof(int64_t));
+               file_reader.data[i].data(),
+               file_reader.data[i].size() * sizeof(int64_t));
       }
 
       auto start = GetCurrentUS();
@@ -232,55 +258,17 @@ TEST(MMDNN, test_mmdnn_lite_xpu) {
     return;
   }
 
-  ParseInput();
+  SampleReader sample_reader;
+  sample_reader.Read();
 
-  {
-    std::vector<int64_t> input0_shape{(int64_t)input0.size(), 1};
-    auto input_tensor0 = predictor->GetInput(0);
-    input_tensor0->Resize(input0_shape);
-    input_tensor0->SetLoD({input0_lod});
-    auto* data0 = input_tensor0->mutable_data<int64_t>();
-    memcpy(data0, input0.data(), sizeof(int64_t) * input0.size());
-  }
-  {
-    std::vector<int64_t> input1_shape{(int64_t)input1.size(), 1};
-    auto input_tensor1 = predictor->GetInput(1);
-    input_tensor1->Resize(input1_shape);
-    input_tensor1->SetLoD({input1_lod});
-    auto* data1 = input_tensor1->mutable_data<int64_t>();
-    memcpy(data1, input1.data(), sizeof(int64_t) * input1.size());
-  }
-  {
-    std::vector<int64_t> input2_shape{(int64_t)input2.size(), 1};
-    auto input_tensor2 = predictor->GetInput(2);
-    input_tensor2->Resize(input2_shape);
-    input_tensor2->SetLoD({input2_lod});
-    auto* data2 = input_tensor2->mutable_data<int64_t>();
-    memcpy(data2, input2.data(), sizeof(int64_t) * input2.size());
-  }
-  {
-    std::vector<int64_t> input3_shape{(int64_t)input3.size(), 1};
-    auto input_tensor3 = predictor->GetInput(3);
-    input_tensor3->Resize(input3_shape);
-    input_tensor3->SetLoD({input3_lod});
-    auto* data3 = input_tensor3->mutable_data<int64_t>();
-    memcpy(data3, input3.data(), sizeof(int64_t) * input3.size());
-  }
-  {
-    std::vector<int64_t> input4_shape{(int64_t)input4.size(), 1};
-    auto input_tensor4 = predictor->GetInput(4);
-    input_tensor4->Resize(input4_shape);
-    input_tensor4->SetLoD({input4_lod});
-    auto* data4 = input_tensor4->mutable_data<int64_t>();
-    memcpy(data4, input4.data(), sizeof(int64_t) * input4.size());
-  }
-  {
-    std::vector<int64_t> input5_shape{(int64_t)input5.size(), 1};
-    auto input_tensor5 = predictor->GetInput(5);
-    input_tensor5->Resize(input5_shape);
-    input_tensor5->SetLoD({input5_lod});
-    auto* data5 = input_tensor5->mutable_data<int64_t>();
-    memcpy(data5, input5.data(), sizeof(int64_t) * input5.size());
+  for (size_t i = 0; i < sample_reader.data.size(); ++i) {
+    auto input_x = predictor->GetInput(i);
+    input_x->Resize({(int64_t)sample_reader.data[i].size(), 1});
+    input_x->SetLoD({sample_reader.lod[i]});
+    auto* data_x = input_x->mutable_data<int64_t>();
+    memcpy(data_x,
+           sample_reader.data[i].data(),
+           sample_reader.data[i].size() * sizeof(int64_t));
   }
 
   for (int i = 0; i < FLAGS_warmup; ++i) {
diff --git a/lite/tests/api/test_transformer_with_mask_fp32_arm.cc b/lite/tests/api/test_transformer_with_mask_fp32_arm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e65b017aa1440683d86d0da03686a2be9c4c6ee5
--- /dev/null
+++ b/lite/tests/api/test_transformer_with_mask_fp32_arm.cc
@@ -0,0 +1,274 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename T>
+void SetTensorData(const std::vector<T> &data,
+                   const std::vector<int64_t> &shape,
+                   paddle::lite_api::Tensor *tensor,
+                   const std::vector<std::vector<uint64_t>> &lod = {}) {
+  tensor->Resize(shape);
+  tensor->SetLoD(lod);
+  std::copy(data.begin(), data.end(), tensor->mutable_data<T>());
+}
+
+void PrepareInputData(
+    const std::shared_ptr<paddle::lite_api::PaddlePredictor> &predictor,
+    std::vector<int64_t> src_word_data,
+    int max_seq_len = 16,  // padding
+    int max_out_len = 8,
+    int bos_idx = 0,
+    int eos_idx = 1,
+    int n_head = 8) {
+  // src_word
+  auto src_word = predictor->GetInput(0);
+  int seq_len = src_word_data.size();
+  for (int i = seq_len; i < max_seq_len; i++) {
+    src_word_data.push_back(eos_idx);
+  }
+  std::vector<int64_t> src_word_shape{
+      1, static_cast<int64_t>(src_word_data.size())};
+  SetTensorData<int64_t>(src_word_data, src_word_shape, src_word.get());
+  // src_pos
+  auto src_pos = predictor->GetInput(1);
+  std::vector<int64_t> src_pos_data(src_word_data.size());
+  std::iota(src_pos_data.begin(), src_pos_data.end(), 0);
+  std::vector<int64_t> src_pos_shape{1,
+                                     static_cast<int64_t>(src_pos_data.size())};
+  SetTensorData<int64_t>(src_pos_data, src_pos_shape, src_pos.get());
+  // src_slf_attn_bias
+  auto src_slf_attn_bias = predictor->GetInput(2);
+  std::vector<float> src_slf_attn_bias_data(1 * n_head * src_word_data.size() *
+                                            src_word_data.size());
+  int offset = 0;
+  for (int j = 0; j < 1 * n_head * src_word_data.size(); j++) {
+    for (int i = 0; i < seq_len; i++) {
+      src_slf_attn_bias_data[offset++] = 0.0f;
+    }
+    for (int i = seq_len; i < src_word_data.size(); i++) {
+      src_slf_attn_bias_data[offset++] = -1e9f;
+    }
+  }
+  std::vector<int64_t> src_slf_attn_bias_shape{
+      1,
+      n_head,
+      static_cast<int64_t>(src_word_data.size()),
+      static_cast<int64_t>(src_word_data.size())};
+  SetTensorData<float>(
+      src_slf_attn_bias_data, src_slf_attn_bias_shape, src_slf_attn_bias.get());
+  // trg_word
+  auto trg_word = predictor->GetInput(3);
+  std::vector<int64_t> trg_word_data(2, 0);
+  std::vector<int64_t> trg_word_shape{2, 1};
+  std::vector<uint64_t> lod_level_0{0, 2};
+  std::vector<uint64_t> lod_level_1{0, 1, 2};
+  std::vector<std::vector<uint64_t>> trg_word_lod(2);
+  trg_word_lod[0] = lod_level_0;
+  trg_word_lod[1] = lod_level_1;
+  SetTensorData<int64_t>(
+      trg_word_data, trg_word_shape, trg_word.get(), trg_word_lod);
+  // init_score
+  auto init_score = predictor->GetInput(4);
+  std::vector<float> init_score_data(2);
+  init_score_data[0] = 0;
+  init_score_data[1] = -1e9f;
+  std::vector<int64_t> init_score_shape{2, 1};
+  std::vector<std::vector<uint64_t>> init_score_lod(trg_word_lod);
+  SetTensorData<float>(
+      init_score_data, init_score_shape, init_score.get(), init_score_lod);
+  // init_idx
+  auto init_idx = predictor->GetInput(5);
+  std::vector<int32_t> init_idx_data(2, 0);
+  std::vector<int64_t> init_idx_shape{2};
+  SetTensorData<int32_t>(init_idx_data, init_idx_shape, init_idx.get());
+  // trg_slf_attn_bias
+  auto trg_slf_attn_bias = predictor->GetInput(6);
+  std::vector<float> trg_slf_attn_bias_data(max_out_len * n_head * 1 *
+                                            max_out_len);
+  offset = 0;
+  for (int k = 0; k < max_out_len; k++) {
+    for (int j = 0; j < n_head; j++) {
+      for (int i = 0; i < max_out_len; i++) {
+        trg_slf_attn_bias_data[offset++] = (i <= k) ? 0.0f : -1e9f;
+      }
+    }
+  }
+  std::vector<int64_t> trg_slf_attn_bias_shape{
+      max_out_len, n_head, 1, max_out_len};
+  SetTensorData<float>(
+      trg_slf_attn_bias_data, trg_slf_attn_bias_shape, trg_slf_attn_bias.get());
+  // trg_src_attn_bias
+  auto trg_src_attn_bias = predictor->GetInput(7);
+  std::vector<float> trg_src_attn_bias_data(1 * n_head * 1 *
+                                            src_word_data.size());
+  offset = 0;
+  for (int j = 0; j < 1 * n_head * 1; j++) {
+    for (int i = 0; i < seq_len; i++) {
+      trg_src_attn_bias_data[offset++] = 0.0f;
+    }
+    for (int i = seq_len; i < src_word_data.size(); i++) {
+      trg_src_attn_bias_data[offset++] = -1e9f;
+    }
+  }
+  std::vector<int64_t> trg_src_attn_bias_shape{
+      1, n_head, 1, static_cast<int64_t>(src_word_data.size())};
+  SetTensorData<float>(
+      trg_src_attn_bias_data, trg_src_attn_bias_shape, trg_src_attn_bias.get());
+  // kv_padding_selection
+  auto kv_padding_selection = predictor->GetInput(8);
+  std::vector<float> kv_padding_selection_data(max_out_len * n_head *
+                                               max_out_len * 1);
+  offset = 0;
+  for (int k = 0; k < max_out_len; k++) {
+    for (int j = 0; j < n_head; j++) {
+      for (int i = 0; i < max_out_len; i++) {
+        kv_padding_selection_data[offset++] = (i == k) ? 1.0f : 0.0f;
+      }
+    }
+  }
+  std::vector<int64_t> kv_padding_selection_shape{
+      max_out_len, n_head, max_out_len, 1};
+  SetTensorData<float>(kv_padding_selection_data,
+                       kv_padding_selection_shape,
+                       kv_padding_selection.get());
+}
+
+void CheckOutputData(
+    const std::shared_ptr<paddle::lite_api::PaddlePredictor> &predictor,
+    const std::vector<int64_t> &ref_seq_ids_data,
+    const std::vector<float> &ref_seq_scores_data) {
+  // seq_ids
+  auto seq_ids = predictor->GetOutput(0);
+  auto seq_ids_shape = seq_ids->shape();
+  auto seq_ids_size = std::accumulate(seq_ids_shape.begin(),
+                                      seq_ids_shape.end(),
+                                      1,
+                                      std::multiplies<int64_t>());
+  ASSERT_EQ(seq_ids_size, ref_seq_ids_data.size());
+  auto *seq_ids_data = seq_ids->data<int64_t>();
+  for (size_t i = 0; i < seq_ids_size; i++) {
+    EXPECT_EQ(seq_ids_data[i], ref_seq_ids_data[i]);
+  }
+  // seq_scores
+  auto seq_scores = predictor->GetOutput(1);
+  auto seq_scores_shape = seq_scores->shape();
+  auto seq_scores_size = std::accumulate(seq_scores_shape.begin(),
+                                         seq_scores_shape.end(),
+                                         1,
+                                         std::multiplies<int64_t>());
+  ASSERT_EQ(seq_scores_size, ref_seq_scores_data.size());
+  auto *seq_scores_data = seq_scores->data<float>();
+  for (size_t i = 0; i < seq_scores_size; i++) {
+    EXPECT_NEAR(seq_scores_data[i], ref_seq_scores_data[i], 1e-5);
+  }
+}
+
+TEST(TransformerWithMask, test_transformer_with_mask_fp32) {
+  // Save the optimized model by using full api with CxxConfig
+  lite_api::CxxConfig cxx_config;
+  cxx_config.set_model_dir(FLAGS_model_dir);
+  cxx_config.set_valid_places(
+      {lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+       lite_api::Place{TARGET(kARM), PRECISION(kInt64)}});
+  auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
+  predictor->SaveOptimizedModel(FLAGS_model_dir + ".nb",
+                                paddle::lite_api::LiteModelType::kNaiveBuffer);
+  // Load the optimized model and run inference by using light api with
+  // MobileConfig
+  paddle::lite_api::MobileConfig mobile_config;
+  mobile_config.set_model_from_file(FLAGS_model_dir + ".nb");
+  mobile_config.set_threads(1);
+  mobile_config.set_power_mode(paddle::lite_api::PowerMode::LITE_POWER_HIGH);
+  std::vector<std::pair<std::vector<int64_t>,
+                        std::pair<std::vector<int64_t>, std::vector<float>>>>
+      test_cases = {
+          {{16, 16, 16, 1},
+           {{0, 16, 16, 16, 16, 16, 16, 1, 0, 16, 16, 16, 16, 16, 9, 1},
+            {0.0f,
+             -0.939061f,
+             -1.91494f,
+             -2.94378f,
+             -4.26457f,
+             -5.82675f,
+             -7.45856f,
+             -7.58065f,
+             0.0f,
+             -0.939061f,
+             -1.91494f,
+             -2.94378f,
+             -4.26457f,
+             -5.82675f,
+             -8.70994f,
+             -8.8053f}}},
+          {{16, 16, 16, 10, 1},
+           {{0, 6, 53, 11, 1, 0, 6, 53, 56, 4, 1},
+            {0.0f,
+             -2.36122f,
+             -4.1678f,
+             -6.19764f,
+             -7.69256f,
+             0.0f,
+             -2.36122f,
+             -4.1678f,
+             -6.20145f,
+             -7.66355f,
+             -8.63024f}}},
+          {{126, 4, 33, 1},
+           {{0, 68, 5, 17, 1, 0, 68, 5, 13, 14, 1},
+            {0.0f,
+             -0.829941f,
+             -1.20217f,
+             -2.23938f,
+             -2.98262f,
+             0.0f,
+             -0.829941f,
+             -1.20217f,
+             -2.25051f,
+             -3.07555f,
+             -3.57711f}}},
+          {{126, 4, 33, 99, 1},
+           {{0, 14, 242, 17, 1, 0, 93, 38, 27, 68, 1},
+            {0.f,
+             -1.8504f,
+             -2.66679f,
+             -3.09469f,
+             -3.63227f,
+             0.0f,
+             -1.33829f,
+             -1.41656f,
+             -3.1333f,
+             -3.27901f,
+             -3.88582f}}}};
+  for (auto &test_case : test_cases) {
+    PrepareInputData(predictor, test_case.first);
+    predictor->Run();
+    CheckOutputData(predictor, test_case.second.first, test_case.second.second);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index f70d7d5381d82488eabbcd981c7ec72c49e93d05..36876bed4eaa14e26d79c0fc29ea5d9179c48e9c 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -1,80 +1,81 @@
 if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_RKNPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-    lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_group_norm_compute SRCS group_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_group_norm_compute SRCS group_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_logical_compute SRCS logical_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_multiclass_nms_compute SRCS multiclass_nms_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_fill_constant_compute SRCS fill_constant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_fill_constant_batch_size_like_compute SRCS fill_constant_batch_size_like_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_logical_compute SRCS logical_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_multiclass_nms_compute SRCS multiclass_nms_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_fill_constant_compute SRCS fill_constant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_fill_constant_batch_size_like_compute SRCS fill_constant_batch_size_like_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 
 if(LITE_BUILD_EXTRA)
-    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_conv_compute SRCS sequence_conv_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_sum_compute SRCS reduce_sum_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_conv_compute SRCS sequence_conv_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_sum_compute SRCS reduce_sum_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_clip_compute SRCS clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_clip_compute SRCS clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 
     # for training kernel
     if (LITE_WITH_TRAIN)
-        lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-        lite_cc_test(test_kernel_activation_grad_compute SRCS activation_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-        lite_cc_test(test_kernel_elementwise_grad_compute SRCS elementwise_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-        lite_cc_test(test_kernel_mul_grad_compute SRCS mul_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-        lite_cc_test(test_kernel_sgd_compute SRCS sgd_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_activation_grad_compute SRCS activation_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_elementwise_grad_compute SRCS elementwise_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_mul_grad_compute SRCS mul_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_sgd_compute SRCS sgd_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     endif()
 
 endif()
+
     lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels}  ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc
index a62c698f83fe10409af0bba8774135d3409358ea..0e803f1281fe2fc4dfca70c3f5223b8835ad7eff 100644
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -302,6 +302,9 @@ TEST(Activation_relu, precision) {
   place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
@@ -324,6 +327,9 @@ TEST(Activation_leaky_relu, precision) {
   abs_error = 1e-2;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
@@ -404,6 +410,9 @@ TEST(Activation_sigmoid, precision) {
   abs_error = 1e-2;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
@@ -428,6 +437,9 @@ TEST(Activation_tanh, precision) {
   place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
@@ -467,6 +479,9 @@ TEST(Activation_relu6, precision) {
   abs_error = 1e-2;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
diff --git a/lite/tests/kernels/conv_compute_test.cc b/lite/tests/kernels/conv_compute_test.cc
index 4442fe47e3a6410aa921d163ef0257602cce2fbc..a4bcf6ea70e3fe719793aa4ebd8fb8cd09e35905 100644
--- a/lite/tests/kernels/conv_compute_test.cc
+++ b/lite/tests/kernels/conv_compute_test.cc
@@ -413,6 +413,9 @@ TEST(Conv2d, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 5e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 5e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
diff --git a/lite/tests/kernels/prior_box_compute_test.cc b/lite/tests/kernels/prior_box_compute_test.cc
index 73fd612c3a03c0a15ddaf3ce6c08ff0ed1a5a95b..ec0eda8cbb2b7f8d6ab01efa467ed857d817905a 100644
--- a/lite/tests/kernels/prior_box_compute_test.cc
+++ b/lite/tests/kernels/prior_box_compute_test.cc
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace lite {
 
-const int MALLOC_ALIGN = 64;
+const int MALLOC_ALIGN = 16;
 
 void* fast_malloc(size_t size) {
   size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
diff --git a/lite/tests/math/gemm_int8_compute_test.cc b/lite/tests/math/gemm_int8_compute_test.cc
index adae19d013e50fbd484257a99f55229c75b94263..57899c8d1e2e0c073f410e90d18119327f21f066 100644
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
@@ -120,6 +120,10 @@ bool test_gemm_int8(bool tra,
   auto dc_fp32 = tc_fp32.mutable_data<float>();
   auto dc_basic_int8 = tc_basic_int8.mutable_data<int8_t>();
   auto dc_basic_fp32 = tc_basic_fp32.mutable_data<float>();
+  // set intial input to be 0
+  memset(reinterpret_cast<char*>(dc_basic_fp32),
+         0,
+         tc_basic_fp32.numel() * sizeof(float));
   auto dbias = tbias.mutable_data<float>();
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/gemv_int8_compute_test.cc b/lite/tests/math/gemv_int8_compute_test.cc
index 99db53511446ecd4772fa2fd1b202337581506ef..3819c0dcd7f87c69a5805aae643a6a3a4a037f03 100644
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
@@ -108,6 +108,10 @@ bool test_gemv_int8(bool tra,
   auto dc_basic_int8 = tc_basic_int8.mutable_data<int8_t>();
   auto dc_basic_fp32 = tc_basic_fp32.mutable_data<float>();
   auto dbias = tbias.mutable_data<float>();
+  // set intial input to be 0
+  memset(reinterpret_cast<char*>(dc_basic_fp32),
+         0,
+         tc_basic_fp32.numel() * sizeof(float));
 
   paddle::lite_api::ActivationType act =
       paddle::lite_api::ActivationType::kIndentity;
diff --git a/lite/tests/math/sgemm_c4_compute_test.cc b/lite/tests/math/sgemm_c4_compute_test.cc
index b5beeaffaed6bff8a260c158bdce234fce6c1349..ecdf77fd37fff1da2914eeca5e29ef931de09c53 100644
--- a/lite/tests/math/sgemm_c4_compute_test.cc
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
@@ -92,6 +92,7 @@ bool test_sgemm_c4(
   auto db_c4 = tb_c4.mutable_data<float>();
   auto dc_basic = tc_basic.mutable_data<float>();
   auto dbias = tbias.mutable_data<float>();
+  memset(reinterpret_cast<char*>(dc_basic), 0, tc_basic.numel());
 
   // trans A, B to c4
   basic_trans_mat_to_c4(da, da_c4, k, m, k, true);
diff --git a/lite/tests/math/sgemv_compute_test.cc b/lite/tests/math/sgemv_compute_test.cc
index 91a1fe1770dfa3eeb3f3b94fcd2361f1c1634b1e..661c4f02aa7eafe807f77767dfd4db01a338993e 100644
--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
@@ -84,6 +84,7 @@ bool test_sgemv(bool tra,
   auto db = tb.mutable_data<float>();
   auto dc = tc.mutable_data<float>();
   auto dc_basic = tc_basic.mutable_data<float>();
+  memset(reinterpret_cast<char*>(dc_basic), 0, tc_basic.numel());
   auto dbias = tbias.mutable_data<float>();
   paddle::lite_api::ActivationType act =
       paddle::lite_api::ActivationType::kIndentity;
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index 9365120772d96d31ff0af98c2cab4dea609be5ab..f3f9b9a94236b0d4f25448deb6a702b82c38740f 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -22,6 +22,7 @@ OPTMODEL_DIR=""
 BUILD_TAILOR=OFF
 BUILD_CV=OFF
 WITH_LOG=ON
+WITH_EXCEPTION=OFF
 WITH_PROFILE=OFF
 BUILD_NPU=OFF
 NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/
@@ -32,6 +33,9 @@ BUILD_APU=OFF
 APU_DDK_ROOT="$(pwd)/apu_sdk_lib/"
 BUILD_RKNPU=OFF
 RKNPU_DDK_ROOT="$(pwd)/rknpu/"
+WITH_HUAWEI_ASCEND_NPU=OFF # Huawei Ascend Builder/Runtime Libs on X86 host 
+# default installation path, ensure acllib/atc/opp directories are all in this root dir
+HUAWEI_ASCEND_NPU_DDK_ROOT="/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux_gcc4.8.5"
 PYTHON_EXECUTABLE_OPTION=""
 
 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
@@ -39,8 +43,8 @@ readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/t
 readonly workspace=$PWD
 
 # if operating in mac env, we should expand the maximum file num
-os_nmae=`uname -s`
-if [ ${os_nmae} == "Darwin" ]; then
+os_name=`uname -s`
+if [ ${os_name} == "Darwin" ]; then
    ulimit -n 1024
 fi
 
@@ -126,6 +130,7 @@ function make_tiny_publish_so {
       -DLITE_WITH_JAVA=$BUILD_JAVA \
       -DLITE_WITH_PYTHON=$BUILD_PYTHON \
       -DLITE_WITH_LOG=$WITH_LOG \
+      -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
       -DLITE_ON_TINY_PUBLISH=ON \
       -DANDROID_STL_TYPE=$android_stl \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
@@ -181,6 +186,7 @@ function make_opencl {
       -DWITH_TESTING=OFF \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
       -DLITE_WITH_LOG=$WITH_LOG \
+      -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
       -DLITE_WITH_CV=$BUILD_CV \
       -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_LANG=$3
 
@@ -219,6 +225,7 @@ function make_full_publish_so {
       -DLITE_WITH_JAVA=$BUILD_JAVA \
       -DLITE_WITH_PYTHON=$BUILD_PYTHON \
       -DLITE_WITH_LOG=$WITH_LOG \
+      -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
       -DLITE_WITH_PROFILE=${WITH_PROFILE} \
       -DANDROID_STL_TYPE=$android_stl \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
@@ -343,6 +350,8 @@ function make_cuda {
             -DLITE_WITH_STATIC_CUDA=OFF \
             -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON \
+            -DLITE_WITH_LOG=${WITH_LOG} \
+            -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
             -DLITE_WITH_XPU=$BUILD_XPU \
             -DLITE_WITH_XTCL=$BUILD_XTCL \
             -DXPU_SDK_ROOT=$XPU_SDK_ROOT
@@ -358,6 +367,11 @@ function make_x86 {
   root_dir=$(pwd)
   build_directory=$BUILD_DIR/build.lite.x86
 
+  if [ ${WITH_HUAWEI_ASCEND_NPU} == "ON" ]; then
+    export CXX=/usr/bin/g++ # Ascend need g++ in centos
+    build_directory=$BUILD_DIR/build.lite.huawei_ascend_npu
+  fi
+
   if [ -d $build_directory ]
   then
     rm -rf $build_directory
@@ -379,10 +393,13 @@ function make_x86 {
             -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON \
             -DLITE_WITH_LOG=${WITH_LOG} \
+            -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
             -DLITE_WITH_PROFILE=${WITH_PROFILE} \
             -DLITE_WITH_XPU=$BUILD_XPU \
             -DLITE_WITH_XTCL=$BUILD_XTCL \
             -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
+            -DLITE_WITH_HUAWEI_ASCEND_NPU=$WITH_HUAWEI_ASCEND_NPU \
+            -DHUAWEI_ASCEND_NPU_DDK_ROOT=$HUAWEI_ASCEND_NPU_DDK_ROOT \
             -DCMAKE_BUILD_TYPE=Release \
             -DPY_VERSION=$PY_VERSION \
             $PYTHON_EXECUTABLE_OPTION
@@ -409,6 +426,7 @@ function print_usage {
     echo
     echo -e "optional argument:"
     echo -e "--with_log: (OFF|ON); controls whether to print log information, default is ON"
+    echo -e "--with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF"
     echo -e "--build_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)"
     echo -e "--build_train: (OFF|ON); controls whether to publish training operators and kernels, build_train is only for full_publish library now"
     echo -e "--build_python: (OFF|ON); controls whether to publish python api lib (ANDROID and IOS is not supported)"
@@ -491,6 +509,17 @@ function main {
                 WITH_LOG="${i#*=}"
                 shift
                 ;;
+            --with_exception=*)
+                WITH_EXCEPTION="${i#*=}"
+                if [[ $WITH_EXCEPTION == "ON" && $ARM_OS=="android" && $ARM_ABI == "armv7" && $ARM_LANG != "clang" ]]; then
+                     set +x
+                     echo
+                     echo -e "error: only clang provide C++ exception handling support for 32-bit ARM."
+                     echo
+                     exit 1
+                fi
+                shift
+                ;;
             --with_profile=*)
                 WITH_PROFILE="${i#*=}"
                 shift
@@ -539,6 +568,14 @@ function main {
                 RKNPU_DDK_ROOT="${i#*=}"
                 shift
                 ;;
+            --with_huawei_ascend_npu=*)
+                WITH_HUAWEI_ASCEND_NPU="${i#*=}"
+                shift
+                ;;
+            --huawei_ascend_npu_ddk_root=*)
+                HUAWEI_ASCEND_NPU_DDK_ROOT="${i#*=}"
+                shift
+                ;;
             tiny_publish)
                 make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL 
                 shift
diff --git a/lite/tools/build_android.sh b/lite/tools/build_android.sh
index 5713c4e21bb97d12bb840c99d1adbc7f2d781157..ecf34f0dfc4ddd141af9ea07dd6c4f15d1c0c16b 100755
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
@@ -17,6 +17,8 @@ WITH_JAVA=ON
 WITH_CV=OFF
 # controls whether to hide log information, default is ON.
 WITH_LOG=ON
+# controls whether to throw the exception when error occurs, default is OFF 
+WITH_EXCEPTION=OFF
 # options of striping lib according to input model.
 OPTMODEL_DIR=""
 WITH_STRIP=OFF
@@ -145,6 +147,7 @@ function make_tiny_publish_so {
   local cmake_mutable_options="
       -DLITE_BUILD_EXTRA=$WITH_EXTRA \
       -DLITE_WITH_LOG=$WITH_LOG \
+      -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
       -DLITE_BUILD_TAILOR=$WITH_STRIP \
       -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
       -DLITE_WITH_JAVA=$WITH_JAVA \
@@ -194,6 +197,7 @@ function make_full_publish_so {
   local cmake_mutable_options="
       -DLITE_BUILD_EXTRA=$WITH_EXTRA \
       -DLITE_WITH_LOG=$WITH_LOG \
+      -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
       -DLITE_BUILD_TAILOR=$WITH_STRIP \
       -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
       -DLITE_WITH_JAVA=$WITH_JAVA \
@@ -237,6 +241,7 @@ function print_usage {
     echo -e "|     --with_java: (OFF|ON); controls whether to publish java api lib, default is ON                                                   |"
     echo -e "|     --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF                                           |"
     echo -e "|     --with_log: (OFF|ON); controls whether to print log information, default is ON                                                   |"
+    echo -e "|     --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF                            |"
     echo -e "|     --with_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)  |"
     echo -e "|                                                                                                                                      |"
     echo -e "|  arguments of striping lib according to input model:(armv8, gcc, c++_static)                                                         |"
@@ -320,6 +325,18 @@ function main {
                 WITH_LOG="${i#*=}"
                 shift
                 ;;
+            # ON or OFF, default OFF
+            --with_exception=*)
+                WITH_EXCEPTION="${i#*=}"
+                if [[ $WITH_EXCEPTION == "ON" && $ARCH == "armv7" && $TOOLCHAIN != "clang" ]]; then
+                     set +x
+                     echo
+                     echo -e "Error: only clang provide C++ exception handling support for 32-bit ARM."
+                     echo
+                     exit 1
+                fi
+                shift
+                ;;
             # compiling lib which can operate on opencl and cpu.
             --with_opencl=*)
                 WITH_OPENCL="${i#*=}"
diff --git a/lite/tools/build_ios.sh b/lite/tools/build_ios.sh
index 3d4337aa8ecc20fd078b8906a950408927ea56c8..4eea073a058ba9e1e821e9f0746687baa0c38d5f 100755
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
@@ -12,6 +12,8 @@ WITH_EXTRA=OFF
 WITH_CV=OFF
 # controls whether to hide log information, default is ON.
 WITH_LOG=ON
+# controls whether to throw the exception when error occurs, default is OFF 
+WITH_EXCEPTION=OFF
 # absolute path of Paddle-Lite.
 workspace=$PWD/$(dirname $0)/../../
 # options of striping lib according to input model.
@@ -69,6 +71,7 @@ function make_ios {
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
             -DLITE_WITH_X86=OFF \
             -DLITE_WITH_LOG=$WITH_LOG \
+            -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
             -DLITE_BUILD_TAILOR=$WITH_STRIP \
             -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
             -DARM_TARGET_ARCH_ABI=$arch \
@@ -96,6 +99,7 @@ function print_usage {
     echo -e "|     --arch: (armv8|armv7), default is armv8                                                                                          |"
     echo -e "|     --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF                                           |"
     echo -e "|     --with_log: (OFF|ON); controls whether to print log information, default is ON                                                   |"
+    echo -e "|     --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF                            |"
     echo -e "|     --with_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)  |"
     echo -e "|                                                                                                                                      |"
     echo -e "|  arguments of striping lib according to input model:(armv8, gcc, c++_static)                                                         |"
@@ -140,6 +144,10 @@ function main {
                 WITH_LOG="${i#*=}"
                 shift
                 ;;
+            --with_exception=*)
+                WITH_EXCEPTION="${i#*=}"
+                shift
+                ;;
             help)
                 print_usage
                 exit 0
diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh
index 5ed491cb7da7b33357b7e66ab8267e60815b5348..f6de128feb6073fe206d03b68c5d8bc04dc9f16c 100755
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
@@ -17,6 +17,8 @@ PY_VERSION=""
 WITH_CV=OFF
 # controls whether to print log information, default is ON.
 WITH_LOG=ON
+# controls whether to throw the exception when error occurs, default is OFF 
+WITH_EXCEPTION=OFF
 # options of striping lib according to input model.
 WITH_STRIP=OFF
 OPTMODEL_DIR=""
@@ -60,6 +62,7 @@ function init_cmake_mutable_options {
                         -DPY_VERSION=$PY_VERSION \
                         -DLITE_WITH_CV=$WITH_CV \
                         -DLITE_WITH_LOG=$WITH_LOG \
+                        -DLITE_WITH_EXCEPTION=$WITH_EXCEPTION \
                         -DLITE_BUILD_TAILOR=$WITH_STRIP \
                         -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
                         -DLITE_WITH_OPENCL=$WITH_OPENCL \
@@ -210,6 +213,7 @@ function print_usage {
     echo -e "|     --python_version: (2.7|3.5|3.7); controls python version to compile whl, default is None                                                         |"
     echo -e "|     --with_cv: (OFF|ON); controls whether to compile cv functions into lib, default is OFF                                                           |"
     echo -e "|     --with_log: (OFF|ON); controls whether to print log information, default is ON                                                                   |"
+    echo -e "|     --with_exception: (OFF|ON); controls whether to throw the exception when error occurs, default is OFF                                            |"
     echo -e "|                                                                                                                                                      |"
     echo -e "|  arguments of striping lib according to input model:                                                                                                 |"
     echo -e "|     ./lite/tools/build_linux.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir                                                                |"
@@ -280,6 +284,11 @@ function main {
                 shift
                 ;;
             # ON or OFF, default OFF
+            --with_exception=*)
+                WITH_EXCEPTION="${i#*=}"
+                shift
+                ;;
+            # ON or OFF, default OFF
             --with_strip=*)
                 BUILD_TAILOR="${i#*=}"
                 shift
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 680c865c2c8999a29ff2b351dadfc797506c87f6..9cec7cdc5d566d1db5a8de4c723a9e0b11408d4d 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -21,8 +21,8 @@ USE_ADB_EMULATOR=ON
 LITE_WITH_COVERAGE=OFF
 
 # if operating in mac env, we should expand the maximum file num
-os_nmae=`uname -s`
-if [ ${os_nmae} == "Darwin" ]; then
+os_name=`uname -s`
+if [ ${os_name} == "Darwin" ]; then
    ulimit -n 1024
 fi
 
@@ -399,6 +399,64 @@ function build_test_xpu {
     test_xpu
 }
 
+function cmake_huawei_ascend_npu {
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
+    prepare_workspace
+    cmake .. \
+        ${common_flags} \
+        -DWITH_GPU=OFF \
+        -DWITH_MKLDNN=OFF \
+        -DLITE_WITH_X86=ON \
+        -DWITH_MKL=ON \
+        -DLITE_BUILD_EXTRA=ON \
+        -DLITE_WITH_HUAWEI_ASCEND_NPU=ON \
+        -DHUAWEI_ASCEND_NPU_DDK_ROOT="/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux_gcc4.8.5" \
+        -DCMAKE_BUILD_TYPE=Release
+}
+
+function build_huawei_ascend_npu {
+    make lite_compile_deps -j$NUM_CORES_FOR_COMPILE
+}
+
+# It will eagerly test all lite related unittests.
+function test_huawei_ascend_npu {
+    # Due to the missing of ascend kernels, we skip the following tests temporarily.
+    # TODO(xxx) clear the skip list latter
+    local skip_list=("test_paddle_api" "test_cxx_api" "test_googlenet"
+                     "test_mobilenetv1_lite_x86" "test_mobilenetv2_lite_x86"
+                     "test_inceptionv4_lite_x86" "test_light_api"
+                     "test_apis" "test_model_bin"
+                    )
+    local to_skip=0
+    for _test in $(cat $TESTS_FILE); do
+        to_skip=0
+        for skip_name in ${skip_list[@]}; do
+            if [ $skip_name = $_test ]; then
+                echo "to skip " $skip_name
+                to_skip=1
+            fi
+        done
+
+        if [ $to_skip -eq 0 ]; then
+            ctest -R $_test -V
+        fi
+    done
+}
+
+# Build the code and run lite server tests. This is executed in the CI system.
+function build_test_huawei_ascend_npu {
+    cur_dir=$(pwd)
+
+    build_dir=$cur_dir/build.lite.huawei_ascend_npu_test
+    mkdir -p $build_dir
+    cd $build_dir
+
+    cmake_huawei_ascend_npu
+    build_huawei_ascend_npu
+
+    test_huawei_ascend_npu
+}
+
 # test_arm_android <some_test_name> <adb_port_number>
 function test_arm_android {
     local test_name=$1
@@ -415,7 +473,7 @@ function test_arm_android {
     echo "test name: ${test_name}"
     adb_work_dir="/data/local/tmp"
 
-    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl")
+    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl" "test_transformer_with_mask_fp32_arm")
     for skip_name in ${skip_list[@]} ; do
         [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
     done
@@ -1157,6 +1215,10 @@ function main {
                 test_arm_android $TEST_NAME $ARM_PORT
                 shift
                 ;;
+            test_huawei_ascend_npu)
+                test_huawei_ascend_npu
+                shift
+                ;;
             build_test_cuda_server)
                 build_test_cuda_server
                 shift
@@ -1174,6 +1236,10 @@ function main {
                 build_test_xpu
                 shift
                 ;;
+            build_test_huawei_ascend_npu)
+                build_test_huawei_ascend_npu
+                shift
+                ;;
             build_test_train)
                 build_test_train
                 shift
@@ -1199,6 +1265,7 @@ function main {
                 build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
                 build_test_arm_subtask_model test_resnet50 resnet50
                 build_test_arm_subtask_model test_inceptionv4 inception_v4_simple
+                build_test_arm_subtask_model test_transformer_with_mask_fp32_arm transformer_with_mask_fp32
                 shift
                 ;;
             build_test_arm_subtask_armlinux)
diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py
index abb60f6141fbee53916a7db1711cf606afb09924..0cf14d12d553a4d9f7f4ed9780e4274560a8b23f 100644
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -56,8 +56,8 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
 ops_lines = []
 
 # valid targets and valid_ops
-valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU"]
-valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
+valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU"]
+valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
 class TargetType:
     kUnk = 0
     kHost = 1
@@ -73,6 +73,7 @@ class TargetType:
     kMLU = 11
     kRKNPU = 12
     kAPU = 13
+    kHuaweiAscendNPU = 14
 
 
 # record op_info of valid kernels into `valid_ops` according to different target type
diff --git a/lite/utils/env.h b/lite/utils/env.h
index f3bb8b58e1b63ed2c0ed05792020d11ea307690c..1d26148cea1ed499c8d5ca408ae9235788be6e91 100644
--- a/lite/utils/env.h
+++ b/lite/utils/env.h
@@ -15,14 +15,23 @@
 #pragma once
 #include <cstdlib>
 #include <cstring>
-
 #include <iostream>
 #include <string>
 
+// Specify the path of configuration file for the subgraph segmentation, an
+// example is shown as below:
+// op_type:in_var_name_0,in_var_name1:out_var_name_0,out_var_name1
+// op_type::out_var_name_0
+// op_type:in_var_name_0
+// op_type
 #define SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE \
   "SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE"
 
-#define SUBGRAPH_DISABLE_ONLINE_MODE "SUBGRAPH_DISABLE_ONLINE_MODE"
+// The original weight/local/unused variables in the subblock of the subgraph op
+// will be saved only if 'SUBGRAPH_ONLINE_MODE' is set to true(default) during
+// the analysis phase, it ensure the ops in the subblock can be converted to the
+// target device model online during the execution phase.
+#define SUBGRAPH_ONLINE_MODE "SUBGRAPH_ONLINE_MODE"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/utils/logging.h b/lite/utils/logging.h
index f292f220c006135af664ea34acc03525a5c112ab..c7fa8d4cf113abebb29c4ebe972e243a39573cf0 100644
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
@@ -57,7 +57,7 @@ static int gettimeofday(struct timeval* tp, void* tzp) {
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
 
-#ifdef LITE_WITH_ANDROID
+#if defined(LITE_WITH_LOG) && defined(LITE_WITH_ANDROID)
 #include <android/log.h>
 // Android log macors
 #define ANDROID_LOG_TAG "Paddle-Lite"
@@ -143,8 +143,10 @@ class LogMessage {
       ANDROID_LOG_I(log_stream_.str().c_str());
     } else if (level_ == "W") {
       ANDROID_LOG_W(log_stream_.str().c_str());
+    } else if (level_ == "F") {
+      ANDROID_LOG_F(log_stream_.str().c_str());
     } else {
-      fprintf(stderr, "Unsupported log level: %s", level_.c_str());
+      fprintf(stderr, "Unsupported log level: %s\n", level_.c_str());
       assert(false);
     }
 #endif
@@ -170,17 +172,25 @@ class LogMessageFatal : public LogMessage {
                   const char* level = "F")
       : LogMessage(file, func, lineno, level) {}
 
-  ~LogMessageFatal() {
+  ~LogMessageFatal()
+#ifdef LITE_WITH_EXCEPTION
+      noexcept(false)
+#endif
+  {
     log_stream_ << '\n';
 #ifdef LITE_WITH_ANDROID
     ANDROID_LOG_F(log_stream_.str().c_str());
 #endif
     fprintf(stderr, "%s", log_stream_.str().c_str());
 
+#ifdef LITE_WITH_EXCEPTION
+    throw std::exception();
+#else
 #ifndef LITE_ON_TINY_PUBLISH
     abort();
 #else
     assert(false);
+#endif
 #endif
   }
 };
@@ -237,7 +247,11 @@ class Voidify {
 
 class VoidifyFatal : public Voidify {
  public:
+#ifdef LITE_WITH_EXCEPTION
+  ~VoidifyFatal() noexcept(false) { throw std::exception(); }
+#else
   ~VoidifyFatal() { assert(false); }
+#endif
 };
 
 #endif
diff --git a/lite/utils/replace_stl/stream.cc b/lite/utils/replace_stl/stream.cc
index 081006be6711d5d26c405181fd6d86e89c9e4e95..8e14e4d6d5dbab8dc01b9f8a07910a905cae6abf 100644
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
@@ -23,6 +23,14 @@ namespace paddle {
 namespace lite {
 namespace replace_stl {
 
+#ifndef LITE_WITH_LOG
+#define ADD_DATA_AS_STRING(data_, obj_)
+#else
+#define ADD_DATA_AS_STRING(data_, obj_)    \
+  std::string text = std::to_string(obj_); \
+  pad(text);                               \
+  data_ = data_ + text;
+
 void ostream::pad(const std::string& text) {
   if (display_width_ > 0) {
     if (display_width_ < text.size()) {
@@ -36,15 +44,6 @@ void ostream::pad(const std::string& text) {
     }
   }
 }
-
-#ifndef LITE_WITH_LOG
-#define ADD_DATA_AS_STRING(data_, obj_)
-#else
-#define ADD_DATA_AS_STRING(data_, obj_)    \
-  std::string text = std::to_string(obj_); \
-  pad(text);                               \
-  data_ = data_ + text;
-
 #endif
 
 template <>
diff --git a/lite/utils/replace_stl/stream.h b/lite/utils/replace_stl/stream.h
index 3288a1986906b3fd600b91b6a56ae7134644456f..c58265a0cd864ebe2d2d158d953b17e2c230531f 100644
--- a/lite/utils/replace_stl/stream.h
+++ b/lite/utils/replace_stl/stream.h
@@ -57,7 +57,9 @@ class ostream {
   ostream& operator<<(const T* obj);
 
  private:
+#ifdef LITE_WITH_LOG
   void pad(const std::string& text);
+#endif
   std::string data_;
   int display_width_{-1};  // -1 refers to no setting
 };