Merge branch 'develop' into dwh_dev

fdf44c59 · Leonardo-Ding · GitHub · c2459c6b · 89d573e1 · fdf44c59
244 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
 lite_option(LITE_WITH_RKNPU  "Enable RKNPU in lite mode"  OFF)
 lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
+lite_option(LITE_WITH_HUAWEI_ASCEND_NPU  "Enable HUAWEI_ASCEND_NPU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
@@ -98,6 +99,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OF
 lite_option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
 lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
 lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
+lite_option(LITE_WITH_EXCEPTION "Enable throwing the exception when error occurs in lite" OFF)
 lite_option(LITE_WITH_NVTX "Enable nvtx or not, please enable LITE_WITH_CUDA first." OFF)
 lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
 lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
@@ -224,6 +226,11 @@ endif()
 if(LITE_WITH_MLU)
    include(mlu)
 endif()
+
+if(LITE_WITH_HUAWEI_ASCEND_NPU)
+    include(device/huawei_ascend_npu)
+endif()
+
 include(coveralls)

 include(external/mklml)     # download mklml package

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -174,6 +174,10 @@ if (LITE_WITH_MLU)
 add_definitions("-DLITE_WITH_MLU")
 endif()

+if (LITE_WITH_HUAWEI_ASCEND_NPU)
+add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU")
+endif()
+
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
 endif()
@@ -190,6 +194,10 @@ if (LITE_WITH_LOG)
  add_definitions("-DLITE_WITH_LOG")
 endif()

+if (LITE_WITH_EXCEPTION)
+  add_definitions("-DLITE_WITH_EXCEPTION")
+endif()
+
 if (LITE_ON_TINY_PUBLISH)
  add_definitions("-DLITE_ON_TINY_PUBLISH")
 endif()

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -80,6 +80,21 @@ if (ARM_TARGET_LANG STREQUAL "clang")
    elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
        set(triple arm-v7a-linux-android)
        set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE)
+        if(ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
+            # Use CMAKE_CXX_STANDARD_LIBRARIES_INIT to ensure libunwind and libc++ is linked in the right order
+            set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libunwind.a")
+            if (ANDROID_API_LEVEL LESS 21)
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libandroid_support.a")
+            endif()
+            if(ANDROID_STL_TYPE STREQUAL "c++_shared")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_shared.so")
+            elseif(ANDROID_STL_TYPE STREQUAL "c++_static")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_static.a")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++abi.a")
+            else()
+                message(FATAL_ERROR "Invalid Android STL TYPE: ${ANDROID_STL_TYPE}.")
+            endif()
+        endif()
    else()
        message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
    endif()

--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -23,6 +23,21 @@ if(ANDROID)
    
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")
+
+    # Don't re-export libgcc symbols
+    set(REMOVE_ATOMIC_GCC_SYMBOLS "-Wl,--exclude-libs,libatomic.a -Wl,--exclude-libs,libgcc.a")
+    set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
+    set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
+
+    # Only the libunwind.a from clang(with libc++) provide C++ exception handling support for 32-bit ARM
+    # Refer to https://android.googlesource.com/platform/ndk/+/master/docs/BuildSystemMaintainers.md#Unwinding
+    if (ARM_TARGET_LANG STREQUAL "clang" AND ARM_TARGET_ARCH_ABI STREQUAL "armv7" AND ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
+        set(REMOVE_UNWIND_SYMBOLS "-Wl,--exclude-libs,libunwind.a")
+        set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
+        set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
+        set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
+    endif()
 endif()

 if(ARMLINUX)
@@ -59,14 +74,13 @@ function(check_linker_flag)
 endfunction()

 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if((LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) OR LITE_WITH_PYTHON OR LITE_WITH_EXCEPTION OR (NOT LITE_ON_TINY_PUBLISH))
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -fasynchronous-unwind-tables -funwind-tables")
+else ()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-asynchronous-unwind-tables -fno-unwind-tables")
+endif()
 if (LITE_ON_TINY_PUBLISH)
-    if((NOT LITE_WITH_PYTHON))
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
-    endif()
-    if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
-    endif()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
    check_linker_flag(-Wl,--gc-sections)
 endif()

--- a/cmake/device/huawei_ascend_npu.cmake
+++ b/cmake/device/huawei_ascend_npu.cmake
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+  return()
+endif()
+
+# 1. path to Huawei Ascend Install Path
+if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
+    set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
+    if(NOT HUAWEI_ASCEND_NPU_DDK_ROOT)
+        message(FATAL_ERROR "Must set HUAWEI_ASCEND_NPU_DDK_ROOT or env HUAWEI_ASCEND_NPU_DDK_ROOT when LITE_WITH_HUAWEI_ASCEND_NPU=ON")
+    endif()
+endif()
+message(STATUS "HUAWEI_ASCEND_NPU_DDK_ROOT: ${HUAWEI_ASCEND_NPU_DDK_ROOT}")
+
+# 2. Huawei Ascend include directory
+set(ACL_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/include")
+set(ATC_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/include")
+set(OPP_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp")
+include_directories(${ACL_INCLUDE_DIR})
+include_directories(${ATC_INCLUDE_DIR})
+include_directories(${OPP_INCLUDE_DIR})
+
+# 3 find ACL Libs (ACL libs should before ATC libs)
+find_library(ACL_ASCENDCL_FILE NAMES ascendcl
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ACL_ASCENDCL_FILE)
+  message(FATAL_ERROR "Can not find ACL_ASCENDCL_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
+else()
+  message(STATUS "Found ACL_ASCENDCL_FILE Library: ${ACL_ASCENDCL_FILE}")
+  add_library(acl_ascendcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET acl_ascendcl PROPERTY IMPORTED_LOCATION ${ACL_ASCENDCL_FILE})
+endif()
+
+# 3.1 ascendcl dependency - libruntime.so
+find_library(ACL_RUNTIME_FILE NAMES runtime
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ACL_RUNTIME_FILE)
+  message(FATAL_ERROR "Can not find ACL_RUNTIME_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
+else()
+  message(STATUS "Found ACL_RUNTIME_FILE Library: ${ACL_RUNTIME_FILE}")
+  add_library(acl_runtime SHARED IMPORTED GLOBAL)
+  set_property(TARGET acl_runtime PROPERTY IMPORTED_LOCATION ${ACL_RUNTIME_FILE})
+endif()
+
+# 4.1 find ATC libs - libregister.so
+find_library(ATC_REGISTER_FILE NAMES register
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_REGISTER_FILE)
+  message(FATAL_ERROR "Can not find ATC_REGISTER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_REGISTER_FILE Library: ${ATC_REGISTER_FILE}")
+  add_library(atc_register SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_register PROPERTY IMPORTED_LOCATION ${ATC_REGISTER_FILE})
+endif()
+
+# 4.1.1 dependency of register - libprotobuf.so.19,
+find_library(ATC_PROTOBUF_FILE NAMES libprotobuf.so.19
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+  if(NOT ATC_REGISTER_FILE)
+  message(FATAL_ERROR "Can not find ATC_PROTOBUF_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_PROTOBUF_FILE Library: ${ATC_PROTOBUF_FILE}")
+  add_library(atc_protobuf SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_protobuf PROPERTY IMPORTED_LOCATION ${ATC_PROTOBUF_FILE})
+endif()
+
+# 4.1.2 dependency of register - libgraph.so
+find_library(ATC_GRAPH_FILE NAMES graph
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_GRAPH_FILE)
+  message(FATAL_ERROR "Can not find ATC_GRAPH_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GRAPH_FILE Library: ${ATC_GRAPH_FILE}")
+  add_library(atc_graph SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_graph PROPERTY IMPORTED_LOCATION ${ATC_GRAPH_FILE})
+endif()
+
+# 4.2 find ATC libs - libge_compiler.so
+find_library(ATC_GE_COMPILER_FILE NAMES ge_compiler
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_GE_COMPILER_FILE)
+  message(FATAL_ERROR "Can not find ATC_GE_COMPILER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GE_COMPILER_FILE Library: ${ATC_GE_COMPILER_FILE}")
+  add_library(atc_ge_compiler SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_ge_compiler PROPERTY IMPORTED_LOCATION ${ATC_GE_COMPILER_FILE})
+endif()
+
+# 4.2.1 dependencies of libge_compiler.so - libge_common.so
+find_library(ATC_GE_COMMON_FILE NAMES ge_common
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_GE_COMMON_FILE)
+  message(FATAL_ERROR "Can not find ATC_GE_COMMON_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GE_COMMON_FILE Library: ${ATC_GE_COMMON_FILE}")
+  add_library(atc_ge_common SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_ge_common PROPERTY IMPORTED_LOCATION ${ATC_GE_COMMON_FILE})
+endif()
+
+# 4.2.3 dependencies of libge_compiler.so - libresource.so
+find_library(ATC_RESOURCE_FILE NAMES resource
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_RESOURCE_FILE)
+  message(FATAL_ERROR "Can not find ATC_RESOURCE_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_RESOURCE_FILE Library: ${ATC_RESOURCE_FILE}")
+  add_library(atc_resource SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_resource PROPERTY IMPORTED_LOCATION ${ATC_RESOURCE_FILE})
+endif()
+
+# 4.3 find OPP libs - libopsproto.so
+find_library(OPP_OPS_PROTO_FILE NAMES opsproto
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in
+  NO_DEFAULT_PATH)
+
+if(NOT OPP_OPS_PROTO_FILE)
+  message(FATAL_ERROR "Can not find OPP_OPS_PROTO_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in")
+else()
+  message(STATUS "Found OPP_OPS_PROTO_FILE Library: ${OPP_OPS_PROTO_FILE}")
+  add_library(opp_ops_proto SHARED IMPORTED GLOBAL)
+  set_property(TARGET opp_ops_proto PROPERTY IMPORTED_LOCATION ${OPP_OPS_PROTO_FILE})
+endif()
+
+# 4.3.1 dependency of  opp_ops_proto - liberror_manager.so
+find_library(ATC_ERROR_MANAGER_FILE NAMES error_manager
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_ERROR_MANAGER_FILE)
+  message(FATAL_ERROR "Can not find ATC_ERROR_MANAGER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_ERROR_MANAGER_FILE Library: ${ATC_ERROR_MANAGER_FILE}")
+  add_library(atc_error_manager SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_error_manager PROPERTY IMPORTED_LOCATION ${ATC_ERROR_MANAGER_FILE})
+endif()
+
+# note: huawei_ascend_npu_runtime_libs should before huawei_ascend_npu_builder_libs
+set(huawei_ascend_npu_runtime_libs acl_ascendcl acl_runtime CACHE INTERNAL "huawei_ascend_npu acllib runtime libs")
+set(huawei_ascend_npu_builder_libs atc_register atc_protobuf atc_graph opp_ops_proto atc_error_manager 
+    atc_ge_compiler atc_ge_common atc_resource CACHE INTERNAL "huawei_ascend_npu atc builder libs")
\ No newline at end of file
--- a/cmake/device/npu.cmake
+++ b/cmake/device/npu.cmake
@@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
  NO_DEFAULT_PATH)

+# Added in HiAI DDK 320 or later version
+find_library(NPU_DDK_HCL_FILE NAMES hcl
+  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
+  NO_DEFAULT_PATH)
+
 if(NOT NPU_DDK_HIAI_FILE)
  message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
 else()
@@ -78,5 +83,13 @@ else()
  set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
 endif()

-set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
+if(NOT NPU_DDK_HCL_FILE)
+# message(FATAL_ERROR "Can not find NPU_DDK_HCL_FILE in ${NPU_DDK_ROOT}")
+else()
+  message(STATUS "Found NPU_DDK HCL Library: ${NPU_DDK_HCL_FILE}")
+  add_library(npu_ddk_hcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET npu_ddk_hcl PROPERTY IMPORTED_LOCATION ${NPU_DDK_HCL_FILE})
+endif()
+
+set(npu_runtime_libs npu_ddk_hiai npu_ddk_hcl CACHE INTERNAL "npu ddk runtime libs")
 set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
--- a/cmake/external/flatbuffers.cmake
+++ b/cmake/external/flatbuffers.cmake
@@ -94,12 +94,10 @@ function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT)
  message(STATUS "SRC_FBS_DIR: ${SRC_FBS_DIR}")
  string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS})
  add_custom_command(
-    OUTPUT ${GEN_HEADER}
+    OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/${GEN_HEADER}"
    COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
            --cpp --gen-mutable --gen-object-api --reflect-names
-            --force-empty --force-empty-vectors
            ${OPT}
-            -I "${CMAKE_CURRENT_SOURCE_DIR}/tests/include_test"
            -o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}"
            "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
    DEPENDS flatbuffers

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  set(deps ${lite_deps_DEPS})
@@ -118,6 +118,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+  if (LITE_WITH_HUAWEI_ASCEND_NPU)
+    foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()

@@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -165,6 +171,7 @@ function(lite_cc_library TARGET)
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            MLU_DEPS ${args_MLU_DEPS}
+            HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
            )

    if (args_SHARED OR ARGS_shared)
@@ -193,7 +200,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -215,6 +222,7 @@ function(lite_cc_binary TARGET)
            HVY_DEPS ${args_HVY_DEPS}
            CV_DEPS ${CV_DEPS}
            MLU_DEPS ${args_MLU_DEPS}
+            HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    if(NOT WIN32)
@@ -246,7 +254,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -276,6 +284,7 @@ function(lite_cc_test TARGET)
              HVY_DEPS ${args_HVY_DEPS}
              CV_DEPS ${args_CV_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size
@@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
 set(apu_kernels CACHE INTERNAL "apu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(mlu_kernels CACHE INTERNAL "mlu kernels")
+set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
 set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
@@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
+# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -438,6 +448,15 @@ function(add_kernel TARGET device level)
        endif()
        set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU")
+        if (NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(huawei_ascend_npu_kernels "${huawei_ascend_npu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
            foreach(src ${args_SRCS})
@@ -481,6 +500,7 @@ function(add_kernel TARGET device level)
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -499,7 +519,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -537,6 +557,7 @@ function(add_operator TARGET level)
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/docs/demo_guides/opencl.md
+++ b/docs/demo_guides/opencl.md
@@ -37,14 +37,25 @@ rm ./lite/api/paddle_use_kernels.h
 rm ./lite/api/paddle_use_ops.h

 # 设置编译参数并开始编译
+# android-armv7:cpu+gpu+cv+extra
 ./lite/tools/build_android.sh \
  --arch=armv7 \
  --toolchain=clang \
-  --with_cv=OFF \
  --with_log=OFF \
-  --with_extra=OFF \
+  --with_extra=ON \
+  --with_cv=ON \
  --with_opencl=ON

+# android-armv8:cpu+gpu+cv+extra
+./lite/tools/build_android.sh \
+  --arch=armv8 \
+  --toolchain=clang \
+  --with_log=OFF \
+  --with_extra=ON \
+  --with_cv=ON \
+  --with_opencl=ON
+
+
 # 注：编译帮助请执行: ./lite/tools/build_android.sh help
 ```

@@ -206,7 +217,7 @@ adb shell "export GLOG_v=4; \

 ## 3. 如何在Code中使用

-即编译产物`demo/cxx/mobile_light`目录下的代码，在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc);
+即编译产物`demo/cxx/mobile_light`目录下的代码，在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)，其中也包括判断当前设备是否支持OpenCL的方法;

 注：这里给出的链接会跳转到线上最新develop分支的代码，很可能与您本地的代码存在差异，建议参考自己本地位于`lite/demo/cxx/`目录的代码，查看如何使用。


--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
 message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
+message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
@@ -45,6 +46,7 @@ if (WITH_TESTING)
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz")
    endif()
    if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -11,7 +11,7 @@ endif()

 set(light_lib_DEPS light_api paddle_api paddle_api_light)

-if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR LITE_WITH_HUAWEI_ASCEND_NPU OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
    #full api dynamic library
    lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
                  DEPS paddle_api paddle_api_light  paddle_api_full)
@@ -40,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                  NPU_DEPS ${npu_kernels}
                  APU_DEPS ${apu_kernels}
                  RKNPU_DEPS ${rknpu_kernels}
+                  HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
                  )

    add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
    if(WIN32)
        target_link_libraries(paddle_light_api_shared shlwapi.lib)
    endif()
-    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
+    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${rknpu_kernels} ${apu_kernels})
   if(APPLE)
        set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds")
        set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}")
@@ -94,6 +95,7 @@ if (WITH_TESTING)
      RKNPU_DEPS ${rknpu_kernels}
      BM_DEPS ${bm_kernels}
      MLU_DEPS ${mlu_kernels}
+      HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
      APU_DEPS ${apu_kernels})

 endif()
@@ -112,6 +114,10 @@ if(LITE_WITH_RKNPU)
    set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
 endif()

+if(LITE_WITH_HUAWEI_ASCEND_NPU)
+    set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps})
+endif()

 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
@@ -126,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
 message(STATUS "get MLU kernels ${mlu_kernels}")
+message(STATUS "get HUAWEI_ASCEND_NPU kernels ${huawei_ascend_npu_kernels}")

 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -144,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
                        RKNPU_DEPS ${rknpu_kernels}
                        BM_DEPS ${bm_kernels}
                        CL_DEPS ${opencl_kernels}
-                        FPGA_DEPS ${fpga_kernels})
+                        FPGA_DEPS ${fpga_kernels}
+                        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 endif()

 # for light api
@@ -168,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
-        MLU_DEPS ${mlu_kernels})
+        MLU_DEPS ${mlu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})

 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -191,6 +200,7 @@ if(WITH_TESTING)
           FPGA_DEPS ${fpga_kernels}
           BM_DEPS ${bm_kernels}
           MLU_DEPS ${mlu_kernels}
+           HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
           EXCLUDE_COMPILE_DEPS "ON"
           ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
                --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -322,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
-        BM_DEPS ${bm_kernels})
+        BM_DEPS ${bm_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    # The final inference library for just MobileConfig.
    bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
    target_link_libraries(paddle_api_full ${cuda_deps})
@@ -394,6 +405,7 @@ if(NOT WITH_COVERAGE)
      FPGA_DEPS ${fpga_kernels}
      BM_DEPS ${bm_kernels}
      MLU_DEPS ${mlu_kernels}
+      HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
      ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
    if (WITH_TESTING)
        add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -415,7 +427,8 @@ if(NOT IOS)
        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    
    lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -430,7 +443,8 @@ if(NOT IOS)
        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    
    lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -445,7 +459,8 @@ if(NOT IOS)
        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})

    lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -459,7 +474,8 @@ if(NOT IOS)
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    
    lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -470,6 +486,7 @@ if(NOT IOS)
        XPU_DEPS ${xpu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
        CL_DEPS ${opencl_kernels}
        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
@@ -487,7 +504,8 @@ if(NOT IOS)
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 endif()

 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -37,8 +37,7 @@ void Predictor::SaveModel(const std::string &dir,
  if (!program_) {
    GenRuntimeProgram();
  }
-  program_->SaveOpInfosToProgram(program_desc_.get());
-  program_->UpdateVarsOfProgram(program_desc_.get());
+  program_->SaveToProgram(program_desc_);
  switch (model_type) {
    case lite_api::LiteModelType::kProtobuf:
      SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true);
@@ -58,18 +57,22 @@ void Predictor::SaveModel(const std::string &dir,
 void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
  std::set<std::string> ops_info;
  std::set<std::string> kernels_info;
-  const auto &instructions_ = program_->instructions();
-  for (auto &node : instructions_) {
+  auto block_size = program_->block_size();
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    const auto &insts = program_->instructions(block_idx);
+    for (auto &inst : insts) {
      // parse op type infomation
-    auto op = node.op()->op_info();
+      auto op = inst.op()->op_info();
      ops_info.insert(op->Type());
      // parse kernel type information
      std::string kernel_type_str =
-        node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) +
-        "," + PrecisionRepr(node.kernel()->precision()) + "," +
-        DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias();
+          inst.kernel()->op_type() + "," + TargetRepr(inst.kernel()->target()) +
+          "," + PrecisionRepr(inst.kernel()->precision()) + "," +
+          DataLayoutRepr(inst.kernel()->layout()) + "," +
+          inst.kernel()->alias();
      kernels_info.insert(kernel_type_str);
    }
+  }

  // get souce_file name from op type and kernel type
  auto op2pathmap = OpKernelInfoCollector::Global().GetOp2PathDict();
@@ -170,9 +173,9 @@ void Predictor::PrepareFeedFetch() {

  std::vector<const cpp::OpDesc *> feeds;
  std::vector<const cpp::OpDesc *> fetchs;
-  const auto &insts = program_->instructions();
-  for (size_t i = 0; i < program_->num_instructions(); i++) {
-    const auto &op = insts[i].op()->op_info();
+  const auto &insts = program_->instructions(kRootBlockIdx);
+  for (auto &inst : insts) {
+    const auto &op = inst.op()->op_info();
    if (op->Type() == "feed") {
      feeds.push_back(op);
    } else if (op->Type() == "fetch") {
@@ -255,7 +258,6 @@ void Predictor::Build(const lite_api::CxxConfig &config,
  } else {
    LOG(INFO) << "Load model from file.";
  }
-
  Build(model_path,
        model_file,
        param_file,
@@ -296,10 +298,10 @@ void Predictor::Build(const std::string &model_path,
  Build(program_desc_, valid_places, passes);
 }

-void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
+void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &program_desc,
                      const std::vector<Place> &valid_places,
                      const std::vector<std::string> &passes) {
-  program_desc_ = desc;
+  program_desc_ = program_desc;
  // `inner_places` is used to optimize passes
  std::vector<Place> inner_places = valid_places;
  for (auto &valid_place : valid_places) {
@@ -336,7 +338,7 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
                        Place{TARGET(kARM), PRECISION(kInt8)});
  }

-  Program program(*desc.get(), scope_, inner_places);
+  Program program(program_desc_, scope_, inner_places);
  valid_places_ = inner_places;

  core::KernelPickFactor factor;

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -49,18 +49,33 @@ class LITE_API Predictor {
    program_desc_ = std::make_shared<cpp::ProgramDesc>();
  }

-  // Create a predictor with the weight variable scope set.
+  ///////////////////////////////////////////////////////////////////
+  // Function: Predictor
+  // Usage: Constructor of Predictor. Create a predictor with the
+  // weight variable scope set given.
+  ///////////////////////////////////////////////////////////////////
  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
      : scope_(root_scope) {}
-  Predictor(const std::shared_ptr<cpp::ProgramDesc>& desc,
+  ///////////////////////////////////////////////////////////////////
+  // Function: Predictor
+  // Usage: Constructor of Predictor. This constructor function can
+  // only be called in Predictor->Clone. This Function will create
+  // a predictor from existed ProgramDesc, Scope and RuntimeProgram.
+  ///////////////////////////////////////////////////////////////////
+  Predictor(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
            const std::shared_ptr<Scope>& root,
            const std::vector<Place>& valid_places,
            const std::vector<std::string>& var_names = {})
-      : program_desc_(desc), scope_(root) {
-    Program program(*desc.get(), scope_, valid_places, var_names);
-    optimizer_ = Optimizer(std::move(program), valid_places);
-    exec_scope_ = optimizer_.exec_scope();
+      : program_desc_(program_desc), scope_(root) {
+    // step1. Create a Program to construct the exec_scope and ops
+    Program program(program_desc_, scope_, valid_places, var_names);
+    exec_scope_ = program.exec_scope();
    valid_places_ = valid_places;
+
+    // step3. Create the RuntimeProgram.
+    program_.reset(
+        new RuntimeProgram(program_desc_, exec_scope_, kRootBlockIdx));
+    program_generated_ = true;
  }

  // Build from a model, with places set for hardware config.
@@ -79,32 +94,62 @@ class LITE_API Predictor {
      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
      bool memory_from_memory = false);

-  void Build(const std::shared_ptr<cpp::ProgramDesc>& desc,
+  void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
             const std::vector<Place>& valid_places,
             const std::vector<std::string>& passes = {});

-  std::shared_ptr<Predictor> Clone() const {
+  //////////////////////////////////////////////////////////
+  // Function: Clone
+  // Usage: Create a Predictor from an existed one,
+  // the cloned predictor will share persistable variables
+  // in scope_ with the original predictor.
+  //////////////////////////////////////////////////////////
+  std::shared_ptr<Predictor> Clone() {
+    // step 1. Generate runtime_program, update op_info and var_info in
+    // program_desc_
+    if (!program_generated_) {
+      GenRuntimeProgram();
+    }
+    program_->SaveToProgram(program_desc_);
+    // step 2. Create a predictor friom current program_desc_ and
+    // runtime_program.
    auto predictor =
        std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
+    // step3. Return the result
    return predictor;
  }
-
-  std::shared_ptr<Predictor> Clone(
-      const std::vector<std::string>& var_names) const {
+  //////////////////////////////////////////////////////////
+  // Function: Clone(var_names)
+  // Usage: Create a Predictor from an existed one,
+  // the cloned predictor will share persistable variables
+  // but persistable variables of name var_names will not
+  // be shared.
+  //////////////////////////////////////////////////////////
+  std::shared_ptr<Predictor> Clone(const std::vector<std::string>& var_names) {
    CHECK(program_desc_) << "Both program and scope of current predicotr "
                            "should be not be nullptr in Clone mode.";
    CHECK(scope_) << "Both program and scope of current predicotr should be "
                     "not be nullptr in Clone mode.";
+    // step 1. Generate runtime_program, update op_info and var_info in
+    // program_desc_
+    if (!program_generated_) {
+      GenRuntimeProgram();
+    }
+    program_->SaveToProgram(program_desc_);
+    // step 2. Create a predictor friom current program_desc_ and
+    // runtime_program.
    auto predictor = std::make_shared<Predictor>(
        program_desc_, scope_, valid_places_, var_names);
-
-    for (auto i : var_names) {
-      predictor->exec_scope_->LocalVar(i);
-      auto* tensor = predictor->scope_->Var(i)->GetMutable<lite::Tensor>();
+    // step3. Copy some persistable variables into private scope.
+    for (auto var_name : var_names) {
+      predictor->exec_scope_->LocalVar(var_name);
+      auto* tensor =
+          predictor->scope_->Var(var_name)->GetMutable<lite::Tensor>();
      auto* sub_tensor =
-          predictor->exec_scope_->Var(i)->GetMutable<lite::Tensor>();
+          predictor->exec_scope_->Var(var_name)->GetMutable<Tensor>();
      sub_tensor->CopyDataFrom(*tensor);
    }
+    // step4. Return the result
    return predictor;
  }

@@ -140,6 +185,7 @@ class LITE_API Predictor {
  // get a const tensor according to its name
  const lite::Tensor* GetTensor(const std::string& name) const;
  const RuntimeProgram& runtime_program() const;
+  Scope* scope() { return scope_.get(); }

  // This method is disabled in mobile, for unnecessary dependencies required.
  void SaveModel(
@@ -162,7 +208,7 @@ class LITE_API Predictor {
  std::shared_ptr<cpp::ProgramDesc> program_desc_;
  std::shared_ptr<Scope> scope_;
  Scope* exec_scope_;
-  std::unique_ptr<RuntimeProgram> program_;
+  std::shared_ptr<RuntimeProgram> program_;
  bool program_generated_{false};
  std::vector<std::string> input_names_;
  std::vector<std::string> output_names_;

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -74,7 +74,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  mode_ = config.power_mode();
  threads_ = config.threads();
 #ifdef LITE_WITH_NPU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
+      config.get_device_id());
+  Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
      config.subgraph_model_cache_dir());
 #endif
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -22,17 +22,18 @@ namespace lite {
 void LightPredictor::Build(const std::string& lite_model_file,
                           bool model_from_memory) {
  if (model_from_memory) {
-    LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
+    LoadModelNaiveFromMemory(
+        lite_model_file, scope_.get(), program_desc_.get());
  } else {
-    LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
+    LoadModelNaiveFromFile(lite_model_file, scope_.get(), program_desc_.get());
  }

  // For weight quantization of post training, load the int8/16 weights
  // for optimized model, and dequant it to fp32.
  DequantizeWeight();
-
-  BuildRuntimeProgram(cpp_program_desc_);
+  BuildRuntimeProgram(program_desc_);
  PrepareFeedFetch();
+  program_desc_.reset();
 }

 void LightPredictor::Build(const std::string& model_dir,
@@ -43,15 +44,15 @@ void LightPredictor::Build(const std::string& model_dir,
  switch (model_type) {
 #ifndef LITE_ON_TINY_PUBLISH
    case lite_api::LiteModelType::kProtobuf:
-      LoadModelPb(model_dir, "", "", scope_.get(), &cpp_program_desc_);
+      LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
      break;
 #endif
    case lite_api::LiteModelType::kNaiveBuffer: {
      if (model_from_memory) {
        LoadModelNaiveFromMemory(
-            model_buffer, param_buffer, scope_.get(), &cpp_program_desc_);
+            model_buffer, param_buffer, scope_.get(), program_desc_.get());
      } else {
-        LoadModelNaive(model_dir, scope_.get(), &cpp_program_desc_);
+        LoadModelNaive(model_dir, scope_.get(), program_desc_.get());
      }
      break;
    }
@@ -60,7 +61,7 @@ void LightPredictor::Build(const std::string& model_dir,
  }

  DequantizeWeight();
-  BuildRuntimeProgram(cpp_program_desc_);
+  BuildRuntimeProgram(program_desc_);
  PrepareFeedFetch();
 }

@@ -109,15 +110,17 @@ std::vector<std::string> LightPredictor::GetOutputNames() {
 }
 // append the names of inputs and outputs into input_names_ and output_names_
 void LightPredictor::PrepareFeedFetch() {
-  auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
-  std::vector<cpp::OpDesc*> feeds;
-  std::vector<cpp::OpDesc*> fetchs;
-  for (size_t i = 0; i < current_block->OpsSize(); i++) {
-    auto op = current_block->GetOp<cpp::OpDesc>(i);
-    if (op->Type() == "feed") {
-      feeds.push_back(op);
-    } else if (op->Type() == "fetch") {
-      fetchs.push_back(op);
+  std::vector<const cpp::OpDesc*> feeds;
+  std::vector<const cpp::OpDesc*> fetchs;
+  std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
+  auto main_block = program_desc->GetBlock<cpp::BlockDesc>(kRootBlockIdx);
+  auto op_size = main_block->OpsSize();
+  for (size_t op_idx = 0; op_idx < op_size; ++op_idx) {
+    auto op_desc = main_block->GetOp<cpp::OpDesc>(op_idx);
+    if (op_desc->Type() == "feed") {
+      feeds.push_back(op_desc);
+    } else if (op_desc->Type() == "fetch") {
+      fetchs.push_back(op_desc);
    }
  }
  input_names_.resize(feeds.size());
@@ -132,54 +135,35 @@ void LightPredictor::PrepareFeedFetch() {
  }
 }

-void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
-  std::vector<Instruction> insts;
-  // 1. Create op first
-  Program program(prog, scope_, {});
-
-// 2. Create Instructs
-#ifdef LITE_WITH_OPENCL
-  using OpenCLContext = Context<TargetType::kOpenCL>;
-  std::unique_ptr<KernelContext> local_ctx(new KernelContext());
-  local_ctx->As<OpenCLContext>().InitOnce();
-#endif
-
-  // Create the kernels of the target places, and filter out the specific
-  // kernel with the target alias.
-  for (auto& op : program.ops()) {
-    auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
-    std::string op_type, alias;
-    Place place;
-    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-    auto kernels = op->CreateKernels({place});
-    // filter out a kernel
-    auto it = std::find_if(
-        kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
-          return it->alias() == alias;
-        });
-    CHECK(it != kernels.end());
-
-#ifdef LITE_WITH_OPENCL
-    if ((*it)->target() == TARGET(kOpenCL)) {
-      std::unique_ptr<KernelContext> ctx(new KernelContext());
-      (*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
-      (*it)->SetContext(std::move(ctx));
+void LightPredictor::BuildRuntimeProgram(
+    const std::shared_ptr<const cpp::ProgramDesc>& program_desc) {
+  auto* exe_scope = &scope_->NewScope();
+  // Prepare workspace
+  scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
+  scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
+  CHECK(program_desc);
+  auto block_size = program_desc->BlocksSize();
+  CHECK(block_size);
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+    auto var_size = block_desc->VarsSize();
+    for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
+      auto var_desc = block_desc->GetVar<cpp::VarDesc>(var_idx);
+      if (!var_desc->Persistable()) {
+        exe_scope->Var(var_desc->Name());
      } else {
-      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") continue;
+        scope_->Var(var_desc->Name());
      }
-#else
-    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
-#endif
-
-    insts.emplace_back(op, std::move(*it));
    }
-  program_.reset(new RuntimeProgram(std::move(insts)));
-
-  CHECK(program.exec_scope());
-  program_->set_exec_scope(program.exec_scope());
+  }
+  // Only extracting the ops and generate the runtime program from the main
+  // block desc
+  program_.reset(new RuntimeProgram(program_desc, exe_scope, kRootBlockIdx));
 }

 void LightPredictor::DequantizeWeight() {
+  std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
 #define PROCESS_CONV2D_DATA()                                             \
  for (int64_t i = 0; i < ch; ++i) {                                      \
    for (int64_t j = 0; j < offset; ++j) {                                \
@@ -205,10 +189,9 @@ void LightPredictor::DequantizeWeight() {
    }
    return result;
  };
-
  Tensor tmp_tensor;
-  for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
-    auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
+  for (size_t i = 0; i < program_desc->BlocksSize(); i++) {
+    auto* block = program_desc->GetBlock<cpp::BlockDesc>(i);
    for (size_t k = 0; k < block->OpsSize(); ++k) {
      auto* op_desc = block->GetOp<cpp::OpDesc>(k);
      if (is_weight_quantized_op(op_desc)) {

--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -46,6 +46,7 @@ class LITE_API LightPredictor {
  LightPredictor(const std::string& lite_model_file,
                 bool model_from_memory = false) {
    scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
    Build(lite_model_file, model_from_memory);
  }

@@ -57,6 +58,7 @@ class LITE_API LightPredictor {
                 lite_api::LiteModelType model_type =
                     lite_api::LiteModelType::kNaiveBuffer) {
    scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
    Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
  }

@@ -78,6 +80,7 @@ class LITE_API LightPredictor {
  std::vector<std::string> GetInputNames();
  std::vector<std::string> GetOutputNames();
  void PrepareFeedFetch();
+  Scope* scope() { return scope_.get(); }

 private:
  void Build(const std::string& lite_model_file,
@@ -91,14 +94,15 @@ class LITE_API LightPredictor {
      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
      bool model_from_memory = false);

-  void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
+  void BuildRuntimeProgram(
+      const std::shared_ptr<const cpp::ProgramDesc>& program_desc);

  void DequantizeWeight();

 private:
  std::shared_ptr<Scope> scope_;
  std::unique_ptr<RuntimeProgram> program_;
-  cpp::ProgramDesc cpp_program_desc_;
+  std::shared_ptr<cpp::ProgramDesc> program_desc_;
  std::vector<std::string> input_names_;
  std::vector<std::string> output_names_;
 };

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -38,7 +38,15 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  threads_ = config.threads();

 #ifdef LITE_WITH_NPU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
+      config.get_device_id());
+  Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
      config.subgraph_model_cache_dir());
 #endif
 }

--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,

  if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
    ASSERT_EQ(out->dims().production(), 1000);
-    double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
+    double eps = first_target == TARGET(kOpenCL) ? 0.25 : 0.1;
    for (int i = 0; i < ref.size(); ++i) {
      for (int j = 0; j < ref[i].size(); ++j) {
        auto result = pdata[j * step + (out->dims()[1] * i)];

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -112,6 +112,8 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
    } else if (target_repr == "npu") {
      valid_places.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "huawei_ascend_npu") {
+      valid_places.emplace_back(TARGET(kHuaweiAscendNPU));
    } else if (target_repr == "xpu") {
      valid_places.emplace_back(TARGET(kXPU));
    } else if (target_repr == "mlu") {
@@ -201,6 +203,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                      "kXPU",
                                      "kRKNPU",
                                      "kAPU",
+                                      "kHuaweiAscendNPU",
                                      "kAny",
                                      "kUnk"};
  int maximum_optype_length = 0;
@@ -265,16 +268,17 @@ void PrintHelpInfo() {
      "        `--param_file=<param_path>`\n"
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
+      "        "
+      "`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
      "  Display operators in the input model\n";
  std::cout << "opt version:" << opt_version << std::endl
            << help_info << std::endl;

--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
      valid_places_.emplace_back(TARGET(kX86));
    } else if (target_repr == "npu") {
      valid_places_.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "huawei_ascend_npu") {
+      valid_places_.emplace_back(TARGET(kHuaweiAscendNPU));
    } else if (target_repr == "xpu") {
      valid_places_.emplace_back(TARGET(kXPU));
    } else if (target_repr == "rknpu") {
@@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() {
      "        `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
      "default\n"
      "        `set_lite_out(output_optimize_model_dir)`\n"
-      "        `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
+      "        "
+      "`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
      "        `record_model_info(false|true)`: refer to whether to record ops "
      "info for striping lib, false by default`\n"
      "        `run() : start model transformation`\n"
@@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() {
      "        `--param_file=<param_path>`\n"
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
      "  Display operators in the input model\n";
  std::cout << "paddlelite opt version:" << opt_version << std::endl
            << help_info << std::endl;
@@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
                                                     "kXPU",
                                                     "kRKNPU",
                                                     "kAPU",
+                                                     "kHuaweiAscendNPU",
                                                     "kAny",
                                                     "kUnk"};
  // Get the lengh of the first column: maximum length of the op_type

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -32,9 +32,22 @@
 #include "lite/backends/mlu/target_wrapper.h"
 #endif

+#ifdef LITE_WITH_OPENCL
+#include "lite/backends/opencl/cl_runtime.h"
+#endif
+
 namespace paddle {
 namespace lite_api {

+bool IsOpenCLBackendValid() {
+  bool opencl_valid = false;
+#ifdef LITE_WITH_OPENCL
+  opencl_valid = paddle::lite::CLRuntime::Global()->OpenCLAvaliableForDevice();
+#endif
+  LOG(INFO) << "opencl_valid:" << opencl_valid;
+  return opencl_valid;
+}
+
 Tensor::Tensor(void *raw) : raw_tensor_(raw) {}

 // TODO(Superjomn) refine this by using another `const void* const_raw`;

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -33,6 +33,9 @@ using lod_t = std::vector<std::vector<uint64_t>>;

 enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };

+// return true if current device supports OpenCL model
+LITE_API bool IsOpenCLBackendValid();
+
 struct LITE_API Tensor {
  explicit Tensor(void* raw);
  explicit Tensor(const void* raw);
@@ -123,6 +126,7 @@ class LITE_API ConfigBase {
  PowerMode mode_{LITE_POWER_NO_BIND};
  // to save subgraph model for npu/xpu/...
  std::string subgraph_model_cache_dir_{""};
+  int device_id_{0};

 public:
  explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
@@ -142,6 +146,9 @@ class LITE_API ConfigBase {
  const std::string& subgraph_model_cache_dir() const {
    return subgraph_model_cache_dir_;
  }
+  // set Device ID
+  void set_device_id(int device_id) { device_id_ = device_id; }
+  const int get_device_id() const { return device_id_; }
 };

 /// CxxConfig is the config for the Full feature predictor.

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -75,7 +75,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "bm",
                                              "mlu",
                                              "rknpu",
-                                              "apu"};
+                                              "apu",
+                                              "huawei_ascend_npu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -118,7 +119,8 @@ const std::string& TargetRepr(TargetType target) {
                                              "kBM",
                                              "kMLU",
                                              "kRKNPU",
-                                              "kAPU"};
+                                              "kAPU",
+                                              "kHuaweiAscendNPU"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -163,7 +165,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kMLU),
                                               TARGET(kAPU),
                                               TARGET(kRKNPU),
-                                               TARGET(kFPGA)});
+                                               TARGET(kFPGA),
+                                               TARGET(kHuaweiAscendNPU)});
  if (target == TARGET(kAny)) {
    return valid_set;
  }

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -57,7 +57,8 @@ enum class TargetType : int {
  kMLU = 11,
  kRKNPU = 12,
  kAPU = 13,
-  NUM = 14,  // number of fields.
+  kHuaweiAscendNPU = 14,
+  NUM = 15,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -28,6 +28,7 @@ USE_MIR_PASS(graph_visualize_pass);

 USE_MIR_PASS(remove_tf_redundant_ops_pass);
 USE_MIR_PASS(lite_conv_bn_fuse_pass);
+USE_MIR_PASS(lite_conv_conv_fuse_pass);
 USE_MIR_PASS(lite_fc_fuse_pass);
 USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
 USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
@@ -47,12 +48,14 @@ USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
+USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
 USE_MIR_PASS(mlu_subgraph_pass);
 USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
 USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);

--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) {
      .value("MLU", TargetType::kMLU)
      .value("RKNPU", TargetType::kRKNPU)
      .value("APU", TargetType::kAPU)
+      .value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU)
      .value("Any", TargetType::kAny);

  // PrecisionType

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(mlu)
 add_subdirectory(bm)
 add_subdirectory(apu)
 add_subdirectory(rknpu)
+add_subdirectory(huawei_ascend_npu)
--- a/lite/backends/arm/math/beam_search.cc
+++ b/lite/backends/arm/math/beam_search.cc
@@ -234,7 +234,7 @@ void beam_search(const Tensor *pre_ids,
  selected_ids->Resize(dims);
  selected_scores->Resize(dims);
  if (parent_idx) {
-    parent_idx->Resize(dims);
+    parent_idx->Resize({static_cast<int64_t>(num_instances)});
  }
  auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
  auto *selected_scores_data = selected_scores->mutable_data<float>();

--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
@@ -139,6 +139,151 @@ static bool conv_trans_weights_numc(const dtype* din,
  }
  return true;
 }
+// for example: m = 4, n = 4
+// din = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9 , 10 ,11], [12, 13, 14, 15]]
+// dout = [[0, 4, 8, 12], [1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15]]
+/*
+  m = 8 n = 8: 0 1 2 3 4 5 6 7           0 8 16 24 32 40 48 56
+               16 17 18 19 20 21 22 23   2 10 18 26 34 42 50 58
+               24 25 26 27 28 29 30 31   3 11 19 27 35 43 51 59
+               32 33 34 35 36 37 38 39   4 12 20 28 36 44 52 60           ...
+    }
+  }
+*/
+template <typename Dtype>
+void local_transpose(const Dtype* din, Dtype* dout, int m, int n) {
+  // n % 4 == 0 && m % 4 == 0
+  // n * m ==> n * m data trans
+  int offset_m = m << 2;
+  const Dtype* din_ptr = din;
+  Dtype* dout_ptr = dout;
+  for (int i = 0; i < n; i += 4) {
+    Dtype* out_ptr0 = dout_ptr;
+    Dtype* out_ptr1 = dout_ptr + m;
+    Dtype* out_ptr2 = out_ptr1 + m;
+    Dtype* out_ptr3 = out_ptr2 + m;
+    const Dtype* in_ptr0 = din_ptr;
+    const Dtype* in_ptr1 = din_ptr + m;
+    const Dtype* in_ptr2 = in_ptr1 + m;
+    const Dtype* in_ptr3 = in_ptr2 + m;
+    for (int j = 0; j < m; j += 4) {
+      float32x4_t vin0 = vld1q_f32(in_ptr0);
+      float32x4_t vin1 = vld1q_f32(in_ptr1);
+      float32x4_t vin2 = vld1q_f32(in_ptr2);
+      float32x4_t vin3 = vld1q_f32(in_ptr3);
+      // a00 b00 a02 b02 a01 b01 a03 b03
+      float32x4x2_t tmp0 = vtrnq_f32(vin0, vin1);
+      // c00 d00 c02 d02 c01 d01 c03 d03
+      float32x4x2_t tmp2 = vtrnq_f32(vin2, vin3);
+      in_ptr0 = in_ptr3 + m;
+      in_ptr1 = in_ptr3 + 2 * m;
+      float tmp_val1 = tmp0.val[0][2];
+      float tmp_val2 = tmp0.val[0][3];
+      tmp0.val[0][2] = tmp2.val[0][0];
+      tmp0.val[0][3] = tmp2.val[0][1];
+      float tmp_val3 = tmp0.val[1][2];
+      float tmp_val4 = tmp0.val[1][3];
+      tmp2.val[0][0] = tmp_val1;
+      tmp2.val[0][1] = tmp_val2;
+      tmp0.val[1][2] = tmp2.val[1][0];
+      tmp0.val[1][3] = tmp2.val[1][1];
+      tmp2.val[1][0] = tmp_val3;
+      tmp2.val[1][1] = tmp_val4;
+      in_ptr2 = in_ptr1 + m;
+      in_ptr3 = in_ptr1 + 2 * m;
+      vst1q_f32(out_ptr0, tmp0.val[0]);
+      vst1q_f32(out_ptr1, tmp0.val[1]);
+      out_ptr0 += 4;
+      out_ptr1 += 4;
+      vst1q_f32(out_ptr2, tmp2.val[0]);
+      vst1q_f32(out_ptr3, tmp2.val[1]);
+      out_ptr2 += 4;
+      out_ptr3 += 4;
+    }
+    dout_ptr += offset_m;
+    din_ptr += 4;
+  }
+}
+template <typename Dtype>
+void transpose(const Dtype* din, Dtype* dout, int m, int n) {
+  // nxm == mxn
+  // 4x4
+  int cnt_n = n >> 2;
+  int remain_n = n & 3;
+  int cnt_m = m >> 2;
+  int remain_m = m & 3;
+  int nn_num = n << 2;  // n * 4
+  int mm_num = m << 2;  // m * 4
+  for (int x = 0; x < cnt_n; x++) {
+    const Dtype* din_ptr0 = din + x * mm_num;
+    const Dtype* din_ptr1 = din_ptr0 + m;
+    const Dtype* din_ptr2 = din_ptr1 + m;
+    const Dtype* din_ptr3 = din_ptr2 + m;
+    Dtype* dout_ptr0 = dout + x * 4;
+    for (int y = 0; y < cnt_m; y++) {
+      float32x4_t din0 = vld1q_f32(din_ptr0);  // a00 a01 a02 a03
+      float32x4_t din1 = vld1q_f32(din_ptr1);
+      float32x4_t din2 = vld1q_f32(din_ptr2);
+      float32x4_t din3 = vld1q_f32(din_ptr3);
+      Dtype* dout_ptr1 = dout_ptr0 + n;
+      Dtype* dout_ptr2 = dout_ptr1 + n;
+      Dtype* dout_ptr3 = dout_ptr2 + n;
+      // a00 b00 a02 b02 a01 b01 a03 b03
+      float32x4x2_t tmp0 = vtrnq_f32(din0, din1);
+      // c00 d00 c02 d02 c01 d01 c03 d03
+      float32x4x2_t tmp2 = vtrnq_f32(din2, din3);
+      din_ptr0 += 4;
+      din_ptr1 += 4;
+      // a00 b00 c00 d00 a02 b02 c02 d02
+      // a01 b01 c01 d01 a03 b03 c03 d03
+      float tmp_val1 = tmp0.val[0][2];
+      float tmp_val2 = tmp0.val[0][3];
+      tmp0.val[0][2] = tmp2.val[0][0];
+      tmp0.val[0][3] = tmp2.val[0][1];
+      float tmp_val3 = tmp0.val[1][2];
+      float tmp_val4 = tmp0.val[1][3];
+      tmp2.val[0][0] = tmp_val1;
+      tmp2.val[0][1] = tmp_val2;
+      tmp0.val[1][2] = tmp2.val[1][0];
+      tmp0.val[1][3] = tmp2.val[1][1];
+      tmp2.val[1][0] = tmp_val3;
+      tmp2.val[1][1] = tmp_val4;
+      din_ptr2 += 4;
+      din_ptr3 += 4;
+      vst1q_f32(dout_ptr0, tmp0.val[0]);
+      vst1q_f32(dout_ptr1, tmp0.val[1]);
+      dout_ptr0 += nn_num;
+      vst1q_f32(dout_ptr2, tmp2.val[0]);
+      vst1q_f32(dout_ptr3, tmp2.val[1]);
+    }
+    for (int y = 0; y < remain_m; y++) {
+      *dout_ptr0++ = *din_ptr0++;
+      *dout_ptr0++ = *din_ptr1++;
+      *dout_ptr0++ = *din_ptr2++;
+      *dout_ptr0++ = *din_ptr3++;
+    }
+  }
+  const Dtype* din_ptr0 = din + cnt_n * mm_num;
+  dout = dout + cnt_n * 4;
+  for (int x = 0; x < remain_n; x++) {
+    Dtype* dout_ptr0 = dout + x * 4;
+    for (int y = 0; y < cnt_m; y++) {
+      float32x4_t din0 = vld1q_f32(din_ptr0);
+      Dtype* dout_ptr1 = dout_ptr0 + n;
+      Dtype* dout_ptr2 = dout_ptr1 + n;
+      Dtype* dout_ptr3 = dout_ptr2 + n;
+      din_ptr0 += 4;
+      *dout_ptr0 = din0[0];
+      *dout_ptr1 = din0[1];
+      dout_ptr0 += nn_num;
+      *dout_ptr2 = din0[2];
+      *dout_ptr3 = din0[3];
+    }
+    for (int y = 0; y < remain_m; y++) {
+      *dout_ptr0++ = *din_ptr0++;
+    }
+  }
+}
 /*preprocessing inputs
 * input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws]
 * n = he - hs

--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -747,6 +747,16 @@ void elementwise_mul<int>(const int* dinx,
  }
 }

+template <>
+void elementwise_mul<int64_t>(const int64_t* dinx,
+                              const int64_t* diny,
+                              int64_t* dout,
+                              int num) {
+  for (int i = 0; i < num; i++) {
+    dout[i] = dinx[i] * diny[i];
+  }
+}
+
 template <>
 void elementwise_mul_relu<float>(const float* dinx,
                                 const float* diny,
@@ -801,6 +811,17 @@ void elementwise_mul_relu<float>(const float* dinx,
  }
 }

+template <>
+void elementwise_mul_relu<int64_t>(const int64_t* dinx,
+                                   const int64_t* diny,
+                                   int64_t* dout,
+                                   int num) {
+  for (int i = 0; i < num; i++) {
+    int64_t tmp = dinx[i] * diny[i];
+    dout[i] = tmp > 0 ? tmp : 0;
+  }
+}
+
 template <>
 void elementwise_mul_broadcast<float>(const float* dinx,
                                      const float* diny,
@@ -935,6 +956,29 @@ void elementwise_mul_broadcast<int>(const int* dinx,
  }
 }

+template <>
+void elementwise_mul_broadcast<int64_t>(const int64_t* dinx,
+                                        const int64_t* diny,
+                                        int64_t* dout,
+                                        int batch,
+                                        int channels,
+                                        int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* dinx_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = *dinx_ptr * diny_data;
+        dout_ptr++;
+        dinx_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_mul_relu_broadcast<float>(const float* dinx,
                                           const float* diny,
@@ -1014,6 +1058,30 @@ void elementwise_mul_relu_broadcast<float>(const float* dinx,
  }
 }

+template <>
+void elementwise_mul_relu_broadcast<int64_t>(const int64_t* dinx,
+                                             const int64_t* diny,
+                                             int64_t* dout,
+                                             int batch,
+                                             int channels,
+                                             int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* dinx_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        int64_t tmp = *dinx_ptr * diny_data;
+        *dout_ptr = tmp > 0 ? tmp : 0;
+        dout_ptr++;
+        dinx_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_max<float>(const float* dinx,
                            const float* diny,

--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -2044,7 +2044,7 @@ void pooling3x3s1p0_avg(const float* din,
              } else {
                if (pad_bottom > 1) {
                  coef_h = 1.f / 3;
-                } else if (pad_bottom = 1) {
+                } else if (pad_bottom == 1) {
                  coef_h = 0.5f;
                } else {
                  coef_h = 1.f;

--- a/lite/backends/arm/math/prior_box.cc
+++ b/lite/backends/arm/math/prior_box.cc
@@ -21,7 +21,7 @@ namespace lite {
 namespace arm {
 namespace math {

-const int MALLOC_ALIGN = 64;
+const int MALLOC_ALIGN = 16;

 void* fast_malloc(size_t size) {
  size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;

--- a/lite/backends/arm/math/sequence_pool.cc
+++ b/lite/backends/arm/math/sequence_pool.cc
@@ -46,11 +46,60 @@ void seq_pool_sum<float>(const float* din,
      memcpy(dout_ptr, din_ptr, width * sizeof(float));
      din_ptr += width;
      height = height - 1;
-      for (int h = 0; h < height; h++) {
-        for (int w = 0; w < width; ++w) {
-          dout_ptr[w] += din_ptr[w];
+      int cnt_w = width >> 2;
+      int remain_w = width & 3;
+      int cnt_h = height >> 2;
+      int remain_h = height & 3;
+      int stride = width << 2;
+      for (int w = 0; w < cnt_w; w++) {
+        const float* din_ptr0 = din_ptr + w * 4;
+        float32x4_t dout_val = vld1q_f32(dout_ptr);
+        const float* din_ptr1 = din_ptr0 + width;
+        const float* din_ptr2 = din_ptr1 + width;
+        const float* din_ptr3 = din_ptr2 + width;
+        for (int h = 0; h < cnt_h; h++) {
+          float32x4_t din0 = vld1q_f32(din_ptr0);
+          float32x4_t din1 = vld1q_f32(din_ptr1);
+          float32x4_t din2 = vld1q_f32(din_ptr2);
+          float32x4_t din3 = vld1q_f32(din_ptr3);
+          dout_val = vaddq_f32(din0, dout_val);
+          float32x4_t tmp = vaddq_f32(din1, din2);
+          din_ptr0 += stride;
+          din_ptr1 += stride;
+          dout_val = vaddq_f32(din3, dout_val);
+          din_ptr2 += stride;
+          din_ptr3 += stride;
+          dout_val = vaddq_f32(tmp, dout_val);
        }
-        din_ptr += width;
+        for (int h = 0; h < remain_h; h++) {
+          float32x4_t din0 = vld1q_f32(din_ptr0);
+          dout_val = vaddq_f32(din0, dout_val);
+          din_ptr0 += width;
+        }
+        vst1q_f32(dout_ptr, dout_val);
+        dout_ptr += 4;
+      }
+      const float* din_ptr00 = din_ptr + cnt_w * 4;
+      for (int w = 0; w < remain_w; w++) {
+        const float* din_ptr0 = din_ptr00 + w;
+        const float* din_ptr1 = din_ptr0 + width;
+        const float* din_ptr2 = din_ptr1 + width;
+        const float* din_ptr3 = din_ptr2 + width;
+        for (int h = 0; h < cnt_h; h++) {
+          *dout_ptr += din_ptr0[0];
+          float tmp = din_ptr1[0] + din_ptr2[0];
+          din_ptr0 += stride;
+          din_ptr1 += stride;
+          *dout_ptr += din_ptr3[0];
+          din_ptr2 += stride;
+          din_ptr3 += stride;
+          *dout_ptr += tmp;
+        }
+        for (int h = 0; h < remain_h; h++) {
+          *dout_ptr += din_ptr0[0];
+          din_ptr0 += width;
+        }
+        dout_ptr++;
      }
    }
  }
@@ -144,12 +193,62 @@ void seq_pool_max<float>(const float* din,
      } else {
        memcpy(dout_ptr, din_ptr, width * sizeof(float));
        din_ptr += width;
-        int remain_h = height - 1;
+        height = height - 1;
+        int cnt_w = width >> 2;
+        int remain_w = width & 3;
+        int cnt_h = height >> 2;
+        int remain_h = height & 3;
+        int stride = width << 2;
+        for (int w = 0; w < cnt_w; w++) {
+          const float* din_ptr0 = din_ptr + w * 4;
+          float32x4_t dout_val = vld1q_f32(dout_ptr);
+          const float* din_ptr1 = din_ptr0 + width;
+          const float* din_ptr2 = din_ptr1 + width;
+          const float* din_ptr3 = din_ptr2 + width;
+          for (int h = 0; h < cnt_h; h++) {
+            float32x4_t din0 = vld1q_f32(din_ptr0);
+            float32x4_t din1 = vld1q_f32(din_ptr1);
+            float32x4_t din2 = vld1q_f32(din_ptr2);
+            float32x4_t din3 = vld1q_f32(din_ptr3);
+            dout_val = vmaxq_f32(din0, dout_val);
+            float32x4_t tmp = vmaxq_f32(din1, din2);
+            din_ptr0 += stride;
+            din_ptr1 += stride;
+            dout_val = vmaxq_f32(din3, dout_val);
+            din_ptr2 += stride;
+            din_ptr3 += stride;
+            dout_val = vmaxq_f32(tmp, dout_val);
+          }
          for (int h = 0; h < remain_h; h++) {
-          for (int w = 0; w < width; w++) {
-            dout_ptr[w] = std::max(dout_ptr[w], din_ptr[w]);
+            float32x4_t din0 = vld1q_f32(din_ptr0);
+            dout_val = vmaxq_f32(din0, dout_val);
+            din_ptr0 += width;
+          }
+          vst1q_f32(dout_ptr, dout_val);
+          dout_ptr += 4;
+        }
+        const float* din_ptr00 = din_ptr + cnt_w * 4;
+        for (int w = 0; w < remain_w; w++) {
+          const float* din_ptr0 = din_ptr00 + w;
+          const float* din_ptr1 = din_ptr0 + width;
+          const float* din_ptr2 = din_ptr1 + width;
+          const float* din_ptr3 = din_ptr2 + width;
+          for (int h = 0; h < cnt_h; h++) {
+            *dout_ptr += din_ptr0[0];
+            *dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
+            float tmp = std::max(din_ptr1[0], din_ptr2[0]);
+            din_ptr0 += stride;
+            din_ptr1 += stride;
+            *dout_ptr = std::max(*dout_ptr, din_ptr3[0]);
+            din_ptr2 += stride;
+            din_ptr3 += stride;
+            *dout_ptr = std::max(*dout_ptr, tmp);
          }
-          din_ptr += width;
+          for (int h = 0; h < remain_h; h++) {
+            *dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
+            din_ptr0 += width;
+          }
+          dout_ptr++;
        }
      }
    }

--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -11,10 +11,13 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps})
 nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
+nv_library(cuda_gru_forward  SRCS gru_forward.cu DEPS cuda_activation ${cuda_static_deps})
+nv_library(cuda_sequence2batch  SRCS sequence2batch.cu DEPS ${cuda_static_deps})
 nv_library(cuda_gemm SRCS gemm.cc  DEPS ${cuda_static_deps})
 nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
 nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps})
 nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})
+nv_library(cuda_bias SRCS bias.cu DEPS ${cuda_static_deps})

 set (
 math_cuda
@@ -25,10 +28,13 @@ set (
 cuda_transpose
 cuda_elementwise
 cudnn_pool
+ cuda_gru_forward
+ cuda_sequence2batch
 cuda_gemm
 cuda_batched_gemm
 cuda_strided_gemm
 cuda_sequence_padding
+ cuda_bias
 )

 set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
--- a/lite/backends/cuda/math/activation.cu
+++ b/lite/backends/cuda/math/activation.cu
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <iostream>
+#include "lite/backends/cuda/cuda_utils.h"
 #include "lite/backends/cuda/math/activation.h"
 #include "lite/backends/cuda/math/utils.h"

@@ -21,6 +22,20 @@ namespace lite {
 namespace cuda {
 namespace math {

+ActivationType GetActiveType(const std::string& act) {
+  if (act == "sigmoid") {
+    return kSigmoid;
+  } else if (act == "relu") {
+    return kReLU;
+  } else if (act == "tanh") {
+    return kTanh;
+  } else if (act == "identify") {
+    return kIdentity;
+  } else {
+    LOG(FATAL) << "not supported activation: " << act;
+  }
+}
+
 template <typename T>
 __global__ void relu_kernel(const int num,
                            const float alpha,
@@ -470,6 +485,76 @@ template void relu(int, const half*, half*, float, cudaStream_t);
 template void bias_relu(
    int, const float*, const float* bias, float*, float, cudaStream_t);

+// ------------- sigmoid -------------
+
+template <typename T>
+__global__ void sigmoid_kernel(const int num, const T* in, T* out) {
+  CUDA_KERNEL_LOOP(i, num) {
+#if __CUDA_ARCH__ >= 350
+    out[i] = static_cast<T>(1.0f) /
+             (static_cast<T>(1.0f) + expf(-1 * __ldg(in + i)));
+#else
+    out[i] = static_cast<T>(1.0f) / (static_cast<T>(1.0f) + expf(-in[i]));
+#endif
+  }
+}
+
+template <>
+__global__ void sigmoid_kernel(const int num, const half* in, half* out) {
+  CUDA_KERNEL_LOOP(i, num) {
+    half tmp = __float2half(1.0f);
+#if __CUDA_ARCH__ >= 530
+    out[i] = __hdiv(
+        tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.0f), __ldg(in + i)))));
+#else
+    out[i] = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i]))));
+#endif
+  }
+}
+
+template <>
+__global__ void sigmoid_kernel(const int num, const half2* in, half2* out) {
+  CUDA_KERNEL_LOOP(i, num) {
+    half2 tmp = __floats2half2_rn(1.0f, 1.0f);
+#if __CUDA_ARCH__ >= 530
+    out[i] = __h2div(tmp,
+                     __hadd2(tmp,
+                             h2exp(__hmul2(__floats2half2_rn(-1.0f, -1.0f),
+                                           __ldg(in + i)))));
+#else
+    out[i].x = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].x))));
+    out[i].y = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].y))));
+#endif
+  }
+}
+
+template <typename T>
+void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream) {
+  sigmoid_kernel<T><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
+      num, din, dout);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <>
+void sigmoid(const int num, const half* din, half* dout, cudaStream_t stream) {
+  if (num % 2 == 0) {
+    const half2* din2 = reinterpret_cast<const half2*>(din);
+    half2* dout2 = reinterpret_cast<half2*>(dout);
+    sigmoid_kernel<
+        half2><<<CUDA_GET_BLOCKS(num / 2), CUDA_NUM_THREADS, 0, stream>>>(
+        num / 2, din2, dout2);
+  } else {
+    sigmoid_kernel<half><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
+        num, din, dout);
+  }
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template void sigmoid(const int num,
+                      const float* din,
+                      float* dout,
+                      cudaStream_t stream);
+
 }  // namespace math
 }  // namespace cuda
 }  // namespace lite

--- a/lite/backends/cuda/math/activation.h
+++ b/lite/backends/cuda/math/activation.h
@@ -17,11 +17,22 @@
 #include <cuda_runtime.h>
 #include <string>

+#include "lite/utils/cp_logging.h"
+
 namespace paddle {
 namespace lite {
 namespace cuda {
 namespace math {

+enum ActivationType {
+  kSigmoid,
+  kReLU,
+  kTanh,
+  kIdentity,
+};
+
+ActivationType GetActiveType(const std::string& act);
+
 // fp32 and half
 template <typename T>
 void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream);
@@ -72,6 +83,9 @@ void bias_int8_nhwc(int num,
                    const void* scale,
                    cudaStream_t stream);

+template <typename T>
+void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream);
+
 }  // namespace math
 }  // namespace cuda
 }  // namespace lite

--- a/lite/backends/cuda/math/bias.cu
+++ b/lite/backends/cuda/math/bias.cu
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/cuda/math/bias.h"
+
+#include <iostream>
+
+#include "lite/backends/cuda/cuda_utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+__global__ void RowwiseAddKernel(
+    const T* a, const T* b, T* c, int width, int num) {
+  CUDA_KERNEL_LOOP(i, num) {
+    int h = i / width;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
+}
+
+template <>
+__global__ void RowwiseAddKernel(
+    const half* a, const half* b, half* c, int width, int num) {
+  CUDA_KERNEL_LOOP(i, num) {
+    int h = i / width;
+    int w = i - h * width;
+    c[i] = __hadd(a[i], b[w]);
+  }
+}
+
+template <typename T>
+void RowwiseAdd<T>::operator()(const T* input,
+                               const T* bias,
+                               T* output,
+                               const int width,
+                               const int count,
+                               const cudaStream_t& stream) {
+  RowwiseAddKernel<T><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+      input, bias, output, width, count);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template struct RowwiseAdd<float>;
+template struct RowwiseAdd<half>;
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/bias.h
+++ b/lite/backends/cuda/math/bias.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "lite/backends/cuda/cuda_utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+struct RowwiseAdd {
+  void operator()(const T* input,
+                  const T* bias,
+                  T* output,
+                  const int width,
+                  const int count,
+                  const cudaStream_t& stream);
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/gru_forward.cu
+++ b/lite/backends/cuda/math/gru_forward.cu
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#include "lite/backends/cuda/math/gru_forward.h"
+#include "lite/core/device_info.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <typename T>
+__global__ void GruForwardResetOutput(
+    T* gate_value,
+    T* reset_output_value,
+    T* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
+  }
+  T prev_out = 0;
+  T reset_out_val;
+  T update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_output_value) {
+    if (is_batch) {
+      prev_output_value += batch_idx * frame_size;
+    }
+    prev_out = prev_output_value[frame_idx];
+  }
+
+  if (active_gate == lite::cuda::math::ActivationType::kSigmoid) {
+    update_gate_value = Sigmoid(update_gate_value);
+    reset_gate_value = Sigmoid(reset_gate_value);
+  } else if (active_gate == lite::cuda::math::ActivationType::kReLU) {
+    update_gate_value = ReLU(update_gate_value);
+    reset_gate_value = ReLU(reset_gate_value);
+  } else if (active_gate == lite::cuda::math::ActivationType::kTanh) {
+    update_gate_value = Tanh(update_gate_value);
+    reset_gate_value = Tanh(reset_gate_value);
+  }
+
+  reset_out_val = prev_out * reset_gate_value;
+
+  gate_value[frame_idx + frame_size * 0] = update_gate_value;
+  gate_value[frame_idx + frame_size * 1] = reset_gate_value;
+  reset_output_value[frame_idx] = reset_out_val;
+}
+
+template <>
+__global__ void GruForwardResetOutput(
+    half* gate_value,
+    half* reset_output_value,
+    half* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
+  }
+  half prev_out = 0;
+  half reset_out_val;
+  half update_gate_value = gate_value[frame_idx + frame_size * 0];
+  half reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_output_value) {
+    if (is_batch) {
+      prev_output_value += batch_idx * frame_size;
+    }
+    prev_out = prev_output_value[frame_idx];
+  }
+
+  if (active_gate == ActivationType::kSigmoid) {
+    update_gate_value = Sigmoid(update_gate_value);
+    reset_gate_value = Sigmoid(reset_gate_value);
+  } else if (active_gate == ActivationType::kReLU) {
+    update_gate_value = ReLU(update_gate_value);
+    reset_gate_value = ReLU(reset_gate_value);
+  } else if (active_gate == ActivationType::kTanh) {
+    update_gate_value = Tanh(update_gate_value);
+    reset_gate_value = Tanh(reset_gate_value);
+  }
+#if __CUDA_ARCH__ >= 530
+  reset_out_val = __hmul(prev_out, reset_gate_value);
+#else
+  reset_out_val =
+      __float2half(__half2float(prev_out) * __half2float(reset_gate_value));
+#endif
+
+  gate_value[frame_idx + frame_size * 0] = update_gate_value;
+  gate_value[frame_idx + frame_size * 1] = reset_gate_value;
+  reset_output_value[frame_idx] = reset_out_val;
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <typename T>
+__global__ void GruForwardFinalOutput(
+    T* gate_value,
+    T* prev_output_value,
+    T* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) {
+      return;
+    }
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
+  }
+
+  T output;
+  T prev_out = 0;
+  T update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T state_frame_value = gate_value[frame_idx + frame_size * 2];
+
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    prev_out = prev_output_value[frame_idx];
+  }
+
+  if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
+    state_frame_value = Sigmoid(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kReLU) {
+    state_frame_value = ReLU(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kTanh) {
+    state_frame_value = Tanh(state_frame_value);
+  }
+
+  if (origin_mode) {
+    output = update_gate_value * prev_out + state_frame_value -
+             update_gate_value * state_frame_value;
+  } else {
+    output = prev_out - update_gate_value * prev_out +
+             update_gate_value * state_frame_value;
+  }
+
+  gate_value[frame_idx + frame_size * 2] = state_frame_value;
+  output_value[frame_idx] = output;
+}
+
+template <>
+__global__ void GruForwardFinalOutput(
+    half* gate_value,
+    half* prev_output_value,
+    half* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) {
+      return;
+    }
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
+  }
+
+  half output;
+  half prev_out = 0;
+  half update_gate_value = gate_value[frame_idx + frame_size * 0];
+  half state_frame_value = gate_value[frame_idx + frame_size * 2];
+
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    prev_out = prev_output_value[frame_idx];
+  }
+
+  if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
+    state_frame_value = Sigmoid(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kReLU) {
+    state_frame_value = ReLU(state_frame_value);
+  } else if (active_node == lite::cuda::math::ActivationType::kTanh) {
+    state_frame_value = Tanh(state_frame_value);
+  }
+
+  if (origin_mode) {
+#if __CUDA_ARCH__ >= 530
+    output =
+        __hsub(__hadd(__hmul(update_gate_value, prev_out), state_frame_value),
+               __hmul(update_gate_value, state_frame_value));
+#else
+    output = __float2half(
+        __half2float(update_gate_value) * __half2float(prev_out) +
+        __half2float(state_frame_value) -
+        __half2float(update_gate_value) * __half2float(state_frame_value));
+#endif
+  } else {
+#if __CUDA_ARCH__ >= 530
+    output = prev_out - update_gate_value * prev_out +
+             update_gate_value * state_frame_value;
+    output = __hadd(__hsub(prev_out, __hmul(update_gate_value, prev_out)),
+                    __hmul(update_gate_value, state_frame_value));
+#else
+    output = __float2half(
+        __half2float(prev_out) -
+        __half2float(update_gate_value) * __half2float(prev_out) +
+        __half2float(update_gate_value) * __half2float(state_frame_value));
+#endif
+  }
+
+  gate_value[frame_idx + frame_size * 2] = state_frame_value;
+  output_value[frame_idx] = output;
+}
+
+template __global__ void GruForwardFinalOutput<float>(
+    float* gate_value,
+    float* prev_output_value,
+    float* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch);
+
+template __global__ void GruForwardResetOutput<float>(
+    float* gate_value,
+    float* reset_output_value,
+    float* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch);
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/gru_forward.h
+++ b/lite/backends/cuda/math/gru_forward.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+
+#include <string>
+#include <vector>
+
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/activation.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+template <typename Dtype>
+inline __device__ Dtype Sigmoid(const Dtype a) {
+  const Dtype min = SIGMOID_THRESHOLD_MIN;
+  const Dtype max = SIGMOID_THRESHOLD_MAX;
+  Dtype tmp = (a < min) ? min : ((a > max) ? max : a);
+  return static_cast<Dtype>(1.0) / (static_cast<Dtype>(1.0) + expf(-tmp));
+}
+
+template <>
+inline __device__ half Sigmoid(const half a) {
+#if __CUDA_ARCH__ >= 530
+  const half tmp = __float2half(1.0f);
+  return __hdiv(tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.f), a))));
+#else
+  return __float2half(1.0f / (expf(__half2float(a) * -1) + 1.0f));
+#endif
+}
+
+template <typename Dtype>
+inline __device__ Dtype ReLU(const Dtype a) {
+  return a > static_cast<Dtype>(0.f) ? a : static_cast<Dtype>(0.f);
+}
+
+template <>
+inline __device__ half ReLU(const half a) {
+  const half tmp = __float2half(0.f);
+#if __CUDA_ARCH__ >= 530
+  return __hgt(a, tmp) ? a : tmp;
+#else
+  return __float2half(__half2float(a) > 0.f ? __half2float(a) : 0.f);
+#endif
+}
+
+template <typename Dtype>
+inline __device__ Dtype Tanh(const Dtype a) {
+  Dtype tmp = static_cast<Dtype>(-2.0) * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (static_cast<Dtype>(2.0) / (static_cast<Dtype>(1.0) + expf(tmp))) -
+         static_cast<Dtype>(1.0);
+}
+
+template <>
+inline __device__ half Tanh(const half a) {
+#if __CUDA_ARCH__ >= 530
+  half tmp = __float2half(1.0f);
+  half numerator = __hmul(__float2half(-2.0f), a);
+  return __hsub(__hdiv(__float2half(2.0f), __hadd(tmp, hexp(numerator))), tmp);
+#else
+  float tmp = -2.0f * __half2float(a);
+  return __float2half(2.0f / (1.0f + expf(tmp)) - 1.0f);
+#endif
+}
+
+template <typename T>
+__global__ void GruForwardResetOutput(
+    T* gate_value,
+    T* reset_output_value,
+    T* prev_output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_gate,
+    bool is_batch);
+
+template <typename T>
+__global__ void GruForwardFinalOutput(
+    T* gate_value,
+    T* prev_output_value,
+    T* output_value,
+    int frame_size,
+    int batch_size,
+    lite::cuda::math::ActivationType active_node,
+    bool origin_mode,
+    bool is_batch);
+
+/*
+ * threads(tile_size, 1)
+ * grids(frame_blocks, 1)
+ */
+template <class T, int TiledSize>
+__global__ void FastCollectiveGruGate(T* gate_value,
+                                      T* prev_output_value,
+                                      T* gate_weight,
+                                      T* reset_output,
+                                      int frame_size,
+                                      ActivationType active_node) {
+  T xt_0 = 0.0f;
+  T a0 = 0.0f;
+  T c0 = 0.0f;
+  T b0[TiledSize];
+
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  int tiled_mask = ((1 << TiledSize) - 1);
+  // tiled matrix multiply using register shift, faster than sm.
+  if (prev_output_value) {
+    for (int k = 0; k < (((frame_size - 1) / TiledSize) + 1); ++k) {
+      a0 = 0;
+      if ((threadIdx.x + k * TiledSize) < frame_size) {
+        a0 = prev_output_value[threadIdx.x + (k * TiledSize)];
+      }
+      for (int i = 0; i < TiledSize; ++i) {
+        if (col < frame_size * 2 && (i + k * TiledSize) < frame_size) {
+          b0[i] = gate_weight[(i + k * TiledSize) * frame_size * 2 + col];
+        }
+      }
+
+      for (int i = 0; i < TiledSize; ++i) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+        c0 = c0 + __shfl_sync(tiled_mask, a0, i, TiledSize) * b0[i];
+#else
+        c0 = c0 + __shfl(a0, i, TiledSize) * b0[i];
+#endif
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (col < frame_size * 2) {
+    xt_0 = gate_value[col];
+    c0 += xt_0;
+    if (active_node == ActivationType::kSigmoid) {
+      c0 = Sigmoid(c0);
+    } else if (active_node == ActivationType::kReLU) {
+      c0 = ReLU(c0);
+    } else if (active_node == ActivationType::kTanh) {
+      c0 = Tanh(c0);
+    }
+    gate_value[col] = c0;
+    if (frame_size <= col && col < frame_size * 2) {
+      T htp_0 = 0.0;
+      if (prev_output_value) {
+        htp_0 = prev_output_value[col - frame_size];
+      }
+      reset_output[col - frame_size] = c0 * htp_0;
+    } else if (col < frame_size) {
+      gate_value[col] = c0;
+    }
+  }
+}
+
+template <class T, int TiledSize>
+__global__ void FastCollectiveGruOut(T* gate_weight,
+                                     T* prev_out_value,
+                                     T* output_value,
+                                     T* gate_value,
+                                     T* reset_value,
+                                     int frame_size,
+                                     ActivationType active_node,
+                                     bool origin_mode) {
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  T a0 = 0.0f;
+  T b0[TiledSize];
+  T c0 = 0.0f;
+
+  int tiled_mask = ((1 << TiledSize) - 1);
+  if (prev_out_value) {
+    for (int k = 0; k < ((frame_size - 1) / TiledSize + 1); ++k) {
+      a0 = 0;
+      if ((threadIdx.x + k * TiledSize) < frame_size) {
+        a0 = reset_value[threadIdx.x + k * TiledSize];
+      }
+      for (int i = 0; i < TiledSize; ++i) {
+        if (col < frame_size && (i + k * TiledSize) < frame_size) {
+          b0[i] = gate_weight[(i + k * TiledSize) * frame_size + col];
+        }
+      }
+      for (int i = 0; i < TiledSize; ++i) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+        c0 = c0 + __shfl_sync(tiled_mask, a0, i, TiledSize) * b0[i];
+#else
+        c0 = c0 + __shfl(a0, i, TiledSize) * b0[i];
+#endif
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (col < frame_size) {
+    T xt_0 = gate_value[col + 2 * frame_size];
+    T gta_0 = gate_value[col];
+    T htp_0 = 0;
+    if (prev_out_value) {
+      htp_0 = prev_out_value[col];
+    }
+    c0 += xt_0;
+    if (active_node == ActivationType::kSigmoid) {
+      c0 = Sigmoid(c0);
+    } else if (active_node == ActivationType::kReLU) {
+      c0 = ReLU(c0);
+    } else if (active_node == ActivationType::kTanh) {
+      c0 = Tanh(c0);
+    }
+    gate_value[col + 2 * frame_size] = c0;
+    if (origin_mode) {
+      output_value[col] = htp_0 * gta_0 + (1 - gta_0) * c0;
+    } else {
+      output_value[col] = c0 * gta_0 + (1 - gta_0) * htp_0;
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/scale.cu
+++ b/lite/backends/cuda/math/scale.cu
@@ -22,10 +22,6 @@ namespace lite {
 namespace cuda {
 namespace math {

-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void scale_kernel(int count,
                             const T* in_data,
@@ -48,7 +44,6 @@ __global__ void scale_kernel(int count,
 template <typename T>
 __global__ void scale_kernel(
    int count, const T* in_data, T* out_data, const T scale, const T bias) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  CUDA_KERNEL_LOOP(tid, count) { out_data[tid] = scale * in_data[tid] + bias; }
 }

@@ -133,12 +128,11 @@ void fp32_scale_nhwc(int num,
 }

 template <typename T>
-void scale(int num, const T* in, T* out, T scale, cudaStream_t stream, T bias) {
+void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream) {
  int thread = 256;
  int block = (num + thread - 1) / thread;
  scale_kernel<<<block, thread, 0, stream>>>(num, in, out, scale, bias);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }

 template <typename T>
@@ -146,11 +140,10 @@ void scale(int num, const T* in, T* out, T scale, T bias) {
  int thread = 256;
  int block = (num + thread - 1) / thread;
  scale_kernel<<<block, thread>>>(num, in, out, scale, bias);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }

-template void scale(int num, const float*, float*, float, cudaStream_t, float);
+template void scale(int num, const float*, float*, float, float, cudaStream_t);
 template void scale(int num, const float*, float*, float, float);

 }  // namespace math

--- a/lite/backends/cuda/math/scale.h
+++ b/lite/backends/cuda/math/scale.h
@@ -32,8 +32,7 @@ void fp32_scale_nhwc(int num,
                     cudaStream_t stream);

 template <typename T>
-void scale(
-    int num, const T* in, T* out, T scale, cudaStream_t stream, T bias = 0);
+void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream);

 template <typename T>
 void scale(int num, const T* in, T* out, T scale, T bias = 0);

--- a/lite/backends/cuda/math/sequence2batch.cu
+++ b/lite/backends/cuda/math/sequence2batch.cu
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/sequence2batch.h"
+#include "lite/backends/cuda/math/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+__global__ void CopyMatrixRowsKernel(const T* src,
+                                     T* dst,
+                                     const uint64_t* index,
+                                     int height,
+                                     int width,
+                                     bool is_src_index) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int row_id = blockDim.y * blockIdx.x + idy;
+  if (row_id < height) {
+    int src_idx = is_src_index ? index[row_id] : row_id;
+    int dst_idx = is_src_index ? row_id : index[row_id];
+    const T* src_data = src + src_idx * width;
+    T* dst_data = dst + dst_idx * width;
+    for (int i = idx; i < width; i += blockDim.x) {
+      dst_data[i] = src_data[i];
+    }
+  }
+}
+
+template <typename T>
+void CopyMatrixRowsFunctor<T>::operator()(
+    const lite::Tensor& src,
+    lite::Tensor* dst,
+    const std::vector<uint64_t>& index_lod,
+    bool is_src_index,
+    const cudaStream_t& stream) {
+  auto src_dims = src.dims();
+  auto dst_dims = dst->dims();
+  CHECK_EQ(src_dims.size(), 2) << "The src must be matrix with rank 2.";
+  CHECK_EQ(dst_dims.size(), 2) << "The dst must be matrix with rank 2.";
+  CHECK_EQ(src_dims[1], dst_dims[1])
+      << "The width of src and dst must be same.";
+  int height = dst_dims[0];
+  int width = dst_dims[1];
+  const auto* src_data = src.data<T>();
+  auto* dst_data = dst->template mutable_data<T>(TARGET(kCUDA));
+
+  index_tensor_.Resize({static_cast<int64_t>(index_lod.size())});
+  auto* index_tensor_data = index_tensor_.mutable_data<uint64_t>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(index_tensor_data,
+                                 index_lod.data(),
+                                 sizeof(uint64_t) * index_lod.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  dim3 threads(128, 8);
+  dim3 grids((height + threads.y - 1) / threads.y);
+  CopyMatrixRowsKernel<T><<<grids, threads, 0, stream>>>(
+      src_data, dst_data, index_tensor_data, height, width, is_src_index);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template class CopyMatrixRowsFunctor<float>;
+template class CopyMatrixRowsFunctor<half>;
+
+template class LoDTensor2BatchFunctor<float>;
+template class LoDTensor2BatchFunctor<half>;
+
+template class Batch2LoDTensorFunctor<float>;
+template class Batch2LoDTensorFunctor<half>;
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/sequence2batch.h
+++ b/lite/backends/cuda/math/sequence2batch.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+class CopyMatrixRowsFunctor {
+ public:
+  // If is_src_index is true, copy the indexed rows of input src to the output
+  // dst. If is_src_index is false, copy the input src to the indexed of output
+  // dst. The indexes rows are based on the input index.
+  void operator()(const lite::Tensor& src,
+                  lite::Tensor* dst,
+                  const std::vector<uint64_t>& index_lod,
+                  bool is_src_index,
+                  const cudaStream_t& stream);
+
+ private:
+  lite::Tensor index_tensor_;
+};
+
+template <typename T>
+class LoDTensor2BatchFunctor {
+  // Calculate the length of each sequence and
+  // sort sequence index by the length.
+  // example:  sequences = {s0, s1, s2}
+  //            s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+  //            seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
+  struct SeqInfo {
+    SeqInfo(size_t start_val, size_t len_val, size_t seq_val)
+        : start(start_val), length(len_val), seq_idx(seq_val) {}
+    size_t start;
+    size_t length;
+    size_t seq_idx;
+  };
+
+ public:
+  void operator()(const lite::Tensor& lod_tensor,
+                  lite::Tensor* batch_tensor,
+                  bool is_reverse,
+                  const cudaStream_t& stream) const {
+    auto lods = lod_tensor.lod();
+    CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now.";
+    const auto& lod = lods[0];
+
+    std::vector<SeqInfo> seq_info;
+    for (int seq_id = 0; seq_id < static_cast<int>(lod.size()) - 1; ++seq_id) {
+      size_t length = lod[seq_id + 1] - lod[seq_id];
+      seq_info.emplace_back(lod[seq_id], length, seq_id);
+    }
+
+    std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
+      return a.length > b.length;
+    });
+
+    // Calculate the start position of each batch.
+    // example:  sequences = {s0, s1, s2}
+    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+    //           max_seqlen = 5,
+    //           batchIndex = {b0, b1, b2, b3, b4}
+    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
+    //              batch_start_positions[0] = 0
+    //              batch_start_positions[1] = len(b0)
+    //              batch_start_positions[2] = len(b0) + len(b1)
+    //              ...
+    //           seq2batch_idx[12] = {4, 0, 9,
+    //                                5, 1, 10,
+    //                                6, 2, 11,
+    //                                7, 3,
+    //                                8}
+    //           seq_order = {1, 0, 2}, the sort order.
+    //               where 1 is the second sequence,
+    //                     0 is the first sequence,
+    //                     2 is the third sequence.
+
+    LoD batch_lods;
+    batch_lods.emplace_back(std::vector<uint64_t>{0});
+    batch_lods.emplace_back(std::vector<uint64_t>{0});
+    batch_lods.emplace_back(std::vector<uint64_t>{0});
+
+    // batch_lods[0] is the start positions for batch LoDTensor
+    size_t max_seqlen = seq_info[0].length;
+    batch_lods[0].resize(max_seqlen + 1);
+    // batch_lods[1] is the raw index in the input LoDTensor
+    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+    // batch_lods[2] is the sort order for the input LoDTensor.
+    batch_lods[2].resize(seq_info.size());
+
+    auto* batch_starts = batch_lods[0].data();
+    auto* seq2batch_idx = batch_lods[1].data();
+    batch_starts[0] = 0;
+    for (size_t n = 0; n < max_seqlen; ++n) {
+      size_t batch_id = batch_starts[n];
+      for (size_t i = 0; i < seq_info.size(); ++i) {
+        size_t seq_len = seq_info[i].length;
+        size_t start = seq_info[i].start;
+        if (n < seq_len) {
+          seq2batch_idx[batch_id] =
+              is_reverse ? start + seq_len - 1 - n : start + n;
+          ++batch_id;
+        } else {
+          break;
+        }
+      }
+      batch_starts[n + 1] = batch_id;
+    }
+    auto* seq_order = batch_lods[2].data();
+    for (size_t i = 0; i < seq_info.size(); ++i) {
+      seq_order[i] = seq_info[i].seq_idx;
+    }
+
+    batch_tensor->set_lod(batch_lods);
+
+    lite::cuda::math::CopyMatrixRowsFunctor<T> to_batch;
+    to_batch(lod_tensor, batch_tensor, batch_lods[1], true, stream);
+    CUDA_POST_KERNEL_CHECK;
+  }
+};
+
+template <typename T>
+class Batch2LoDTensorFunctor {
+ public:
+  void operator()(const lite::Tensor& batch_tensor,
+                  lite::Tensor* lod_tensor,
+                  const cudaStream_t& stream) {
+    auto in_lod = batch_tensor.lod();
+    CHECK_GT(in_lod.size(), 2UL) << "The LoD of LoDTensor should include at "
+                                    "least 2-level sequence infomation.";
+    CHECK_EQ(in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]))
+        << "The LoD information should be consistent with the dims.";
+    lite::cuda::math::CopyMatrixRowsFunctor<T> to_seq;
+    to_seq(batch_tensor, lod_tensor, in_lod[1], false, stream);
+    CUDA_POST_KERNEL_CHECK;
+  }
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/sequence_padding.cu
+++ b/lite/backends/cuda/math/sequence_padding.cu
@@ -86,8 +86,7 @@ void SequencePadding(T* pad_data,
      seq_num,
      pad_seq_len,
      step_width);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }

 template <typename T>
@@ -120,8 +119,7 @@ void SequenceUnpadding(T* seq_data,
      seq_num,
      pad_seq_len,
      step_width);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }

 template void SequencePadding(float* pad_data,

--- a/lite/backends/cuda/target_wrapper.h
+++ b/lite/backends/cuda/target_wrapper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include "lite/backends/cuda/cuda_utils.h"
 #include "lite/core/target_wrapper.h"

 namespace paddle {
@@ -31,6 +32,16 @@ class TargetWrapper<TARGET(kCUDA)> {
  static size_t num_devices();
  static size_t maximum_stream() { return 0; }

+  static int GetComputeCapability() {
+    int dev_id = GetCurDevice();
+    int major, minor;
+    CUDA_CALL(cudaDeviceGetAttribute(
+        &major, cudaDevAttrComputeCapabilityMajor, dev_id));
+    CUDA_CALL(cudaDeviceGetAttribute(
+        &minor, cudaDevAttrComputeCapabilityMinor, dev_id));
+    return major * 10 + minor;
+  }
+
  static size_t GetCurDevice() {
    int dev_id;
    cudaGetDevice(&dev_id);

--- a/lite/backends/host/target_wrapper.cc
+++ b/lite/backends/host/target_wrapper.cc
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace lite {

-const int MALLOC_ALIGN = 64;
+const int MALLOC_ALIGN = 16;

 void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
  size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
@@ -30,7 +30,6 @@ void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
  void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
                                    (~(MALLOC_ALIGN - 1)));
  static_cast<void**>(r)[-1] = p;
-  memset(r, 0, size);
  return r;
 }
 void TargetWrapper<TARGET(kHost)>::Free(void* ptr) {

--- a/lite/backends/huawei_ascend_npu/CMakeLists.txt
+++ b/lite/backends/huawei_ascend_npu/CMakeLists.txt
+if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+  return()
+endif()
+
+lite_cc_library(model_client_huawei_ascend_npu SRCS model_client.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs})
+lite_cc_library(device_huawei_ascend_npu SRCS device.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs} model_client_huawei_ascend_npu)
--- a/lite/backends/huawei_ascend_npu/device.cc
+++ b/lite/backends/huawei_ascend_npu/device.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/huawei_ascend_npu/device.h"
+#include <map>
+#include <utility>
+#include "ge/ge_api_types.h"
+#include "ge/ge_ir_build.h"
+#include "graph/graph.h"
+#include "lite/utils/io.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+std::shared_ptr<AclModelClient> Device::LoadFromMem(
+    const std::vector<char>& model_buffer, const int device_id) {
+  if (model_buffer.size() == 0) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] model_buffer size is ZERO!";
+    return nullptr;
+  }
+
+  // Create a ACL model  client to load the om model
+  std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
+  // Load model from memory
+  if (model_client->LoadFromMem(
+          reinterpret_cast<const void*>(model_buffer.data()),
+          model_buffer.size())) {
+    return model_client;
+  }
+  return nullptr;
+}
+
+std::shared_ptr<AclModelClient> Device::LoadFromFile(
+    const std::string& model_path, const int device_id) {
+  if (!paddle::lite::IsFileExists(model_path)) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] om model file not exists:" << model_path;
+    return nullptr;
+  }
+
+  // Create a ACL model  client to load the om model
+  std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
+  // Load model from memory
+  if (model_client->LoadFromFile(model_path.c_str())) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
+    return model_client;
+  }
+  return nullptr;
+}
+
+std::mutex Device::device_mutex_;
+
+bool Device::Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
+                   std::vector<ge::Operator>& output_nodes,  // NOLINT
+                   std::vector<char>* model_buffer) {
+  std::lock_guard<std::mutex> lock(device_mutex_);
+  // Convert the HiAI IR graph to the HiAI om model
+  ge::Graph ir_graph("graph");
+  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
+
+  // Build IR model
+  ge::ModelBufferData om_buffer;
+  std::map<std::string, std::string> options;
+  options.insert(std::make_pair(ge::ir_option::LOG_LEVEL, "error"));
+
+  ATC_CALL(aclgrphBuildModel(ir_graph, options, om_buffer));
+
+  // Copy from om model buffer
+  model_buffer->resize(om_buffer.length);
+  memcpy(reinterpret_cast<void*>(model_buffer->data()),
+         reinterpret_cast<void*>(om_buffer.data.get()),
+         om_buffer.length);
+
+  return true;
+}
+
+void Device::InitOnce() {
+  if (runtime_inited_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] runtime already inited!";
+    return;
+  }
+  // ACL runtime init => can only be called once in one process
+  ACL_CALL(aclInit(NULL));
+
+  // ATC builder init => can only be called once in one process
+  std::map<std::string, std::string> global_options;
+  global_options.insert(
+      std::make_pair(ge::ir_option::SOC_VERSION, "Ascend310"));
+  ATC_CALL(ge::aclgrphBuildInitialize(global_options));
+
+  runtime_inited_ = true;
+}
+
+void Device::DestroyOnce() {
+  if (!runtime_inited_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to destroy runtime!";
+    return;
+  }
+  // ATC builder finalize => can only be called once in one process
+  ge::aclgrphBuildFinalize();
+  // ACL runtime finalize => can only be called once in one process
+  ACL_CALL(aclFinalize());
+
+  runtime_inited_ = false;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/huawei_ascend_npu/device.h
+++ b/lite/backends/huawei_ascend_npu/device.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+#include "lite/backends/huawei_ascend_npu/model_client.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() { InitOnce(); }
+
+  ~Device() { DestroyOnce(); }
+
+  std::shared_ptr<AclModelClient> LoadFromMem(
+      const std::vector<char>& model_buffer, const int device_id);
+  std::shared_ptr<AclModelClient> LoadFromFile(const std::string& model_path,
+                                               const int device_id);
+  // Build the ACL IR graph to the ACL om model
+  bool Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
+             std::vector<ge::Operator>& output_nodes,  // NOLINT
+             std::vector<char>* model_buffer);         // NOLINT
+
+ private:
+  void InitOnce();
+  void DestroyOnce();
+  bool runtime_inited_{false};
+  static std::mutex device_mutex_;
+};
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/huawei_ascend_npu/model_client.cc
+++ b/lite/backends/huawei_ascend_npu/model_client.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/huawei_ascend_npu/model_client.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+bool AclModelClient::LoadFromMem(const void* data, uint32_t size) {
+  if (load_flag_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
+    return true;
+  }
+
+  auto ret = aclmdlQuerySizeFromMem(
+      data, size, &model_memory_size_, &model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from memory failed!";
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
+                    "failed, require size is "
+                 << model_memory_size_;
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
+                    "failed, require size is "
+                 << model_weight_size_;
+    return false;
+  }
+  ret = aclmdlLoadFromMemWithMem(data,
+                                 size,
+                                 &model_id_,
+                                 model_memory_ptr_,
+                                 model_memory_size_,
+                                 model_weight_ptr_,
+                                 model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from memory failed!";
+    return false;
+  }
+  model_desc_ = aclmdlCreateDesc();
+  if (model_desc_ == nullptr) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
+    return false;
+  }
+  ret = aclmdlGetDesc(model_desc_, model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] AclModelClient LoadFromMem success.";
+  load_flag_ = true;
+  return true;
+}
+
+bool AclModelClient::LoadFromFile(const char* model_path) {
+  if (load_flag_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
+    return true;
+  }
+  auto ret =
+      aclmdlQuerySize(model_path, &model_memory_size_, &model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from file failed!";
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
+                    "failed, require size is "
+                 << model_memory_size_;
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
+                    "failed, require size is "
+                 << model_weight_size_;
+    return false;
+  }
+  ret = aclmdlLoadFromFileWithMem(model_path,
+                                  &model_id_,
+                                  model_memory_ptr_,
+                                  model_memory_size_,
+                                  model_weight_ptr_,
+                                  model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from file failed!";
+    return false;
+  }
+  model_desc_ = aclmdlCreateDesc();
+  if (model_desc_ == nullptr) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
+    return false;
+  }
+  ret = aclmdlGetDesc(model_desc_, model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
+  load_flag_ = true;
+  return true;
+}
+
+bool AclModelClient::GetModelIOTensorDim(
+    std::vector<TensorDesc>* input_tensor,
+    std::vector<TensorDesc>* output_tensor) {
+  if (!model_desc_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] GetModelIOTensorDim failed!";
+    return false;
+  }
+  size_t input_num = aclmdlGetNumInputs(model_desc_);
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] input numher is " << input_num;
+  for (size_t i = 0; i < input_num; i++) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] printing input [" << i << "] ....";
+    aclmdlIODims input_dim;
+    aclmdlGetInputDims(model_desc_, i, &input_dim);
+    aclDataType data_type = aclmdlGetInputDataType(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of inputs[" << i << "] is "
+            << data_type;
+    aclFormat data_format = aclmdlGetInputFormat(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of inputs[" << i << "] is "
+            << data_format;
+    TensorDesc tensor_desc = TensorDesc(data_type, input_dim, data_format);
+    input_tensor->push_back(tensor_desc);
+  }
+
+  size_t output_num = aclmdlGetNumOutputs(model_desc_);
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] output numher is " << output_num;
+  for (size_t i = 0; i < output_num; i++) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] printing output [" << i << "] ....";
+    aclmdlIODims output_dim;
+    aclmdlGetOutputDims(model_desc_, i, &output_dim);
+    aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of outputs[" << i << "] is "
+            << data_type;
+    aclFormat data_format = aclmdlGetOutputFormat(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of outputs[" << i << "] is "
+            << data_format;
+    TensorDesc tensor_desc = TensorDesc(data_type, output_dim, data_format);
+    output_tensor->push_back(tensor_desc);
+  }
+  return true;
+}
+
+bool AclModelClient::GetTensorFromDataset(
+    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
+  size_t device_output_num = aclmdlGetDatasetNumBuffers(output_dataset_);
+  size_t tensor_output_num = reinterpret_cast<size_t>(output_tensor->size());
+  if (device_output_num != tensor_output_num) {
+    LOG(ERROR)
+        << "[HUAWEI_ASCEND_NPU] output number not equal, device number is "
+        << device_output_num << "tensor number is " << tensor_output_num;
+    return false;
+  }
+  for (size_t i = 0; i < device_output_num; i++) {
+    aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(output_dataset_, i);
+    void* device_data = aclGetDataBufferAddr(buffer_device);
+    uint32_t device_size = aclGetDataBufferSize(buffer_device);
+
+    void* tensor_data = nullptr;
+    aclError ret = aclrtMallocHost(&tensor_data, device_size);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMallocHost failed, ret " << ret;
+      return false;
+    }
+    ret = aclrtMemcpy(tensor_data,
+                      device_size,
+                      device_data,
+                      device_size,
+                      ACL_MEMCPY_DEVICE_TO_HOST);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMemcpy failed, ret " << ret;
+      return false;
+    }
+    if (output_tensor->at(i)->SetData(reinterpret_cast<uint8_t*>(tensor_data),
+                                      device_size) != ge::GRAPH_SUCCESS) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] SetData to output tensor failed";
+      return false;
+    }
+  }
+  VLOG(3)
+      << "[HUAWEI_ASCEND_NPU] Get output tensor from output dataset succeed.";
+  return true;
+}
+
+void AclModelClient::CreateInputDataset(
+    std::vector<std::shared_ptr<ge::Tensor>>* input_tensor) {
+  input_dataset_ = aclmdlCreateDataset();
+  if (input_dataset_ == nullptr) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create input dataset failed!";
+    return;
+  }
+
+  for (size_t i = 0; i < input_tensor->size(); i++) {
+    auto item = input_tensor->at(i);
+    size_t buffer_size = item->GetSize();
+    void* buffer_device = nullptr;
+    aclError ret =
+        aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR)
+          << "[HUAWEI_ASCEND_NPU] input malloc device buffer failed. size is "
+          << buffer_size;
+      return;
+    }
+    void* buffer_data = reinterpret_cast<void*>(item->GetData());
+    ret = aclrtMemcpy(buffer_device,
+                      buffer_size,
+                      buffer_data,
+                      buffer_size,
+                      ACL_MEMCPY_HOST_TO_DEVICE);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input memcpy failed, buffer size is "
+                 << buffer_size;
+      aclrtFree(buffer_device);
+      return;
+    }
+    aclDataBuffer* data_buffer =
+        aclCreateDataBuffer(buffer_device, buffer_size);
+    if (data_buffer == nullptr) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
+      aclrtFree(buffer_device);
+      return;
+    }
+    if (aclmdlAddDatasetBuffer(input_dataset_, data_buffer) != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input aclmdlAddDatasetBuffer failed!";
+      aclrtFree(buffer_device);
+      aclDestroyDataBuffer(data_buffer);
+      return;
+    }
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateInputDataset succeed.";
+}
+void AclModelClient::CreateOutputDataset(
+    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
+  output_dataset_ = aclmdlCreateDataset();
+  if (output_dataset_ == nullptr) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create output dataset failed!";
+    return;
+  }
+  size_t output_size = aclmdlGetNumOutputs(model_desc_);
+  CHECK_EQ(output_size, output_tensor->size());
+  for (size_t i = 0; i < output_size; i++) {
+    size_t buffer_size = aclmdlGetOutputSizeByIndex(model_desc_, i);
+    void* buffer_device = nullptr;
+    aclError ret =
+        aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR)
+          << "[HUAWEI_ASCEND_NPU] output malloc device buffer failed. size is "
+          << buffer_size;
+      return;
+    }
+    aclDataBuffer* data_buffer =
+        aclCreateDataBuffer(buffer_device, buffer_size);
+    if (data_buffer == nullptr) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
+      aclrtFree(buffer_device);
+      return;
+    }
+    if (aclmdlAddDatasetBuffer(output_dataset_, data_buffer) !=
+        ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclmdlAddDatasetBuffer failed!";
+      aclrtFree(buffer_device);
+      aclDestroyDataBuffer(data_buffer);
+      return;
+    }
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateOutputDataset succeed.";
+}
+
+bool AclModelClient::ModelExecute(
+    std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
+    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
+  // check model exists
+  if (model_desc_ == nullptr) {
+    LOG(ERROR)
+        << "[HUAWEI_ASCEND_NPU] no model description, model execution failed!";
+    return false;
+  }
+  // create input/output dataset
+  CreateInputDataset(input_tensor);
+  CreateOutputDataset(output_tensor);
+
+  // model execution
+  ACL_CALL(aclmdlExecute(model_id_, input_dataset_, output_dataset_));
+
+  // get output
+  if (!GetTensorFromDataset(output_tensor)) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset failed, modelId:"
+               << model_id_;
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset succeed, modelId:"
+          << model_id_;
+
+  return true;
+}
+
+void AclModelClient::DestroyDataset(aclmdlDataset** dataset) {
+  if (*dataset == nullptr) {
+    LOG(WARNING)
+        << "[HUAWEI_ASCEND_NPU] no dataset exists, no need to destroy!";
+    return;
+  }
+
+  size_t dataset_num = aclmdlGetDatasetNumBuffers(*dataset);
+  for (size_t i = 0; i < dataset_num; i++) {
+    aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(*dataset, i);
+    void* device_data = aclGetDataBufferAddr(buffer_device);
+    if (device_data == nullptr) {
+      LOG(WARNING)
+          << "[HUAWEI_ASCEND_NPU] failed to get data buffer of deivce data!";
+    } else {
+      if (aclrtFree(device_data) != ACL_ERROR_NONE) {
+        LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to free deivce data!";
+      }
+    }
+    if (aclDestroyDataBuffer(buffer_device) != ACL_ERROR_NONE) {
+      LOG(WARNING)
+          << "[HUAWEI_ASCEND_NPU] failed to destroy deivce data buffer!";
+    }
+  }
+  if (aclmdlDestroyDataset(*dataset) != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to destroy dataset!";
+  }
+  *dataset = nullptr;
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroy dataset success.";
+}
+
+bool AclModelClient::UnloadModel() {
+  if (!load_flag_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to unload model, load flag is "
+                 << load_flag_;
+    return true;
+  }
+
+  DestroyDataset(&input_dataset_);
+  DestroyDataset(&output_dataset_);
+
+  aclError ret = aclmdlUnload(model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "unload model failed, model id is " << model_id_;
+    return false;
+  }
+  if (model_desc_ != nullptr) {
+    (void)aclmdlDestroyDesc(model_desc_);
+    model_desc_ = nullptr;
+  }
+
+  if (model_memory_ptr_ != nullptr) {
+    aclrtFree(model_memory_ptr_);
+    model_memory_ptr_ = nullptr;
+    model_memory_size_ = 0;
+  }
+
+  if (model_weight_ptr_ != nullptr) {
+    aclrtFree(model_weight_ptr_);
+    model_weight_ptr_ = nullptr;
+    model_weight_size_ = 0;
+  }
+  load_flag_ = false;
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Unload model success, model id " << model_id_;
+  return true;
+}
+
+uint32_t AclModelClient::num_devices() {
+  uint32_t count = 0;
+  ACL_CALL(aclrtGetDeviceCount(&count));
+  return count;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/huawei_ascend_npu/model_client.h
+++ b/lite/backends/huawei_ascend_npu/model_client.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/huawei_ascend_npu/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+class TensorDesc {
+ public:
+  TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) {
+    if (format == ACL_FORMAT_NHWC) {
+      dim_order[1] = 3;
+      dim_order[2] = 1;
+      dim_order[3] = 2;
+    }
+    // create ge::Tensordesc
+    ge_tensor_desc_ = new ge::TensorDesc(
+        GetGeShape(dims), GetGeFormat(format), GetGeDataType(data_type));
+    CHECK(ge_tensor_desc_ != nullptr);
+  }
+  ~TensorDesc() { ge_tensor_desc_ = nullptr; }
+  int64_t GetNumber() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[0]);
+  }
+  int64_t GetChannel() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[1]);
+  }
+  int64_t GetHeight() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[2]);
+  }
+  int64_t GetWidth() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[3]);
+  }
+  const ge::TensorDesc& GetGeTensorDesc() const { return *ge_tensor_desc_; }
+
+ private:
+  ge::Shape GetGeShape(aclmdlIODims dims) {
+    ge::Shape ge_shape({0, 0, 0, 0});
+    for (size_t i = 0; i < dims.dimCount; i++) {
+      if (ge_shape.SetDim(i, dims.dims[i]) != ge::GRAPH_SUCCESS) {
+        LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Shape SetDim failed!";
+      } else {
+        VLOG(3) << "[HUAWEI_ASCEND_NPU] Setting Ge Shape[" << i << "] = <"
+                << dims.dims[i] << ">";
+      }
+    }
+    return ge_shape;
+  }
+  ge::Format GetGeFormat(aclFormat format) {
+    ge::Format ge_format = ge::FORMAT_NCHW;
+    switch (format) {
+      case ACL_FORMAT_NCHW:
+        ge_format = ge::FORMAT_NCHW;
+        break;
+      case ACL_FORMAT_NHWC:
+        ge_format = ge::FORMAT_NHWC;
+        break;
+      case ACL_FORMAT_ND:
+        ge_format = ge::FORMAT_ND;
+        break;
+      default:
+        LOG(FATAL) << "[HUAWEI_ASCEND_NPU] format not supported:" << format;
+        break;
+    }
+    return ge_format;
+  }
+  ge::DataType GetGeDataType(aclDataType data_type) {
+    ge::DataType ge_datatype = ge::DT_FLOAT;
+    switch (data_type) {
+      case ACL_FLOAT:
+        ge_datatype = ge::DT_FLOAT;
+        break;
+      case ACL_FLOAT16:
+        ge_datatype = ge::DT_FLOAT16;
+        break;
+      case ACL_INT8:
+        ge_datatype = ge::DT_INT8;
+        break;
+      case ACL_INT16:
+        ge_datatype = ge::DT_INT16;
+        break;
+      case ACL_INT32:
+        ge_datatype = ge::DT_INT32;
+        break;
+      case ACL_INT64:
+        ge_datatype = ge::DT_INT64;
+        break;
+      case ACL_BOOL:
+        ge_datatype = ge::DT_BOOL;
+        break;
+      default:
+        LOG(FATAL) << "[HUAWEI_ASCEND_NPU] data type not supported!";
+        break;
+    }
+    return ge_datatype;
+  }
+
+ private:
+  ge::TensorDesc* ge_tensor_desc_{nullptr};
+  // n c h w order, default to ACL_FORMAT_NCHW
+  std::vector<size_t> dim_order{0, 1, 2, 3};
+};
+
+class AclModelClient {
+ public:
+  explicit AclModelClient(int device_id) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Creating Huawei Ascend Device: "
+            << device_id;
+    device_num_ = num_devices();
+    if (device_id < 0 || device_id >= device_num_) {
+      LOG(FATAL) << "Failed with invalid device id " << device_id;
+      return;
+    }
+    device_id_ = device_id;
+    ACL_CALL(aclrtSetDevice(device_id_));
+  }
+
+  ~AclModelClient() {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroying Huawei Ascend Device: "
+            << device_id_;
+    ACL_CALL(aclrtResetDevice(device_id_));
+  }
+
+  bool LoadFromMem(const void* data, uint32_t size);
+  bool LoadFromFile(const char* model_path);
+  bool GetModelIOTensorDim(std::vector<TensorDesc>* input_tensor,
+                           std::vector<TensorDesc>* output_tensor);
+  bool ModelExecute(std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
+                    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
+  bool UnloadModel();
+
+ private:
+  void CreateInputDataset(
+      std::vector<std::shared_ptr<ge::Tensor>>* input_tensor);
+  void CreateOutputDataset(
+      std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
+  bool GetTensorFromDataset(
+      std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
+  void DestroyDataset(aclmdlDataset** dataset);
+
+ private:
+  uint32_t num_devices();
+
+ private:
+  int device_id_{0};
+  int device_num_{0};
+  aclrtContext context_{nullptr};
+  bool load_flag_{false};
+  uint32_t model_id_{0};
+  size_t model_memory_size_;
+  size_t model_weight_size_;
+  void* model_memory_ptr_;
+  void* model_weight_ptr_;
+  aclmdlDesc* model_desc_{nullptr};
+  aclmdlDataset* input_dataset_{nullptr};
+  aclmdlDataset* output_dataset_{nullptr};
+};
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/huawei_ascend_npu/utils.h
+++ b/lite/backends/huawei_ascend_npu/utils.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "acl/acl.h"
+#include "ge/ge_api_types.h"
+#include "ge/ge_ir_build.h"
+#include "graph/ge_error_codes.h"
+#include "graph/graph.h"
+#include "graph/tensor.h"
+#include "graph/types.h"
+#include "lite/utils/cp_logging.h"
+
+/*
+ * This file contains some Huawei Ascend NPU specific uitls.
+ */
+
+#define ACL_CALL(msg)                                       \
+  CHECK_EQ(reinterpret_cast<aclError>(msg), ACL_ERROR_NONE) \
+      << (msg) << " Huawei Ascend NPU ACL Error: "          \
+      << ::paddle::lite::huawei_ascend_npu::AclErrorInfo(   \
+             reinterpret_cast<int>(msg))
+
+#define ATC_CALL(msg)                                                 \
+  CHECK_EQ(reinterpret_cast<ge::graphStatus>(msg), ge::GRAPH_SUCCESS) \
+      << (msg) << " Huawei Ascend NPU ATC Error: "                    \
+      << ::paddle::lite::huawei_ascend_npu::AtcErrorInfo(             \
+             reinterpret_cast<uint32_t>(msg))
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+static const char* AtcErrorInfo(uint32_t error) {
+  switch (error) {
+#define LITE_ATC_ERROR_INFO(xx) \
+  case xx:                      \
+    return #xx;                 \
+    break;
+    LITE_ATC_ERROR_INFO(ge::GRAPH_FAILED);         // 0xFFFFFFFF
+    LITE_ATC_ERROR_INFO(ge::GRAPH_PARAM_INVALID);  // 50331649
+#undef LITE_ATC_ERROR_INFO
+    default:
+      return "unknown error";
+      break;
+  }
+}
+
+static const char* AclErrorInfo(int error) {
+  switch (error) {
+#define LITE_ACL_ERROR_INFO(xx) \
+  case xx:                      \
+    return #xx;                 \
+    break;
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PARAM);                    // 100000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_UNINITIALIZE);                     // 100001
+    LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_INITIALIZE);                // 100002
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE);                     // 100003
+    LITE_ACL_ERROR_INFO(ACL_ERROR_WRITE_FILE);                       // 100004
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE_SIZE);                // 100005
+    LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_FILE);                       // 100006
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_MISSING_ATTR);                // 100007
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_ATTR_INVALID);                // 100008
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DUMP_CONFIG);              // 100009
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PROFILING_CONFIG);         // 100010
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_MODEL_ID);                 // 100011
+    LITE_ACL_ERROR_INFO(ACL_ERROR_DESERIALIZE_MODEL);                // 100012
+    LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_MODEL);                      // 100013
+    LITE_ACL_ERROR_INFO(ACL_ERROR_READ_MODEL_FAILURE);               // 100014
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_SIZE_INVALID);               // 100015
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_MISSING_ATTR);               // 100016
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_INPUT_NOT_MATCH);            // 100017
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_OUTPUT_NOT_MATCH);           // 100018
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_NOT_DYNAMIC);                // 100019
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_TYPE_NOT_MATCH);                // 100020
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_INPUT_NOT_MATCH);               // 100021
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_OUTPUT_NOT_MATCH);              // 100022
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_ATTR_NOT_MATCH);                // 100023
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_NOT_FOUND);                     // 100024
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_LOAD_FAILED);                   // 100025
+    LITE_ACL_ERROR_INFO(ACL_ERROR_UNSUPPORTED_DATA_TYPE);            // 100026
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FORMAT_NOT_MATCH);                 // 100027
+    LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED);      // 100028
+    LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_NOT_FOUND);                 // 100029
+    LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED);  // 100030
+    LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_ALREADY_REGISTERED);        // 100031
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_QUEUE_ID);                 // 100032
+    LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_SUBSCRIBE);                 // 100033
+    LITE_ACL_ERROR_INFO(ACL_ERROR_STREAM_NOT_SUBSCRIBE);             // 100034
+    LITE_ACL_ERROR_INFO(ACL_ERROR_THREAD_NOT_SUBSCRIBE);             // 100035
+    LITE_ACL_ERROR_INFO(ACL_ERROR_WAIT_CALLBACK_TIMEOUT);            // 100036
+    LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_FINALIZE);                  // 100037
+    LITE_ACL_ERROR_INFO(ACL_ERROR_NOT_STATIC_AIPP);                  // 100038
+    LITE_ACL_ERROR_INFO(ACL_ERROR_BAD_ALLOC);                        // 200000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_API_NOT_SUPPORT);                  // 200001
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DEVICE);                   // 200002
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MEMORY_ADDRESS_UNALIGNED);         // 200003
+    LITE_ACL_ERROR_INFO(ACL_ERROR_RESOURCE_NOT_MATCH);               // 200004
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_RESOURCE_HANDLE);          // 200005
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FEATURE_UNSUPPORTED);              // 200006
+    LITE_ACL_ERROR_INFO(ACL_ERROR_STORAGE_OVER_LIMIT);               // 300000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INTERNAL_ERROR);                   // 500000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FAILURE);                          // 500001
+    LITE_ACL_ERROR_INFO(ACL_ERROR_GE_FAILURE);                       // 500002
+    LITE_ACL_ERROR_INFO(ACL_ERROR_RT_FAILURE);                       // 500003
+    LITE_ACL_ERROR_INFO(ACL_ERROR_DRV_FAILURE);                      // 500004
+    LITE_ACL_ERROR_INFO(ACL_ERROR_PROFILING_FAILURE);                // 500005
+#undef LITE_ACL_ERROR_INFO
+    default:
+      return "unknown error";
+      break;
+  }
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -33,7 +33,7 @@ std::shared_ptr<hiai::AiModelMngerClient> Device::Load(
  // Check HiAI DDK version
  const char* ddk_version = model_client->GetVersion();
  if (ddk_version) {
-    LOG(INFO) << "[NPU] HiAI DDK version: " << ddk_version;
+    VLOG(3) << "[NPU] HiAI DDK version: " << ddk_version;
  } else {
    LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!";
  }

--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -38,17 +38,20 @@ CLRuntime::~CLRuntime() {
 }

 bool CLRuntime::Init() {
-  if (initialized_) {
+  if (is_cl_runtime_initialized_) {
    return true;
  }
  bool is_platform_init = InitializePlatform();
  bool is_device_init = InitializeDevice();
-  is_init_success_ = is_platform_init && is_device_init;
-  initialized_ = true;
-
+  LOG(INFO) << "is_platform_init:" << is_platform_init;
+  LOG(INFO) << "is_device_init:" << is_device_init;
+  if ((is_platform_init == true) && (is_device_init == true)) {
+    is_platform_device_init_success_ = true;
    context_ = CreateContext();
    command_queue_ = CreateCommandQueue(context());
-  return initialized_;
+    is_cl_runtime_initialized_ = true;
+  }
+  return is_cl_runtime_initialized_;
 }

 cl::Platform& CLRuntime::platform() {
@@ -64,7 +67,9 @@ cl::Context& CLRuntime::context() {
 }

 cl::Device& CLRuntime::device() {
-  CHECK(device_ != nullptr) << "device_ is not initialized!";
+  if (device_ == nullptr) {
+    LOG(ERROR) << "device_ is not initialized!";
+  }
  return *device_;
 }

@@ -150,6 +155,14 @@ GpuType CLRuntime::ParseGpuTypeFromDeviceName(std::string device_name) {
 }

 bool CLRuntime::InitializeDevice() {
+  VLOG(3) << "device_info_.size():" << device_info_.size();
+  for (auto i : device_info_) {
+    VLOG(3) << ">>> " << i.first << " " << i.second;
+  }
+  if (device_info_.size() > 0 && device_info_.size() <= 2) {
+    return false;
+  }
+  device_info_["PLACEHOLDER"] = 1;
  // ===================== BASIC =====================
  // CL_DEVICE_TYPE_GPU
  // CL_DEVICE_NAME
@@ -160,7 +173,7 @@ bool CLRuntime::InitializeDevice() {
  status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
  CL_CHECK_ERROR(status_);
  if (all_devices.empty()) {
-    LOG(FATAL) << "No OpenCL GPU device found!";
+    LOG(ERROR) << "No available OpenCL GPU device found!";
    return false;
  }
  device_ = std::make_shared<cl::Device>();
@@ -313,9 +326,6 @@ bool CLRuntime::InitializeDevice() {
 }

 std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
-  if (0 != device_info_.size()) {
-    return device_info_;
-  }
  InitializeDevice();
  return device_info_;
 }

--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/backends/opencl/cl_utility.h"
+#include "lite/backends/opencl/cl_wrapper.h"

 typedef enum {
  UNKNOWN = 0,
@@ -68,6 +69,28 @@ class CLRuntime {
 public:
  static CLRuntime* Global();

+  bool OpenCLAvaliableForDevice() {
+    bool opencl_lib_found = paddle::lite::CLWrapper::Global()->OpenclLibFound();
+    LOG(INFO) << "opencl_lib_found:" << opencl_lib_found;
+    if (opencl_lib_found == false) return false;
+
+    bool dlsym_success = paddle::lite::CLWrapper::Global()->DlsymSuccess();
+    LOG(INFO) << "dlsym_success:" << dlsym_success;
+    if (opencl_lib_found == false) return false;
+
+    InitializeDevice();
+    bool support_fp16 =
+        static_cast<bool>(device_info_["CL_DEVICE_EXTENSIONS_FP16"]);
+    LOG(INFO) << "support_fp16:" << support_fp16;
+    if (support_fp16 == false) return false;
+
+    is_device_avaliable_for_opencl_ =
+        dlsym_success && opencl_lib_found && support_fp16;
+    LOG(INFO) << "is_device_avaliable_for_opencl_:"
+              << is_device_avaliable_for_opencl_;
+    return is_device_avaliable_for_opencl_;
+  }
+
  bool Init();

  cl::Platform& platform();
@@ -85,7 +108,7 @@ class CLRuntime {

  bool BuildProgram(cl::Program* program, const std::string& options = "");

-  bool IsInitSuccess() { return is_init_success_; }
+  bool IsInitSuccess() { return is_platform_device_init_success_; }

  std::string cl_path() { return cl_path_; }

@@ -167,9 +190,11 @@ class CLRuntime {

  cl_int status_{CL_SUCCESS};

-  bool initialized_{false};
+  bool is_device_avaliable_for_opencl_{false};
+
+  bool is_cl_runtime_initialized_{false};

-  bool is_init_success_{false};
+  bool is_platform_device_init_success_{false};
 };

 }  // namespace lite

--- a/lite/backends/opencl/cl_wrapper.cc
+++ b/lite/backends/opencl/cl_wrapper.cc
@@ -19,14 +19,16 @@ limitations under the License. */

 namespace paddle {
 namespace lite {
+
 CLWrapper *CLWrapper::Global() {
  static CLWrapper wrapper;
  return &wrapper;
 }

 CLWrapper::CLWrapper() {
-  CHECK(InitHandle()) << "Fail to initialize the OpenCL library!";
-  InitFunctions();
+  opencl_lib_found_ = InitHandle();
+  CHECK(opencl_lib_found_) << "Fail to initialize the OpenCL library!";
+  dlsym_success_ = InitFunctions();
 }

 bool CLWrapper::InitHandle() {
@@ -68,15 +70,17 @@ bool CLWrapper::InitHandle() {
  }
 }

-void CLWrapper::InitFunctions() {
+bool CLWrapper::InitFunctions() {
  CHECK(handle_ != nullptr) << "The library handle can't be null!";
+  bool dlsym_success = true;

 #define PADDLE_DLSYM(cl_func)                                        \
  do {                                                               \
    cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func);            \
    if (cl_func##_ == nullptr) {                                     \
-      LOG(FATAL) << "Cannot find the " << #cl_func                   \
+      LOG(ERROR) << "Cannot find the " << #cl_func                   \
                 << " symbol in libOpenCL.so!";                      \
+      dlsym_success = false;                                         \
      break;                                                         \
    }                                                                \
    VLOG(4) << "Loaded the " << #cl_func << " symbol successfully."; \
@@ -137,6 +141,7 @@ void CLWrapper::InitFunctions() {
  PADDLE_DLSYM(clEnqueueCopyImage);

 #undef PADDLE_DLSYM
+  return dlsym_success;
 }

 }  // namespace lite

--- a/lite/backends/opencl/cl_wrapper.h
+++ b/lite/backends/opencl/cl_wrapper.h
@@ -508,13 +508,20 @@ class CLWrapper final {
    return clEnqueueCopyImage_;
  }

+  bool OpenclLibFound() { return opencl_lib_found_; }
+
+  bool DlsymSuccess() { return dlsym_success_; }
+
 private:
  CLWrapper();
  CLWrapper(const CLWrapper &) = delete;
  CLWrapper &operator=(const CLWrapper &) = delete;
  bool InitHandle();
-  void InitFunctions();
+  bool InitFunctions();
+  bool opencl_lib_found_{true};
+  bool dlsym_success_{true};
  void *handle_{nullptr};
+
  clGetPlatformIDsType clGetPlatformIDs_{nullptr};
  clGetPlatformInfoType clGetPlatformInfo_{nullptr};
  clBuildProgramType clBuildProgram_{nullptr};

--- a/lite/backends/xpu/debug.h
+++ b/lite/backends/xpu/debug.h
@@ -19,7 +19,7 @@
 #include <memory>
 #include <string>
 #include <type_traits>
-#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/backends/xpu/target_wrapper.h"

 namespace paddle {
 namespace lite {
@@ -82,8 +82,8 @@ void DumpXPUMem(const T* ptr,
                size_t item_per_line = 30) {
  size_t after_stride_len = (len + stride - 1) / stride;
  std::unique_ptr<T[]> cpu_mem(new T[len]);
-  xpu_memcpy(
-      cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  XPU_CALL(xpu_memcpy(
+      cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST));
  std::unique_ptr<T[]> after_stride(new T[after_stride_len]);
  for (size_t i = 0; i < after_stride_len; ++i) {
    after_stride[i] = cpu_mem[i * stride];

--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
@@ -19,11 +19,11 @@ namespace lite {

 void* TargetWrapperXPU::Malloc(size_t size) {
  void* ptr{nullptr};
-  xpu_malloc(&ptr, size);
+  XPU_CALL(xpu_malloc(&ptr, size));
  return ptr;
 }

-void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
+void TargetWrapperXPU::Free(void* ptr) { XPU_CALL(xpu_free(ptr)); }

 void TargetWrapperXPU::MemcpySync(void* dst,
                                  const void* src,
@@ -31,10 +31,10 @@ void TargetWrapperXPU::MemcpySync(void* dst,
                                  IoDirection dir) {
  switch (dir) {
    case IoDirection::HtoD:
-      xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
+      XPU_CALL(xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE));
      break;
    case IoDirection::DtoH:
-      xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
+      XPU_CALL(xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST));
      break;
    default:
      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
@@ -49,7 +49,7 @@ XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size,
  } else {
    ptr = TargetWrapperXPU::Malloc(size);
  }
-  CHECK(ptr != nullptr);
+  CHECK(ptr != nullptr) << "size = " << size << ", use_l3 = " << use_l3;
  return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3));
 }


--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
@@ -16,11 +16,23 @@

 #include <memory>                                 // std::unique_ptr
 #include "lite/backends/xpu/xpu_header_sitter.h"  // xpu_free
-#include "lite/core/target_wrapper.h"
+#include "lite/core/target_wrapper.h"             // TargetWrapper
+#include "lite/utils/cp_logging.h"                // CHECK_EQ
+
+#define XPU_CALL(func)                                        \
+  {                                                           \
+    auto e = (func);                                          \
+    CHECK_EQ(e, 0) << "XPU: (" << #func << ") returns " << e; \
+  }

 namespace paddle {
 namespace lite {

+// MAX(lod.size()) = 64
+const int XPU_MAX_LOD_SIZE = 64;
+// MAX(lod[i + 1] - lod[i]) = 512
+const int XPU_MAX_LOD_SEQ_LEN = 512;
+
 using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;

 struct XPUScratchPad {
@@ -33,7 +45,7 @@ struct XPUScratchPad {
 struct XPUScratchPadDeleter {
  void operator()(XPUScratchPad* sp) const {
    if (!sp->is_l3_) {
-      xpu_free(sp->addr_);
+      XPU_CALL(xpu_free(sp->addr_));
    }
    delete sp;
  }
@@ -55,7 +67,7 @@ class TargetWrapper<TARGET(kXPU)> {
                         size_t size,
                         IoDirection dir);

-  static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = true);
+  static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = false);

  static xdnn::Context* GetRawContext() {
    if (tls_raw_ctx_ == nullptr) {
@@ -77,11 +89,10 @@ class TargetWrapper<TARGET(kXPU)> {
  static void SetDev(int dev_no = 0) {
    const char* dev_env = getenv("LITE_XPU_DEV");
    if (dev_env) {
-      xpu_set_device(atoi(dev_env));
-      return;
+      dev_no = atoi(dev_env);
    }

-    xpu_set_device(dev_no);
+    XPU_CALL(xpu_set_device(dev_no));
  }

  static std::string multi_encoder_precision;  // NOLINT

--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)

 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -24,7 +24,7 @@ namespace arena {
 void TestCase::CreateInstruction() {
  std::shared_ptr<lite::OpLite> op = nullptr;
  static const std::set<TargetType> subgraph_op_supported_targets(
-      {TARGET(kNPU), TARGET(kXPU)});
+      {TARGET(kNPU), TARGET(kXPU), TARGET(kHuaweiAscendNPU)});
  bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) !=
                            subgraph_op_supported_targets.end();
 #if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
@@ -32,25 +32,35 @@ void TestCase::CreateInstruction() {
 #endif
  if (enable_subgraph_op) {
    // Create a new block desc to wrap the original op desc
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
    int sub_block_idx = 0;
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
    sub_block_desc->ClearOps();
    sub_block_desc->ClearVars();
-    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
-    *sub_block_op_desc = *op_desc_;
+    auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_op_desc = *op_desc_;
    // Add the block desc into the subgraph op which used to replace the
    // original op
    op_desc_.reset(new cpp::OpDesc());
    op_desc_->SetType("subgraph");
    op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
-    auto in_names = sub_block_op_desc->input_vars();
-    auto out_names = sub_block_op_desc->output_vars();
+    auto in_names = sub_op_desc->input_vars();
+    auto out_names = sub_op_desc->output_vars();
    op_desc_->SetInput("Inputs", in_names);
    op_desc_->SetOutput("Outputs", out_names);
-    op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names);
+    // filter only data op (not const op by persisiable)
+    std::vector<std::string> in_data_names;
+    for (auto name : in_names) {
+      if (!(inst_scope_->FindTensor(name)->persistable())) {
+        in_data_names.push_back(name);
+      }
+    }
+    op_desc_->SetAttr<std::vector<std::string>>("input_data_names",
+                                                in_data_names);
    op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
    op = LiteOpRegistry::Global().Create(op_desc().Type());
-    static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
+    static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
+        sub_program_desc);
  } else {
    op = LiteOpRegistry::Global().Create(op_desc().Type());
  }
@@ -60,7 +70,7 @@ void TestCase::CreateInstruction() {
  // filter out the target kernel
  CHECK(!kernels.empty()) << "No kernel found for place "
                          << place_.DebugString();
-  auto it = std::remove_if(
+  auto it = std::find_if(
      kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& k) {
        return k->alias() == alias_;
      });
@@ -234,19 +244,6 @@ bool TestCase::CheckPrecision(const std::string& var_name,
  return success;
 }

-TestCase::~TestCase() {
-  if (op_desc_->Type() == "subgraph") {
-    // Release the subblock desc of Subgraph op
-    auto subgraph_op = const_cast<operators::SubgraphOp*>(
-        static_cast<const operators::SubgraphOp*>(instruction_->op()));
-    CHECK(subgraph_op);
-    auto sub_block_desc = subgraph_op->GetSubBlock();
-    if (sub_block_desc) {
-      delete sub_block_desc;
-    }
-  }
-}
-
 }  // namespace arena
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -46,7 +46,7 @@ class TestCase {
        base_scope_(new Scope) {
    ctx_ = ContextScheduler::Global().NewContext(place_.target);
  }
-  virtual ~TestCase();
+  virtual ~TestCase() {}

  void Prepare() {
    PrepareData();

--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -17,8 +17,13 @@
 namespace paddle {
 namespace lite {

-#ifdef LITE_WITH_NPU
-std::string Context<TargetType::kNPU>::subgraph_model_cache_dir_{""};  // NOLINT
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+thread_local std::string
+    Context<TargetType::kHuaweiAscendNPU>::subgraph_model_cache_dir_{
+        ""};  // NOLINT
+thread_local int
+    Context<TargetType::kHuaweiAscendNPU>::huawei_ascend_device_id_{
+        0};  // NOLINT
 #endif

 #ifdef LITE_WITH_MLU

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -39,6 +39,7 @@
 #include <utility>
 #include <vector>
 #include "lite/core/device_info.h"
+#include "lite/core/scope.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
 #include "lite/utils/all.h"
@@ -61,6 +62,7 @@ using FPGAContext = Context<TargetType::kFPGA>;
 using BMContext = Context<TargetType::kBM>;
 using MLUContext = Context<TargetType::kMLU>;
 using RKNPUContext = Context<TargetType::kRKNPU>;
+using HuaweiAscendNPUContext = Context<TargetType::kHuaweiAscendNPU>;

 template <>
 class Context<TargetType::kHost> {
@@ -84,6 +86,35 @@ class Context<TargetType::kNPU> {
  NPUContext& operator=(const NPUContext& ctx) {}
  std::string name() const { return "NPUContext"; }

+  static void SetSubgraphModelCacheDir(Scope* scope,
+                                       std::string subgraph_model_cache_dir) {
+    auto var = scope->Var("SUBGRAPH_MODEL_CACHE_DIR");
+    CHECK(var);
+    auto data = var->GetMutable<std::string>();
+    CHECK(data);
+    *data = subgraph_model_cache_dir;
+  }
+  static std::string SubgraphModelCacheDir(Scope* scope) {
+    auto var = scope->FindVar("SUBGRAPH_MODEL_CACHE_DIR");
+    if (!var) return "";
+    return var->Get<std::string>();
+  }
+};
+#endif
+
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+template <>
+class Context<TargetType::kHuaweiAscendNPU> {
+ public:
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(HuaweiAscendNPUContext* ctx) {}
+
+  HuaweiAscendNPUContext& operator=(const HuaweiAscendNPUContext& ctx) {
+    return *this;
+  }
+  std::string name() const { return "HuaweiAscendNPUContext"; }
+
  static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) {
    subgraph_model_cache_dir_ = subgraph_model_cache_dir;
  }
@@ -91,8 +122,14 @@ class Context<TargetType::kNPU> {
    return subgraph_model_cache_dir_;
  }

+  static void SetHuaweiAscendDeviceID(int huawei_ascend_device_id) {
+    huawei_ascend_device_id_ = huawei_ascend_device_id;
+  }
+  static int HuaweiAscendDeviceID() { return huawei_ascend_device_id_; }
+
 private:
-  static std::string subgraph_model_cache_dir_;
+  static thread_local std::string subgraph_model_cache_dir_;
+  static thread_local int huawei_ascend_device_id_;
 };
 #endif

@@ -385,6 +422,13 @@ class ContextScheduler {
            &ctx->As<NPUContext>());
        break;
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+      case TARGET(kHuaweiAscendNPU):
+        kernel_contexts_[TargetType::kHuaweiAscendNPU]
+            .As<HuaweiAscendNPUContext>()
+            .CopySharedTo(&ctx->As<HuaweiAscendNPUContext>());
+        break;
+#endif
 #ifdef LITE_WITH_APU
      case TARGET(kAPU):
        kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
@@ -466,6 +510,9 @@ class ContextScheduler {
 #ifdef LITE_WITH_NPU
    InitContext<TargetType::kNPU, NPUContext>();
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+    InitContext<TargetType::kHuaweiAscendNPU, HuaweiAscendNPUContext>();
+#endif
 #ifdef LITE_WITH_APU
    InitContext<TargetType::kAPU, APUContext>();
 #endif

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -18,6 +18,7 @@ lite_cc_library(mir_passes
      fusion/conv_activation_fuse_pass.cc
      fusion/var_conv_2d_activation_fuse_pass.cc
      fusion/conv_bn_fuse_pass.cc
+      fusion/conv_conv_fuse_pass.cc
      fusion/elementwise_add_activation_fuse_pass.cc
      fusion/quant_dequant_fuse_pass.cc
      fusion/sequence_pool_concat_fuse_pass.cc
@@ -32,6 +33,7 @@ lite_cc_library(mir_passes
      elimination/identity_dropout_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc
      elimination/remove_tf_redundant_ops_pass.cc
+      elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
      static_kernel_pick_pass.cc
      variable_place_inference_pass.cc
      type_target_cast_pass.cc

--- a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
+++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
+#include <algorithm>
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+// Remove all of the unused nodes from the contorl flow op and update the inputs
+// and outputs of the op info The unused nodes are defined as the nodes which
+// are only linked to the control flow op nodes but nerver linked to the other
+// op nodes.
+//
+// For example:
+// graph[0]: main block
+//                      in_x
+//             in_f      |   in_z(unused node)
+//                  \    |    /
+//                   \   |   /
+//        in_w ------- while ------- in_y(unused_node)
+//                    /  |
+//                   /   |
+// (unused node)out_y    |
+//                     out_x
+//
+// graph[1]: sub block
+//                     in_x
+//                       |
+//                       |
+//                     conv2d----in_f
+//                       |
+//                       |
+//                      fc ------in_w
+//                       |
+//                       |
+//                     softmax
+//                       |
+//                       |
+//                     out_x
+//
+// After the pass is applied:
+//                      in_x
+//             in_f      |
+//                  \    |
+//                   \   |
+//        in_w ------- while
+//                       |
+//                       |
+//                       |
+//                     out_x
+
+// Remove the var node from var2rm if it is recursively referred to any op in
+// the subblock
+void CollectUnusedInputOutputNodes(
+    int block_idx,
+    std::vector<std::unique_ptr<mir::SSAGraph>>* graphs,
+    const std::unordered_set<std::string>& control_flow_op_types,
+    std::unordered_map<std::string, Node*>* in_vars2rm,
+    std::unordered_map<std::string, Node*>* out_vars2rm) {
+  auto block_size = graphs->size();
+  for (auto& op_node : (*graphs)[block_idx]->StmtTopologicalOrder()) {
+    if (!op_node->IsStmt()) continue;
+    auto op_info = op_node->AsStmt().op_info();
+    auto op_type = op_info->Type();
+    if (control_flow_op_types.count(op_type)) {
+      int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
+      CHECK(block_idx >= 0 && block_idx < block_size);
+      CollectUnusedInputOutputNodes(sub_block_idx,
+                                    graphs,
+                                    control_flow_op_types,
+                                    in_vars2rm,
+                                    out_vars2rm);
+    } else {
+      for (auto& var_node : op_node->inlinks) {
+        auto& var_name = var_node->AsArg().name;
+        if (in_vars2rm->count(var_name)) {
+          in_vars2rm->erase(var_name);
+        }
+      }
+      for (auto& var_node : op_node->outlinks) {
+        auto& var_name = var_node->AsArg().name;
+        // Tensor array may be only used as the output vars in the sublock
+        if (in_vars2rm->count(var_name)) {
+          in_vars2rm->erase(var_name);
+        }
+        if (out_vars2rm->count(var_name)) {
+          out_vars2rm->erase(var_name);
+        }
+      }
+    }
+  }
+}
+
+// Remove the unused var nodes from the graph and update the op_info of the
+// control flow op
+void RemoveNodesFromGraphAndUpdateOpInfo(
+    SSAGraph* graph,
+    Node* op_node,
+    const std::unordered_map<std::string, Node*>& in_vars2rm,
+    const std::unordered_map<std::string, Node*>& out_vars2rm) {
+  auto op_info = op_node->AsStmt().mutable_op_info();
+  auto op_type = op_info->Type();
+  // Unlink the in_vars2rm and out_vars2rm from the control flow op node, and
+  // remove them if nerver used.
+  for (auto& var_node : in_vars2rm) {
+    VLOG(3) << "in var node '" << var_node.first << "' is unlinked to "
+            << op_type;
+    RemoveDirectedLink(var_node.second, op_node);
+  }
+  for (auto& var_node : out_vars2rm) {
+    VLOG(3) << "out var node '" << var_node.first << "' is unlinked from "
+            << op_type;
+    RemoveDirectedLink(op_node, var_node.second);
+    // Unlink from all of the out op nodes.
+    std::unordered_set<Node*> out_op_nodes;
+    for (auto* out_op_node : var_node.second->outlinks) {
+      if (!out_op_nodes.count(out_op_node)) {
+        out_op_nodes.insert(out_op_node);
+      }
+    }
+    for (auto* out_op_node : out_op_nodes) {
+      RemoveDirectedLink(var_node.second, out_op_node);
+    }
+  }
+  // Remove the unused nodes from the graph if their inlinks and outlinks are
+  // empty
+  std::unordered_set<const Node*> removed_var_nodes;
+  for (auto& var_node : in_vars2rm) {
+    if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
+        !removed_var_nodes.count(var_node.second)) {
+      removed_var_nodes.insert(var_node.second);
+      graph->RemoveNode(var_node.second);
+      VLOG(3) << "in var node " << var_node.first << " is removed";
+    }
+  }
+  for (auto& var_node : out_vars2rm) {
+    if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
+        !removed_var_nodes.count(var_node.second)) {
+      removed_var_nodes.insert(var_node.second);
+      graph->RemoveNode(var_node.second);
+      VLOG(3) << "out var node " << var_node.first << " is removed";
+    }
+  }
+  // Update the op info of the control flow op
+  for (auto& input : *op_info->mutable_inputs()) {
+    for (auto var = input.second.begin(); var != input.second.end();) {
+      if (in_vars2rm.count(*var)) {
+        var = input.second.erase(var);
+      } else {
+        ++var;
+      }
+    }
+  }
+  for (auto& output : *op_info->mutable_outputs()) {
+    for (auto var = output.second.begin(); var != output.second.end();) {
+      if (out_vars2rm.count(*var)) {
+        var = output.second.erase(var);
+      } else {
+        ++var;
+      }
+    }
+  }
+}
+
+void ControlFlowOpUnusedInputsAndOutputsEliminatePass::SetAllGraphs(
+    std::vector<std::unique_ptr<mir::SSAGraph>>* graphs) {
+  CHECK(graphs && !graphs->empty());
+  graphs_ = graphs;
+}
+
+void ControlFlowOpUnusedInputsAndOutputsEliminatePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  // Remove the unused input and output nodes from the control flow op nodes
+  // Which are only linked to the control flow op nodes but nerver linked to the
+  // other op nodes
+  const std::unordered_set<std::string> control_flow_op_types = {
+      "while", "conditional_block"};
+  auto block_size = graphs_->size();
+  for (auto& op_node : graph->StmtTopologicalOrder()) {
+    if (!op_node->IsStmt()) continue;
+    auto op_info = op_node->AsStmt().mutable_op_info();
+    auto op_type = op_info->Type();
+    if (!control_flow_op_types.count(op_type)) continue;
+    int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
+    CHECK(sub_block_idx >= 0 && sub_block_idx < block_size);
+    // Initialize the unused nodes with all of the input and output nodes
+    std::unordered_map<std::string, Node *> in_vars2rm, out_vars2rm;
+    for (auto* var_node : op_node->inlinks) {
+      auto& var_name = var_node->AsArg().name;
+      if (!in_vars2rm.count(var_name)) {
+        in_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
+      }
+    }
+    for (auto* var_node : op_node->outlinks) {
+      auto& var_name = var_node->AsArg().name;
+      if (!out_vars2rm.count(var_name)) {
+        out_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
+      }
+    }
+    // Remove the nodes which used in subblock recursively, and the remaining
+    // nodes are the unused one.
+    CollectUnusedInputOutputNodes(sub_block_idx,
+                                  graphs_,
+                                  control_flow_op_types,
+                                  &in_vars2rm,
+                                  &out_vars2rm);
+    if (in_vars2rm.size() > 0 || out_vars2rm.size() > 0) {
+      // Remove the unused nodes from graph, and update the op info of the
+      // control flow op
+      RemoveNodesFromGraphAndUpdateOpInfo(
+          graph.get(), op_node, in_vars2rm, out_vars2rm);
+    }
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(
+    control_flow_op_unused_inputs_and_outputs_eliminate_pass,
+    paddle::lite::mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass)
+    .BindTargets({TARGET(kNPU)});
--- a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h
+++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ControlFlowOpUnusedInputsAndOutputsEliminatePass : public mir::StmtPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph> &graph) override;
+  void SetAllGraphs(std::vector<std::unique_ptr<mir::SSAGraph>> *graphs);
+
+ private:
+  std::vector<std::unique_ptr<mir::SSAGraph>> *graphs_;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -16,6 +16,9 @@ lite_cc_library(fuse_var_conv_activation
 lite_cc_library(fuse_conv_bn
        SRCS conv_bn_fuser.cc
        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_conv_conv
+        SRCS conv_conv_fuser.cc
+        DEPS pattern_matcher_high_api)     
 lite_cc_library(fuse_elementwise_add_activation
        SRCS elementwise_add_activation_fuser.cc
        DEPS pattern_matcher_high_api)
@@ -42,6 +45,7 @@ set(mir_fusers
    fuse_conv_activation
    fuse_var_conv_activation
    fuse_conv_bn
+    fuse_conv_conv
    fuse_quant_dequant
    fuse_elementwise_add_activation
    fuse_transpose_softmax_transpose

--- a/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -383,10 +383,10 @@ class XPUSingleEncoderFuser : public FuseBase {
    op_desc.SetAttr<std::string>("act_type", act_type_);

    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    auto* single_encoder_stmt = matched.at("q_mul")->stmt();
    fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());

--- a/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
@@ -373,10 +373,10 @@ class XPUResNetCbamBlock0Fuser : public FuseBase {
    auto block0_stmt = matched.at("left_conv1")->stmt();
    // block0_stmt->ResetOp(op_desc, graph->valid_places());
    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
    block0_stmt->SetOp(fake_subgraph_op);
@@ -693,10 +693,10 @@ class XPUResNetCbamBlock1Fuser : public FuseBase {

    auto block1_stmt = matched.at("right_conv1")->stmt();
    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
    block1_stmt->SetOp(fake_subgraph_op);
@@ -932,10 +932,10 @@ class XPUResNetCbamBlock2Fuser : public FuseBase {
        << "Y of last fc must have been transposed";

    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    fake_subgraph_op->Attach(op_desc, scope);
    fake_subgraph_op->SetValidPlaces(block2_stmt->op()->valid_places());
    block2_stmt->SetOp(fake_subgraph_op);

--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
@@ -315,10 +315,10 @@ class XPUResNetBlock0Fuser : public FuseBase {
    auto block0_stmt = matched.at("left_conv1")->stmt();
    // block0_stmt->ResetOp(op_desc, graph->valid_places());
    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
    block0_stmt->SetOp(fake_subgraph_op);
@@ -577,10 +577,10 @@ class XPUResNetBlock1Fuser : public FuseBase {

    auto block1_stmt = matched.at("right_conv1")->stmt();
    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
    block1_stmt->SetOp(fake_subgraph_op);

--- a/lite/core/mir/fusion/conv_conv_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_conv_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/conv_conv_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/conv_conv_fuser.h"
+#include "lite/core/mir/graph_visualize_pass.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  // initialze fuser params
+  std::vector<bool> conv_has_bias_cases{true, false};
+  std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
+  bool has_arm = false;
+  for (auto& place : graph->valid_places()) {
+    if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) {
+      has_arm = true;
+      break;
+    }
+  }
+  if (!has_arm) {
+    return;
+  }
+  // only support fp32 fusion
+  for (auto conv_has_bias0 : conv_has_bias_cases) {
+    for (auto conv_has_bias1 : conv_has_bias_cases) {
+      for (auto conv_type0 : conv_type_cases) {
+        for (auto conv_type1 : conv_type_cases) {
+          VLOG(4) << "conv_has_bias0:" << conv_has_bias0
+                  << " conv_type0:" << conv_type0;
+          VLOG(4) << "conv_has_bias1:" << conv_has_bias1
+                  << " conv_type1:" << conv_type1;
+          fusion::ConvConvFuser fuser(
+              conv_type0, conv_type1, conv_has_bias0, conv_has_bias1);
+          fuser(graph.get());
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_conv_conv_fuse_pass, paddle::lite::mir::ConvConvFusePass)
+    .BindTargets({TARGET(kARM)});
--- a/lite/kernels/xpu/utils.h
+++ b/lite/kernels/xpu/utils.h
@@ -14,18 +14,19 @@

 #pragma once

-#include "lite/backends/xpu/xpu_header_sitter.h"
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"

 namespace paddle {
 namespace lite {
-namespace kernels {
-namespace xpu {
+namespace mir {

-struct XPUFreeDeleter {
-  void operator()(void* p) const { xpu_free(p); }
+class ConvConvFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };

-}  // namespace xpu
-}  // namespace kernels
+}  // namespace mir
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/mir/fusion/conv_conv_fuser.cc
+++ b/lite/core/mir/fusion/conv_conv_fuser.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/conv_conv_fuser.h"
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void ConvConvFuser::BuildPattern() {
+  auto* conv_input0 = VarNode("conv_input0")
+                          ->assert_is_op_input(conv_type0_, "Input")
+                          ->AsInput();
+  auto* conv_weight0 = VarNode("conv_weight0")
+                           ->assert_is_op_input(conv_type0_, "Filter")
+                           ->AsInput();
+  auto* conv0 = OpNode("conv2d0", conv_type0_)->assert_is_op(conv_type0_);
+  auto* conv_out0 = VarNode("conv_out0")
+                        ->assert_is_op_output(conv_type0_, "Output")
+                        ->assert_is_op_input(conv_type1_, "Input")
+                        ->AsIntermediate();
+
+  auto* conv_weight1 = VarNode("conv_weight1")
+                           ->assert_is_op_input(conv_type1_, "Filter")
+                           ->AsIntermediate();
+  auto* conv1 = OpNode("conv2d1", conv_type1_)
+                    ->assert_is_op(conv_type1_)
+                    ->assert_op_attr<int>("groups", 1)
+                    ->AsIntermediate();
+
+  auto* conv_out1 = VarNode("conv_out1")
+                        ->assert_is_op_output(conv_type1_, "Output")
+                        ->AsOutput();
+
+  if (conv_has_bias0_) {
+    if (conv_has_bias1_) {
+      auto* conv_bias0 = VarNode("conv_bias0")
+                             ->assert_is_op_input(conv_type0_, "Bias")
+                             ->AsIntermediate();
+      auto* conv_bias1 = VarNode("conv_bias1")
+                             ->assert_is_op_input(conv_type1_, "Bias")
+                             ->AsInput();
+      conv0->LinksFrom({conv_input0, conv_weight0, conv_bias0})
+          .LinksTo({conv_out0});
+      conv1->LinksFrom({conv_out0, conv_weight1, conv_bias1})
+          .LinksTo({conv_out1});
+    } else {
+      auto* conv_bias0 = VarNode("conv_bias0")
+                             ->assert_is_op_input(conv_type0_, "Bias")
+                             ->AsIntermediate();
+      conv0->LinksFrom({conv_input0, conv_weight0, conv_bias0})
+          .LinksTo({conv_out0});
+      conv1->LinksFrom({conv_out0, conv_weight1}).LinksTo({conv_out1});
+    }
+  } else {
+    conv0->LinksFrom({conv_input0, conv_weight0}).LinksTo({conv_out0});
+    if (conv_has_bias1_) {
+      auto* conv_bias1 = VarNode("conv_bias1")
+                             ->assert_is_op_input(conv_type1_, "Bias")
+                             ->AsInput();
+      conv1->LinksFrom({conv_out0, conv_weight1, conv_bias1})
+          .LinksTo({conv_out1});
+    } else {
+      conv1->LinksFrom({conv_out0, conv_weight1}).LinksTo({conv_out1});
+    }
+  }
+}
+
+void ConvConvFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
+  auto conv_instruct = matched.at("conv2d0")->stmt();
+  auto conv_op_desc = conv_instruct->mutable_op_info();
+  auto conv = conv_instruct->op();
+  auto* scope = conv->scope();
+  auto conv_op_desc1 = matched.at("conv2d1")->stmt()->mutable_op_info();
+
+  // conv0
+  auto weight0_t = scope->FindVar(matched.at("conv_weight0")->arg()->name)
+                       ->GetMutable<lite::Tensor>();
+
+  // conv1
+  auto weight1_t = scope->FindVar(matched.at("conv_weight1")->arg()->name)
+                       ->GetMutable<lite::Tensor>();
+  // auto groups0 = conv_op_desc->GetAttr<int>("groups");
+  auto groups1 = conv_op_desc1->GetAttr<int>("groups");
+  auto strides1 = conv_op_desc1->GetAttr<std::vector<int>>("strides");
+  auto paddings1 = conv_op_desc1->GetAttr<std::vector<int>>("paddings");
+  auto dilations1 = conv_op_desc1->GetAttr<std::vector<int>>("dilations");
+
+  bool enable0_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
+  bool enable1_int8 = conv_op_desc1->HasAttr("enable_int8") ? true : false;
+  int kw = weight1_t->dims()[2];
+  int kh = weight1_t->dims()[3];
+  if (!(kw == 1 && kh == 1)) {
+    return;
+  }
+  CHECK_EQ(enable0_int8, enable1_int8) << "The Conv compute type must be same";
+  CHECK_EQ(groups1, 1) << "The groups of weight1_dim must be 1";
+  CHECK_EQ(weight0_t->dims()[0], weight1_t->dims()[1])
+      << "weight0_dims[0] == weight1_dim[1]";
+  for (int i = 0; i < strides1.size(); i++) {
+    CHECK_EQ(strides1[i], 1) << "strides[" << i << "]: " << strides1[i]
+                             << " must be 1";
+  }
+  for (int i = 0; i < paddings1.size(); i++) {
+    CHECK_EQ(paddings1[i], 0) << "paddings1[" << i << "]: " << paddings1[i]
+                              << " must be 0";
+  }
+  for (int i = 0; i < dilations1.size(); i++) {
+    CHECK_EQ(dilations1[i], 1) << "dilations1[" << i << "]: " << dilations1[i]
+                               << " must be 1";
+  }
+  // comupte new_wight and new bias
+  ///////////////////////////////////////////////////////////////////////////////
+  // Compute ConvConvFuser
+  // Before fusion
+  //
+  //   conv(x) = conv(x) = kx + z = y
+  //   conv(y) = ay + b
+  //
+  // After fusion:
+  //
+  //   conv(conv(x)) = a(kx + z) + b = akx + az + b
+  //
+  //   new_weights = ak
+  //   new_bias = az + b
+  ///////////////////////////////////////////////////////////////////////////////
+  if (enable0_int8) {
+    LOG(FATAL) << "it doesn't support";
+    return;
+  } else {
+    // compute new conv_weight
+    Tensor weight_tensor;
+    auto in_dims = weight0_t->dims();
+    auto weight_dims = weight1_t->dims();
+    const float* din = weight0_t->data<float>();
+    const float* weights = weight1_t->data<float>();
+    int oc0 = in_dims[0];
+    int ic = in_dims[1];
+    int ih = in_dims[2];
+    int iw = in_dims[3];
+    int oc = weight_dims[0];
+    weight_tensor.Resize({oc, ic, ih, iw});
+    float* dout = weight_tensor.mutable_data<float>();
+    ComputeNewWeight(dout, din, weights, oc0, ic, ih, iw, oc);
+    weight0_t->CopyDataFrom(weight_tensor);
+  }
+  // compute new conv_bias
+  if (conv_has_bias0_ && conv_op_desc->HasInput("Bias") &&
+      conv_op_desc->Input("Bias").size() > 0) {
+    auto bias_t0 = scope->FindVar(matched.at("conv_bias0")->arg()->name)
+                       ->GetMutable<lite::Tensor>();
+    if (conv_has_bias1_ && conv_op_desc1->HasInput("Bias") &&
+        conv_op_desc1->Input("Bias").size() > 0) {
+      auto bias_t1 = scope->FindVar(matched.at("conv_bias1")->arg()->name)
+                         ->GetMutable<lite::Tensor>();
+      Tensor bias;
+      bias.CopyDataFrom(*bias_t1);
+      auto bias_data = bias.mutable_data<float>();
+      ComputeNewBias(bias_data, bias_t0, weight1_t, bias_t1);
+      bias_t1->CopyDataFrom(bias);
+      conv_op_desc->SetInput(
+          "Bias", {matched.at("conv_bias1")->arg()->name});  // conv_bias
+      IR_NODE_LINK_TO(matched.at("conv_bias1"), matched.at("conv2d0"));
+    } else {
+      Tensor bias;
+      auto weight_dims = weight1_t->dims();
+      bias.Resize({weight_dims[0]});
+      auto bias_d = bias.mutable_data<float>();
+      ComputeNewBias(bias_d, bias_t0, weight1_t, nullptr);
+      bias_t0->CopyDataFrom(bias);
+      conv_op_desc->SetInput(
+          "Bias", {matched.at("conv_bias0")->arg()->name});  // conv_bias
+    }
+  } else {
+    if (conv_has_bias1_ && conv_op_desc1->HasInput("Bias") &&
+        conv_op_desc1->Input("Bias").size() > 0) {
+      conv_op_desc->SetInput(
+          "Bias", {matched.at("conv_bias1")->arg()->name});  // conv_bias
+      IR_NODE_LINK_TO(matched.at("conv_bias1"), matched.at("conv2d0"));
+    }
+  }
+  conv_op_desc->SetType(conv_type0_);
+  conv_op_desc->SetInput("Input", {matched.at("conv_input0")->arg()->name});
+  conv_op_desc->SetInput("Filter", {matched.at("conv_weight0")->arg()->name});
+  conv_op_desc->SetOutput("Output", {matched.at("conv_out1")->arg()->name});
+
+  auto update_conv_desc = *conv_instruct->mutable_op_info();
+  conv_instruct->ResetOp(update_conv_desc, graph->valid_places());
+
+  IR_OP_VAR_LINK(matched.at("conv2d0"), matched.at("conv_out1"));
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/conv_conv_fuser.h
+++ b/lite/core/mir/fusion/conv_conv_fuser.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class ConvConvFuser : public FuseBase {
+ public:
+  explicit ConvConvFuser(const std::string& conv_type0,
+                         const std::string& conv_type1,
+                         const bool conv_has_bias0,
+                         const bool conv_has_bias1)
+      : conv_type0_(conv_type0),
+        conv_type1_(conv_type1),
+        conv_has_bias0_(conv_has_bias0),
+        conv_has_bias1_(conv_has_bias1) {}
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  void ComputeNewWeight(float* dout,
+                        const float* din,
+                        const float* weights,
+                        int oc0,
+                        int ic,
+                        int ih,
+                        int iw,
+                        int oc1) {
+    // input conv_weight0_t weights conv_weight1_t
+    // output weight_tensor
+    // ksize = 1
+    int in_size = ih * iw;
+    int in_channel_size = ic * in_size;
+    // out = w1[j, i, ih, iw] * w2[k, j, kw, kh]
+    // out_dim = [oc1, ic, kh, kw], din_dim = [oc0, ic, kh, kw]
+    // weight_dim = [oc1, oc0, kh, kw]
+    for (int k = 0; k < oc1; k++) {
+      const float* weights_ptr = weights + k * oc0;
+      float* out_ptr = dout + k * in_channel_size;
+      for (int c = 0; c < ic; c++) {
+        float* out_ptr_channel = out_ptr + c * in_size;
+        const float* din_ptr = din + c * in_size;
+        for (int i = 0; i < in_size; i++) {
+          float sum = 0.f;
+          for (int j = 0; j < oc0; j++) {
+            sum += din_ptr[j * in_channel_size] * weights_ptr[j];
+          }
+          *out_ptr_channel++ = sum;
+        }
+      }
+    }
+  }
+
+  void ComputeNewBias(float* dout,
+                      Tensor* bias0_tensor,
+                      Tensor* weight_tensor,
+                      Tensor* bias1_tensor) {
+    // input bias0_tensor weight_tensor bias1_tensor
+    // output bias_tensor
+    auto in_dims = bias0_tensor->dims();
+    auto weight_dims = weight_tensor->dims();
+    const float* din = bias0_tensor->data<float>();
+    const float* weights = weight_tensor->data<float>();
+    int ic = in_dims[0];
+    int oc = weight_dims[0];
+    // out_k = b0[num, j, 1, 1] * w2[k, j, 1, 1]
+    if (bias1_tensor) {
+      const float* din2 = bias1_tensor->data<float>();
+      for (int k = 0; k < oc; k++) {
+        const float* weights_ptr = weights + k * ic;
+        float sum = 0.f;
+        for (int j = 0; j < ic; j++) {
+          sum += din[j] * weights_ptr[j];
+        }
+        dout[k] = sum + din2[k];
+      }
+    } else {
+      for (int k = 0; k < oc; k++) {
+        const float* weights_ptr = weights + k * ic;
+        float sum = 0.f;
+        for (int j = 0; j < ic; j++) {
+          sum += din[j] * weights_ptr[j];
+        }
+        dout[k] = sum;
+      }
+    }
+  }
+
+ private:
+  std::string conv_type0_{"conv2d"};
+  std::string conv_type1_{"conv2d"};
+  bool conv_has_bias0_{false};
+  bool conv_has_bias1_{false};
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
--- a/lite/core/mir/generate_program_pass.cc
+++ b/lite/core/mir/generate_program_pass.cc
--- a/lite/core/mir/generate_program_pass.h
+++ b/lite/core/mir/generate_program_pass.h
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
--- a/lite/core/program.h
+++ b/lite/core/program.h
--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/conditional_block_compute.h
+++ b/lite/kernels/arm/conditional_block_compute.h
--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ b/lite/kernels/arm/elementwise_compute_test.cc
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
--- a/lite/kernels/arm/fc_compute.h
+++ b/lite/kernels/arm/fc_compute.h
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
--- a/lite/kernels/arm/gather_compute.h
+++ b/lite/kernels/arm/gather_compute.h
--- a/lite/kernels/arm/sequence_conv_compute.cc
+++ b/lite/kernels/arm/sequence_conv_compute.cc
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
--- a/lite/kernels/cuda/assign_value_compute.cu
+++ b/lite/kernels/cuda/assign_value_compute.cu
--- a/lite/kernels/cuda/concat_compute_test.cc
+++ b/lite/kernels/cuda/concat_compute_test.cc
--- a/lite/kernels/cuda/dropout_compute.cc
+++ b/lite/kernels/cuda/dropout_compute.cc
--- a/lite/kernels/cuda/gru_compute.cu
+++ b/lite/kernels/cuda/gru_compute.cu
--- a/lite/kernels/cuda/gru_compute.h
+++ b/lite/kernels/cuda/gru_compute.h
--- a/lite/kernels/cuda/gru_compute_test.cc
+++ b/lite/kernels/cuda/gru_compute_test.cc
--- a/lite/kernels/cuda/scale_compute.cc
+++ b/lite/kernels/cuda/scale_compute.cc
--- a/lite/kernels/cuda/sequence_mask_compute.cu
+++ b/lite/kernels/cuda/sequence_mask_compute.cu
--- a/lite/kernels/cuda/sequence_pad_compute.cu
+++ b/lite/kernels/cuda/sequence_pad_compute.cu
--- a/lite/kernels/cuda/sequence_unpad_compute.cu
+++ b/lite/kernels/cuda/sequence_unpad_compute.cu
--- a/lite/kernels/cuda/sequence_unpad_compute.h
+++ b/lite/kernels/cuda/sequence_unpad_compute.h
--- a/lite/kernels/cuda/sigmoid_compute.cu
+++ b/lite/kernels/cuda/sigmoid_compute.cu
--- a/lite/kernels/cuda/sigmoid_compute.h
+++ b/lite/kernels/cuda/sigmoid_compute.h
--- a/lite/kernels/cuda/sigmoid_compute_test.cc
+++ b/lite/kernels/cuda/sigmoid_compute_test.cc
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/host/assign_compute.cc
+++ b/lite/kernels/host/assign_compute.cc
--- a/lite/kernels/arm/conditional_block_compute.cc
+++ b/lite/kernels/arm/conditional_block_compute.cc
--- a/lite/kernels/host/conditional_block_compute.h
+++ b/lite/kernels/host/conditional_block_compute.h
--- a/lite/kernels/host/print_compute.cc
+++ b/lite/kernels/host/print_compute.cc
--- a/lite/kernels/host/print_compute.h
+++ b/lite/kernels/host/print_compute.h
--- a/lite/kernels/arm/while_compute.cc
+++ b/lite/kernels/arm/while_compute.cc
--- a/lite/kernels/arm/while_compute.h
+++ b/lite/kernels/arm/while_compute.h
--- a/lite/kernels/huawei_ascend_npu/CMakeLists.txt
+++ b/lite/kernels/huawei_ascend_npu/CMakeLists.txt
--- a/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
+++ b/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
--- a/lite/kernels/huawei_ascend_npu/bridges/act_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/act_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/graph.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/graph.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/graph.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/graph.h
--- a/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
--- a/lite/kernels/huawei_ascend_npu/bridges/utility.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/utility.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.h
--- a/lite/kernels/huawei_ascend_npu/subgraph_compute.cc
+++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.cc
--- a/lite/kernels/huawei_ascend_npu/subgraph_compute.h
+++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.h
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
--- a/lite/kernels/opencl/nearest_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
--- a/lite/kernels/xpu/__xpu__mmdnn_compute.cc
+++ b/lite/kernels/xpu/__xpu__mmdnn_compute.cc
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
--- a/lite/kernels/xpu/__xpu__resnet50_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
--- a/lite/kernels/xpu/__xpu__search_attention_compute.cc
+++ b/lite/kernels/xpu/__xpu__search_attention_compute.cc
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
--- a/lite/kernels/xpu/batch_norm_compute.h
+++ b/lite/kernels/xpu/batch_norm_compute.h
--- a/lite/kernels/xpu/cast_compute.h
+++ b/lite/kernels/xpu/cast_compute.h
--- a/lite/kernels/xpu/conv_compute.h
+++ b/lite/kernels/xpu/conv_compute.h
--- a/lite/kernels/xpu/dropout_compute.h
+++ b/lite/kernels/xpu/dropout_compute.h
--- a/lite/kernels/xpu/elementwise_compute.h
+++ b/lite/kernels/xpu/elementwise_compute.h
--- a/lite/kernels/xpu/layer_norm_compute.h
+++ b/lite/kernels/xpu/layer_norm_compute.h
--- a/lite/kernels/xpu/lookup_table_compute.cc
+++ b/lite/kernels/xpu/lookup_table_compute.cc
--- a/lite/kernels/xpu/lookup_table_compute.h
+++ b/lite/kernels/xpu/lookup_table_compute.h
--- a/lite/kernels/xpu/match_matrix_tensor_compute.cc
+++ b/lite/kernels/xpu/match_matrix_tensor_compute.cc
--- a/lite/kernels/xpu/matmul_compute.h
+++ b/lite/kernels/xpu/matmul_compute.h
--- a/lite/kernels/xpu/mul_compute.h
+++ b/lite/kernels/xpu/mul_compute.h
--- a/lite/kernels/xpu/pool_compute.h
+++ b/lite/kernels/xpu/pool_compute.h
--- a/lite/kernels/xpu/scale_compute.h
+++ b/lite/kernels/xpu/scale_compute.h
--- a/lite/kernels/xpu/search_fc_compute.cc
+++ b/lite/kernels/xpu/search_fc_compute.cc
--- a/lite/kernels/xpu/search_grnn_compute.cc
+++ b/lite/kernels/xpu/search_grnn_compute.cc
--- a/lite/kernels/xpu/sequence_arithmetic_compute.cc
+++ b/lite/kernels/xpu/sequence_arithmetic_compute.cc
--- a/lite/kernels/xpu/sequence_concat_compute.cc
+++ b/lite/kernels/xpu/sequence_concat_compute.cc
--- a/lite/kernels/xpu/sequence_pool_compute.cc
+++ b/lite/kernels/xpu/sequence_pool_compute.cc
--- a/lite/kernels/xpu/sequence_reverse_compute.cc
+++ b/lite/kernels/xpu/sequence_reverse_compute.cc
--- a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
+++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
--- a/lite/kernels/xpu/softmax_compute.h
+++ b/lite/kernels/xpu/softmax_compute.h
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
--- a/lite/kernels/xpu/var_conv_2d_compute.cc
+++ b/lite/kernels/xpu/var_conv_2d_compute.cc
--- a/lite/model_parser/base/apis.h
+++ b/lite/model_parser/base/apis.h
--- a/lite/model_parser/base/proto_desc.h
+++ b/lite/model_parser/base/proto_desc.h
--- a/lite/model_parser/base/vector_view.h
+++ b/lite/model_parser/base/vector_view.h
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
--- a/lite/model_parser/flatbuffers/io.cc
+++ b/lite/model_parser/flatbuffers/io.cc
--- a/lite/model_parser/flatbuffers/op_desc.h
+++ b/lite/model_parser/flatbuffers/op_desc.h
--- a/lite/model_parser/flatbuffers/program_desc.h
+++ b/lite/model_parser/flatbuffers/program_desc.h
--- a/lite/model_parser/flatbuffers/vector_view.h
+++ b/lite/model_parser/flatbuffers/vector_view.h
--- a/lite/model_parser/general/CMakeLists.txt
+++ b/lite/model_parser/general/CMakeLists.txt
--- a/lite/model_parser/pb/var_desc.cc
+++ b/lite/model_parser/pb/var_desc.cc
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/__xpu__mmdnn_op.cc
+++ b/lite/operators/__xpu__mmdnn_op.cc
--- a/lite/operators/__xpu__mmdnn_op.h
+++ b/lite/operators/__xpu__mmdnn_op.h
--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
--- a/lite/operators/conditional_block_op.cc
+++ b/lite/operators/conditional_block_op.cc
--- a/lite/operators/conditional_block_op.h
+++ b/lite/operators/conditional_block_op.h
--- a/lite/operators/fake_quantize_dequantize_abs_max.cc
+++ b/lite/operators/fake_quantize_dequantize_abs_max.cc
--- a/lite/operators/fake_quantize_dequantize_abs_max.h
+++ b/lite/operators/fake_quantize_dequantize_abs_max.h
--- a/lite/operators/gru_op.cc
+++ b/lite/operators/gru_op.cc
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/print_op.cc
+++ b/lite/operators/print_op.cc
--- a/lite/operators/print_op.h
+++ b/lite/operators/print_op.h
--- a/lite/operators/sequence_pad_op.cc
+++ b/lite/operators/sequence_pad_op.cc
--- a/lite/operators/sequence_unpad_op.cc
+++ b/lite/operators/sequence_unpad_op.cc
--- a/lite/operators/subgraph_op.cc
+++ b/lite/operators/subgraph_op.cc
--- a/lite/operators/subgraph_op.h
+++ b/lite/operators/subgraph_op.h
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
--- a/lite/operators/while_op.cc
+++ b/lite/operators/while_op.cc
--- a/lite/operators/while_op.h
+++ b/lite/operators/while_op.h
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
--- a/lite/tests/api/test_mmdnn_lite_xpu.cc
+++ b/lite/tests/api/test_mmdnn_lite_xpu.cc
--- a/lite/tests/api/test_transformer_with_mask_fp32_arm.cc
+++ b/lite/tests/api/test_transformer_with_mask_fp32_arm.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
--- a/lite/tests/kernels/conv_compute_test.cc
+++ b/lite/tests/kernels/conv_compute_test.cc
--- a/lite/tests/kernels/prior_box_compute_test.cc
+++ b/lite/tests/kernels/prior_box_compute_test.cc
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
--- a/lite/tests/math/sgemm_c4_compute_test.cc
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
--- a/lite/utils/env.h
+++ b/lite/utils/env.h
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
--- a/lite/utils/replace_stl/stream.h
+++ b/lite/utils/replace_stl/stream.h