Merge pull request #141 from PaddlePaddle/develop

pull

Merge pull request #141 from PaddlePaddle/develop
pull
0d7b2a32 · HappyAngel · GitHub · 8101c073 · a105bbdf · 0d7b2a32
163 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
 lite_option(LITE_WITH_RKNPU  "Enable RKNPU in lite mode"  OFF)
 lite_option(LITE_WITH_MLU  "Enable MLU in lite mode"  OFF)
+lite_option(LITE_WITH_HUAWEI_ASCEND_NPU  "Enable HUAWEI_ASCEND_NPU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_XTCL  "Enable XPU via XTCL"  OFF IF LITE_WITH_XPU)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
@@ -98,6 +99,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OF
 lite_option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
 lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
 lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
+lite_option(LITE_WITH_EXCEPTION "Enable throwing the exception when error occurs in lite" OFF)
 lite_option(LITE_WITH_NVTX "Enable nvtx or not, please enable LITE_WITH_CUDA first." OFF)
 lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
 lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
@@ -224,6 +226,11 @@ endif()
 if(LITE_WITH_MLU)
    include(mlu)
 endif()
+
+if(LITE_WITH_HUAWEI_ASCEND_NPU)
+    include(device/huawei_ascend_npu)
+endif()
+
 include(coveralls)

 include(external/mklml)     # download mklml package

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -174,6 +174,10 @@ if (LITE_WITH_MLU)
 add_definitions("-DLITE_WITH_MLU")
 endif()

+if (LITE_WITH_HUAWEI_ASCEND_NPU)
+add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU")
+endif()
+
 if (LITE_WITH_PROFILE)
    add_definitions("-DLITE_WITH_PROFILE")
 endif()
@@ -190,6 +194,10 @@ if (LITE_WITH_LOG)
  add_definitions("-DLITE_WITH_LOG")
 endif()

+if (LITE_WITH_EXCEPTION)
+  add_definitions("-DLITE_WITH_EXCEPTION")
+endif()
+
 if (LITE_ON_TINY_PUBLISH)
  add_definitions("-DLITE_ON_TINY_PUBLISH")
 endif()

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -80,6 +80,21 @@ if (ARM_TARGET_LANG STREQUAL "clang")
    elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
        set(triple arm-v7a-linux-android)
        set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE)
+        if(ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
+            # Use CMAKE_CXX_STANDARD_LIBRARIES_INIT to ensure libunwind and libc++ is linked in the right order
+            set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libunwind.a")
+            if (ANDROID_API_LEVEL LESS 21)
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libandroid_support.a")
+            endif()
+            if(ANDROID_STL_TYPE STREQUAL "c++_shared")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_shared.so")
+            elseif(ANDROID_STL_TYPE STREQUAL "c++_static")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_static.a")
+                set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++abi.a")
+            else()
+                message(FATAL_ERROR "Invalid Android STL TYPE: ${ANDROID_STL_TYPE}.")
+            endif()
+        endif()
    else()
        message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
    endif()

--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -23,6 +23,21 @@ if(ANDROID)
    
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")
+
+    # Don't re-export libgcc symbols
+    set(REMOVE_ATOMIC_GCC_SYMBOLS "-Wl,--exclude-libs,libatomic.a -Wl,--exclude-libs,libgcc.a")
+    set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
+    set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
+
+    # Only the libunwind.a from clang(with libc++) provide C++ exception handling support for 32-bit ARM
+    # Refer to https://android.googlesource.com/platform/ndk/+/master/docs/BuildSystemMaintainers.md#Unwinding
+    if (ARM_TARGET_LANG STREQUAL "clang" AND ARM_TARGET_ARCH_ABI STREQUAL "armv7" AND ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
+        set(REMOVE_UNWIND_SYMBOLS "-Wl,--exclude-libs,libunwind.a")
+        set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
+        set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
+        set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
+    endif()
 endif()

 if(ARMLINUX)
@@ -59,14 +74,13 @@ function(check_linker_flag)
 endfunction()

 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if((LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) OR LITE_WITH_PYTHON OR LITE_WITH_EXCEPTION OR (NOT LITE_ON_TINY_PUBLISH))
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -fasynchronous-unwind-tables -funwind-tables")
+else ()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-asynchronous-unwind-tables -fno-unwind-tables")
+endif()
 if (LITE_ON_TINY_PUBLISH)
-    if((NOT LITE_WITH_PYTHON))
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
-    endif()
-    if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
-    endif()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
    check_linker_flag(-Wl,--gc-sections)
 endif()

--- a/cmake/device/huawei_ascend_npu.cmake
+++ b/cmake/device/huawei_ascend_npu.cmake
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+  return()
+endif()
+
+# 1. path to Huawei Ascend Install Path
+if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
+    set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
+    if(NOT HUAWEI_ASCEND_NPU_DDK_ROOT)
+        message(FATAL_ERROR "Must set HUAWEI_ASCEND_NPU_DDK_ROOT or env HUAWEI_ASCEND_NPU_DDK_ROOT when LITE_WITH_HUAWEI_ASCEND_NPU=ON")
+    endif()
+endif()
+message(STATUS "HUAWEI_ASCEND_NPU_DDK_ROOT: ${HUAWEI_ASCEND_NPU_DDK_ROOT}")
+
+# 2. Huawei Ascend include directory
+set(ACL_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/include")
+set(ATC_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/include")
+set(OPP_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp")
+include_directories(${ACL_INCLUDE_DIR})
+include_directories(${ATC_INCLUDE_DIR})
+include_directories(${OPP_INCLUDE_DIR})
+
+# 3 find ACL Libs (ACL libs should before ATC libs)
+find_library(ACL_ASCENDCL_FILE NAMES ascendcl
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ACL_ASCENDCL_FILE)
+  message(FATAL_ERROR "Can not find ACL_ASCENDCL_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
+else()
+  message(STATUS "Found ACL_ASCENDCL_FILE Library: ${ACL_ASCENDCL_FILE}")
+  add_library(acl_ascendcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET acl_ascendcl PROPERTY IMPORTED_LOCATION ${ACL_ASCENDCL_FILE})
+endif()
+
+# 3.1 ascendcl dependency - libruntime.so
+find_library(ACL_RUNTIME_FILE NAMES runtime
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ACL_RUNTIME_FILE)
+  message(FATAL_ERROR "Can not find ACL_RUNTIME_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
+else()
+  message(STATUS "Found ACL_RUNTIME_FILE Library: ${ACL_RUNTIME_FILE}")
+  add_library(acl_runtime SHARED IMPORTED GLOBAL)
+  set_property(TARGET acl_runtime PROPERTY IMPORTED_LOCATION ${ACL_RUNTIME_FILE})
+endif()
+
+# 4.1 find ATC libs - libregister.so
+find_library(ATC_REGISTER_FILE NAMES register
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_REGISTER_FILE)
+  message(FATAL_ERROR "Can not find ATC_REGISTER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_REGISTER_FILE Library: ${ATC_REGISTER_FILE}")
+  add_library(atc_register SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_register PROPERTY IMPORTED_LOCATION ${ATC_REGISTER_FILE})
+endif()
+
+# 4.1.1 dependency of register - libprotobuf.so.19,
+find_library(ATC_PROTOBUF_FILE NAMES libprotobuf.so.19
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+  if(NOT ATC_REGISTER_FILE)
+  message(FATAL_ERROR "Can not find ATC_PROTOBUF_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_PROTOBUF_FILE Library: ${ATC_PROTOBUF_FILE}")
+  add_library(atc_protobuf SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_protobuf PROPERTY IMPORTED_LOCATION ${ATC_PROTOBUF_FILE})
+endif()
+
+# 4.1.2 dependency of register - libgraph.so
+find_library(ATC_GRAPH_FILE NAMES graph
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_GRAPH_FILE)
+  message(FATAL_ERROR "Can not find ATC_GRAPH_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GRAPH_FILE Library: ${ATC_GRAPH_FILE}")
+  add_library(atc_graph SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_graph PROPERTY IMPORTED_LOCATION ${ATC_GRAPH_FILE})
+endif()
+
+# 4.2 find ATC libs - libge_compiler.so
+find_library(ATC_GE_COMPILER_FILE NAMES ge_compiler
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_GE_COMPILER_FILE)
+  message(FATAL_ERROR "Can not find ATC_GE_COMPILER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GE_COMPILER_FILE Library: ${ATC_GE_COMPILER_FILE}")
+  add_library(atc_ge_compiler SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_ge_compiler PROPERTY IMPORTED_LOCATION ${ATC_GE_COMPILER_FILE})
+endif()
+
+# 4.2.1 dependencies of libge_compiler.so - libge_common.so
+find_library(ATC_GE_COMMON_FILE NAMES ge_common
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_GE_COMMON_FILE)
+  message(FATAL_ERROR "Can not find ATC_GE_COMMON_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_GE_COMMON_FILE Library: ${ATC_GE_COMMON_FILE}")
+  add_library(atc_ge_common SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_ge_common PROPERTY IMPORTED_LOCATION ${ATC_GE_COMMON_FILE})
+endif()
+
+# 4.2.3 dependencies of libge_compiler.so - libresource.so
+find_library(ATC_RESOURCE_FILE NAMES resource
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_RESOURCE_FILE)
+  message(FATAL_ERROR "Can not find ATC_RESOURCE_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_RESOURCE_FILE Library: ${ATC_RESOURCE_FILE}")
+  add_library(atc_resource SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_resource PROPERTY IMPORTED_LOCATION ${ATC_RESOURCE_FILE})
+endif()
+
+# 4.3 find OPP libs - libopsproto.so
+find_library(OPP_OPS_PROTO_FILE NAMES opsproto
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in
+  NO_DEFAULT_PATH)
+
+if(NOT OPP_OPS_PROTO_FILE)
+  message(FATAL_ERROR "Can not find OPP_OPS_PROTO_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in")
+else()
+  message(STATUS "Found OPP_OPS_PROTO_FILE Library: ${OPP_OPS_PROTO_FILE}")
+  add_library(opp_ops_proto SHARED IMPORTED GLOBAL)
+  set_property(TARGET opp_ops_proto PROPERTY IMPORTED_LOCATION ${OPP_OPS_PROTO_FILE})
+endif()
+
+# 4.3.1 dependency of  opp_ops_proto - liberror_manager.so
+find_library(ATC_ERROR_MANAGER_FILE NAMES error_manager
+  PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
+  NO_DEFAULT_PATH)
+
+if(NOT ATC_ERROR_MANAGER_FILE)
+  message(FATAL_ERROR "Can not find ATC_ERROR_MANAGER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
+else()
+  message(STATUS "Found ATC_ERROR_MANAGER_FILE Library: ${ATC_ERROR_MANAGER_FILE}")
+  add_library(atc_error_manager SHARED IMPORTED GLOBAL)
+  set_property(TARGET atc_error_manager PROPERTY IMPORTED_LOCATION ${ATC_ERROR_MANAGER_FILE})
+endif()
+
+# note: huawei_ascend_npu_runtime_libs should before huawei_ascend_npu_builder_libs
+set(huawei_ascend_npu_runtime_libs acl_ascendcl acl_runtime CACHE INTERNAL "huawei_ascend_npu acllib runtime libs")
+set(huawei_ascend_npu_builder_libs atc_register atc_protobuf atc_graph opp_ops_proto atc_error_manager 
+    atc_ge_compiler atc_ge_common atc_resource CACHE INTERNAL "huawei_ascend_npu atc builder libs")
\ No newline at end of file
--- a/cmake/device/npu.cmake
+++ b/cmake/device/npu.cmake
@@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
  NO_DEFAULT_PATH)

+# Added in HiAI DDK 320 or later version
+find_library(NPU_DDK_HCL_FILE NAMES hcl
+  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
+  NO_DEFAULT_PATH)
+
 if(NOT NPU_DDK_HIAI_FILE)
  message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
 else()
@@ -78,5 +83,13 @@ else()
  set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
 endif()

-set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
+if(NOT NPU_DDK_HCL_FILE)
+# message(FATAL_ERROR "Can not find NPU_DDK_HCL_FILE in ${NPU_DDK_ROOT}")
+else()
+  message(STATUS "Found NPU_DDK HCL Library: ${NPU_DDK_HCL_FILE}")
+  add_library(npu_ddk_hcl SHARED IMPORTED GLOBAL)
+  set_property(TARGET npu_ddk_hcl PROPERTY IMPORTED_LOCATION ${NPU_DDK_HCL_FILE})
+endif()
+
+set(npu_runtime_libs npu_ddk_hiai npu_ddk_hcl CACHE INTERNAL "npu ddk runtime libs")
 set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -22,7 +22,7 @@ endfunction()
 function (lite_deps TARGET)
  set(options "")
  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS ARGS)
  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

  set(deps ${lite_deps_DEPS})
@@ -118,6 +118,12 @@ function (lite_deps TARGET)
    endforeach(var)
  endif()

+  if (LITE_WITH_HUAWEI_ASCEND_NPU)
+    foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
+
  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()

@@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -165,6 +171,7 @@ function(lite_cc_library TARGET)
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            MLU_DEPS ${args_MLU_DEPS}
+            HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
            )

    if (args_SHARED OR ARGS_shared)
@@ -193,7 +200,7 @@ function(lite_cc_binary TARGET)
        set(options " -g ")
    endif()
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

@@ -215,6 +222,7 @@ function(lite_cc_binary TARGET)
            HVY_DEPS ${args_HVY_DEPS}
            CV_DEPS ${CV_DEPS}
            MLU_DEPS ${args_MLU_DEPS}
+            HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    if(NOT WIN32)
@@ -246,7 +254,7 @@ function(lite_cc_test TARGET)
    endif()
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
        ARGS
        COMPILE_LEVEL # (basic|extra)
@@ -276,6 +284,7 @@ function(lite_cc_test TARGET)
              HVY_DEPS ${args_HVY_DEPS}
              CV_DEPS ${args_CV_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
              )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
    # strip binary target to reduce size
@@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
 set(apu_kernels CACHE INTERNAL "apu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
 set(mlu_kernels CACHE INTERNAL "mlu kernels")
+set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels")
 set(bm_kernels CACHE INTERNAL "bm kernels")
 set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
 set(opencl_kernels CACHE INTERNAL "opencl kernels")
@@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR)
  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
 endif()
 # add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
+# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
 # level: one of (basic, extra)
 function(add_kernel TARGET device level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -438,6 +448,15 @@ function(add_kernel TARGET device level)
        endif()
        set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
    endif()
+    if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU")
+        if (NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        endif()
+        set(huawei_ascend_npu_kernels "${huawei_ascend_npu_kernels};${TARGET}" CACHE INTERNAL "")
+    endif()
    if ("${device}" STREQUAL "OPENCL")
        if (NOT LITE_WITH_OPENCL)
            foreach(src ${args_SRCS})
@@ -481,6 +500,7 @@ function(add_kernel TARGET device level)
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}
@@ -499,7 +519,7 @@ endif()
 function(add_operator TARGET level)
    set(options "")
    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -537,6 +557,7 @@ function(add_operator TARGET level)
              RKNPU_DEPS ${args_RKNPU_DEPS}
              BM_DEPS ${args_BM_DEPS}
              MLU_DEPS ${args_MLU_DEPS}
+              HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
              PROFILE_DEPS ${args_PROFILE_DEPS}
              LIGHT_DEPS ${args_LIGHT_DEPS}
              HVY_DEPS ${args_HVY_DEPS}

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
 message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
+message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
@@ -45,6 +46,7 @@ if (WITH_TESTING)
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz")
    endif()
    if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -11,7 +11,7 @@ endif()

 set(light_lib_DEPS light_api paddle_api paddle_api_light)

-if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR LITE_WITH_HUAWEI_ASCEND_NPU OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
    #full api dynamic library
    lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
                  DEPS paddle_api paddle_api_light  paddle_api_full)
@@ -40,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
                  NPU_DEPS ${npu_kernels}
                  APU_DEPS ${apu_kernels}
                  RKNPU_DEPS ${rknpu_kernels}
+                  HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
                  )

    add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
    if(WIN32)
        target_link_libraries(paddle_light_api_shared shlwapi.lib)
    endif()
-    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
+    target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${rknpu_kernels} ${apu_kernels})
   if(APPLE)
        set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds")
        set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}")
@@ -94,6 +95,7 @@ if (WITH_TESTING)
      RKNPU_DEPS ${rknpu_kernels}
      BM_DEPS ${bm_kernels}
      MLU_DEPS ${mlu_kernels}
+      HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
      APU_DEPS ${apu_kernels})

 endif()
@@ -112,6 +114,10 @@ if(LITE_WITH_RKNPU)
    set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
 endif()

+if(LITE_WITH_HUAWEI_ASCEND_NPU)
+    set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps})
+    set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps})
+endif()

 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
@@ -126,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
 message(STATUS "get BM kernels ${bm_kernels}")
 message(STATUS "get MLU kernels ${mlu_kernels}")
+message(STATUS "get HUAWEI_ASCEND_NPU kernels ${huawei_ascend_npu_kernels}")

 # for full api
 if (NOT LITE_ON_TINY_PUBLISH)
@@ -144,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
                        RKNPU_DEPS ${rknpu_kernels}
                        BM_DEPS ${bm_kernels}
                        CL_DEPS ${opencl_kernels}
-                        FPGA_DEPS ${fpga_kernels})
+                        FPGA_DEPS ${fpga_kernels}
+                        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 endif()

 # for light api
@@ -168,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        BM_DEPS ${bm_kernels}
-        MLU_DEPS ${mlu_kernels})
+        MLU_DEPS ${mlu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})

 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -191,6 +200,7 @@ if(WITH_TESTING)
           FPGA_DEPS ${fpga_kernels}
           BM_DEPS ${bm_kernels}
           MLU_DEPS ${mlu_kernels}
+           HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
           EXCLUDE_COMPILE_DEPS "ON"
           ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
                --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
@@ -322,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
        APU_DEPS ${apu_kernels}
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
-        BM_DEPS ${bm_kernels})
+        BM_DEPS ${bm_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    # The final inference library for just MobileConfig.
    bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
    target_link_libraries(paddle_api_full ${cuda_deps})
@@ -394,6 +405,7 @@ if(NOT WITH_COVERAGE)
      FPGA_DEPS ${fpga_kernels}
      BM_DEPS ${bm_kernels}
      MLU_DEPS ${mlu_kernels}
+      HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
      ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
    if (WITH_TESTING)
        add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
@@ -415,7 +427,8 @@ if(NOT IOS)
        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    
    lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -430,7 +443,8 @@ if(NOT IOS)
        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    
    lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -445,7 +459,8 @@ if(NOT IOS)
        RKNPU_DEPS ${rknpu_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})

    lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -459,7 +474,8 @@ if(NOT IOS)
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
    
    lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
        ${ops} ${host_kernels}
@@ -470,8 +486,9 @@ if(NOT IOS)
        XPU_DEPS ${xpu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
        MLU_DEPS ${mlu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
        CL_DEPS ${opencl_kernels}
-	BM_DEPS ${bm_kernels}
+        BM_DEPS ${bm_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
@@ -487,7 +504,8 @@ if(NOT IOS)
        CL_DEPS ${opencl_kernels}
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
-        CUDA_DEPS ${cuda_kernels})
+        CUDA_DEPS ${cuda_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
 endif()

 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -37,8 +37,7 @@ void Predictor::SaveModel(const std::string &dir,
  if (!program_) {
    GenRuntimeProgram();
  }
-  program_->SaveOpInfosToProgram(program_desc_.get());
-  program_->UpdateVarsOfProgram(program_desc_.get());
+  program_->SaveToProgram(program_desc_);
  switch (model_type) {
    case lite_api::LiteModelType::kProtobuf:
      SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true);
@@ -58,17 +57,21 @@ void Predictor::SaveModel(const std::string &dir,
 void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
  std::set<std::string> ops_info;
  std::set<std::string> kernels_info;
-  const auto &instructions_ = program_->instructions();
-  for (auto &node : instructions_) {
-    // parse op type infomation
-    auto op = node.op()->op_info();
-    ops_info.insert(op->Type());
-    // parse kernel type information
-    std::string kernel_type_str =
-        node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) +
-        "," + PrecisionRepr(node.kernel()->precision()) + "," +
-        DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias();
-    kernels_info.insert(kernel_type_str);
+  auto block_size = program_->block_size();
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    const auto &insts = program_->instructions(block_idx);
+    for (auto &inst : insts) {
+      // parse op type infomation
+      auto op = inst.op()->op_info();
+      ops_info.insert(op->Type());
+      // parse kernel type information
+      std::string kernel_type_str =
+          inst.kernel()->op_type() + "," + TargetRepr(inst.kernel()->target()) +
+          "," + PrecisionRepr(inst.kernel()->precision()) + "," +
+          DataLayoutRepr(inst.kernel()->layout()) + "," +
+          inst.kernel()->alias();
+      kernels_info.insert(kernel_type_str);
+    }
  }

  // get souce_file name from op type and kernel type
@@ -170,9 +173,9 @@ void Predictor::PrepareFeedFetch() {

  std::vector<const cpp::OpDesc *> feeds;
  std::vector<const cpp::OpDesc *> fetchs;
-  const auto &insts = program_->instructions();
-  for (size_t i = 0; i < program_->num_instructions(); i++) {
-    const auto &op = insts[i].op()->op_info();
+  const auto &insts = program_->instructions(kRootBlockIdx);
+  for (auto &inst : insts) {
+    const auto &op = inst.op()->op_info();
    if (op->Type() == "feed") {
      feeds.push_back(op);
    } else if (op->Type() == "fetch") {
@@ -255,7 +258,6 @@ void Predictor::Build(const lite_api::CxxConfig &config,
  } else {
    LOG(INFO) << "Load model from file.";
  }
-
  Build(model_path,
        model_file,
        param_file,
@@ -296,10 +298,10 @@ void Predictor::Build(const std::string &model_path,
  Build(program_desc_, valid_places, passes);
 }

-void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
+void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &program_desc,
                      const std::vector<Place> &valid_places,
                      const std::vector<std::string> &passes) {
-  program_desc_ = desc;
+  program_desc_ = program_desc;
  // `inner_places` is used to optimize passes
  std::vector<Place> inner_places = valid_places;
  for (auto &valid_place : valid_places) {
@@ -336,7 +338,7 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
                        Place{TARGET(kARM), PRECISION(kInt8)});
  }

-  Program program(*desc.get(), scope_, inner_places);
+  Program program(program_desc_, scope_, inner_places);
  valid_places_ = inner_places;

  core::KernelPickFactor factor;

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -52,12 +52,12 @@ class LITE_API Predictor {
  // Create a predictor with the weight variable scope set.
  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
      : scope_(root_scope) {}
-  Predictor(const std::shared_ptr<cpp::ProgramDesc>& desc,
-            const std::shared_ptr<Scope>& root,
+  Predictor(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
+            const std::shared_ptr<Scope>& root_scope,
            const std::vector<Place>& valid_places,
-            const std::vector<std::string>& var_names = {})
-      : program_desc_(desc), scope_(root) {
-    Program program(*desc.get(), scope_, valid_places, var_names);
+            const std::vector<std::string>& vars_to_clone = {})
+      : program_desc_(program_desc), scope_(root_scope) {
+    Program program(program_desc_, scope_, valid_places, vars_to_clone);
    optimizer_ = Optimizer(std::move(program), valid_places);
    exec_scope_ = optimizer_.exec_scope();
    valid_places_ = valid_places;
@@ -79,30 +79,28 @@ class LITE_API Predictor {
      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
      bool memory_from_memory = false);

-  void Build(const std::shared_ptr<cpp::ProgramDesc>& desc,
+  void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
             const std::vector<Place>& valid_places,
             const std::vector<std::string>& passes = {});

  std::shared_ptr<Predictor> Clone() const {
-    auto predictor =
-        std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
-    return predictor;
+    return std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
  }

  std::shared_ptr<Predictor> Clone(
-      const std::vector<std::string>& var_names) const {
+      const std::vector<std::string>& vars_to_clone) const {
    CHECK(program_desc_) << "Both program and scope of current predicotr "
                            "should be not be nullptr in Clone mode.";
    CHECK(scope_) << "Both program and scope of current predicotr should be "
                     "not be nullptr in Clone mode.";
    auto predictor = std::make_shared<Predictor>(
-        program_desc_, scope_, valid_places_, var_names);
+        program_desc_, scope_, valid_places_, vars_to_clone);

-    for (auto i : var_names) {
-      predictor->exec_scope_->LocalVar(i);
-      auto* tensor = predictor->scope_->Var(i)->GetMutable<lite::Tensor>();
+    for (auto var_name : vars_to_clone) {
+      predictor->exec_scope_->LocalVar(var_name);
+      auto* tensor = predictor->scope_->Var(var_name)->GetMutable<Tensor>();
      auto* sub_tensor =
-          predictor->exec_scope_->Var(i)->GetMutable<lite::Tensor>();
+          predictor->exec_scope_->Var(var_name)->GetMutable<Tensor>();
      sub_tensor->CopyDataFrom(*tensor);
    }
    return predictor;
@@ -140,6 +138,7 @@ class LITE_API Predictor {
  // get a const tensor according to its name
  const lite::Tensor* GetTensor(const std::string& name) const;
  const RuntimeProgram& runtime_program() const;
+  Scope* scope() { return scope_.get(); }

  // This method is disabled in mobile, for unnecessary dependencies required.
  void SaveModel(

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -74,7 +74,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  mode_ = config.power_mode();
  threads_ = config.threads();
 #ifdef LITE_WITH_NPU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
+      config.get_device_id());
+  Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
      config.subgraph_model_cache_dir());
 #endif
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -22,16 +22,16 @@ namespace lite {
 void LightPredictor::Build(const std::string& lite_model_file,
                           bool model_from_memory) {
  if (model_from_memory) {
-    LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
+    LoadModelNaiveFromMemory(
+        lite_model_file, scope_.get(), program_desc_.get());
  } else {
-    LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
+    LoadModelNaiveFromFile(lite_model_file, scope_.get(), program_desc_.get());
  }

  // For weight quantization of post training, load the int8/16 weights
  // for optimized model, and dequant it to fp32.
  DequantizeWeight();
-
-  BuildRuntimeProgram(cpp_program_desc_);
+  BuildRuntimeProgram(program_desc_);
  PrepareFeedFetch();
 }

@@ -43,15 +43,15 @@ void LightPredictor::Build(const std::string& model_dir,
  switch (model_type) {
 #ifndef LITE_ON_TINY_PUBLISH
    case lite_api::LiteModelType::kProtobuf:
-      LoadModelPb(model_dir, "", "", scope_.get(), &cpp_program_desc_);
+      LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
      break;
 #endif
    case lite_api::LiteModelType::kNaiveBuffer: {
      if (model_from_memory) {
        LoadModelNaiveFromMemory(
-            model_buffer, param_buffer, scope_.get(), &cpp_program_desc_);
+            model_buffer, param_buffer, scope_.get(), program_desc_.get());
      } else {
-        LoadModelNaive(model_dir, scope_.get(), &cpp_program_desc_);
+        LoadModelNaive(model_dir, scope_.get(), program_desc_.get());
      }
      break;
    }
@@ -60,7 +60,7 @@ void LightPredictor::Build(const std::string& model_dir,
  }

  DequantizeWeight();
-  BuildRuntimeProgram(cpp_program_desc_);
+  BuildRuntimeProgram(program_desc_);
  PrepareFeedFetch();
 }

@@ -109,16 +109,17 @@ std::vector<std::string> LightPredictor::GetOutputNames() {
 }
 // append the names of inputs and outputs into input_names_ and output_names_
 void LightPredictor::PrepareFeedFetch() {
-  const cpp::ProgramDesc& prog = cpp_program_desc_;
-  auto current_block = prog.GetBlock<cpp::BlockDesc>(0);
-  std::vector<cpp::OpDesc const*> feeds;
-  std::vector<cpp::OpDesc const*> fetchs;
-  for (size_t i = 0; i < current_block->OpsSize(); i++) {
-    auto op = current_block->GetOp<cpp::OpDesc>(i);
-    if (op->Type() == "feed") {
-      feeds.push_back(op);
-    } else if (op->Type() == "fetch") {
-      fetchs.push_back(op);
+  std::vector<const cpp::OpDesc*> feeds;
+  std::vector<const cpp::OpDesc*> fetchs;
+  std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
+  auto main_block = program_desc->GetBlock<cpp::BlockDesc>(kRootBlockIdx);
+  auto op_size = main_block->OpsSize();
+  for (size_t op_idx = 0; op_idx < op_size; ++op_idx) {
+    auto op_desc = main_block->GetOp<cpp::OpDesc>(op_idx);
+    if (op_desc->Type() == "feed") {
+      feeds.push_back(op_desc);
+    } else if (op_desc->Type() == "fetch") {
+      fetchs.push_back(op_desc);
    }
  }
  input_names_.resize(feeds.size());
@@ -133,55 +134,35 @@ void LightPredictor::PrepareFeedFetch() {
  }
 }

-void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
-  std::vector<Instruction> insts;
-  // 1. Create op first
-  Program program(prog, scope_, {});
-
-// 2. Create Instructs
-#ifdef LITE_WITH_OPENCL
-  using OpenCLContext = Context<TargetType::kOpenCL>;
-  std::unique_ptr<KernelContext> local_ctx(new KernelContext());
-  local_ctx->As<OpenCLContext>().InitOnce();
-#endif
-
-  // Create the kernels of the target places, and filter out the specific
-  // kernel with the target alias.
-  for (auto& op : program.ops()) {
-    auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
-    std::string op_type, alias;
-    Place place;
-    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-    auto kernels = op->CreateKernels({place});
-    // filter out a kernel
-    auto it = std::find_if(
-        kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
-          return it->alias() == alias;
-        });
-    CHECK(it != kernels.end());
-
-#ifdef LITE_WITH_OPENCL
-    if ((*it)->target() == TARGET(kOpenCL)) {
-      std::unique_ptr<KernelContext> ctx(new KernelContext());
-      (*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
-      (*it)->SetContext(std::move(ctx));
-    } else {
-      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+void LightPredictor::BuildRuntimeProgram(
+    const std::shared_ptr<const cpp::ProgramDesc>& program_desc) {
+  auto* exe_scope = &scope_->NewScope();
+  // Prepare workspace
+  scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
+  scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
+  CHECK(program_desc);
+  auto block_size = program_desc->BlocksSize();
+  CHECK(block_size);
+  for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+    auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
+    auto var_size = block_desc->VarsSize();
+    for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
+      auto var_desc = block_desc->GetVar<cpp::VarDesc>(var_idx);
+      if (!var_desc->Persistable()) {
+        exe_scope->Var(var_desc->Name());
+      } else {
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") continue;
+        scope_->Var(var_desc->Name());
+      }
    }
-#else
-    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
-#endif
-
-    insts.emplace_back(op, std::move(*it));
  }
-  program_.reset(new RuntimeProgram(std::move(insts)));
-
-  CHECK(program.exec_scope());
-  program_->set_exec_scope(program.exec_scope());
+  // Only extracting the ops and generate the runtime program from the main
+  // block desc
+  program_.reset(new RuntimeProgram(program_desc, exe_scope, kRootBlockIdx));
 }

 void LightPredictor::DequantizeWeight() {
-  const cpp::ProgramDesc& cpp_desc = cpp_program_desc_;
+  std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
 #define PROCESS_CONV2D_DATA()                                             \
  for (int64_t i = 0; i < ch; ++i) {                                      \
    for (int64_t j = 0; j < offset; ++j) {                                \
@@ -207,10 +188,9 @@ void LightPredictor::DequantizeWeight() {
    }
    return result;
  };
-
  Tensor tmp_tensor;
-  for (size_t i = 0; i < cpp_desc.BlocksSize(); i++) {
-    auto* block = cpp_desc.GetBlock<cpp::BlockDesc>(i);
+  for (size_t i = 0; i < program_desc->BlocksSize(); i++) {
+    auto* block = program_desc->GetBlock<cpp::BlockDesc>(i);
    for (size_t k = 0; k < block->OpsSize(); ++k) {
      auto* op_desc = block->GetOp<cpp::OpDesc>(k);
      if (is_weight_quantized_op(op_desc)) {

--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -46,6 +46,7 @@ class LITE_API LightPredictor {
  LightPredictor(const std::string& lite_model_file,
                 bool model_from_memory = false) {
    scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
    Build(lite_model_file, model_from_memory);
  }

@@ -57,6 +58,7 @@ class LITE_API LightPredictor {
                 lite_api::LiteModelType model_type =
                     lite_api::LiteModelType::kNaiveBuffer) {
    scope_ = std::make_shared<Scope>();
+    program_desc_ = std::make_shared<cpp::ProgramDesc>();
    Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
  }

@@ -78,6 +80,7 @@ class LITE_API LightPredictor {
  std::vector<std::string> GetInputNames();
  std::vector<std::string> GetOutputNames();
  void PrepareFeedFetch();
+  Scope* scope() { return scope_.get(); }

 private:
  void Build(const std::string& lite_model_file,
@@ -91,14 +94,15 @@ class LITE_API LightPredictor {
      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
      bool model_from_memory = false);

-  void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
+  void BuildRuntimeProgram(
+      const std::shared_ptr<const cpp::ProgramDesc>& program_desc);

  void DequantizeWeight();

 private:
  std::shared_ptr<Scope> scope_;
  std::unique_ptr<RuntimeProgram> program_;
-  cpp::ProgramDesc cpp_program_desc_;
+  std::shared_ptr<cpp::ProgramDesc> program_desc_;
  std::vector<std::string> input_names_;
  std::vector<std::string> output_names_;
 };

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -38,7 +38,15 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  threads_ = config.threads();

 #ifdef LITE_WITH_NPU
+  // Store the model-level configuration into scope for kernels, and use
+  // exe_scope to store the execution-level configuration
  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      raw_predictor_->scope(), config.subgraph_model_cache_dir());
+#endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
+      config.get_device_id());
+  Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
      config.subgraph_model_cache_dir());
 #endif
 }

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -112,6 +112,8 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
    } else if (target_repr == "npu") {
      valid_places.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "huawei_ascend_npu") {
+      valid_places.emplace_back(TARGET(kHuaweiAscendNPU));
    } else if (target_repr == "xpu") {
      valid_places.emplace_back(TARGET(kXPU));
    } else if (target_repr == "mlu") {
@@ -201,6 +203,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
                                      "kXPU",
                                      "kRKNPU",
                                      "kAPU",
+                                      "kHuaweiAscendNPU",
                                      "kAny",
                                      "kUnk"};
  int maximum_optype_length = 0;
@@ -265,16 +268,17 @@ void PrintHelpInfo() {
      "        `--param_file=<param_path>`\n"
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
+      "        "
+      "`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
      "  Display operators in the input model\n";
  std::cout << "opt version:" << opt_version << std::endl
            << help_info << std::endl;

--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
      valid_places_.emplace_back(TARGET(kX86));
    } else if (target_repr == "npu") {
      valid_places_.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "huawei_ascend_npu") {
+      valid_places_.emplace_back(TARGET(kHuaweiAscendNPU));
    } else if (target_repr == "xpu") {
      valid_places_.emplace_back(TARGET(kXPU));
    } else if (target_repr == "rknpu") {
@@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() {
      "        `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
      "default\n"
      "        `set_lite_out(output_optimize_model_dir)`\n"
-      "        `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
+      "        "
+      "`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
      "        `record_model_info(false|true)`: refer to whether to record ops "
      "info for striping lib, false by default`\n"
      "        `run() : start model transformation`\n"
@@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() {
      "        `--param_file=<param_path>`\n"
      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
      "        `--optimize_out=<output_optimize_model_dir>`\n"
-      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n"
      "        `--record_tailoring_info=(true|false)`\n"
      "  Arguments of model checking and ops information:\n"
      "        `--print_all_ops=true`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `--print_supported_ops=true  "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
      "  Display valid operators of input targets\n"
      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
-      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
      "  Display operators in the input model\n";
  std::cout << "paddlelite opt version:" << opt_version << std::endl
            << help_info << std::endl;
@@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
                                                     "kXPU",
                                                     "kRKNPU",
                                                     "kAPU",
+                                                     "kHuaweiAscendNPU",
                                                     "kAny",
                                                     "kUnk"};
  // Get the lengh of the first column: maximum length of the op_type

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -126,6 +126,7 @@ class LITE_API ConfigBase {
  PowerMode mode_{LITE_POWER_NO_BIND};
  // to save subgraph model for npu/xpu/...
  std::string subgraph_model_cache_dir_{""};
+  int device_id_{0};

 public:
  explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
@@ -145,6 +146,9 @@ class LITE_API ConfigBase {
  const std::string& subgraph_model_cache_dir() const {
    return subgraph_model_cache_dir_;
  }
+  // set Device ID
+  void set_device_id(int device_id) { device_id_ = device_id; }
+  const int get_device_id() const { return device_id_; }
 };

 /// CxxConfig is the config for the Full feature predictor.

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -76,7 +76,8 @@ const std::string& TargetToStr(TargetType target) {
                                              "bm",
                                              "mlu",
                                              "rknpu",
-                                              "apu"};
+                                              "apu",
+                                              "huawei_ascend_npu"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -119,7 +120,8 @@ const std::string& TargetRepr(TargetType target) {
                                              "kBM",
                                              "kMLU",
                                              "kRKNPU",
-                                              "kAPU"};
+                                              "kAPU",
+                                              "kHuaweiAscendNPU"};
  auto x = static_cast<int>(target);
  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
  return target2string[x];
@@ -164,7 +166,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kMLU),
                                               TARGET(kAPU),
                                               TARGET(kRKNPU),
-                                               TARGET(kFPGA)});
+                                               TARGET(kFPGA),
+                                               TARGET(kHuaweiAscendNPU)});
  if (target == TARGET(kAny)) {
    return valid_set;
  }

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -57,7 +57,8 @@ enum class TargetType : int {
  kMLU = 11,
  kRKNPU = 12,
  kAPU = 13,
-  NUM = 14,  // number of fields.
+  kHuaweiAscendNPU = 14,
+  NUM = 15,  // number of fields.
 };
 enum class PrecisionType : int {
  kUnk = 0,

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -48,12 +48,14 @@ USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
+USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
 USE_MIR_PASS(mlu_subgraph_pass);
 USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
 USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);

--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) {
      .value("MLU", TargetType::kMLU)
      .value("RKNPU", TargetType::kRKNPU)
      .value("APU", TargetType::kAPU)
+      .value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU)
      .value("Any", TargetType::kAny);

  // PrecisionType

--- a/lite/backends/CMakeLists.txt
+++ b/lite/backends/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(mlu)
 add_subdirectory(bm)
 add_subdirectory(apu)
 add_subdirectory(rknpu)
+add_subdirectory(huawei_ascend_npu)
--- a/lite/backends/arm/math/beam_search.cc
+++ b/lite/backends/arm/math/beam_search.cc
@@ -234,7 +234,7 @@ void beam_search(const Tensor *pre_ids,
  selected_ids->Resize(dims);
  selected_scores->Resize(dims);
  if (parent_idx) {
-    parent_idx->Resize(dims);
+    parent_idx->Resize({static_cast<int64_t>(num_instances)});
  }
  auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
  auto *selected_scores_data = selected_scores->mutable_data<float>();

--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -747,6 +747,16 @@ void elementwise_mul<int>(const int* dinx,
  }
 }

+template <>
+void elementwise_mul<int64_t>(const int64_t* dinx,
+                              const int64_t* diny,
+                              int64_t* dout,
+                              int num) {
+  for (int i = 0; i < num; i++) {
+    dout[i] = dinx[i] * diny[i];
+  }
+}
+
 template <>
 void elementwise_mul_relu<float>(const float* dinx,
                                 const float* diny,
@@ -801,6 +811,17 @@ void elementwise_mul_relu<float>(const float* dinx,
  }
 }

+template <>
+void elementwise_mul_relu<int64_t>(const int64_t* dinx,
+                                   const int64_t* diny,
+                                   int64_t* dout,
+                                   int num) {
+  for (int i = 0; i < num; i++) {
+    int64_t tmp = dinx[i] * diny[i];
+    dout[i] = tmp > 0 ? tmp : 0;
+  }
+}
+
 template <>
 void elementwise_mul_broadcast<float>(const float* dinx,
                                      const float* diny,
@@ -935,6 +956,29 @@ void elementwise_mul_broadcast<int>(const int* dinx,
  }
 }

+template <>
+void elementwise_mul_broadcast<int64_t>(const int64_t* dinx,
+                                        const int64_t* diny,
+                                        int64_t* dout,
+                                        int batch,
+                                        int channels,
+                                        int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* dinx_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = *dinx_ptr * diny_data;
+        dout_ptr++;
+        dinx_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_mul_relu_broadcast<float>(const float* dinx,
                                           const float* diny,
@@ -1014,6 +1058,30 @@ void elementwise_mul_relu_broadcast<float>(const float* dinx,
  }
 }

+template <>
+void elementwise_mul_relu_broadcast<int64_t>(const int64_t* dinx,
+                                             const int64_t* diny,
+                                             int64_t* dout,
+                                             int batch,
+                                             int channels,
+                                             int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* dinx_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int k = 0; k < num; ++k) {
+        int64_t tmp = *dinx_ptr * diny_data;
+        *dout_ptr = tmp > 0 ? tmp : 0;
+        dout_ptr++;
+        dinx_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_max<float>(const float* dinx,
                            const float* diny,

--- a/lite/backends/arm/math/prior_box.cc
+++ b/lite/backends/arm/math/prior_box.cc
@@ -21,7 +21,7 @@ namespace lite {
 namespace arm {
 namespace math {

-const int MALLOC_ALIGN = 64;
+const int MALLOC_ALIGN = 16;

 void* fast_malloc(size_t size) {
  size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;

--- a/lite/backends/cuda/math/gru_forward.h
+++ b/lite/backends/cuda/math/gru_forward.h
@@ -30,9 +30,16 @@ namespace lite {
 namespace cuda {
 namespace math {

+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
 template <typename Dtype>
 inline __device__ Dtype Sigmoid(const Dtype a) {
-  return static_cast<Dtype>(1.0) / (static_cast<Dtype>(1.0) + expf(-a));
+  const Dtype min = SIGMOID_THRESHOLD_MIN;
+  const Dtype max = SIGMOID_THRESHOLD_MAX;
+  Dtype tmp = (a < min) ? min : ((a > max) ? max : a);
+  return static_cast<Dtype>(1.0) / (static_cast<Dtype>(1.0) + expf(-tmp));
 }

 template <>
@@ -63,6 +70,7 @@ inline __device__ half ReLU(const half a) {
 template <typename Dtype>
 inline __device__ Dtype Tanh(const Dtype a) {
  Dtype tmp = static_cast<Dtype>(-2.0) * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
  return (static_cast<Dtype>(2.0) / (static_cast<Dtype>(1.0) + expf(tmp))) -
         static_cast<Dtype>(1.0);
 }

--- a/lite/backends/cuda/math/scale.cu
+++ b/lite/backends/cuda/math/scale.cu
@@ -22,10 +22,6 @@ namespace lite {
 namespace cuda {
 namespace math {

-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void scale_kernel(int count,
                             const T* in_data,
@@ -48,7 +44,6 @@ __global__ void scale_kernel(int count,
 template <typename T>
 __global__ void scale_kernel(
    int count, const T* in_data, T* out_data, const T scale, const T bias) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  CUDA_KERNEL_LOOP(tid, count) { out_data[tid] = scale * in_data[tid] + bias; }
 }

@@ -133,12 +128,11 @@ void fp32_scale_nhwc(int num,
 }

 template <typename T>
-void scale(int num, const T* in, T* out, T scale, cudaStream_t stream, T bias) {
+void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream) {
  int thread = 256;
  int block = (num + thread - 1) / thread;
  scale_kernel<<<block, thread, 0, stream>>>(num, in, out, scale, bias);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }

 template <typename T>
@@ -146,11 +140,10 @@ void scale(int num, const T* in, T* out, T scale, T bias) {
  int thread = 256;
  int block = (num + thread - 1) / thread;
  scale_kernel<<<block, thread>>>(num, in, out, scale, bias);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }

-template void scale(int num, const float*, float*, float, cudaStream_t, float);
+template void scale(int num, const float*, float*, float, float, cudaStream_t);
 template void scale(int num, const float*, float*, float, float);

 }  // namespace math

--- a/lite/backends/cuda/math/scale.h
+++ b/lite/backends/cuda/math/scale.h
@@ -32,8 +32,7 @@ void fp32_scale_nhwc(int num,
                     cudaStream_t stream);

 template <typename T>
-void scale(
-    int num, const T* in, T* out, T scale, cudaStream_t stream, T bias = 0);
+void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream);

 template <typename T>
 void scale(int num, const T* in, T* out, T scale, T bias = 0);

--- a/lite/backends/cuda/math/sequence2batch.cu
+++ b/lite/backends/cuda/math/sequence2batch.cu
@@ -32,7 +32,7 @@ __global__ void CopyMatrixRowsKernel(const T* src,
                                     bool is_src_index) {
  int idx = threadIdx.x;
  int idy = threadIdx.y;
-  int row_id = blockDim.y * gridDim.x + idy;
+  int row_id = blockDim.y * blockIdx.x + idy;
  if (row_id < height) {
    int src_idx = is_src_index ? index[row_id] : row_id;
    int dst_idx = is_src_index ? row_id : index[row_id];
@@ -72,7 +72,7 @@ void CopyMatrixRowsFunctor<T>::operator()(
  dim3 threads(128, 8);
  dim3 grids((height + threads.y - 1) / threads.y);
  CopyMatrixRowsKernel<T><<<grids, threads, 0, stream>>>(
-      src_data, dst_data, index_tensor_data, height, width, true);
+      src_data, dst_data, index_tensor_data, height, width, is_src_index);
  CUDA_POST_KERNEL_CHECK;
 }


--- a/lite/backends/cuda/math/sequence2batch.h
+++ b/lite/backends/cuda/math/sequence2batch.h
@@ -53,11 +53,11 @@ class LoDTensor2BatchFunctor {
  //            s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
  //            seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
  struct SeqInfo {
-    SeqInfo(size_t start, size_t length, size_t seq_idx)
-        : start_(start), length_(length), seq_idx_(seq_idx) {}
-    size_t start_;
-    size_t length_;
-    size_t seq_idx_;
+    SeqInfo(size_t start_val, size_t len_val, size_t seq_val)
+        : start(start_val), length(len_val), seq_idx(seq_val) {}
+    size_t start;
+    size_t length;
+    size_t seq_idx;
  };

 public:
@@ -76,7 +76,7 @@ class LoDTensor2BatchFunctor {
    }

    std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
-      return a.length_ > b.length_;
+      return a.length > b.length;
    });

    // Calculate the start position of each batch.
@@ -106,7 +106,7 @@ class LoDTensor2BatchFunctor {
    batch_lods.emplace_back(std::vector<uint64_t>{0});

    // batch_lods[0] is the start positions for batch LoDTensor
-    size_t max_seqlen = seq_info[0].length_;
+    size_t max_seqlen = seq_info[0].length;
    batch_lods[0].resize(max_seqlen + 1);
    // batch_lods[1] is the raw index in the input LoDTensor
    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
@@ -119,8 +119,8 @@ class LoDTensor2BatchFunctor {
    for (size_t n = 0; n < max_seqlen; ++n) {
      size_t batch_id = batch_starts[n];
      for (size_t i = 0; i < seq_info.size(); ++i) {
-        size_t seq_len = seq_info[i].length_;
-        size_t start = seq_info[i].start_;
+        size_t seq_len = seq_info[i].length;
+        size_t start = seq_info[i].start;
        if (n < seq_len) {
          seq2batch_idx[batch_id] =
              is_reverse ? start + seq_len - 1 - n : start + n;
@@ -133,7 +133,7 @@ class LoDTensor2BatchFunctor {
    }
    auto* seq_order = batch_lods[2].data();
    for (size_t i = 0; i < seq_info.size(); ++i) {
-      seq_order[i] = seq_info[i].seq_idx_;
+      seq_order[i] = seq_info[i].seq_idx;
    }

    batch_tensor->set_lod(batch_lods);

--- a/lite/backends/cuda/math/sequence_padding.cu
+++ b/lite/backends/cuda/math/sequence_padding.cu
@@ -86,8 +86,7 @@ void SequencePadding(T* pad_data,
      seq_num,
      pad_seq_len,
      step_width);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }

 template <typename T>
@@ -120,8 +119,7 @@ void SequenceUnpadding(T* seq_data,
      seq_num,
      pad_seq_len,
      step_width);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+  CUDA_POST_KERNEL_CHECK;
 }

 template void SequencePadding(float* pad_data,

--- a/lite/backends/host/target_wrapper.cc
+++ b/lite/backends/host/target_wrapper.cc
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace lite {

-const int MALLOC_ALIGN = 64;
+const int MALLOC_ALIGN = 16;

 void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
  size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
@@ -30,7 +30,6 @@ void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
  void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
                                    (~(MALLOC_ALIGN - 1)));
  static_cast<void**>(r)[-1] = p;
-  memset(r, 0, size);
  return r;
 }
 void TargetWrapper<TARGET(kHost)>::Free(void* ptr) {

--- a/lite/backends/huawei_ascend_npu/CMakeLists.txt
+++ b/lite/backends/huawei_ascend_npu/CMakeLists.txt
+if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
+  return()
+endif()
+
+lite_cc_library(model_client_huawei_ascend_npu SRCS model_client.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs})
+lite_cc_library(device_huawei_ascend_npu SRCS device.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs} model_client_huawei_ascend_npu)
--- a/lite/backends/huawei_ascend_npu/device.cc
+++ b/lite/backends/huawei_ascend_npu/device.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/huawei_ascend_npu/device.h"
+#include <map>
+#include <utility>
+#include "ge/ge_api_types.h"
+#include "ge/ge_ir_build.h"
+#include "graph/graph.h"
+#include "lite/utils/io.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+std::shared_ptr<AclModelClient> Device::LoadFromMem(
+    const std::vector<char>& model_buffer, const int device_id) {
+  if (model_buffer.size() == 0) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] model_buffer size is ZERO!";
+    return nullptr;
+  }
+
+  // Create a ACL model  client to load the om model
+  std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
+  // Load model from memory
+  if (model_client->LoadFromMem(
+          reinterpret_cast<const void*>(model_buffer.data()),
+          model_buffer.size())) {
+    return model_client;
+  }
+  return nullptr;
+}
+
+std::shared_ptr<AclModelClient> Device::LoadFromFile(
+    const std::string& model_path, const int device_id) {
+  if (!paddle::lite::IsFileExists(model_path)) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] om model file not exists:" << model_path;
+    return nullptr;
+  }
+
+  // Create a ACL model  client to load the om model
+  std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
+  // Load model from memory
+  if (model_client->LoadFromFile(model_path.c_str())) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
+    return model_client;
+  }
+  return nullptr;
+}
+
+std::mutex Device::device_mutex_;
+
+bool Device::Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
+                   std::vector<ge::Operator>& output_nodes,  // NOLINT
+                   std::vector<char>* model_buffer) {
+  std::lock_guard<std::mutex> lock(device_mutex_);
+  // Convert the HiAI IR graph to the HiAI om model
+  ge::Graph ir_graph("graph");
+  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
+
+  // Build IR model
+  ge::ModelBufferData om_buffer;
+  std::map<std::string, std::string> options;
+  options.insert(std::make_pair(ge::ir_option::LOG_LEVEL, "error"));
+
+  ATC_CALL(aclgrphBuildModel(ir_graph, options, om_buffer));
+
+  // Copy from om model buffer
+  model_buffer->resize(om_buffer.length);
+  memcpy(reinterpret_cast<void*>(model_buffer->data()),
+         reinterpret_cast<void*>(om_buffer.data.get()),
+         om_buffer.length);
+
+  return true;
+}
+
+void Device::InitOnce() {
+  if (runtime_inited_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] runtime already inited!";
+    return;
+  }
+  // ACL runtime init => can only be called once in one process
+  ACL_CALL(aclInit(NULL));
+
+  // ATC builder init => can only be called once in one process
+  std::map<std::string, std::string> global_options;
+  global_options.insert(
+      std::make_pair(ge::ir_option::SOC_VERSION, "Ascend310"));
+  ATC_CALL(ge::aclgrphBuildInitialize(global_options));
+
+  runtime_inited_ = true;
+}
+
+void Device::DestroyOnce() {
+  if (!runtime_inited_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to destroy runtime!";
+    return;
+  }
+  // ATC builder finalize => can only be called once in one process
+  ge::aclgrphBuildFinalize();
+  // ACL runtime finalize => can only be called once in one process
+  ACL_CALL(aclFinalize());
+
+  runtime_inited_ = false;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/huawei_ascend_npu/device.h
+++ b/lite/backends/huawei_ascend_npu/device.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+#include "lite/backends/huawei_ascend_npu/model_client.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() { InitOnce(); }
+
+  ~Device() { DestroyOnce(); }
+
+  std::shared_ptr<AclModelClient> LoadFromMem(
+      const std::vector<char>& model_buffer, const int device_id);
+  std::shared_ptr<AclModelClient> LoadFromFile(const std::string& model_path,
+                                               const int device_id);
+  // Build the ACL IR graph to the ACL om model
+  bool Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
+             std::vector<ge::Operator>& output_nodes,  // NOLINT
+             std::vector<char>* model_buffer);         // NOLINT
+
+ private:
+  void InitOnce();
+  void DestroyOnce();
+  bool runtime_inited_{false};
+  static std::mutex device_mutex_;
+};
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/huawei_ascend_npu/model_client.cc
+++ b/lite/backends/huawei_ascend_npu/model_client.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/huawei_ascend_npu/model_client.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+bool AclModelClient::LoadFromMem(const void* data, uint32_t size) {
+  if (load_flag_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
+    return true;
+  }
+
+  auto ret = aclmdlQuerySizeFromMem(
+      data, size, &model_memory_size_, &model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from memory failed!";
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
+                    "failed, require size is "
+                 << model_memory_size_;
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
+                    "failed, require size is "
+                 << model_weight_size_;
+    return false;
+  }
+  ret = aclmdlLoadFromMemWithMem(data,
+                                 size,
+                                 &model_id_,
+                                 model_memory_ptr_,
+                                 model_memory_size_,
+                                 model_weight_ptr_,
+                                 model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from memory failed!";
+    return false;
+  }
+  model_desc_ = aclmdlCreateDesc();
+  if (model_desc_ == nullptr) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
+    return false;
+  }
+  ret = aclmdlGetDesc(model_desc_, model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] AclModelClient LoadFromMem success.";
+  load_flag_ = true;
+  return true;
+}
+
+bool AclModelClient::LoadFromFile(const char* model_path) {
+  if (load_flag_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
+    return true;
+  }
+  auto ret =
+      aclmdlQuerySize(model_path, &model_memory_size_, &model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from file failed!";
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
+                    "failed, require size is "
+                 << model_memory_size_;
+    return false;
+  }
+  ret = aclrtMalloc(
+      &model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
+                    "failed, require size is "
+                 << model_weight_size_;
+    return false;
+  }
+  ret = aclmdlLoadFromFileWithMem(model_path,
+                                  &model_id_,
+                                  model_memory_ptr_,
+                                  model_memory_size_,
+                                  model_weight_ptr_,
+                                  model_weight_size_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from file failed!";
+    return false;
+  }
+  model_desc_ = aclmdlCreateDesc();
+  if (model_desc_ == nullptr) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
+    return false;
+  }
+  ret = aclmdlGetDesc(model_desc_, model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
+  load_flag_ = true;
+  return true;
+}
+
+bool AclModelClient::GetModelIOTensorDim(
+    std::vector<TensorDesc>* input_tensor,
+    std::vector<TensorDesc>* output_tensor) {
+  if (!model_desc_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] GetModelIOTensorDim failed!";
+    return false;
+  }
+  size_t input_num = aclmdlGetNumInputs(model_desc_);
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] input numher is " << input_num;
+  for (size_t i = 0; i < input_num; i++) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] printing input [" << i << "] ....";
+    aclmdlIODims input_dim;
+    aclmdlGetInputDims(model_desc_, i, &input_dim);
+    aclDataType data_type = aclmdlGetInputDataType(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of inputs[" << i << "] is "
+            << data_type;
+    aclFormat data_format = aclmdlGetInputFormat(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of inputs[" << i << "] is "
+            << data_format;
+    TensorDesc tensor_desc = TensorDesc(data_type, input_dim, data_format);
+    input_tensor->push_back(tensor_desc);
+  }
+
+  size_t output_num = aclmdlGetNumOutputs(model_desc_);
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] output numher is " << output_num;
+  for (size_t i = 0; i < output_num; i++) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] printing output [" << i << "] ....";
+    aclmdlIODims output_dim;
+    aclmdlGetOutputDims(model_desc_, i, &output_dim);
+    aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of outputs[" << i << "] is "
+            << data_type;
+    aclFormat data_format = aclmdlGetOutputFormat(model_desc_, i);
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of outputs[" << i << "] is "
+            << data_format;
+    TensorDesc tensor_desc = TensorDesc(data_type, output_dim, data_format);
+    output_tensor->push_back(tensor_desc);
+  }
+  return true;
+}
+
+bool AclModelClient::GetTensorFromDataset(
+    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
+  size_t device_output_num = aclmdlGetDatasetNumBuffers(output_dataset_);
+  size_t tensor_output_num = reinterpret_cast<size_t>(output_tensor->size());
+  if (device_output_num != tensor_output_num) {
+    LOG(ERROR)
+        << "[HUAWEI_ASCEND_NPU] output number not equal, device number is "
+        << device_output_num << "tensor number is " << tensor_output_num;
+    return false;
+  }
+  for (size_t i = 0; i < device_output_num; i++) {
+    aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(output_dataset_, i);
+    void* device_data = aclGetDataBufferAddr(buffer_device);
+    uint32_t device_size = aclGetDataBufferSize(buffer_device);
+
+    void* tensor_data = nullptr;
+    aclError ret = aclrtMallocHost(&tensor_data, device_size);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMallocHost failed, ret " << ret;
+      return false;
+    }
+    ret = aclrtMemcpy(tensor_data,
+                      device_size,
+                      device_data,
+                      device_size,
+                      ACL_MEMCPY_DEVICE_TO_HOST);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMemcpy failed, ret " << ret;
+      return false;
+    }
+    if (output_tensor->at(i)->SetData(reinterpret_cast<uint8_t*>(tensor_data),
+                                      device_size) != ge::GRAPH_SUCCESS) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] SetData to output tensor failed";
+      return false;
+    }
+  }
+  VLOG(3)
+      << "[HUAWEI_ASCEND_NPU] Get output tensor from output dataset succeed.";
+  return true;
+}
+
+void AclModelClient::CreateInputDataset(
+    std::vector<std::shared_ptr<ge::Tensor>>* input_tensor) {
+  input_dataset_ = aclmdlCreateDataset();
+  if (input_dataset_ == nullptr) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create input dataset failed!";
+    return;
+  }
+
+  for (size_t i = 0; i < input_tensor->size(); i++) {
+    auto item = input_tensor->at(i);
+    size_t buffer_size = item->GetSize();
+    void* buffer_device = nullptr;
+    aclError ret =
+        aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR)
+          << "[HUAWEI_ASCEND_NPU] input malloc device buffer failed. size is "
+          << buffer_size;
+      return;
+    }
+    void* buffer_data = reinterpret_cast<void*>(item->GetData());
+    ret = aclrtMemcpy(buffer_device,
+                      buffer_size,
+                      buffer_data,
+                      buffer_size,
+                      ACL_MEMCPY_HOST_TO_DEVICE);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input memcpy failed, buffer size is "
+                 << buffer_size;
+      aclrtFree(buffer_device);
+      return;
+    }
+    aclDataBuffer* data_buffer =
+        aclCreateDataBuffer(buffer_device, buffer_size);
+    if (data_buffer == nullptr) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
+      aclrtFree(buffer_device);
+      return;
+    }
+    if (aclmdlAddDatasetBuffer(input_dataset_, data_buffer) != ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input aclmdlAddDatasetBuffer failed!";
+      aclrtFree(buffer_device);
+      aclDestroyDataBuffer(data_buffer);
+      return;
+    }
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateInputDataset succeed.";
+}
+void AclModelClient::CreateOutputDataset(
+    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
+  output_dataset_ = aclmdlCreateDataset();
+  if (output_dataset_ == nullptr) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create output dataset failed!";
+    return;
+  }
+  size_t output_size = aclmdlGetNumOutputs(model_desc_);
+  CHECK_EQ(output_size, output_tensor->size());
+  for (size_t i = 0; i < output_size; i++) {
+    size_t buffer_size = aclmdlGetOutputSizeByIndex(model_desc_, i);
+    void* buffer_device = nullptr;
+    aclError ret =
+        aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
+    if (ret != ACL_ERROR_NONE) {
+      LOG(ERROR)
+          << "[HUAWEI_ASCEND_NPU] output malloc device buffer failed. size is "
+          << buffer_size;
+      return;
+    }
+    aclDataBuffer* data_buffer =
+        aclCreateDataBuffer(buffer_device, buffer_size);
+    if (data_buffer == nullptr) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
+      aclrtFree(buffer_device);
+      return;
+    }
+    if (aclmdlAddDatasetBuffer(output_dataset_, data_buffer) !=
+        ACL_ERROR_NONE) {
+      LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclmdlAddDatasetBuffer failed!";
+      aclrtFree(buffer_device);
+      aclDestroyDataBuffer(data_buffer);
+      return;
+    }
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateOutputDataset succeed.";
+}
+
+bool AclModelClient::ModelExecute(
+    std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
+    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
+  // check model exists
+  if (model_desc_ == nullptr) {
+    LOG(ERROR)
+        << "[HUAWEI_ASCEND_NPU] no model description, model execution failed!";
+    return false;
+  }
+  // create input/output dataset
+  CreateInputDataset(input_tensor);
+  CreateOutputDataset(output_tensor);
+
+  // model execution
+  ACL_CALL(aclmdlExecute(model_id_, input_dataset_, output_dataset_));
+
+  // get output
+  if (!GetTensorFromDataset(output_tensor)) {
+    LOG(ERROR) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset failed, modelId:"
+               << model_id_;
+    return false;
+  }
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset succeed, modelId:"
+          << model_id_;
+
+  return true;
+}
+
+void AclModelClient::DestroyDataset(aclmdlDataset** dataset) {
+  if (*dataset == nullptr) {
+    LOG(WARNING)
+        << "[HUAWEI_ASCEND_NPU] no dataset exists, no need to destroy!";
+    return;
+  }
+
+  size_t dataset_num = aclmdlGetDatasetNumBuffers(*dataset);
+  for (size_t i = 0; i < dataset_num; i++) {
+    aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(*dataset, i);
+    void* device_data = aclGetDataBufferAddr(buffer_device);
+    if (device_data == nullptr) {
+      LOG(WARNING)
+          << "[HUAWEI_ASCEND_NPU] failed to get data buffer of deivce data!";
+    } else {
+      if (aclrtFree(device_data) != ACL_ERROR_NONE) {
+        LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to free deivce data!";
+      }
+    }
+    if (aclDestroyDataBuffer(buffer_device) != ACL_ERROR_NONE) {
+      LOG(WARNING)
+          << "[HUAWEI_ASCEND_NPU] failed to destroy deivce data buffer!";
+    }
+  }
+  if (aclmdlDestroyDataset(*dataset) != ACL_ERROR_NONE) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to destroy dataset!";
+  }
+  *dataset = nullptr;
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroy dataset success.";
+}
+
+bool AclModelClient::UnloadModel() {
+  if (!load_flag_) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to unload model, load flag is "
+                 << load_flag_;
+    return true;
+  }
+
+  DestroyDataset(&input_dataset_);
+  DestroyDataset(&output_dataset_);
+
+  aclError ret = aclmdlUnload(model_id_);
+  if (ret != ACL_ERROR_NONE) {
+    LOG(ERROR) << "unload model failed, model id is " << model_id_;
+    return false;
+  }
+  if (model_desc_ != nullptr) {
+    (void)aclmdlDestroyDesc(model_desc_);
+    model_desc_ = nullptr;
+  }
+
+  if (model_memory_ptr_ != nullptr) {
+    aclrtFree(model_memory_ptr_);
+    model_memory_ptr_ = nullptr;
+    model_memory_size_ = 0;
+  }
+
+  if (model_weight_ptr_ != nullptr) {
+    aclrtFree(model_weight_ptr_);
+    model_weight_ptr_ = nullptr;
+    model_weight_size_ = 0;
+  }
+  load_flag_ = false;
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Unload model success, model id " << model_id_;
+  return true;
+}
+
+uint32_t AclModelClient::num_devices() {
+  uint32_t count = 0;
+  ACL_CALL(aclrtGetDeviceCount(&count));
+  return count;
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/huawei_ascend_npu/model_client.h
+++ b/lite/backends/huawei_ascend_npu/model_client.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/huawei_ascend_npu/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+class TensorDesc {
+ public:
+  TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) {
+    if (format == ACL_FORMAT_NHWC) {
+      dim_order[1] = 3;
+      dim_order[2] = 1;
+      dim_order[3] = 2;
+    }
+    // create ge::Tensordesc
+    ge_tensor_desc_ = new ge::TensorDesc(
+        GetGeShape(dims), GetGeFormat(format), GetGeDataType(data_type));
+    CHECK(ge_tensor_desc_ != nullptr);
+  }
+  ~TensorDesc() { ge_tensor_desc_ = nullptr; }
+  int64_t GetNumber() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[0]);
+  }
+  int64_t GetChannel() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[1]);
+  }
+  int64_t GetHeight() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[2]);
+  }
+  int64_t GetWidth() const {
+    return ge_tensor_desc_->GetShape().GetDim(dim_order[3]);
+  }
+  const ge::TensorDesc& GetGeTensorDesc() const { return *ge_tensor_desc_; }
+
+ private:
+  ge::Shape GetGeShape(aclmdlIODims dims) {
+    ge::Shape ge_shape({0, 0, 0, 0});
+    for (size_t i = 0; i < dims.dimCount; i++) {
+      if (ge_shape.SetDim(i, dims.dims[i]) != ge::GRAPH_SUCCESS) {
+        LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Shape SetDim failed!";
+      } else {
+        VLOG(3) << "[HUAWEI_ASCEND_NPU] Setting Ge Shape[" << i << "] = <"
+                << dims.dims[i] << ">";
+      }
+    }
+    return ge_shape;
+  }
+  ge::Format GetGeFormat(aclFormat format) {
+    ge::Format ge_format = ge::FORMAT_NCHW;
+    switch (format) {
+      case ACL_FORMAT_NCHW:
+        ge_format = ge::FORMAT_NCHW;
+        break;
+      case ACL_FORMAT_NHWC:
+        ge_format = ge::FORMAT_NHWC;
+        break;
+      case ACL_FORMAT_ND:
+        ge_format = ge::FORMAT_ND;
+        break;
+      default:
+        LOG(FATAL) << "[HUAWEI_ASCEND_NPU] format not supported:" << format;
+        break;
+    }
+    return ge_format;
+  }
+  ge::DataType GetGeDataType(aclDataType data_type) {
+    ge::DataType ge_datatype = ge::DT_FLOAT;
+    switch (data_type) {
+      case ACL_FLOAT:
+        ge_datatype = ge::DT_FLOAT;
+        break;
+      case ACL_FLOAT16:
+        ge_datatype = ge::DT_FLOAT16;
+        break;
+      case ACL_INT8:
+        ge_datatype = ge::DT_INT8;
+        break;
+      case ACL_INT16:
+        ge_datatype = ge::DT_INT16;
+        break;
+      case ACL_INT32:
+        ge_datatype = ge::DT_INT32;
+        break;
+      case ACL_INT64:
+        ge_datatype = ge::DT_INT64;
+        break;
+      case ACL_BOOL:
+        ge_datatype = ge::DT_BOOL;
+        break;
+      default:
+        LOG(FATAL) << "[HUAWEI_ASCEND_NPU] data type not supported!";
+        break;
+    }
+    return ge_datatype;
+  }
+
+ private:
+  ge::TensorDesc* ge_tensor_desc_{nullptr};
+  // n c h w order, default to ACL_FORMAT_NCHW
+  std::vector<size_t> dim_order{0, 1, 2, 3};
+};
+
+class AclModelClient {
+ public:
+  explicit AclModelClient(int device_id) {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Creating Huawei Ascend Device: "
+            << device_id;
+    device_num_ = num_devices();
+    if (device_id < 0 || device_id >= device_num_) {
+      LOG(FATAL) << "Failed with invalid device id " << device_id;
+      return;
+    }
+    device_id_ = device_id;
+    ACL_CALL(aclrtSetDevice(device_id_));
+  }
+
+  ~AclModelClient() {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroying Huawei Ascend Device: "
+            << device_id_;
+    ACL_CALL(aclrtResetDevice(device_id_));
+  }
+
+  bool LoadFromMem(const void* data, uint32_t size);
+  bool LoadFromFile(const char* model_path);
+  bool GetModelIOTensorDim(std::vector<TensorDesc>* input_tensor,
+                           std::vector<TensorDesc>* output_tensor);
+  bool ModelExecute(std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
+                    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
+  bool UnloadModel();
+
+ private:
+  void CreateInputDataset(
+      std::vector<std::shared_ptr<ge::Tensor>>* input_tensor);
+  void CreateOutputDataset(
+      std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
+  bool GetTensorFromDataset(
+      std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
+  void DestroyDataset(aclmdlDataset** dataset);
+
+ private:
+  uint32_t num_devices();
+
+ private:
+  int device_id_{0};
+  int device_num_{0};
+  aclrtContext context_{nullptr};
+  bool load_flag_{false};
+  uint32_t model_id_{0};
+  size_t model_memory_size_;
+  size_t model_weight_size_;
+  void* model_memory_ptr_;
+  void* model_weight_ptr_;
+  aclmdlDesc* model_desc_{nullptr};
+  aclmdlDataset* input_dataset_{nullptr};
+  aclmdlDataset* output_dataset_{nullptr};
+};
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/huawei_ascend_npu/utils.h
+++ b/lite/backends/huawei_ascend_npu/utils.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "acl/acl.h"
+#include "ge/ge_api_types.h"
+#include "ge/ge_ir_build.h"
+#include "graph/ge_error_codes.h"
+#include "graph/graph.h"
+#include "graph/tensor.h"
+#include "graph/types.h"
+#include "lite/utils/cp_logging.h"
+
+/*
+ * This file contains some Huawei Ascend NPU specific uitls.
+ */
+
+#define ACL_CALL(msg)                                       \
+  CHECK_EQ(reinterpret_cast<aclError>(msg), ACL_ERROR_NONE) \
+      << (msg) << " Huawei Ascend NPU ACL Error: "          \
+      << ::paddle::lite::huawei_ascend_npu::AclErrorInfo(   \
+             reinterpret_cast<int>(msg))
+
+#define ATC_CALL(msg)                                                 \
+  CHECK_EQ(reinterpret_cast<ge::graphStatus>(msg), ge::GRAPH_SUCCESS) \
+      << (msg) << " Huawei Ascend NPU ATC Error: "                    \
+      << ::paddle::lite::huawei_ascend_npu::AtcErrorInfo(             \
+             reinterpret_cast<uint32_t>(msg))
+
+namespace paddle {
+namespace lite {
+namespace huawei_ascend_npu {
+
+static const char* AtcErrorInfo(uint32_t error) {
+  switch (error) {
+#define LITE_ATC_ERROR_INFO(xx) \
+  case xx:                      \
+    return #xx;                 \
+    break;
+    LITE_ATC_ERROR_INFO(ge::GRAPH_FAILED);         // 0xFFFFFFFF
+    LITE_ATC_ERROR_INFO(ge::GRAPH_PARAM_INVALID);  // 50331649
+#undef LITE_ATC_ERROR_INFO
+    default:
+      return "unknown error";
+      break;
+  }
+}
+
+static const char* AclErrorInfo(int error) {
+  switch (error) {
+#define LITE_ACL_ERROR_INFO(xx) \
+  case xx:                      \
+    return #xx;                 \
+    break;
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PARAM);                    // 100000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_UNINITIALIZE);                     // 100001
+    LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_INITIALIZE);                // 100002
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE);                     // 100003
+    LITE_ACL_ERROR_INFO(ACL_ERROR_WRITE_FILE);                       // 100004
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE_SIZE);                // 100005
+    LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_FILE);                       // 100006
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_MISSING_ATTR);                // 100007
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_ATTR_INVALID);                // 100008
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DUMP_CONFIG);              // 100009
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PROFILING_CONFIG);         // 100010
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_MODEL_ID);                 // 100011
+    LITE_ACL_ERROR_INFO(ACL_ERROR_DESERIALIZE_MODEL);                // 100012
+    LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_MODEL);                      // 100013
+    LITE_ACL_ERROR_INFO(ACL_ERROR_READ_MODEL_FAILURE);               // 100014
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_SIZE_INVALID);               // 100015
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_MISSING_ATTR);               // 100016
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_INPUT_NOT_MATCH);            // 100017
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_OUTPUT_NOT_MATCH);           // 100018
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_NOT_DYNAMIC);                // 100019
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_TYPE_NOT_MATCH);                // 100020
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_INPUT_NOT_MATCH);               // 100021
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_OUTPUT_NOT_MATCH);              // 100022
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_ATTR_NOT_MATCH);                // 100023
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_NOT_FOUND);                     // 100024
+    LITE_ACL_ERROR_INFO(ACL_ERROR_OP_LOAD_FAILED);                   // 100025
+    LITE_ACL_ERROR_INFO(ACL_ERROR_UNSUPPORTED_DATA_TYPE);            // 100026
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FORMAT_NOT_MATCH);                 // 100027
+    LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED);      // 100028
+    LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_NOT_FOUND);                 // 100029
+    LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED);  // 100030
+    LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_ALREADY_REGISTERED);        // 100031
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_QUEUE_ID);                 // 100032
+    LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_SUBSCRIBE);                 // 100033
+    LITE_ACL_ERROR_INFO(ACL_ERROR_STREAM_NOT_SUBSCRIBE);             // 100034
+    LITE_ACL_ERROR_INFO(ACL_ERROR_THREAD_NOT_SUBSCRIBE);             // 100035
+    LITE_ACL_ERROR_INFO(ACL_ERROR_WAIT_CALLBACK_TIMEOUT);            // 100036
+    LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_FINALIZE);                  // 100037
+    LITE_ACL_ERROR_INFO(ACL_ERROR_NOT_STATIC_AIPP);                  // 100038
+    LITE_ACL_ERROR_INFO(ACL_ERROR_BAD_ALLOC);                        // 200000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_API_NOT_SUPPORT);                  // 200001
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DEVICE);                   // 200002
+    LITE_ACL_ERROR_INFO(ACL_ERROR_MEMORY_ADDRESS_UNALIGNED);         // 200003
+    LITE_ACL_ERROR_INFO(ACL_ERROR_RESOURCE_NOT_MATCH);               // 200004
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_RESOURCE_HANDLE);          // 200005
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FEATURE_UNSUPPORTED);              // 200006
+    LITE_ACL_ERROR_INFO(ACL_ERROR_STORAGE_OVER_LIMIT);               // 300000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_INTERNAL_ERROR);                   // 500000
+    LITE_ACL_ERROR_INFO(ACL_ERROR_FAILURE);                          // 500001
+    LITE_ACL_ERROR_INFO(ACL_ERROR_GE_FAILURE);                       // 500002
+    LITE_ACL_ERROR_INFO(ACL_ERROR_RT_FAILURE);                       // 500003
+    LITE_ACL_ERROR_INFO(ACL_ERROR_DRV_FAILURE);                      // 500004
+    LITE_ACL_ERROR_INFO(ACL_ERROR_PROFILING_FAILURE);                // 500005
+#undef LITE_ACL_ERROR_INFO
+    default:
+      return "unknown error";
+      break;
+  }
+}
+
+}  // namespace huawei_ascend_npu
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -33,7 +33,7 @@ std::shared_ptr<hiai::AiModelMngerClient> Device::Load(
  // Check HiAI DDK version
  const char* ddk_version = model_client->GetVersion();
  if (ddk_version) {
-    LOG(INFO) << "[NPU] HiAI DDK version: " << ddk_version;
+    VLOG(3) << "[NPU] HiAI DDK version: " << ddk_version;
  } else {
    LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!";
  }

--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)

 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -24,7 +24,7 @@ namespace arena {
 void TestCase::CreateInstruction() {
  std::shared_ptr<lite::OpLite> op = nullptr;
  static const std::set<TargetType> subgraph_op_supported_targets(
-      {TARGET(kNPU), TARGET(kXPU)});
+      {TARGET(kNPU), TARGET(kXPU), TARGET(kHuaweiAscendNPU)});
  bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) !=
                            subgraph_op_supported_targets.end();
 #if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
@@ -32,25 +32,35 @@ void TestCase::CreateInstruction() {
 #endif
  if (enable_subgraph_op) {
    // Create a new block desc to wrap the original op desc
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
    int sub_block_idx = 0;
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
    sub_block_desc->ClearOps();
    sub_block_desc->ClearVars();
-    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
-    *sub_block_op_desc = *op_desc_;
+    auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_op_desc = *op_desc_;
    // Add the block desc into the subgraph op which used to replace the
    // original op
    op_desc_.reset(new cpp::OpDesc());
    op_desc_->SetType("subgraph");
    op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
-    auto in_names = sub_block_op_desc->input_vars();
-    auto out_names = sub_block_op_desc->output_vars();
+    auto in_names = sub_op_desc->input_vars();
+    auto out_names = sub_op_desc->output_vars();
    op_desc_->SetInput("Inputs", in_names);
    op_desc_->SetOutput("Outputs", out_names);
-    op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names);
+    // filter only data op (not const op by persisiable)
+    std::vector<std::string> in_data_names;
+    for (auto name : in_names) {
+      if (!(inst_scope_->FindTensor(name)->persistable())) {
+        in_data_names.push_back(name);
+      }
+    }
+    op_desc_->SetAttr<std::vector<std::string>>("input_data_names",
+                                                in_data_names);
    op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
    op = LiteOpRegistry::Global().Create(op_desc().Type());
-    static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
+    static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
+        sub_program_desc);
  } else {
    op = LiteOpRegistry::Global().Create(op_desc().Type());
  }
@@ -60,7 +70,7 @@ void TestCase::CreateInstruction() {
  // filter out the target kernel
  CHECK(!kernels.empty()) << "No kernel found for place "
                          << place_.DebugString();
-  auto it = std::remove_if(
+  auto it = std::find_if(
      kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& k) {
        return k->alias() == alias_;
      });
@@ -234,19 +244,6 @@ bool TestCase::CheckPrecision(const std::string& var_name,
  return success;
 }

-TestCase::~TestCase() {
-  if (op_desc_->Type() == "subgraph") {
-    // Release the subblock desc of Subgraph op
-    auto subgraph_op = const_cast<operators::SubgraphOp*>(
-        static_cast<const operators::SubgraphOp*>(instruction_->op()));
-    CHECK(subgraph_op);
-    auto sub_block_desc = subgraph_op->GetSubBlock();
-    if (sub_block_desc) {
-      delete sub_block_desc;
-    }
-  }
-}
-
 }  // namespace arena
 }  // namespace lite
 }  // namespace paddle
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -46,7 +46,7 @@ class TestCase {
        base_scope_(new Scope) {
    ctx_ = ContextScheduler::Global().NewContext(place_.target);
  }
-  virtual ~TestCase();
+  virtual ~TestCase() {}

  void Prepare() {
    PrepareData();

--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -17,8 +17,13 @@
 namespace paddle {
 namespace lite {

-#ifdef LITE_WITH_NPU
-std::string Context<TargetType::kNPU>::subgraph_model_cache_dir_{""};  // NOLINT
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+thread_local std::string
+    Context<TargetType::kHuaweiAscendNPU>::subgraph_model_cache_dir_{
+        ""};  // NOLINT
+thread_local int
+    Context<TargetType::kHuaweiAscendNPU>::huawei_ascend_device_id_{
+        0};  // NOLINT
 #endif

 #ifdef LITE_WITH_MLU

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -39,6 +39,7 @@
 #include <utility>
 #include <vector>
 #include "lite/core/device_info.h"
+#include "lite/core/scope.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
 #include "lite/utils/all.h"
@@ -61,6 +62,7 @@ using FPGAContext = Context<TargetType::kFPGA>;
 using BMContext = Context<TargetType::kBM>;
 using MLUContext = Context<TargetType::kMLU>;
 using RKNPUContext = Context<TargetType::kRKNPU>;
+using HuaweiAscendNPUContext = Context<TargetType::kHuaweiAscendNPU>;

 template <>
 class Context<TargetType::kHost> {
@@ -84,6 +86,35 @@ class Context<TargetType::kNPU> {
  NPUContext& operator=(const NPUContext& ctx) {}
  std::string name() const { return "NPUContext"; }

+  static void SetSubgraphModelCacheDir(Scope* scope,
+                                       std::string subgraph_model_cache_dir) {
+    auto var = scope->Var("SUBGRAPH_MODEL_CACHE_DIR");
+    CHECK(var);
+    auto data = var->GetMutable<std::string>();
+    CHECK(data);
+    *data = subgraph_model_cache_dir;
+  }
+  static std::string SubgraphModelCacheDir(Scope* scope) {
+    auto var = scope->FindVar("SUBGRAPH_MODEL_CACHE_DIR");
+    if (!var) return "";
+    return var->Get<std::string>();
+  }
+};
+#endif
+
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+template <>
+class Context<TargetType::kHuaweiAscendNPU> {
+ public:
+  // NOTE: InitOnce should only be used by ContextScheduler
+  void InitOnce() {}
+  void CopySharedTo(HuaweiAscendNPUContext* ctx) {}
+
+  HuaweiAscendNPUContext& operator=(const HuaweiAscendNPUContext& ctx) {
+    return *this;
+  }
+  std::string name() const { return "HuaweiAscendNPUContext"; }
+
  static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) {
    subgraph_model_cache_dir_ = subgraph_model_cache_dir;
  }
@@ -91,8 +122,14 @@ class Context<TargetType::kNPU> {
    return subgraph_model_cache_dir_;
  }

+  static void SetHuaweiAscendDeviceID(int huawei_ascend_device_id) {
+    huawei_ascend_device_id_ = huawei_ascend_device_id;
+  }
+  static int HuaweiAscendDeviceID() { return huawei_ascend_device_id_; }
+
 private:
-  static std::string subgraph_model_cache_dir_;
+  static thread_local std::string subgraph_model_cache_dir_;
+  static thread_local int huawei_ascend_device_id_;
 };
 #endif

@@ -385,6 +422,13 @@ class ContextScheduler {
            &ctx->As<NPUContext>());
        break;
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+      case TARGET(kHuaweiAscendNPU):
+        kernel_contexts_[TargetType::kHuaweiAscendNPU]
+            .As<HuaweiAscendNPUContext>()
+            .CopySharedTo(&ctx->As<HuaweiAscendNPUContext>());
+        break;
+#endif
 #ifdef LITE_WITH_APU
      case TARGET(kAPU):
        kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
@@ -466,6 +510,9 @@ class ContextScheduler {
 #ifdef LITE_WITH_NPU
    InitContext<TargetType::kNPU, NPUContext>();
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+    InitContext<TargetType::kHuaweiAscendNPU, HuaweiAscendNPUContext>();
+#endif
 #ifdef LITE_WITH_APU
    InitContext<TargetType::kAPU, APUContext>();
 #endif

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -33,6 +33,7 @@ lite_cc_library(mir_passes
      elimination/identity_dropout_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc
      elimination/remove_tf_redundant_ops_pass.cc
+      elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
      static_kernel_pick_pass.cc
      variable_place_inference_pass.cc
      type_target_cast_pass.cc

--- a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
+++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
+#include <algorithm>
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+// Remove all of the unused nodes from the contorl flow op and update the inputs
+// and outputs of the op info The unused nodes are defined as the nodes which
+// are only linked to the control flow op nodes but nerver linked to the other
+// op nodes.
+//
+// For example:
+// graph[0]: main block
+//                      in_x
+//             in_f      |   in_z(unused node)
+//                  \    |    /
+//                   \   |   /
+//        in_w ------- while ------- in_y(unused_node)
+//                    /  |
+//                   /   |
+// (unused node)out_y    |
+//                     out_x
+//
+// graph[1]: sub block
+//                     in_x
+//                       |
+//                       |
+//                     conv2d----in_f
+//                       |
+//                       |
+//                      fc ------in_w
+//                       |
+//                       |
+//                     softmax
+//                       |
+//                       |
+//                     out_x
+//
+// After the pass is applied:
+//                      in_x
+//             in_f      |
+//                  \    |
+//                   \   |
+//        in_w ------- while
+//                       |
+//                       |
+//                       |
+//                     out_x
+
+// Remove the var node from var2rm if it is recursively referred to any op in
+// the subblock
+void CollectUnusedInputOutputNodes(
+    int block_idx,
+    std::vector<std::unique_ptr<mir::SSAGraph>>* graphs,
+    const std::unordered_set<std::string>& control_flow_op_types,
+    std::unordered_map<std::string, Node*>* in_vars2rm,
+    std::unordered_map<std::string, Node*>* out_vars2rm) {
+  auto block_size = graphs->size();
+  for (auto& op_node : (*graphs)[block_idx]->StmtTopologicalOrder()) {
+    if (!op_node->IsStmt()) continue;
+    auto op_info = op_node->AsStmt().op_info();
+    auto op_type = op_info->Type();
+    if (control_flow_op_types.count(op_type)) {
+      int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
+      CHECK(block_idx >= 0 && block_idx < block_size);
+      CollectUnusedInputOutputNodes(sub_block_idx,
+                                    graphs,
+                                    control_flow_op_types,
+                                    in_vars2rm,
+                                    out_vars2rm);
+    } else {
+      for (auto& var_node : op_node->inlinks) {
+        auto& var_name = var_node->AsArg().name;
+        if (in_vars2rm->count(var_name)) {
+          in_vars2rm->erase(var_name);
+        }
+      }
+      for (auto& var_node : op_node->outlinks) {
+        auto& var_name = var_node->AsArg().name;
+        // Tensor array may be only used as the output vars in the sublock
+        if (in_vars2rm->count(var_name)) {
+          in_vars2rm->erase(var_name);
+        }
+        if (out_vars2rm->count(var_name)) {
+          out_vars2rm->erase(var_name);
+        }
+      }
+    }
+  }
+}
+
+// Remove the unused var nodes from the graph and update the op_info of the
+// control flow op
+void RemoveNodesFromGraphAndUpdateOpInfo(
+    SSAGraph* graph,
+    Node* op_node,
+    const std::unordered_map<std::string, Node*>& in_vars2rm,
+    const std::unordered_map<std::string, Node*>& out_vars2rm) {
+  auto op_info = op_node->AsStmt().mutable_op_info();
+  auto op_type = op_info->Type();
+  // Unlink the in_vars2rm and out_vars2rm from the control flow op node, and
+  // remove them if nerver used.
+  for (auto& var_node : in_vars2rm) {
+    VLOG(3) << "in var node '" << var_node.first << "' is unlinked to "
+            << op_type;
+    RemoveDirectedLink(var_node.second, op_node);
+  }
+  for (auto& var_node : out_vars2rm) {
+    VLOG(3) << "out var node '" << var_node.first << "' is unlinked from "
+            << op_type;
+    RemoveDirectedLink(op_node, var_node.second);
+    // Unlink from all of the out op nodes.
+    std::unordered_set<Node*> out_op_nodes;
+    for (auto* out_op_node : var_node.second->outlinks) {
+      if (!out_op_nodes.count(out_op_node)) {
+        out_op_nodes.insert(out_op_node);
+      }
+    }
+    for (auto* out_op_node : out_op_nodes) {
+      RemoveDirectedLink(var_node.second, out_op_node);
+    }
+  }
+  // Remove the unused nodes from the graph if their inlinks and outlinks are
+  // empty
+  std::unordered_set<const Node*> removed_var_nodes;
+  for (auto& var_node : in_vars2rm) {
+    if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
+        !removed_var_nodes.count(var_node.second)) {
+      removed_var_nodes.insert(var_node.second);
+      graph->RemoveNode(var_node.second);
+      VLOG(3) << "in var node " << var_node.first << " is removed";
+    }
+  }
+  for (auto& var_node : out_vars2rm) {
+    if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
+        !removed_var_nodes.count(var_node.second)) {
+      removed_var_nodes.insert(var_node.second);
+      graph->RemoveNode(var_node.second);
+      VLOG(3) << "out var node " << var_node.first << " is removed";
+    }
+  }
+  // Update the op info of the control flow op
+  for (auto& input : *op_info->mutable_inputs()) {
+    for (auto var = input.second.begin(); var != input.second.end();) {
+      if (in_vars2rm.count(*var)) {
+        var = input.second.erase(var);
+      } else {
+        ++var;
+      }
+    }
+  }
+  for (auto& output : *op_info->mutable_outputs()) {
+    for (auto var = output.second.begin(); var != output.second.end();) {
+      if (out_vars2rm.count(*var)) {
+        var = output.second.erase(var);
+      } else {
+        ++var;
+      }
+    }
+  }
+}
+
+void ControlFlowOpUnusedInputsAndOutputsEliminatePass::SetAllGraphs(
+    std::vector<std::unique_ptr<mir::SSAGraph>>* graphs) {
+  CHECK(graphs && !graphs->empty());
+  graphs_ = graphs;
+}
+
+void ControlFlowOpUnusedInputsAndOutputsEliminatePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  // Remove the unused input and output nodes from the control flow op nodes
+  // Which are only linked to the control flow op nodes but nerver linked to the
+  // other op nodes
+  const std::unordered_set<std::string> control_flow_op_types = {
+      "while", "conditional_block"};
+  auto block_size = graphs_->size();
+  for (auto& op_node : graph->StmtTopologicalOrder()) {
+    if (!op_node->IsStmt()) continue;
+    auto op_info = op_node->AsStmt().mutable_op_info();
+    auto op_type = op_info->Type();
+    if (!control_flow_op_types.count(op_type)) continue;
+    int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
+    CHECK(sub_block_idx >= 0 && sub_block_idx < block_size);
+    // Initialize the unused nodes with all of the input and output nodes
+    std::unordered_map<std::string, Node *> in_vars2rm, out_vars2rm;
+    for (auto* var_node : op_node->inlinks) {
+      auto& var_name = var_node->AsArg().name;
+      if (!in_vars2rm.count(var_name)) {
+        in_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
+      }
+    }
+    for (auto* var_node : op_node->outlinks) {
+      auto& var_name = var_node->AsArg().name;
+      if (!out_vars2rm.count(var_name)) {
+        out_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
+      }
+    }
+    // Remove the nodes which used in subblock recursively, and the remaining
+    // nodes are the unused one.
+    CollectUnusedInputOutputNodes(sub_block_idx,
+                                  graphs_,
+                                  control_flow_op_types,
+                                  &in_vars2rm,
+                                  &out_vars2rm);
+    if (in_vars2rm.size() > 0 || out_vars2rm.size() > 0) {
+      // Remove the unused nodes from graph, and update the op info of the
+      // control flow op
+      RemoveNodesFromGraphAndUpdateOpInfo(
+          graph.get(), op_node, in_vars2rm, out_vars2rm);
+    }
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(
+    control_flow_op_unused_inputs_and_outputs_eliminate_pass,
+    paddle::lite::mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass)
+    .BindTargets({TARGET(kNPU)});
--- a/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h
+++ b/lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ControlFlowOpUnusedInputsAndOutputsEliminatePass : public mir::StmtPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph> &graph) override;
+  void SetAllGraphs(std::vector<std::unique_ptr<mir::SSAGraph>> *graphs);
+
+ private:
+  std::vector<std::unique_ptr<mir::SSAGraph>> *graphs_;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -383,10 +383,10 @@ class XPUSingleEncoderFuser : public FuseBase {
    op_desc.SetAttr<std::string>("act_type", act_type_);

    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    auto* single_encoder_stmt = matched.at("q_mul")->stmt();
    fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());

--- a/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
@@ -373,10 +373,10 @@ class XPUResNetCbamBlock0Fuser : public FuseBase {
    auto block0_stmt = matched.at("left_conv1")->stmt();
    // block0_stmt->ResetOp(op_desc, graph->valid_places());
    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
    block0_stmt->SetOp(fake_subgraph_op);
@@ -693,10 +693,10 @@ class XPUResNetCbamBlock1Fuser : public FuseBase {

    auto block1_stmt = matched.at("right_conv1")->stmt();
    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
    block1_stmt->SetOp(fake_subgraph_op);
@@ -932,10 +932,10 @@ class XPUResNetCbamBlock2Fuser : public FuseBase {
        << "Y of last fc must have been transposed";

    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    fake_subgraph_op->Attach(op_desc, scope);
    fake_subgraph_op->SetValidPlaces(block2_stmt->op()->valid_places());
    block2_stmt->SetOp(fake_subgraph_op);

--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
@@ -315,10 +315,10 @@ class XPUResNetBlock0Fuser : public FuseBase {
    auto block0_stmt = matched.at("left_conv1")->stmt();
    // block0_stmt->ResetOp(op_desc, graph->valid_places());
    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
    block0_stmt->SetOp(fake_subgraph_op);
@@ -577,10 +577,10 @@ class XPUResNetBlock1Fuser : public FuseBase {

    auto block1_stmt = matched.at("right_conv1")->stmt();
    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-    // XXX: memleak?
-    auto sub_block_desc = new cpp::BlockDesc();
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
-        ->SetSubBlock(sub_block_desc);
+        ->SetProgramDesc(sub_program_desc);
    fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
    fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
    block1_stmt->SetOp(fake_subgraph_op);

--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -34,20 +34,25 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  }

  // fuse quantized node and dequant node
-  for (auto& op_type :
-       {"conv2d", "mul", "depthwise_conv2d", "conv2d_transpose"}) {
+  std::vector<std::string> quantized_op_types = {
+      "conv2d", "depthwise_conv2d", "conv2d_transpose", "mul"};
+  for (auto& op_type : quantized_op_types) {
    fusion::DequantOpFuser fuser(op_type);
    fuser(graph.get());
  }
-
-  for (auto& op_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) {
+  for (auto& op_type : quantized_op_types) {
    fusion::ChannelWiseDequantOpFuser fuser(op_type);
    fuser(graph.get());
  }

  // process quant_dequant_node
-  fusion::DeleteQuantDequantOpFuser dqd_fuser;
-  dqd_fuser(graph.get());
+  std::vector<std::string> quant_dequant_op_types = {
+      "fake_quantize_dequantize_abs_max",
+      "fake_quantize_dequantize_moving_average_abs_max"};
+  for (auto& op_type : quant_dequant_op_types) {
+    fusion::DeleteQuantDequantOpFuser dqd_fuser(op_type);
+    dqd_fuser(graph.get());
+  }
 }

 }  // namespace mir

--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -175,11 +175,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
  for (int i = 0; i < weight_scale_size; i++) {
    weight_scale.push_back(whole_weight_scale);
  }
-
-  // Arm CPU does not support conv2d_transpose
-  if (quantized_op_type_ != "conv2d_transpose") {
-    op_desc.SetAttr("enable_int8", true);
-  }
+  op_desc.SetAttr("enable_int8", true);
  op_desc.SetInputScale(weight_name, weight_scale);

  // change the weight from the float type to int8 type.
@@ -284,7 +280,6 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
    op_desc.SetInput("X", {quantized_op_input->arg()->name});
    op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
  }
-  // Arm CPU does not support conv2d_transpose
  if (quantized_op_type_ != "conv2d_transpose") {
    op_desc.SetAttr("enable_int8", true);
  }
@@ -320,30 +315,33 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
 }

 void DeleteQuantDequantOpFuser::BuildPattern() {
-  std::string quant_dequant_op_type =
-      "fake_quantize_dequantize_moving_average_abs_max";
-  auto* input_scale_node =
-      VarNode("input_scale_node")
-          ->assert_is_op_input(quant_dequant_op_type, "InScale");
-  auto* input_act_node =
-      VarNode("input_act_node")->assert_is_op_input(quant_dequant_op_type, "X");
-  auto* quant_dequant_node = OpNode("quant_dequant_node", quant_dequant_op_type)
-                                 ->assert_is_op(quant_dequant_op_type);
+  auto* input_act_node = VarNode("input_act_node")
+                             ->assert_is_op_input(quant_dequant_op_type_, "X");
+  auto* quant_dequant_node =
+      OpNode("quant_dequant_node", quant_dequant_op_type_)
+          ->assert_is_op(quant_dequant_op_type_);
  auto* output_scale_node =
      VarNode("output_scale_node")
-          ->assert_is_op_output(quant_dequant_op_type, "OutScale");
+          ->assert_is_op_output(quant_dequant_op_type_, "OutScale");
  auto* output_act_node =
      VarNode("output_act_node")
-          ->assert_is_op_output(quant_dequant_op_type, "Out");
-
-  quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
+          ->assert_is_op_output(quant_dequant_op_type_, "Out");
+
+  if (quant_dequant_op_type_ ==
+      "fake_quantize_dequantize_moving_average_abs_max") {
+    auto* input_scale_node =
+        VarNode("input_scale_node")
+            ->assert_is_op_input(quant_dequant_op_type_, "InScale");
+    quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
+  } else {
+    quant_dequant_node->LinksFrom({input_act_node});
+  }
  output_scale_node->LinksFrom({quant_dequant_node});
  output_act_node->LinksFrom({quant_dequant_node});
 }

 void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
                                              const key2nodes_t& matched) {
-  auto* input_scale_node = matched.at("input_scale_node");
  auto* input_act_node = matched.at("input_act_node");
  auto* quant_dequant_node = matched.at("quant_dequant_node");
  auto* output_scale_node = matched.at("output_scale_node");
@@ -373,7 +371,12 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
  }
  // delete nodes and edges
  std::set<const Node*> nodes2rm = {
-      input_scale_node, quant_dequant_node, output_scale_node, output_act_node};
+      quant_dequant_node, output_scale_node, output_act_node};
+  if (quant_dequant_op_type_ ==
+      "fake_quantize_dequantize_moving_average_abs_max") {
+    auto* input_scale_node = matched.at("input_scale_node");
+    nodes2rm.insert(input_scale_node);
+  }
  GraphSafeRemoveNodes(graph, nodes2rm);
 }


--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -86,17 +86,22 @@ class ChannelWiseDequantOpFuser : public FuseBase {
  std::string quantized_op_type_{};
 };

-/* The pattern like "fake_quantize_dequantize_moving_average_abs_max +
- * quantized_op" can be deteted by this fuser. The fuser modifies the input
- * scale for the quantized_op and deletes the fake_quant_dequant_op.
+/* The pattern like "fake_quantize_dequantize_op + quantized_op" can be
+ * deteted by this fuser. The fuser modifies the input scale for the
+ * quantized_op and deletes the fake_quant_dequant_op.
 */
 class DeleteQuantDequantOpFuser : public FuseBase {
 public:
+  explicit DeleteQuantDequantOpFuser(const std::string& quant_dequant_op_type)
+      : quant_dequant_op_type_(quant_dequant_op_type) {}
  void BuildPattern() override;
  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;

 private:
  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+
+ private:
+  std::string quant_dequant_op_type_{};
 };

 }  // namespace fusion

--- a/lite/core/mir/generate_program_pass.cc
+++ b/lite/core/mir/generate_program_pass.cc
@@ -39,6 +39,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
    nodes_in_order = graph->StmtTopologicalOrder();
  }

+  insts_.emplace_back();
  for (auto& item : nodes_in_order) {
    if (item->IsStmt()) {
      auto& stmt = item->AsStmt();
@@ -57,7 +58,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
            .SetSyncStreams(stmt.sync_streams_);
      }
 #endif
-      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
+      insts_.back().emplace_back(stmt.op(), std::move(stmt.kernels().front()));
    }
  }
 }

--- a/lite/core/mir/generate_program_pass.h
+++ b/lite/core/mir/generate_program_pass.h
@@ -42,7 +42,7 @@ class GenerateProgramPass : public ProgramPass {
  }

 private:
-  std::vector<Instruction> insts_;
+  std::vector<std::vector<Instruction>> insts_;
 };

 }  // namespace mir

--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -315,4 +315,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
                     TARGET(kBM),
                     TARGET(kRKNPU),
                     TARGET(kAPU),
-                     TARGET(kMLU)});
+                     TARGET(kMLU),
+                     TARGET(kHuaweiAscendNPU)});
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -284,13 +284,19 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
                head_node->AsArg().name,
                cur_node->AsArg().name);
  // for subgraph op, modify the BlockDesc
-  auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
-                             inst_node->AsStmt().op().get())
-                             ->GetSubBlock();
-  for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
-    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
-    UpdateInputTo(
-        sub_block_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
+  auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                              inst_node->AsStmt().op().get())
+                              ->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx =
+      inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       ++sub_op_idx) {
+    auto* sub_op_desc = const_cast<cpp::OpDesc*>(
+        sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
+    UpdateInputTo(sub_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
  }

  // recreate the op
@@ -444,21 +450,27 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
                 tail_node->AsArg().name,
                 cur_node->AsArg().name);
  // for subgraph op, modify the BlockDesc
-  auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
-                             inst_node->AsStmt().op().get())
-                             ->GetSubBlock();
-  for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
-    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
+  auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                              inst_node->AsStmt().op().get())
+                              ->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx =
+      inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       ++sub_op_idx) {
+    auto* sub_op_desc = const_cast<cpp::OpDesc*>(
+        sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
    UpdateOutputTo(
-        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+        sub_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
    /* graph like this
     *        subgraph_op_0
     *          /       \
     *         /         \
     * subgraph_op_1   host_op
     */
-    UpdateInputTo(
-        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+    UpdateInputTo(sub_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
  }

  // recreate the op
@@ -482,15 +494,22 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
  }
 }

-bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
-  auto* block_desc =
-      static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
-          ->GetSubBlock();
-  for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
-    auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
-    CHECK(op_desc);
-    if (op_desc->Type() == "conv2d") {
-      for (auto& names : op_desc->inputs()) {
+bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node,
+                                               Node* inst_node) {
+  auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                              inst_node->AsStmt().op().get())
+                              ->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx =
+      inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       sub_op_idx++) {
+    auto sub_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx);
+    CHECK(sub_op_desc);
+    if (sub_op_desc->Type() == "conv2d") {
+      for (auto& names : sub_op_desc->inputs()) {
        if (std::find(names.second.begin(),
                      names.second.end(),
                      arg_node->AsArg().name) != names.second.end()) {
@@ -746,19 +765,23 @@ std::pair<bool, std::string> CheckOutputAndInsert(
 // insert cast op on mlu, to avoid cast on cpu
 void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
                                        const Type* subgraph_type) {
-  auto subgraph_op = subgraph_node->AsStmt().op();
-  CHECK_EQ(subgraph_op->Type(), "subgraph");
-  auto op = dynamic_cast<operators::SubgraphOp*>(subgraph_op.get());
-  CHECK(op);
-  auto block_desc = op->GetSubBlock();
+  CHECK_EQ(subgraph_node->AsStmt().op()->Type(), "subgraph");
+  auto subgraph_op =
+      dynamic_cast<operators::SubgraphOp*>(subgraph_node->AsStmt().op().get());
+  CHECK(subgraph_op);
+  auto sub_program_desc = subgraph_op->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx = subgraph_op->op_info()->GetAttr<int32_t>("sub_block");
+  auto* sub_block_desc = const_cast<cpp::BlockDesc*>(
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx));

  // create a new block desc to keep op sequence correct
-  cpp::BlockDesc* new_block_desc = new cpp::BlockDesc();
-  new_block_desc->ClearOps();
-  new_block_desc->ClearVars();
-  new_block_desc->SetIdx(block_desc->Idx());
-  new_block_desc->SetParentIdx(block_desc->ParentIdx());
-  new_block_desc->SetForwardBlockIdx(block_desc->ForwardBlockIdx());
+  cpp::BlockDesc new_block_desc;
+  new_block_desc.ClearOps();
+  new_block_desc.ClearVars();
+  new_block_desc.SetIdx(sub_block_desc->Idx());
+  new_block_desc.SetParentIdx(sub_block_desc->ParentIdx());
+  new_block_desc.SetForwardBlockIdx(sub_block_desc->ForwardBlockIdx());

  // find all IO that is not weight or persist
  std::list<std::string> i_names, o_names;
@@ -769,8 +792,8 @@ void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
    auto input_name = input->AsArg().name;
    if (!(input->AsArg().is_weight || input->AsArg().is_persist)) {
      i_names.emplace_back(input_name);
-      auto ret = CheckInputAndInsert(op->scope(),
-                                     new_block_desc,
+      auto ret = CheckInputAndInsert(subgraph_op->scope(),
+                                     &new_block_desc,
                                     input_name,
                                     input->AsArg().type,
                                     subgraph_type);
@@ -783,8 +806,8 @@ void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
    auto output_name = output->AsArg().name;
    if (!(output->AsArg().is_weight || output->AsArg().is_persist)) {
      o_names.emplace_back(output_name);
-      auto ret = CheckOutputAndInsert(op->scope(),
-                                      block_desc,
+      auto ret = CheckOutputAndInsert(subgraph_op->scope(),
+                                      sub_block_desc,
                                      output_name,
                                      output->AsArg().type,
                                      subgraph_type);
@@ -795,46 +818,48 @@ void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
  }

  // update input and output
-  for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); ++op_idx) {
-    auto desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
-    auto new_desc = new_block_desc->AddOp<cpp::OpDesc>();
-    *new_desc = *desc;
-
-    if (desc->Type() != "layout" && desc->Type() != "cast") {
-      auto op_input_args = new_desc->InputArgumentNames();
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       ++sub_op_idx) {
+    auto sub_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx);
+    auto new_op_desc = new_block_desc.AddOp<cpp::OpDesc>();
+    *new_op_desc = *sub_op_desc;
+
+    if (sub_op_desc->Type() != "layout" && sub_op_desc->Type() != "cast") {
+      auto op_input_args = new_op_desc->InputArgumentNames();
      for (auto& input_arg : op_input_args) {
-        auto op_input = new_desc->Input(input_arg);
+        auto op_input = new_op_desc->Input(input_arg);
        for (auto& it : i_names) {
          auto index = std::find(op_input.begin(), op_input.end(), it);
          if (index != op_input.end() &&
              node_replace.find(it) != node_replace.end()) {
            index = op_input.erase(index);
            op_input.emplace(index, node_replace.at(it));
-            VLOG(4) << new_desc->Type() << "] change input from " << it
+            VLOG(4) << new_op_desc->Type() << "] change input from " << it
                    << " to " << node_replace.at(it);
          }
        }
-        new_desc->SetInput(input_arg, op_input);
+        new_op_desc->SetInput(input_arg, op_input);
      }

-      auto op_output_args = new_desc->OutputArgumentNames();
+      auto op_output_args = new_op_desc->OutputArgumentNames();
      for (auto& output_arg : op_output_args) {
-        auto op_output = new_desc->Output(output_arg);
+        auto op_output = new_op_desc->Output(output_arg);
        for (auto& it : o_names) {
          auto index = std::find(op_output.begin(), op_output.end(), it);
          if (index != op_output.end() &&
              node_replace.find(it) != node_replace.end()) {
            index = op_output.erase(index);
            op_output.emplace(index, node_replace.at(it));
-            VLOG(4) << new_desc->Type() << "] change output from " << it
+            VLOG(4) << new_op_desc->Type() << "] change output from " << it
                    << " to " << node_replace.at(it);
          }
        }
-        new_desc->SetOutput(output_arg, op_output);
+        new_op_desc->SetOutput(output_arg, op_output);
      }
    }
  }
-  op->SetSubBlock(new_block_desc);
+
+  *sub_block_desc = new_block_desc;
 }

 void ModifyValidPlaces(SSAGraph* graph, bool use_mlu_cast) {

--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -153,60 +153,61 @@ Node *SSAGraph::GraphCreateInstructNode(
 }

 void SSAGraph::Build(const Program &program,
-                     const std::vector<Place> &valid_places) {
+                     const std::vector<Place> &valid_places,
+                     int block_idx) {
  CHECK(node_storage_.empty());

-  auto weights_name = program.weights();
-  auto is_weights = [&](const std::string &name) -> bool {
-    auto it = std::find(weights_name.begin(), weights_name.end(), name);
-    if (it == weights_name.end()) return false;
+  auto weights = program.weights();
+  auto is_weight = [&](const std::string &name) -> bool {
+    auto it = std::find(weights.begin(), weights.end(), name);
+    if (it == weights.end()) return false;
    return true;
  };

-  std::map<std::string, PrecisionType> var_types = program.var_data_type();
-
-  std::map<std::string, mir::Node *> arg_update_node_map_;
-  for (auto &op : program.ops()) {
+  auto var_type_map = program.var_type_map();
+  std::map<std::string, mir::Node *> arg_update_node_map;
+  for (auto &op : program.ops(block_idx)) {
    VLOG(3) << op->op_info()->Type();
    auto *op_node = GraphCreateInstructNode(op, valid_places);
-    for (const std::string &name : op->op_info()->input_names()) {
+    auto *op_info = op->op_info();
+    const auto &op_type = op_info->Type();
+    for (const auto &var_name : op_info->input_names()) {
      mir::Node *arg_node = nullptr;
-      if (arg_update_node_map_.count(name)) {
-        arg_node = arg_update_node_map_.at(name);
+      if (arg_update_node_map.count(var_name)) {
+        arg_node = arg_update_node_map.at(var_name);
      } else {
        node_storage_.emplace_back();
        arg_node = &node_storage_.back();
-        arg_node->AsArg(name, node_storage_.size() - 1);
-        arg_update_node_map_[name] = arg_node;
+        arg_node->AsArg(var_name, node_storage_.size() - 1);
+        arg_update_node_map[var_name] = arg_node;
      }
-      if (var_types.count(name)) {
+      if (var_type_map.count(var_name)) {
        if (!arg_node->arg()->type) {
-          arg_node->arg()->type = LiteType::GetTensorTy(
-              TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+          arg_node->arg()->type = var_type_map[var_name];
        }
        // Store the original data type of the output tensors for
        // type_precision_cast_pass, to keep the consistency between the
        // output types of original graph and optimized graph's
-        if (op->op_info()->Type() == "fetch") {
+        if (op_type == "fetch") {
          op->mutable_op_info()->SetAttr<int>(
-              "data_type", static_cast<int>(var_types[name]));
+              "data_type",
+              static_cast<int>(var_type_map[var_name]->precision()));
        }
      }
-      if (is_weights(name)) arg_node->AsArg().is_weight = true;
+      if (is_weight(var_name)) arg_node->AsArg().is_weight = true;
      CHECK(arg_node->IsRoleSet());
      DirectedLink(arg_node, op_node);
    }
-    for (const std::string &name : op->op_info()->output_names()) {
+    for (const auto &var_name : op->op_info()->output_names()) {
      node_storage_.emplace_back();
      auto *arg_node = &node_storage_.back();
-      arg_node->AsArg(name, node_storage_.size() - 1);
-      arg_update_node_map_[name] = arg_node;
-      if (var_types.count(name) && !arg_node->arg()->type) {
-        arg_node->arg()->type = LiteType::GetTensorTy(
-            TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+      arg_node->AsArg(var_name, node_storage_.size() - 1);
+      arg_update_node_map[var_name] = arg_node;
+      if (var_type_map.count(var_name) && !arg_node->arg()->type) {
+        arg_node->arg()->type = var_type_map[var_name];
      }

-      if (is_weights(name)) arg_node->AsArg().is_weight = true;
+      if (is_weight(var_name)) arg_node->AsArg().is_weight = true;
      CHECK(arg_node->IsRoleSet());
      DirectedLink(op_node, arg_node);
    }

--- a/lite/core/mir/ssa_graph.h
+++ b/lite/core/mir/ssa_graph.h
@@ -35,9 +35,13 @@ class GraphBase {};

 class SSAGraph : GraphBase {
 public:
-  // @param program: the op program
+  // @param program: the target program with vars and ops
  // @param valid_places: the valid places user set for the system.
-  void Build(const Program &program, const std::vector<Place> &valid_places);
+  // @param block_idx: the block index in the target program, default is 0(main
+  // block)
+  void Build(const Program &program,
+             const std::vector<Place> &valid_places,
+             int block_idx = kRootBlockIdx);
  void RemoveNode(const mir::Node *node);

  std::vector<mir::Node *> StmtTopologicalOrder();

--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -411,16 +411,17 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
  cpp::OpDesc subgraph_op_desc;
  subgraph_op_desc.SetType("subgraph");

-  // Create a new sub block desc for storing all of Ops and Vars of the target
-  // subgraph and sub_block_idx is set as a attribute of subgraph op,
-  // sub_block_idx < 0 means it's a new subgraph op
-  int sub_block_idx = -(subgraph_idx + 1);
-  auto sub_block_desc = new cpp::BlockDesc();
+  // Create a program desc and a block desc for storing all of Ops and Vars of
+  // the target subgraph and sub_block_idx is set as a attribute of subgraph op,
+  // sub_block_idx = 0 means it's a new subgraph op
+  auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+  int sub_block_idx = 0;
+  auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
  sub_block_desc->ClearOps();
  sub_block_desc->ClearVars();
  for (auto &op_node : subgraph_nodes) {
-    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
-    *sub_block_op_desc = *op_node->AsStmt().op_info();
+    auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_op_desc = *op_node->AsStmt().op_info();
  }
  subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx);

@@ -437,13 +438,13 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
                       &local_var_nodes,
                       &unused_var_nodes);
  // A simplified model without the original weight/local/unused nodes on the
-  // subgraph ops will be saved only if 'SUBGRAPH_DISABLE_ONLINE_MODE' is set to
-  // true and Predictor->Run(...), Predictor->Save(...) is called.
+  // subgraph ops will be saved only if 'SUBGRAPH_ONLINE_MODE' is set to
+  // true(default) and Predictor->Run(...), Predictor->Save(...) is called.
  std::set<Node *> input_var_nodes(idata_var_nodes.begin(),
                                   idata_var_nodes.end());
  std::set<Node *> output_var_nodes(odata_var_nodes.begin(),
                                    odata_var_nodes.end());
-  if (!GetBoolFromEnv(SUBGRAPH_DISABLE_ONLINE_MODE)) {
+  if (GetBoolFromEnv(SUBGRAPH_ONLINE_MODE, true)) {
    input_var_nodes.insert(weight_var_nodes.begin(), weight_var_nodes.end());
    output_var_nodes.insert(local_var_nodes.begin(), local_var_nodes.end());
    output_var_nodes.insert(unused_var_nodes.begin(), unused_var_nodes.end());
@@ -476,7 +477,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
  subgraph_op_desc.SetOutput("Outputs", output_var_names);
  auto subgraph_op = LiteOpRegistry::Global().Create("subgraph");
  static_cast<operators::SubgraphOp *>(subgraph_op.get())
-      ->SetSubBlock(sub_block_desc);
+      ->SetProgramDesc(sub_program_desc);
  auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
  subgraph_op->Attach(subgraph_op_desc, any_op->scope());


--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -141,12 +141,11 @@ std::vector<std::string> AddFetchDesc(
 }

 TEST(Subgraph, detect_simple_model) {
-  cpp::ProgramDesc program_desc;
+  auto program_desc = std::make_shared<cpp::ProgramDesc>();
  std::vector<Place> valid_places{{TARGET(kHost), PRECISION(kFloat)}};
  auto scope = std::make_shared<Scope>();
  // Build a simple network
-  program_desc.ClearBlocks();
-  auto* block_desc = program_desc.AddBlock<cpp::BlockDesc>();
+  auto* block_desc = program_desc->AddBlock<cpp::BlockDesc>();
  block_desc->ClearOps();
  block_desc->ClearVars();
  auto* var_desc = block_desc->AddVar<cpp::VarDesc>();
@@ -181,13 +180,13 @@ TEST(Subgraph, detect_custom_model) {
                 "the path of model files.";
    return;
  }
-  cpp::ProgramDesc program_desc;
+  auto program_desc = std::make_shared<cpp::ProgramDesc>();
  auto scope = std::make_shared<Scope>();
  LoadModelPb(FLAGS_model_dir,
              FLAGS_model_file,
              FLAGS_params_file,
              scope.get(),
-              &program_desc,
+              program_desc.get(),
              !FLAGS_model_file.empty() && !FLAGS_params_file.empty(),
              false);
  std::vector<Place> valid_places({
@@ -200,6 +199,9 @@ TEST(Subgraph, detect_custom_model) {
 #ifdef LITE_WITH_NPU
      Place{TARGET(kNPU), PRECISION(kFloat)},
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+      Place{TARGET(kHuaweiAscendNPU), PRECISION(kFloat)},
+#endif
 #ifdef LITE_WITH_XTCL
      Place{TARGET(kXPU), PRECISION(kFloat)},
 #endif

--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -40,6 +40,21 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  fuser();
 }

+void HuaweiAscendNPUSubgraphPass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  std::set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
+#include "lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
 void APUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  std::set<std::string> supported_lists;
 #define USE_SUBGRAPH_BRIDGE(op_type, target) \
@@ -119,6 +134,9 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {

 REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
    .BindTargets({TARGET(kNPU)});
+REGISTER_MIR_PASS(huawei_ascend_npu_subgraph_pass,
+                  paddle::lite::mir::HuaweiAscendNPUSubgraphPass)
+    .BindTargets({TARGET(kHuaweiAscendNPU)});
 REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass)
    .BindTargets({TARGET(kAPU)});
 REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)

--- a/lite/core/mir/subgraph/subgraph_pass.h
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass {
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 };

+class HuaweiAscendNPUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
 class APUSubgraphPass : public ProgramPass {
 public:
  void Apply(const std::unique_ptr<SSAGraph>& graph) override;

--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -187,6 +187,10 @@ TEST(Subgraph, generate_model_and_check_precision) {
 #ifdef LITE_WITH_NPU
  valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
 #endif
+#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
+  valid_places.push_back(
+      lite_api::Place{TARGET(kHuaweiAscendNPU), PRECISION(kFloat)});
+#endif
 #ifdef LITE_WITH_XTCL
  valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
 #endif

--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -36,14 +36,20 @@ void UpdateInputsForSubgraph(OpLite* op,
      op_desc->GetAttr<std::vector<std::string>>("input_data_names");
  std::replace(input_data_names.begin(), input_data_names.end(), from, to);
  op_desc->SetAttr("input_data_names", input_data_names);
-  auto* subblock_desc = static_cast<operators::SubgraphOp*>(op)->GetSubBlock();
-  CHECK(subblock_desc);
-  for (size_t i = 0; i < subblock_desc->OpsSize(); i++) {
-    auto* subblock_op_desc = subblock_desc->GetOp<cpp::OpDesc>(i);
-    for (auto& subblock_op_input : *subblock_op_desc->mutable_inputs()) {
-      for (auto& subblock_var_name : subblock_op_input.second) {
-        if (subblock_var_name == from) {
-          subblock_var_name = to;
+  auto sub_program_desc =
+      static_cast<operators::SubgraphOp*>(op)->GetProgramDesc();
+  CHECK(sub_program_desc);
+  int sub_block_idx = op_desc->GetAttr<int32_t>("sub_block");
+  auto sub_block_desc =
+      sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
+  for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
+       sub_op_idx++) {
+    auto sub_op_desc = const_cast<cpp::OpDesc*>(
+        sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
+    for (auto& sub_op_input : *sub_op_desc->mutable_inputs()) {
+      for (auto& sub_var_name : sub_op_input.second) {
+        if (sub_var_name == from) {
+          sub_var_name = to;
        }
      }
    }

--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -59,25 +59,46 @@ class VariablePlaceInferencePass : public DebugPass {
  }

  // Set the type of the weight
-  void SetWeightType(Node* w,
+  void SetWeightType(Node* weight_node,
                     const LiteType& type,
-                     const std::map<std::string, bool>& lite_with_targets) {
+                     const std::map<std::string, bool>& with_targets) {
    VLOG(4) << "type.precision():" << PrecisionRepr(type.precision());
-    if (lite_with_targets.at("kFPGA")) {
-      w->AsArg().type = LiteType::GetTensorTy(
+    if (with_targets.at("kFPGA")) {
+      weight_node->AsArg().type = LiteType::GetTensorTy(
          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-    } else if (lite_with_targets.at("kOpenCL")) {
-      w->AsArg().type = LiteType::GetTensorTy(
+    } else if (with_targets.at("kOpenCL")) {
+      weight_node->AsArg().type = LiteType::GetTensorTy(
          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-    } else if (lite_with_targets.at("kCUDA")) {
-      w->AsArg().type = LiteType::GetTensorTy(
+    } else if (with_targets.at("kCUDA")) {
+      weight_node->AsArg().type = LiteType::GetTensorTy(
          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
    } else {
-      w->AsArg().type = LiteType::GetTensorTy(
+      weight_node->AsArg().type = LiteType::GetTensorTy(
          TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
    }
  }

+  // Update a's kUnk fields from b's fields.
+  void UpdateTypeFrom(const Type** a, const Type* b) {
+    auto target = (*a)->target();
+    auto precision = (*a)->precision();
+    auto layout = (*a)->layout();
+    if (target == TARGET(kUnk)) {
+      target = b->target();
+    }
+    if (precision == PRECISION(kUnk)) {
+      precision = b->precision();
+    }
+    if (layout == DATALAYOUT(kUnk)) {
+      layout = b->layout();
+    }
+    if ((*a)->IsTensor() && b->IsTensor()) {
+      *a = LiteType::GetTensorTy(target, precision, layout);
+    } else if ((*a)->IsTensorList() && b->IsTensorList()) {
+      *a = LiteType::GetTensorListTy(target, precision, layout);
+    }
+  }
+
  void InferenceArgumentPlace(SSAGraph* graph) {
    auto& valid_places = graph->valid_places();
    auto valid_places_has_target = [&](TargetType t) -> bool {
@@ -88,122 +109,90 @@ class VariablePlaceInferencePass : public DebugPass {
      }
      return false;
    };
-    std::map<std::string, bool> lite_with_targets{
+    std::map<std::string, bool> with_targets{
        {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
        {"kCUDA", valid_places_has_target(TARGET(kCUDA))},
        {"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
-    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
-    VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
+    VLOG(4) << "with_targets['kOpenCL']:" << with_targets["kOpenCL"];
+    VLOG(4) << "with_targets['kFPGA']:" << with_targets["kFPGA"];

    VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
-    for (auto& x : graph->StmtTopologicalOrder()) {
-      auto& inst = x->AsStmt();
+    for (auto& node : graph->StmtTopologicalOrder()) {
+      auto& inst = node->AsStmt();
+      const auto* op_info = inst.op_info();
+      const auto& op_type = op_info->Type();
+      auto& kernel = inst.picked_kernel();
+
      // The IoCopyOp is a tool operator, it won't support the type inference.
      // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
-      // for
-      // tool operator
-      if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
-        VLOG(3) << "inst.op_type() == 'io_copy', continue";
-        if (inst.op_type() == "io_copy") continue;
+      // for tool operator
+      if ((!with_targets["kFPGA"]) && (!with_targets["kOpenCL"])) {
+        VLOG(3) << "skip 'io_copy' if target is FPGA and OpenCL";
+        if (op_type == "io_copy") continue;
      }
-      // deal with inputs
-      VLOG(4) << "Infering op " << inst.op_info()->Repr();
-      // TODO(zhaolong): Add check if the node's name in op's arguments.

-      auto get_argname = [&](
-          const std::string& node_name,
-          const std::map<std::string, std::vector<std::string>>& argname_map)
-          -> std::string {
-            for (auto& ele : argname_map) {
-              auto it =
-                  std::find(ele.second.begin(), ele.second.end(), node_name);
-              if (it != ele.second.end()) return ele.first;
-            }
-            return "";
-          };
-
-      for (auto* x_in : x->inlinks) {
-        std::string node_name = x_in->AsArg().name;
-        std::string arg_name = get_argname(node_name, inst.op_info()->inputs());
-        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
-                                   << node_name;
-        VLOG(4) << "-- input arg_name:" << arg_name << " "
-                << "-- node name:" << node_name;
-        auto type = inst.picked_kernel().GetInputDeclType(arg_name);
-        if (!x_in->AsArg().type) {
-          VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
-          if (x_in->AsArg().is_weight) {
-            SetWeightType(x_in, *type, lite_with_targets);
+      // Infering the input and output variable's place according to the
+      // declaration of I/O arguments of the picked kernel of the op
+      VLOG(4) << "Op " << op_info->Repr();
+      for (auto* in_node : node->inlinks) {
+        auto& var = in_node->AsArg();
+        const auto& var_name = var.name;
+        auto* var_type = &var.type;
+        std::string arg_name;
+        CHECK(op_info->GetInputArgname(var_name, &arg_name))
+            << "Can not find the input argument for var " << var_name;
+        VLOG(4) << " - input arg name:" << arg_name << " var name:" << var_name;
+        const auto* decl_type = kernel.GetInputDeclType(arg_name);
+        if (!(*var_type)) {
+          VLOG(4) << "set type " << *decl_type << " " << var_name;
+          if (var.is_weight) {
+            SetWeightType(in_node, *decl_type, with_targets);
          } else {
-            x_in->AsArg().type = type;
+            *var_type = decl_type;
          }
-        } else if (x_in->AsArg().type->target() == TARGET(kUnk) &&
-                   x_in->AsArg().type->precision() != PRECISION(kUnk) &&
-                   x_in->AsArg().type->layout() == DATALAYOUT(kUnk)) {
+        } else if (!(*var_type)->place().is_valid()) {
          // If is quantization, infer the Int8 type.
-          if (type->precision() == PRECISION(kInt8)) {
-            x_in->AsArg().type = type;
+          if (decl_type->precision() == PRECISION(kInt8)) {
+            *var_type = decl_type;
          } else {
-            PrecisionType tmp_ptype = x_in->AsArg().type->precision();
-            x_in->AsArg().type = LiteType::GetTensorTy(
-                type->target(), tmp_ptype, type->layout());
+            UpdateTypeFrom(var_type, decl_type);
          }
        }
      }
-
-      VLOG(4) << "inst " << inst.op_info()->Repr();
-      for (auto* x_out : x->outlinks) {
-        std::string node_name = x_out->AsArg().name;
-        std::string arg_name =
-            get_argname(node_name, inst.op_info()->outputs());
-        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
-                                   << node_name << " in Inst "
-                                   << inst.op_type();
-        VLOG(4) << "-- output arg_name " << arg_name;
-        auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
-        if (!x_out->AsArg().type) {
-          VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
-          if (x_out->AsArg().is_weight) {
-            SetWeightType(x_out, *type, lite_with_targets);
+      for (auto* out_node : node->outlinks) {
+        auto& var = out_node->AsArg();
+        const auto& var_name = var.name;
+        auto* var_type = &var.type;
+        std::string arg_name;
+        CHECK(op_info->GetOutputArgname(var_name, &arg_name))
+            << "Can not find the output argument for var " << var_name;
+        VLOG(4) << " - output arg name:" << arg_name
+                << " var name:" << var_name;
+        const auto* decl_type = kernel.GetOutputDeclType(arg_name);
+        if (!(*var_type)) {
+          VLOG(4) << "set type " << *decl_type << " " << var_name;
+          if (var.is_weight) {
+            SetWeightType(out_node, *decl_type, with_targets);
          } else {
-            x_out->AsArg().type = type;
+            *var_type = decl_type;
          }
-        } else if (x_out->AsArg().type->target() == TARGET(kUnk) &&
-                   x_out->AsArg().type->precision() != PRECISION(kUnk) &&
-                   x_out->AsArg().type->layout() == DATALAYOUT(kUnk)) {
+        } else if (!(*var_type)->place().is_valid()) {
          // If is quantization, infer the Int8 type.
-          if (type->precision() == PRECISION(kInt8)) {
-            x_out->AsArg().type = type;
-          } else if (type->precision() == PRECISION(kFP16) &&
-                     type->target() != TARGET(kOpenCL)) {
-            x_out->AsArg().type = type;
+          if (decl_type->precision() == PRECISION(kInt8) ||
+              (decl_type->precision() == PRECISION(kFP16) &&
+               decl_type->target() != TARGET(kOpenCL))) {
+            *var_type = decl_type;
          } else {
-            PrecisionType tmp_ptype = x_out->AsArg().type->precision();
-            x_out->AsArg().type = LiteType::GetTensorTy(
-                type->target(), tmp_ptype, type->layout());
+            UpdateTypeFrom(var_type, decl_type);
          }
        }
      }
    }
  }

-  // Update me's kUnk fields by other's fields.
-  void UpdatePlace(Place* me, const Place& other) {
-    CHECK(other.is_valid());
-    if (me->target == TARGET(kUnk)) {
-      me->target = other.target;
-    }
-    if (me->precision == PRECISION(kUnk)) {
-      me->precision = other.precision;
-    }
-    if (me->layout == DATALAYOUT(kUnk)) {
-      me->layout = other.layout;
-    }
-  }
-
 private:
-  // The default target for arguments, e.g. load weights to CPU memory for CUDA
-  // computation by default.
+  // The default target for arguments, e.g. load weights to CPU memory for
+  // CUDA computation by default.
  TargetType argument_default_target_{TARGET(kHost)};
 };


--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -99,7 +99,7 @@ class OpLite : public Registry {
  std::vector<std::unique_ptr<KernelBase>> CreateKernels(
      const std::vector<Place> &places, const std::string &kernel_type = "");

-  lite::Scope *scope() { return scope_; }
+  Scope *scope() { return scope_; }

  // Assign op param to kernel.
  virtual void AttachKernel(KernelBase *kernel) = 0;
@@ -169,7 +169,7 @@ class OpLite : public Registry {
  }

 protected:
-  lite::Scope *scope_{nullptr};
+  Scope *scope_{nullptr};
  std::unique_ptr<KernelBase> kernel_;
  std::string op_type_;
  std::vector<Place> valid_places_;

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -19,6 +19,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
 #include "lite/core/mir/generate_program_pass.h"
 #include "lite/core/mir/pass_manager.h"
 #include "lite/core/mir/pass_utils.h"
@@ -36,6 +37,9 @@ namespace lite {
 * lite::Optimizer optimize a program. It utilize the mir passes to analysis the
 * program and export an optimized program.
 */
+// TODO(hong1986032) Support the following passes for the subblocks
+const std::set<std::string> kSubblockUnsupportedPasses(
+    {"memory_optimize_pass"});
 class Optimizer {
 public:
  Optimizer() {}
@@ -60,14 +64,20 @@ class Optimizer {
    program_ = &program;
    valid_places_ = valid_places;
    CHECK(!valid_places.empty()) << "At least one valid_place should be set";
-    CHECK(!graph_) << "duplicate optimize found";
-
-    graph_.reset(new mir::SSAGraph);
-    graph_->Build(program, valid_places);
-    graph_->SetValidPlaces(valid_places);
+    CHECK(graphs_.empty()) << "duplicate optimize found";
+
+    auto block_size = program.block_size();
+    for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
+      std::unique_ptr<mir::SSAGraph> graph;
+      graph.reset(new mir::SSAGraph);
+      graph->Build(program, valid_places, block_idx);
+      graph->SetValidPlaces(valid_places);
+      graphs_.emplace_back(std::move(graph));
+    }

    SpecifyKernelPickTactic(kernel_pick_factor);
    InitTargetTypeTransformPass();
+    InitControlFlowOpUnusedInputsAndOutputsEliminatePass();

    if (passes.empty() || passes.size() == 1) {
      std::vector<std::string> passes_local{
@@ -107,11 +117,13 @@ class Optimizer {
                                                      // 'enable_int8' for all
                                                      // of the quantized ops.
           "npu_subgraph_pass",
+           "huawei_ascend_npu_subgraph_pass",
           "xpu_subgraph_pass",
           "bm_subgraph_pass",
           "apu_subgraph_pass",
           "rknpu_subgraph_pass",
           "mlu_subgraph_pass",
+           "control_flow_op_unused_inputs_and_outputs_eliminate_pass",
           "static_kernel_pick_pass",  // pick original kernel from graph

           "remove_tf_redundant_ops_pass",
@@ -176,62 +188,15 @@ class Optimizer {
    exec_scope_ = program.exec_scope();
  }

-  const lite::Scope* exec_scope() const { return exec_scope_; }
-
-  // Set shape(dims) infos of var descs to scope var.
-  //  developer can write pass using input / output tensor dims of op.
-  //
-  // Example: If you have node `Node* softmax_node`,
-  //   you can get dims of output tensor in passes:
-  //
-  //   auto* scope = softmax_node->AsStmt().op()->scope();
-  //   auto softmax_out_arg_name =
-  //             softmax_node->outlinks.front()->AsArg().name;
-  //   auto softmax_out_tensor =
-  //             scope->FindVar(softmax_out_arg_name)->Get<lite::Tensor>();
-  //   softmax_out_dims = softmax_out_tensor.dims();
-  void SetVarDescShapeToScopeVar() {
-    auto dims_to_str_func = [](std::vector<int64_t> shape) -> std::string {
-      std::string str_res;
-      for (size_t i = 0; i < shape.size(); ++i) {
-        str_res += std::to_string(shape[i]);
-        if (i != shape.size() - 1) {
-          str_res += "x";
-        }
-      }
-      return str_res;
-    };
-
-    auto* program_desc = program_->program_desc();
-    VLOG(5) << "program_desc->BlocksSize():" << program_desc->BlocksSize();
-    auto blocks_desc = program_desc->GetBlocks();
-    for (size_t bidx = 0; bidx < blocks_desc.size(); ++bidx) {
-      auto block_desc = blocks_desc[bidx];
-      auto vars_desc = block_desc.GetVars();
-      for (size_t vidx = 0; vidx < vars_desc.size(); ++vidx) {
-        auto var_desc = vars_desc[vidx];
-        VLOG(5) << var_desc.Name() << " "
-                << dims_to_str_func(var_desc.GetShape());
-        if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue;
-        auto* var = program_->exec_scope()->FindVar(var_desc.Name());
-        auto tensor = var->GetMutable<lite::Tensor>();
-        if (tensor->dims().size() == 0 && var_desc.GetShape().size() != 0) {
-          VLOG(5) << "var_desc.Name():" << var_desc.Name()
-                  << " shape:" << dims_to_str_func(var_desc.GetShape());
-          tensor->Resize(var_desc.GetShape());
-        }
-        VLOG(5) << "var_desc.Name():" << var_desc.Name()
-                << " shape:" << dims_to_str_func(var_desc.GetShape())
-                << " tensor:" << tensor->dims();
-      }
-    }
-  }
+  const Scope* exec_scope() const { return exec_scope_; }

  // Generate a new program based on the mir graph.
  std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
    auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
        "generate_program_pass");
-    pass->Apply(graph_);
+    for (auto& graph : graphs_) {
+      pass->Apply(graph);
+    }
    auto program = pass->GenProgram();
    CHECK(exec_scope_);
    program->set_exec_scope(exec_scope_);
@@ -247,27 +212,38 @@ class Optimizer {
    pass->SetValidPlaces(valid_places_);
  }

+  void InitControlFlowOpUnusedInputsAndOutputsEliminatePass() {
+    auto* pass =
+        mir::PassManager::Global()
+            .LookUp<mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass>(
+                "control_flow_op_unused_inputs_and_outputs_eliminate_pass");
+    CHECK(pass);
+    CHECK(!graphs_.empty());
+    pass->SetAllGraphs(&graphs_);
+  }
+
  // Generate C++ code which combines the inference program, model and weights.
  void GenCode(const std::string& code_dir);

-  const mir::SSAGraph& ssa_graph() const {
-    CHECK(graph_);
-    return *graph_;
+  const mir::SSAGraph& ssa_graph(int block_idx = kRootBlockIdx) const {
+    CHECK(!graphs_.empty());
+    CHECK(graphs_[block_idx]);
+    return *graphs_[block_idx];
  }

-  mir::SSAGraph* mutable_ssa_graph() {
-    CHECK(graph_);
-    return graph_.get();
+  mir::SSAGraph* mutable_ssa_graph(int block_idx = kRootBlockIdx) {
+    CHECK(!graphs_.empty());
+    CHECK(graphs_[block_idx]);
+    return graphs_[block_idx].get();
  }

-  lite::Scope* exec_scope() { return exec_scope_; }
+  Scope* exec_scope() { return exec_scope_; }

 protected:
  void SpecifyKernelPickTactic(core::KernelPickFactor factor);

  // Specify the passes and run them.
  void RunPasses(const std::vector<std::string>& passes) {
-    SetVarDescShapeToScopeVar();
    for (auto& x : passes) {
      LOG(INFO) << "== Running pass: " << x;
      mir::Pass* pass = mir::PassManager::Global().LookUp(x);
@@ -285,16 +261,23 @@ class Optimizer {
        LOG(INFO) << "   - Skip " << x
                  << " because the target or kernel does not match.";
      } else {
-        pass->Apply(graph_);
+        // Check the pass whether it is supported for processing subblocks
+        if (kSubblockUnsupportedPasses.count(x)) {
+          pass->Apply(graphs_[kRootBlockIdx]);
+        } else {
+          for (auto& graph : graphs_) {
+            pass->Apply(graph);
+          }
+        }
        LOG(INFO) << "== Finished running: " << x;
      }
    }
  }

 private:
-  std::unique_ptr<mir::SSAGraph> graph_;
+  std::vector<std::unique_ptr<mir::SSAGraph>> graphs_;
  std::vector<Place> valid_places_;
-  lite::Scope* exec_scope_{};
+  Scope* exec_scope_{};
  Program* program_{};
 };


--- a/lite/core/program.cc
+++ b/lite/core/program.cc
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -41,61 +41,72 @@ static const char kKernelTypeAttr[] = "__@kernel_type_attr@__";
 // - scope: which contains all the weights
 struct Program {
 public:
-  explicit Program(const std::shared_ptr<Scope>& root) { scope_ = root; }
-  Program(const cpp::ProgramDesc& desc,
-          const std::shared_ptr<Scope>& root,
+  explicit Program(const std::shared_ptr<Scope>& root_scope) {
+    scope_ = root_scope;
+  }
+  Program(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
+          const std::shared_ptr<Scope>& root_scope,
          const std::vector<Place>& valid_places,
-          const std::vector<std::string>& var_names = {})
-      : scope_(root), valid_places_(valid_places) {
-    desc_.CopyFrom(desc);
+          const std::vector<std::string>& vars_to_clone = {})
+      : scope_(root_scope),
+        valid_places_(valid_places),
+        program_desc_(program_desc) {
    CHECK(scope_) << "scope should be init first";
    VLOG(4) << "prepare work";
-    PrepareWorkspace(desc, var_names);
+    PrepareWorkspace(program_desc_, vars_to_clone);
    VLOG(4) << "build desc";
-    Build(desc);
+    Build(program_desc_);
    VLOG(4) << "build desc finished";
  }

  std::unique_ptr<Program> Clone() const {
-    std::unique_ptr<Program> res(new Program(desc_, scope_, valid_places_));
-    return res;
+    return std::unique_ptr<Program>(
+        new Program(program_desc_, scope_, valid_places_));
  }

  const std::list<std::string>& weights() const { return weights_; }
-  const std::list<std::string>& tmp_vars() const { return tmp_vars_; }
+  const std::list<std::string>& vars() const { return vars_; }
  std::list<std::string>* mutable_weights() { return &weights_; }
-  std::list<std::string>* mutable_tmp_vars() { return &tmp_vars_; }
+  std::list<std::string>* mutable_vars() { return &vars_; }

-  const std::list<std::shared_ptr<OpLite>>& ops() const { return ops_; }
-  std::list<std::shared_ptr<OpLite>>* mutable_ops() { return &ops_; }
+  const std::list<std::shared_ptr<OpLite>>& ops(
+      int block_idx = kRootBlockIdx) const {
+    return ops_[block_idx];
+  }
+  std::list<std::shared_ptr<OpLite>>* mutable_ops(
+      int block_idx = kRootBlockIdx) {
+    return &ops_[block_idx];
+  }

-  lite::Scope* exec_scope() { return exec_scope_; }
-  lite::Scope* scope() { return scope_.get(); }
+  size_t block_size() { return ops_.size(); }

-  cpp::ProgramDesc* program_desc() { return &desc_; }
+  Scope* exec_scope() { return exec_scope_; }
+  Scope* scope() { return scope_.get(); }

-  const std::map<std::string, PrecisionType>& var_data_type() const {
-    return var_data_type_;
+  cpp::ProgramDesc* program_desc() { return program_desc_.get(); }
+
+  const std::map<std::string, const Type*>& var_type_map() const {
+    return var_type_map_;
  }

 private:
  // Build from a program and scope.
-  void Build(const cpp::ProgramDesc& program);
+  void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc);
  // Create temporary variables.
-  void PrepareWorkspace(const cpp::ProgramDesc& program,
-                        const std::vector<std::string>& var_names = {});
+  void PrepareWorkspace(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
+                        const std::vector<std::string>& vars_to_clone = {});

 private:
-  std::map<std::string, PrecisionType> var_data_type_;
-  std::list<std::string> tmp_vars_;
+  std::map<std::string, const Type*> var_type_map_;
+  std::list<std::string> vars_;
  std::list<std::string> weights_;
-  std::list<std::shared_ptr<OpLite>> ops_;
+  std::vector<std::list<std::shared_ptr<OpLite>>> ops_;
  // the scope to run the kernels, NOTE this is the execution scope.
-  std::shared_ptr<lite::Scope> scope_;
+  std::shared_ptr<Scope> scope_;
  std::vector<Place> valid_places_;
  // Runtime scope.
-  lite::Scope* exec_scope_{};
-  cpp::ProgramDesc desc_;
+  Scope* exec_scope_{};
+  std::shared_ptr<cpp::ProgramDesc> program_desc_;
 };

 struct Instruction {
@@ -173,8 +184,22 @@ struct Instruction {
 */
 class LITE_API RuntimeProgram {
 public:
-  explicit RuntimeProgram(std::vector<Instruction>&& insts)
+  explicit RuntimeProgram(std::vector<std::vector<Instruction>>&& insts)
      : instructions_(std::move(insts)) {
+    Init();
+  }
+  explicit RuntimeProgram(
+      const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
+      Scope* exec_scope,
+      int block_idx = kRootBlockIdx);
+  ~RuntimeProgram() {
+#ifdef LITE_WITH_PROFILE
+    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
+    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch);
+#endif  // LITE_WITH_PROFILE
+  }
+
+  void Init() {
    if (instructions_.empty()) {
      LOG(FATAL) << "no instructions";
    }
@@ -183,7 +208,7 @@ class LITE_API RuntimeProgram {
 #endif
 #ifdef LITE_WITH_NVTX
    const NVTXAnnotator& annotator = NVTXAnnotator::Global();
-    for (auto& inst : instructions_) {
+    for (auto& inst : instructions_[kRootBlockIdx]) {
      NVTXRangeAnnotation annotation = annotator.AnnotateBlock();
      register_layer_names_.push_back(annotator.RegisterString(
          const_cast<paddle::lite::OpLite*>(inst.op())->Type().c_str()));
@@ -191,41 +216,38 @@ class LITE_API RuntimeProgram {
    register_layer_names_.push_back(annotator.RegisterString("one_loop"));
 #endif
  }
-  ~RuntimeProgram() {
-#ifdef LITE_WITH_PROFILE
-    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
-    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch);
-#endif  // LITE_WITH_PROFILE
-  }

  void Run();

-  void set_exec_scope(lite::Scope* x) { exec_scope_ = x; }
-  lite::Scope* exec_scope() { return exec_scope_; }
+  void set_exec_scope(Scope* x) { exec_scope_ = x; }
+  Scope* exec_scope() { return exec_scope_; }

-  size_t num_instructions() const { return instructions_.size(); }
+  const std::vector<Instruction>& instructions(
+      int block_idx = kRootBlockIdx) const {
+    return instructions_[block_idx];
+  }

-  const std::vector<Instruction>& instructions() const { return instructions_; }
+  std::vector<Instruction>* mutable_instructions(
+      int block_idx = kRootBlockIdx) {
+    return &instructions_[block_idx];
+  }

-  // `SaveOpInfosToProgram` will update the op list(ops_) of the block 0
-  // in ProgramDesc.
-  void SaveOpInfosToProgram(cpp::ProgramDesc* desc);
+  size_t block_size() { return instructions_.size(); }

-  // `UpdateVarsOfProgram` will update the var list(vars_) of the block 0 in
-  // ProgramDesc. Namely, if a new var created in some passes, its var_desc will
-  // be added in vars_.
-  void UpdateVarsOfProgram(cpp::ProgramDesc* desc);
+  // Update the ops and vars of all of blocks to the given program_desc
+  // according to the instructions
+  void SaveToProgram(std::shared_ptr<cpp::ProgramDesc> program_desc);

 private:
  RuntimeProgram(const RuntimeProgram&) = delete;
-  std::vector<Instruction> instructions_;
-  lite::Scope* exec_scope_{};
+  std::vector<std::vector<Instruction>> instructions_;
+  Scope* exec_scope_{};

 #ifdef LITE_WITH_PROFILE
  profile::Profiler profiler_;
  void set_profiler() {
-    for (auto i = instructions_.begin(); i != instructions_.end(); ++i) {
-      i->set_profiler(&profiler_);
+    for (auto& inst : instructions_[kRootBlockIdx]) {
+      inst.set_profiler(&profiler_);
    }
  }
 #endif

--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
@@ -84,6 +84,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
  lod_ = other.lod_;
  memory_size_ = other.memory_size_;
  precision_ = other.precision_;
+  persistable_ = other.persistable_;
  buffer_->CopyDataFrom(*other.buffer_, memory_size_);
 }


--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
        X86_DEPS ${x86_kernels}
        ARM_DEPS ${arm_kernels}
        NPU_DEPS ${npu_kernels}
+        HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
        RKNPU_DEPS ${rknpu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
@@ -44,6 +45,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
    X86_DEPS ${x86_kernels}
    ARM_DEPS ${arm_kernels}
    NPU_DEPS ${npu_kernels}
+    HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
    RKNPU_DEPS ${rknpu_kernels}
    XPU_DEPS ${xpu_kernels}
    CL_DEPS ${opencl_kernels}

--- a/lite/kernels/CMakeLists.txt
+++ b/lite/kernels/CMakeLists.txt
@@ -14,3 +14,4 @@ add_subdirectory(mlu)
 add_subdirectory(apu)
 add_subdirectory(bm)
 add_subdirectory(rknpu)
+add_subdirectory(huawei_ascend_npu)
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -37,7 +37,7 @@ bool SubgraphEngine::BuildDeviceProgram() {
  subgraph::apu::Graph graph;
  int neuron_errCode = NeuronModel_create(&model_);
  if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Fail to create model";
+    LOG(WARNING) << "[APU] Failed to create the neuron model!";
    return false;
  }
  graph.set_model(model_);
@@ -46,11 +46,12 @@ bool SubgraphEngine::BuildDeviceProgram() {

  // Convert all of ops and their input vars and weights and added into the APU
  // NIR graph
-  if (origin_program_.empty()) {
+  if (!origin_program_) {
    BuildOriginProgram();
  }
  const auto& bridges = subgraph::Registry::Instance();
-  for (auto& inst : origin_program_) {
+  const auto& insts = origin_program_->instructions(kRootBlockIdx);
+  for (auto& inst : insts) {
    auto op = const_cast<OpLite*>(inst.op());
    CHECK(op);
    op->CheckShape();
@@ -70,55 +71,38 @@ bool SubgraphEngine::BuildDeviceProgram() {
    }
  }

-  // Get input tensor
-  std::vector<uint32_t> ins;
-  origin_itensors_.resize(input_names_.size());
-  origin_idims_.resize(input_names_.size());
+  // Get the index of input tensors
+  std::vector<uint32_t> input_indices;
  for (int i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
-    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
-    VLOG(3) << "subgraph input name: " << i << ", " << input_names_[i] << ":"
-            << origin_idims_[i].production();
-    // Get input index
-    int idx;
-    if (graph.Has(input_names_[i])) {
-      ins.push_back(graph.Get(input_names_[i])->index());
-      VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index();
-    } else {
-      LOG(WARNING) << "Fail to find input: " << input_names_[i];
-      return false;
-    }
+    CHECK(graph.Has(input_names_[i])) << "[APU] Failed to find input node "
+                                      << input_names_[i];
+    auto index = graph.Get(input_names_[i])->index();
+    input_indices.push_back(index);
+    VLOG(3) << "[APU] Input[" << i << "] name " << input_names_[i] << " dims "
+            << origin_itensors_[i]->dims() << " index " << index;
  }

-  // Get output tensor
-  std::vector<uint32_t> outs;
-  origin_otensors_.resize(output_names_.size());
-  origin_odims_.resize(output_names_.size());
+  // Get the index of output tensors
+  std::vector<uint32_t> output_indices;
  for (int i = 0; i < output_names_.size(); i++) {
-    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-    VLOG(3) << "subgraph output name: " << i << ", " << output_names_[i] << ":"
-            << origin_odims_[i].production();
+    CHECK(graph.Has(output_names_[i])) << "[APU] Failed to find output node "
+                                       << output_names_[i];
    origin_otensors_[i]->mutable_data<int8_t>();
-    // Get input index
-    if (graph.Has(output_names_[i])) {
-      outs.push_back(graph.Get(output_names_[i])->index());
-      VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index();
-    } else {
-      LOG(WARNING) << "Fail to find output: " << output_names_[i];
-      return false;
-    }
+    auto index = graph.Get(output_names_[i])->index();
+    output_indices.push_back(index);
+    VLOG(3) << "[APU] Output[" << i << "] name " << output_names_[i] << " dims "
+            << origin_otensors_[i]->dims() << " index " << index;
  }

-  VLOG(3) << "ins size: " << ins.size() << " outs size:" << outs.size();
-  // Set subgraph input/output
-  NeuronModel_identifyInputsAndOutputs(
-      model_, ins.size(), &ins[0], outs.size(), &outs[0]);
+  // Indentify the input and output tensors of the neuron model
+  NeuronModel_identifyInputsAndOutputs(model_,
+                                       input_indices.size(),
+                                       &input_indices[0],
+                                       output_indices.size(),
+                                       &output_indices[0]);
  neuron_errCode = NeuronModel_finish(model_);
  if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode;
+    LOG(WARNING) << "[APU] Fail to create NIR model:" << neuron_errCode;
    return false;
  }
  VLOG(3) << "[APU] APU NIR model created!";
@@ -207,11 +191,11 @@ SubgraphEngine::~SubgraphEngine() {
 void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
  engine_.reset(new SubgraphEngine(ctx_.get(),
-                                   param.sub_block_idx,
-                                   param.sub_block_desc,
+                                   param.block_idx,
+                                   param.program_desc,
+                                   param.exec_scope,
                                   param.input_data_names,
-                                   param.output_data_names,
-                                   param.scope));
+                                   param.output_data_names));
  CHECK(engine_);
 }


--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
@@ -31,12 +31,16 @@ class SubgraphEngine : public subgraph::Engine {
 public:
  SubgraphEngine(KernelContext *ctx,
                 int block_idx,
-                 cpp::BlockDesc *block_desc,
+                 const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
+                 Scope *exec_scope,
                 const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope)
-      : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {}
+                 const std::vector<std::string> &output_names)
+      : subgraph::Engine(ctx,
+                         block_idx,
+                         program_desc,
+                         exec_scope,
+                         input_names,
+                         output_names) {}

  ~SubgraphEngine();


--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -75,7 +75,6 @@ add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_comp
 add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(clip_compute_arm ARM extra SRCS clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -87,7 +86,6 @@ add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_comp
 add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)

--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -202,17 +202,13 @@ void ElementwiseMulCompute<T, PType>::Run() {
  }
 }

-template <>
-void ElementwiseMulCompute<int64_t, PRECISION(kInt64)>::Run() {
-  auto& param = this->template Param<operators::ElementwiseParam>();
-  lite::arm::math::elementwise_compute_basic<int64_t>(param, "mul", "");
-}
-
-void ElementwiseMulActivationCompute::Run() {
-  auto& param = Param<operators::FusionElementwiseActivationParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
+template <typename T, PrecisionType PType>
+void ElementwiseMulActivationCompute<T, PType>::Run() {
+  auto& param =
+      this->template Param<operators::FusionElementwiseActivationParam>();
+  auto* x_data = param.X->template data<T>();
+  auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
  int axis = param.axis;
  std::string act_type = param.act_type;
  auto x_dims = param.X->dims();
@@ -221,21 +217,21 @@ void ElementwiseMulActivationCompute::Run() {
  if (x_dims.size() < y_dims.size() &&
      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
    if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu_broadcast<float>(
+      lite::arm::math::elementwise_mul_relu_broadcast<T>(
          y_data, x_data, out_data, pre, n, post);
    } else {
      LOG(FATAL) << "unsupported Activation type: " << act_type;
    }
  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
    if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu_broadcast(
+      lite::arm::math::elementwise_mul_relu_broadcast<T>(
          x_data, y_data, out_data, pre, n, post);
    } else {
      LOG(FATAL) << "unsupported Activation type: " << act_type;
    }
  } else {
    if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu(
+      lite::arm::math::elementwise_mul_relu<T>(
          x_data, y_data, out_data, x_dims.production());
    } else {
      LOG(FATAL) << "unsupported Activation type: " << act_type;
@@ -426,46 +422,60 @@ REGISTER_LITE_KERNEL(
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

-using elementwise_mul_float =
+using elementwise_mul_float_t =
    paddle::lite::kernels::arm::ElementwiseMulCompute<float, PRECISION(kFloat)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float, def)
+    elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float_t, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

-using elementwise_mul_int32 =
+using elementwise_mul_int32_t =
    paddle::lite::kernels::arm::ElementwiseMulCompute<int, PRECISION(kInt32)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32, def)
+    elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32_t, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .Finalize();

-using elementwise_mul_int64 =
+using elementwise_mul_int64_t =
    paddle::lite::kernels::arm::ElementwiseMulCompute<int64_t,
                                                      PRECISION(kInt64)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64, def)
+    elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64_t, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .Finalize();

-REGISTER_LITE_KERNEL(
-    fusion_elementwise_mul_activation,
-    kARM,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::arm::ElementwiseMulActivationCompute,
-    def)
+using fusion_elementwise_mul_activation_float_t = paddle::lite::kernels::arm::
+    ElementwiseMulActivationCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     fusion_elementwise_mul_activation_float_t,
+                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

+using fusion_elementwise_mul_activation_int64_t = paddle::lite::kernels::arm::
+    ElementwiseMulActivationCompute<int64_t, PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kARM,
+                     kInt64,
+                     kNCHW,
+                     fusion_elementwise_mul_activation_int64_t,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(elementwise_max,
                     kARM,
                     kFloat,
@@ -489,22 +499,22 @@ REGISTER_LITE_KERNEL(
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

-using elementwise_div_fp32 =
+using elementwise_div_fp32_t =
    paddle::lite::kernels::arm::ElementwiseDivCompute<float, PRECISION(kFloat)>;

 REGISTER_LITE_KERNEL(
-    elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32, def)
+    elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32_t, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

-using elementwise_div_int64 =
+using elementwise_div_int64_t =
    paddle::lite::kernels::arm::ElementwiseDivCompute<int64_t,
                                                      PRECISION(kInt64)>;

 REGISTER_LITE_KERNEL(
-    elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64, def)
+    elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64_t, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
@@ -522,11 +532,11 @@ REGISTER_LITE_KERNEL(
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();

-using elementwise_mod_int64 =
+using elementwise_mod_int64_t =
    paddle::lite::kernels::arm::ElementwiseModCompute<int64_t,
                                                      PRECISION(kInt64)>;
 REGISTER_LITE_KERNEL(
-    elementwise_mod, kARM, kInt64, kNCHW, elementwise_mod_int64, def)
+    elementwise_mod, kARM, kInt64, kNCHW, elementwise_mod_int64_t, def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})

--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
@@ -62,8 +62,8 @@ class ElementwiseMulCompute : public KernelLite<TARGET(kARM), PType> {
  virtual ~ElementwiseMulCompute() = default;
 };

-class ElementwiseMulActivationCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ElementwiseMulActivationCompute : public KernelLite<TARGET(kARM), PType> {
 public:
  void Run() override;


--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ b/lite/kernels/arm/elementwise_compute_test.cc
@@ -533,13 +533,15 @@ TEST(fusion_elementwise_mul_activation_arm, retrive_op) {
 }

 TEST(fusion_elementwise_mul_activation_arm, init) {
-  ElementwiseMulActivationCompute fusion_elementwise_mul_activation;
+  ElementwiseMulActivationCompute<float, PRECISION(kFloat)>
+      fusion_elementwise_mul_activation;
  ASSERT_EQ(fusion_elementwise_mul_activation.precision(), PRECISION(kFloat));
  ASSERT_EQ(fusion_elementwise_mul_activation.target(), TARGET(kARM));
 }

 TEST(fusion_elementwise_mul_activation_arm, compute) {
-  ElementwiseMulActivationCompute fusion_elementwise_mul_activation;
+  ElementwiseMulActivationCompute<float, PRECISION(kFloat)>
+      fusion_elementwise_mul_activation;
  operators::FusionElementwiseActivationParam param;
  lite::Tensor x, y, output, output_ref;


--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
--- a/lite/kernels/arm/gather_compute.h
+++ b/lite/kernels/arm/gather_compute.h
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
--- a/lite/kernels/cuda/assign_value_compute.cu
+++ b/lite/kernels/cuda/assign_value_compute.cu
--- a/lite/kernels/cuda/dropout_compute.cc
+++ b/lite/kernels/cuda/dropout_compute.cc
--- a/lite/kernels/cuda/gru_compute.cu
+++ b/lite/kernels/cuda/gru_compute.cu
--- a/lite/kernels/cuda/scale_compute.cc
+++ b/lite/kernels/cuda/scale_compute.cc
--- a/lite/kernels/cuda/sequence_mask_compute.cu
+++ b/lite/kernels/cuda/sequence_mask_compute.cu
--- a/lite/kernels/cuda/sequence_pad_compute.cu
+++ b/lite/kernels/cuda/sequence_pad_compute.cu
--- a/lite/kernels/cuda/sequence_unpad_compute.cu
+++ b/lite/kernels/cuda/sequence_unpad_compute.cu
--- a/lite/kernels/cuda/sequence_unpad_compute.h
+++ b/lite/kernels/cuda/sequence_unpad_compute.h
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/host/assign_compute.cc
+++ b/lite/kernels/host/assign_compute.cc
--- a/lite/kernels/arm/conditional_block_compute.cc
+++ b/lite/kernels/arm/conditional_block_compute.cc
--- a/lite/kernels/arm/conditional_block_compute.h
+++ b/lite/kernels/arm/conditional_block_compute.h
--- a/lite/kernels/host/print_compute.cc
+++ b/lite/kernels/host/print_compute.cc
--- a/lite/kernels/host/print_compute.h
+++ b/lite/kernels/host/print_compute.h
--- a/lite/kernels/arm/while_compute.cc
+++ b/lite/kernels/arm/while_compute.cc
--- a/lite/kernels/arm/while_compute.h
+++ b/lite/kernels/arm/while_compute.h
--- a/lite/kernels/huawei_ascend_npu/CMakeLists.txt
+++ b/lite/kernels/huawei_ascend_npu/CMakeLists.txt
--- a/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
+++ b/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
--- a/lite/kernels/huawei_ascend_npu/bridges/act_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/act_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/graph.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/graph.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/graph.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/graph.h
--- a/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
--- a/lite/kernels/huawei_ascend_npu/bridges/utility.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/utility.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.h
--- a/lite/kernels/huawei_ascend_npu/subgraph_compute.cc
+++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.cc
--- a/lite/kernels/huawei_ascend_npu/subgraph_compute.h
+++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.h
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
--- a/lite/kernels/opencl/nearest_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
--- a/lite/model_parser/base/apis.h
+++ b/lite/model_parser/base/apis.h
--- a/lite/model_parser/base/proto_desc.h
+++ b/lite/model_parser/base/proto_desc.h
--- a/lite/model_parser/pb/var_desc.cc
+++ b/lite/model_parser/pb/var_desc.cc
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
--- a/lite/operators/conditional_block_op.cc
+++ b/lite/operators/conditional_block_op.cc
--- a/lite/operators/conditional_block_op.h
+++ b/lite/operators/conditional_block_op.h
--- a/lite/operators/fake_quantize_dequantize_abs_max.cc
+++ b/lite/operators/fake_quantize_dequantize_abs_max.cc
--- a/lite/operators/fake_quantize_dequantize_abs_max.h
+++ b/lite/operators/fake_quantize_dequantize_abs_max.h
--- a/lite/operators/gru_op.cc
+++ b/lite/operators/gru_op.cc
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/print_op.cc
+++ b/lite/operators/print_op.cc
--- a/lite/operators/print_op.h
+++ b/lite/operators/print_op.h
--- a/lite/operators/sequence_pad_op.cc
+++ b/lite/operators/sequence_pad_op.cc
--- a/lite/operators/sequence_unpad_op.cc
+++ b/lite/operators/sequence_unpad_op.cc
--- a/lite/operators/subgraph_op.cc
+++ b/lite/operators/subgraph_op.cc
--- a/lite/operators/subgraph_op.h
+++ b/lite/operators/subgraph_op.h
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
--- a/lite/operators/while_op.cc
+++ b/lite/operators/while_op.cc
--- a/lite/operators/while_op.h
+++ b/lite/operators/while_op.h
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
--- a/lite/tests/api/test_transformer_with_mask_fp32_arm.cc
+++ b/lite/tests/api/test_transformer_with_mask_fp32_arm.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
--- a/lite/tests/kernels/conv_compute_test.cc
+++ b/lite/tests/kernels/conv_compute_test.cc
--- a/lite/tests/kernels/prior_box_compute_test.cc
+++ b/lite/tests/kernels/prior_box_compute_test.cc
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
--- a/lite/tests/math/sgemm_c4_compute_test.cc
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
--- a/lite/utils/env.h
+++ b/lite/utils/env.h
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
--- a/lite/utils/replace_stl/stream.h
+++ b/lite/utils/replace_stl/stream.h